Fix field schema validation test and update database schema

- Fixed field extraction logic in test_field_schema_validation.py to properly parse methods with docstrings - Previous regex was too greedy and matched across multiple method definitions - Now uses proper parenthesis and docstring matching to isolate method bodies - Correctly handles both 'fields = [...]' and 'fields = common_fields + [...]' patterns - Updated db_schema.sql to include missing columns: - campaign_insights: added frequency, cpp, cost_per_action_type columns - adset_insights: added account_currency column - campaign_insights_by_country: added frequency, cpp, cost_per_action_type columns - All field schema validation tests now pass - Test dynamically extracts fields from scheduled_grabber.py source code - Compares against actual database schema from db_schema.sql - Properly filters metadata-only fields (campaign_id, campaign_name, etc.) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-10 11:48:17 +01:00
parent 511f381ff2
commit 5f83ecd7ee
3 changed files with 388 additions and 201 deletions
--- a/tests/test_field_schema_validation.py
+++ b/tests/test_field_schema_validation.py
@@ -1,244 +1,359 @@
 """
-Test that validates all fields requested by grab_* methods exist in the database schema.
+Integration test that validates all fields requested by grab_* methods exist in the database schema.

-This test ensures that whenever new fields are added to the Meta API field lists,
-the corresponding database columns exist. It catches schema mismatches early.
+This test:
+1. Parses the SQL schema file (db_schema.sql) to extract actual table columns
+2. Reads scheduled_grabber.py to find which methods call which tables
+3. Verifies that all requested fields exist in the actual database schema
 """

+import re
+import pathlib
+from typing import Dict, Set, List
+
 import pytest
-from facebook_business.adobjects.adsinsights import AdsInsights


-# Database schema field mappings
-# Maps API field names to database column names
-FIELD_MAPPINGS = {
-    # Core metrics
-    "impressions": "impressions",
-    "clicks": "clicks",
-    "spend": "spend",
-    "reach": "reach",
-    "frequency": "frequency",
+def parse_sql_schema() -> Dict[str, Set[str]]:
+    """
+    Parse db_schema.sql to extract table columns.

-    # Calculated metrics
-    "ctr": "ctr",
-    "cpc": "cpc",
-    "cpm": "cpm",
-    "cpp": "cpp",
+    Returns:
+        Dictionary mapping table names to sets of column names
+    """
+    schema_file = pathlib.Path(__file__).parent.parent / "src" / "meta_api_grabber" / "db_schema.sql"

-    # Actions and costs
-    "actions": "actions",
-    "cost_per_action_type": "cost_per_action_type",
+    if not schema_file.exists():
+        raise FileNotFoundError(f"Schema file not found: {schema_file}")

-    # Date/time fields
-    "date_start": "date_start",
-    "date_stop": "date_stop",
+    with open(schema_file, 'r') as f:
+        content = f.read()

-    # ID fields (not stored in insights tables, but referenced)
-    "campaign_id": "referenced_in_campaigns",
-    "campaign_name": "referenced_in_campaigns",
-    "adset_id": "referenced_in_adsets",
-    "adset_name": "referenced_in_adsets",
-    "country": "country",
-}
+    tables = {}

-# Table schemas
-TABLE_SCHEMAS = {
-    "account_insights": {
-        "impressions", "clicks", "spend", "reach", "frequency",
-        "ctr", "cpc", "cpm", "cpp", "actions", "cost_per_action_type",
-        "date_start", "date_stop", "date_preset", "fetched_at"
-    },
-    "campaign_insights": {
-        "impressions", "clicks", "spend", "reach",
-        "ctr", "cpc", "cpm", "actions",
-        "date_start", "date_stop", "date_preset", "fetched_at",
-        "campaign_id", "account_id"
-    },
-    "adset_insights": {
-        "impressions", "clicks", "spend", "reach",
-        "ctr", "cpc", "cpm", "actions",
-        "date_start", "date_stop", "date_preset", "fetched_at",
-        "adset_id", "campaign_id", "account_id"
-    },
-    "campaign_insights_by_country": {
-        "impressions", "clicks", "spend", "reach",
-        "ctr", "cpc", "cpm", "actions",
-        "date_start", "date_stop", "date_preset", "fetched_at",
-        "campaign_id", "account_id", "country"
+    # Parse CREATE TABLE statements
+    # Pattern: CREATE TABLE IF NOT EXISTS table_name (...)
+    create_table_pattern = r'CREATE TABLE IF NOT EXISTS (\w+)\s*\((.*?)\);'
+
+    for match in re.finditer(create_table_pattern, content, re.DOTALL):
+        table_name = match.group(1)
+        table_body = match.group(2)
+
+        # Extract column names (first word before space/comma)
+        # Pattern: column_name TYPE ...
+        column_pattern = r'^\s*(\w+)\s+\w+'
+        columns = set()
+
+        for line in table_body.split('\n'):
+            line = line.strip()
+            if not line or line.startswith('--') or line.startswith('PRIMARY') or line.startswith('FOREIGN') or line.startswith('CONSTRAINT'):
+                continue
+
+            col_match = re.match(column_pattern, line)
+            if col_match:
+                columns.add(col_match.group(1))
+
+        if columns:
+            tables[table_name] = columns
+
+    return tables
+
+
+def get_field_name(field_str: str) -> str:
+    """
+    Extract field name from AdsInsights.Field.xxx notation.
+
+    Example: 'impressions' from 'AdsInsights.Field.impressions'
+    """
+    if '.' in field_str:
+        return field_str.split('.')[-1]
+    return field_str
+
+
+def extract_fields_from_grabber_source() -> Dict[str, List[str]]:
+    """
+    Extract field lists from grab_* methods by reading scheduled_grabber.py source.
+
+    Returns:
+        Dictionary mapping method names to lists of field names
+    """
+    grabber_file = pathlib.Path(__file__).parent.parent / "src" / "meta_api_grabber" / "scheduled_grabber.py"
+
+    if not grabber_file.exists():
+        raise FileNotFoundError(f"scheduled_grabber.py not found: {grabber_file}")
+
+    with open(grabber_file, 'r') as f:
+        source = f.read()
+
+    methods_to_table = {
+        'grab_account_insights': 'account_insights',
+        'grab_campaign_insights': 'campaign_insights',
+        'grab_adset_insights': 'adset_insights',
+        'grab_campaign_insights_by_country': 'campaign_insights_by_country',
    }
+
+    result = {}
+
+    for method_name in methods_to_table.keys():
+        # Find the method definition by looking for: async def method_name(...)
+        method_pattern = rf'async def {method_name}\s*\('
+        method_match = re.search(method_pattern, source)
+
+        if not method_match:
+            continue
+
+        # Get the position after the method name pattern
+        start_pos = method_match.end()
+
+        # Now find where the method body actually starts (after the closing paren and docstring)
+        # Skip to the opening paren
+        open_paren_pos = start_pos - 1
+
+        # Count parentheses to find the closing paren of the function signature
+        paren_count = 1
+        pos = open_paren_pos + 1
+        while pos < len(source) and paren_count > 0:
+            if source[pos] == '(':
+                paren_count += 1
+            elif source[pos] == ')':
+                paren_count -= 1
+            pos += 1
+
+        # Now pos is after the closing paren. Find the colon
+        colon_pos = source.find(':', pos)
+
+        # Skip past any docstring if present
+        after_colon = source[colon_pos + 1:colon_pos + 10].lstrip()
+        if after_colon.startswith('"""') or after_colon.startswith("'''"):
+            quote_type = '"""' if after_colon.startswith('"""') else "'''"
+            docstring_start = source.find(quote_type, colon_pos)
+            docstring_end = source.find(quote_type, docstring_start + 3) + 3
+            method_body_start = docstring_end
+        else:
+            method_body_start = colon_pos + 1
+
+        # Find the next method definition to know where this method ends
+        next_method_pattern = r'async def \w+\s*\('
+        next_match = re.search(next_method_pattern, source[method_body_start:])
+
+        if next_match:
+            method_body_end = method_body_start + next_match.start()
+        else:
+            # Last method - use rest of file
+            method_body_end = len(source)
+
+        method_body = source[method_body_start:method_body_end]
+
+        # Extract fields from the method body
+        # Look for: fields = [...] or fields = common_fields + [...]
+
+        # First check if this method uses common_fields
+        uses_common_fields = 'common_fields' in method_body[:500]
+
+        if uses_common_fields:
+            # Pattern: fields = common_fields + [...]
+            fields_pattern = r'fields\s*=\s*common_fields\s*\+\s*\[(.*?)\]'
+            fields_match = re.search(fields_pattern, method_body, re.DOTALL)
+            if fields_match:
+                fields_str = fields_match.group(1)
+                # Extract individual field names
+                field_pattern = r'AdsInsights\.Field\.(\w+)'
+                fields = re.findall(field_pattern, fields_str)
+
+                # Also get common_fields from the module level
+                common_pattern = r'common_fields\s*=\s*\[(.*?)\]'
+                common_match = re.search(common_pattern, source, re.DOTALL)
+                if common_match:
+                    common_str = common_match.group(1)
+                    common_fields_list = re.findall(field_pattern, common_str)
+                    fields = common_fields_list + fields
+
+                result[method_name] = fields
+        else:
+            # Pattern: fields = [...]
+            # Use bracket matching to find the correct field list
+            fields_keyword_pos = method_body.find('fields =')
+
+            if fields_keyword_pos != -1:
+                # Find the opening bracket after fields =
+                bracket_pos = method_body.find('[', fields_keyword_pos)
+                if bracket_pos != -1:
+                    # Count brackets to find the matching closing bracket
+                    bracket_count = 0
+                    end_pos = bracket_pos
+                    for i, char in enumerate(method_body[bracket_pos:]):
+                        if char == '[':
+                            bracket_count += 1
+                        elif char == ']':
+                            bracket_count -= 1
+                            if bracket_count == 0:
+                                end_pos = bracket_pos + i
+                                break
+
+                    fields_str = method_body[bracket_pos + 1:end_pos]
+                    field_pattern = r'AdsInsights\.Field\.(\w+)'
+                    fields = re.findall(field_pattern, fields_str)
+                    result[method_name] = fields
+
+    return result
+
+
+@pytest.fixture(scope="module")
+def schema_columns():
+    """Parse and cache the schema columns."""
+    return parse_sql_schema()
+
+
+@pytest.fixture(scope="module")
+def extracted_fields_by_method():
+    """Extract and cache the fields from each grab_* method."""
+    return extract_fields_from_grabber_source()
+
+
+# Mapping of method names to their insight table names
+METHOD_TO_TABLE = {
+    'grab_account_insights': 'account_insights',
+    'grab_campaign_insights': 'campaign_insights',
+    'grab_adset_insights': 'adset_insights',
+    'grab_campaign_insights_by_country': 'campaign_insights_by_country',
 }

-
-def get_field_value(field_obj) -> str:
-    """Extract field name from AdsInsights.Field object."""
-    # AdsInsights.Field attributes are simple string values
-    return str(field_obj)
+# Fields that are IDs/names stored in metadata tables, not in the insights table
+METADATA_ONLY_FIELDS = {
+    'campaign_id', 'campaign_name',
+    'adset_id', 'adset_name',
+}


 class TestFieldSchemaValidation:
    """Validate that all API field requests have corresponding database columns."""

-    def test_account_insights_fields(self):
-        """Test that account insights fields exist in schema."""
-        fields = [
-            AdsInsights.Field.impressions,
-            AdsInsights.Field.clicks,
-            AdsInsights.Field.spend,
-            AdsInsights.Field.cpc,
-            AdsInsights.Field.cpm,
-            AdsInsights.Field.ctr,
-            AdsInsights.Field.cpp,
-            AdsInsights.Field.reach,
-            AdsInsights.Field.frequency,
-            AdsInsights.Field.actions,
-            AdsInsights.Field.cost_per_action_type,
-            AdsInsights.Field.date_start,
-            AdsInsights.Field.date_stop,
-        ]
+    def test_grab_account_insights_fields(self, schema_columns, extracted_fields_by_method):
+        """Test that grab_account_insights fields exist in schema."""
+        method_name = 'grab_account_insights'
+        table_name = METHOD_TO_TABLE[method_name]

-        schema_fields = TABLE_SCHEMAS["account_insights"]
-        for field in fields:
-            field_name = get_field_value(field)
-            assert field_name in FIELD_MAPPINGS, f"Field '{field_name}' not in FIELD_MAPPINGS"
-            db_column = FIELD_MAPPINGS[field_name]
+        assert method_name in extracted_fields_by_method, f"Could not extract fields from {method_name}"

-            # Skip reference checks for ID fields
-            if "referenced_in" not in db_column:
-                assert db_column in schema_fields, \
-                    f"Account insights field '{field_name}' (DB: '{db_column}') not in schema"
+        extracted_fields = set(extracted_fields_by_method[method_name])
+        table_cols = schema_columns.get(table_name, set())
+        assert table_cols, f"Table {table_name} not found in schema"

-    def test_campaign_insights_fields(self):
-        """Test that campaign insights fields exist in schema."""
-        fields = [
-            AdsInsights.Field.campaign_id,
-            AdsInsights.Field.campaign_name,
-            AdsInsights.Field.impressions,
-            AdsInsights.Field.clicks,
-            AdsInsights.Field.spend,
-            AdsInsights.Field.ctr,
-            AdsInsights.Field.cpc,
-            AdsInsights.Field.cpm,
-            AdsInsights.Field.reach,
-            AdsInsights.Field.actions,
-            AdsInsights.Field.date_start,
-            AdsInsights.Field.date_stop,
-        ]
+        missing = extracted_fields - table_cols
+        assert not missing, \
+            f"{table_name} table missing columns: {missing}\n" \
+            f"Method requests: {sorted(extracted_fields)}\n" \
+            f"Available: {sorted(table_cols)}"

-        schema_fields = TABLE_SCHEMAS["campaign_insights"]
-        for field in fields:
-            field_name = get_field_value(field)
-            assert field_name in FIELD_MAPPINGS, f"Field '{field_name}' not in FIELD_MAPPINGS"
-            db_column = FIELD_MAPPINGS[field_name]
+        print(f"✓ {method_name} → {table_name}: {len(extracted_fields)} fields validated")

-            # Skip reference checks for ID/name fields
-            if "referenced_in" not in db_column:
-                assert db_column in schema_fields, \
-                    f"Campaign insights field '{field_name}' (DB: '{db_column}') not in schema"
+    def test_grab_campaign_insights_fields(self, schema_columns, extracted_fields_by_method):
+        """Test that grab_campaign_insights fields exist in schema."""
+        method_name = 'grab_campaign_insights'
+        table_name = METHOD_TO_TABLE[method_name]

-    def test_adset_insights_fields(self):
-        """Test that adset insights fields exist in schema."""
-        fields = [
-            AdsInsights.Field.adset_id,
-            AdsInsights.Field.adset_name,
-            AdsInsights.Field.campaign_id,
-            AdsInsights.Field.impressions,
-            AdsInsights.Field.clicks,
-            AdsInsights.Field.spend,
-            AdsInsights.Field.ctr,
-            AdsInsights.Field.cpc,
-            AdsInsights.Field.cpm,
-            AdsInsights.Field.reach,
-            AdsInsights.Field.actions,
-            AdsInsights.Field.date_start,
-            AdsInsights.Field.date_stop,
-        ]
+        assert method_name in extracted_fields_by_method, f"Could not extract fields from {method_name}"

-        schema_fields = TABLE_SCHEMAS["adset_insights"]
-        for field in fields:
-            field_name = get_field_value(field)
-            assert field_name in FIELD_MAPPINGS, f"Field '{field_name}' not in FIELD_MAPPINGS"
-            db_column = FIELD_MAPPINGS[field_name]
+        extracted_fields = set(extracted_fields_by_method[method_name])
+        table_cols = schema_columns.get(table_name, set())
+        assert table_cols, f"Table {table_name} not found in schema"

-            # Skip reference checks for ID/name fields
-            if "referenced_in" not in db_column:
-                assert db_column in schema_fields, \
-                    f"Adset insights field '{field_name}' (DB: '{db_column}') not in schema"
+        # Remove ID/name fields (stored in metadata tables, not insights table)
+        insight_only_fields = extracted_fields - METADATA_ONLY_FIELDS

-    def test_campaign_insights_by_country_fields(self):
-        """Test that campaign insights by country fields exist in schema."""
-        fields = [
-            AdsInsights.Field.campaign_id,
-            AdsInsights.Field.campaign_name,
-            AdsInsights.Field.impressions,
-            AdsInsights.Field.clicks,
-            AdsInsights.Field.spend,
-            AdsInsights.Field.ctr,
-            AdsInsights.Field.cpc,
-            AdsInsights.Field.cpm,
-            AdsInsights.Field.reach,
-            AdsInsights.Field.actions,
-            AdsInsights.Field.date_start,
-            AdsInsights.Field.date_stop,
-        ]
+        missing = insight_only_fields - table_cols
+        assert not missing, \
+            f"{table_name} table missing columns: {missing}\n" \
+            f"Method requests: {sorted(extracted_fields)}\n" \
+            f"Available: {sorted(table_cols)}"

-        schema_fields = TABLE_SCHEMAS["campaign_insights_by_country"]
-        for field in fields:
-            field_name = get_field_value(field)
-            assert field_name in FIELD_MAPPINGS, f"Field '{field_name}' not in FIELD_MAPPINGS"
-            db_column = FIELD_MAPPINGS[field_name]
+        print(f"✓ {method_name} → {table_name}: {len(extracted_fields)} fields validated")

-            # Skip reference checks for ID/name fields
-            if "referenced_in" not in db_column:
-                assert db_column in schema_fields, \
-                    f"Campaign by country insights field '{field_name}' (DB: '{db_column}') not in schema"
+    def test_grab_adset_insights_fields(self, schema_columns, extracted_fields_by_method):
+        """Test that grab_adset_insights fields exist in schema."""
+        method_name = 'grab_adset_insights'
+        table_name = METHOD_TO_TABLE[method_name]

-        # Country breakdown field
-        assert "country" in schema_fields, "Country field missing in campaign_insights_by_country schema"
+        assert method_name in extracted_fields_by_method, f"Could not extract fields from {method_name}"

-    def test_common_fields_consistency(self):
-        """Test that common_fields are consistent across all methods."""
-        from meta_api_grabber.scheduled_grabber import common_fields
+        extracted_fields = set(extracted_fields_by_method[method_name])
+        table_cols = schema_columns.get(table_name, set())
+        assert table_cols, f"Table {table_name} not found in schema"

-        # Verify common_fields is defined and contains expected metrics
-        expected_metrics = {
-            "impressions", "clicks", "spend", "cpc", "cpm", "ctr", "cpp",
-            "reach", "frequency", "actions", "cost_per_action_type",
-            "date_start", "date_stop"
-        }
+        # Remove ID/name fields (stored in metadata tables, not insights table)
+        insight_only_fields = extracted_fields - METADATA_ONLY_FIELDS

-        common_field_names = {get_field_value(f) for f in common_fields}
+        missing = insight_only_fields - table_cols
+        assert not missing, \
+            f"{table_name} table missing columns: {missing}\n" \
+            f"Method requests: {sorted(extracted_fields)}\n" \
+            f"Available: {sorted(table_cols)}"

-        for metric in expected_metrics:
-            assert metric in common_field_names, \
-                f"Common metric '{metric}' not found in common_fields"
+        print(f"✓ {method_name} → {table_name}: {len(extracted_fields)} fields validated")

-    def test_all_table_schemas_valid(self):
-        """Test that all table schemas are properly defined."""
+    def test_grab_campaign_insights_by_country_fields(self, schema_columns, extracted_fields_by_method):
+        """Test that grab_campaign_insights_by_country fields exist in schema."""
+        method_name = 'grab_campaign_insights_by_country'
+        table_name = METHOD_TO_TABLE[method_name]
+
+        assert method_name in extracted_fields_by_method, f"Could not extract fields from {method_name}"
+
+        extracted_fields = set(extracted_fields_by_method[method_name])
+        table_cols = schema_columns.get(table_name, set())
+        assert table_cols, f"Table {table_name} not found in schema"
+
+        # Remove ID/name fields (stored in metadata tables, not insights table)
+        insight_only_fields = extracted_fields - METADATA_ONLY_FIELDS
+
+        # Country is special - it's part of the breakdown
+        assert "country" in table_cols, \
+            f"country field missing in {table_name} table\n" \
+            f"Available: {sorted(table_cols)}"
+
+        missing = insight_only_fields - table_cols
+        assert not missing, \
+            f"{table_name} table missing columns: {missing}\n" \
+            f"Method requests: {sorted(extracted_fields)}\n" \
+            f"Available: {sorted(table_cols)}"
+
+        print(f"✓ {method_name} → {table_name}: {len(extracted_fields)} fields validated")
+
+    def test_all_tables_exist(self, schema_columns):
+        """Test that all required insight tables exist in schema."""
        required_tables = {
            "account_insights",
            "campaign_insights",
            "adset_insights",
-            "campaign_insights_by_country"
+            "campaign_insights_by_country",
        }

-        for table in required_tables:
-            assert table in TABLE_SCHEMAS, f"Table '{table}' not defined in TABLE_SCHEMAS"
-            assert len(TABLE_SCHEMAS[table]) > 0, f"Table '{table}' has no fields defined"
+        existing_tables = set(schema_columns.keys())
+        missing = required_tables - existing_tables

+        assert not missing, \
+            f"Missing tables: {missing}\n" \
+            f"Found: {sorted(existing_tables)}"

-class TestSchemaDocumentation:
-    """Document the expected schema structure for reference."""
-
-    def test_schema_documentation(self):
-        """Print out the schema for verification purposes."""
+    def test_schema_documentation(self, schema_columns):
+        """Print out the parsed schema for verification."""
        print("\n" + "="*80)
-        print("DATABASE SCHEMA DOCUMENTATION")
+        print("PARSED DATABASE SCHEMA")
        print("="*80)

-        for table, fields in TABLE_SCHEMAS.items():
-            print(f"\nTable: {table}")
-            print(f"Columns: {sorted(fields)}")
-            print(f"Total columns: {len(fields)}")
+        for table_name in sorted(schema_columns.keys()):
+            columns = sorted(schema_columns[table_name])
+            print(f"\nTable: {table_name}")
+            print(f"Columns ({len(columns)}): {', '.join(columns)}")
+
+    def test_extracted_fields_documentation(self, extracted_fields_by_method):
+        """Print out extracted fields from each method."""
+        print("\n" + "="*80)
+        print("EXTRACTED FIELDS FROM GRAB METHODS")
+        print("="*80)
+
+        for method_name, fields in sorted(extracted_fields_by_method.items()):
+            print(f"\n{method_name}:")
+            print(f"  Fields ({len(fields)}): {', '.join(sorted(set(fields)))}")


 if __name__ == "__main__":