dictia-public/tests/test_migration_compatibility.py

"""
Test suite to ensure database migrations are compatible with both SQLite and PostgreSQL.

These tests scan the init_db.py file for patterns that would break on PostgreSQL,
such as SQLite-only boolean defaults (0/1 instead of FALSE/TRUE) and unquoted
reserved keywords.

Run with: python tests/test_migration_compatibility.py
"""

import re
import unittest
import os


class TestMigrationCompatibility(unittest.TestCase):
    """Tests to ensure init_db.py uses cross-database compatible SQL."""

    @classmethod
    def setUpClass(cls):
        """Load init_db.py content once for all tests."""
        # Find the project root
        test_dir = os.path.dirname(os.path.abspath(__file__))
        project_root = os.path.dirname(test_dir)
        init_db_path = os.path.join(project_root, 'src', 'init_db.py')

        with open(init_db_path, 'r') as f:
            cls.content = f.read()

    def test_no_raw_boolean_defaults_in_alter_table(self):
        """
        Ensure no raw ALTER TABLE statements use SQLite-only boolean defaults.

        The pattern 'BOOLEAN DEFAULT 0' or 'BOOLEAN DEFAULT 1' in raw SQL
        will fail on PostgreSQL, which requires 'DEFAULT FALSE' or 'DEFAULT TRUE'.

        Using add_column_if_not_exists() handles this conversion automatically.
        """
        # Pattern to find raw SQL with text() that has BOOLEAN DEFAULT 0/1
        # This matches: text('... BOOLEAN DEFAULT 0 ...') or text("...")
        pattern = r"conn\.execute\s*\(\s*text\s*\(['\"]([^'\"]*BOOLEAN\s+DEFAULT\s+[01][^'\"]*)['\"]"

        matches = re.findall(pattern, self.content, re.IGNORECASE)

        # Filter out false positives - we're looking for raw ALTER TABLE statements
        # not UPDATE statements or other SQL that legitimately uses 0/1
        problematic = []
        for match in matches:
            match_upper = match.upper()
            # Only flag if it's an ALTER TABLE with BOOLEAN DEFAULT 0/1
            if 'ALTER TABLE' in match_upper and 'BOOLEAN' in match_upper:
                if 'DEFAULT 0' in match or 'DEFAULT 1' in match:
                    problematic.append(match)

        self.assertEqual(
            len(problematic), 0,
            f"Found SQLite-only boolean defaults in raw ALTER TABLE statements. "
            f"Use add_column_if_not_exists() instead:\n" +
            "\n".join(f"  - {m[:100]}..." if len(m) > 100 else f"  - {m}" for m in problematic)
        )

    def test_no_boolean_integer_comparisons_in_raw_sql(self):
        """
        Ensure raw SQL doesn't compare boolean columns to integers (0/1).

        PostgreSQL strictly separates boolean and integer types:
        - 'column = 1' fails with 'operator does not exist: boolean = integer'
        - 'column = TRUE' works on both SQLite (3.23+) and PostgreSQL

        Known boolean columns in migrations: protect_from_deletion, email_verified,
        auto_share_on_apply, share_with_group_lead, is_inbox, is_highlighted,
        deletion_exempt, is_admin, can_share_publicly.
        """
        boolean_columns = [
            'protect_from_deletion', 'email_verified', 'auto_share_on_apply',
            'share_with_group_lead', 'is_inbox', 'is_highlighted',
            'deletion_exempt', 'is_admin', 'can_share_publicly',
            'auto_speaker_labelling', 'auto_summarization'
        ]

        # Find raw SQL in text() calls
        sql_pattern = r"text\s*\(\s*['\"\"]\"\"(.*?)['\"\"]\"\"?\s*\)"
        # Simpler: find lines with known boolean column = 0 or = 1
        problematic = []
        for col in boolean_columns:
            # Match: column = 0 or column = 1 (not = TRUE/FALSE)
            pattern = rf"{col}\s*=\s*[01]\b"
            matches = re.finditer(pattern, self.content, re.IGNORECASE)
            for match in matches:
                # Get surrounding context to check if it's in a text() SQL call
                start = max(0, match.start() - 200)
                context = self.content[start:match.end() + 50]
                if 'text(' in context and 'sqlite_master' not in context:
                    problematic.append(f"{col}: ...{match.group()}...")

        self.assertEqual(
            len(problematic), 0,
            f"Found boolean columns compared to integers in raw SQL. "
            f"Use TRUE/FALSE instead of 1/0 for PostgreSQL compatibility:\n" +
            "\n".join(f"  - {p}" for p in problematic)
        )

    def test_reserved_keywords_quoted_in_index_creation(self):
        """
        Ensure reserved keywords like 'user' are properly quoted in index creation.

        Raw SQL like 'CREATE INDEX ... ON user (column)' will fail on some databases
        because 'user' is a reserved keyword. It should be quoted as "user" or use
        the create_index_if_not_exists() utility.
        """
        reserved_keywords = ['user', 'order', 'group', 'table', 'select', 'index']

        problematic = []

        for keyword in reserved_keywords:
            # Pattern to find unquoted reserved keyword after ON in index creation
            # Matches: CREATE INDEX ... ON user ( but not ON "user" or ON `user`
            pattern = rf"CREATE\s+(?:UNIQUE\s+)?INDEX[^;]*\s+ON\s+{keyword}\s*\("

            matches = re.findall(pattern, self.content, re.IGNORECASE)

            for match in matches:
                # Skip if the keyword is already quoted
                if f'"{keyword}"' in match.lower() or f'`{keyword}`' in match.lower():
                    continue
                problematic.append((keyword, match[:80]))

        self.assertEqual(
            len(problematic), 0,
            f"Found unquoted reserved keywords in index creation. "
            f"Use create_index_if_not_exists() or quote the table name:\n" +
            "\n".join(f"  - '{kw}' in: {sql}..." for kw, sql in problematic)
        )

    def test_add_column_uses_utility(self):
        """
        Ensure most ADD COLUMN operations use add_column_if_not_exists().

        Direct ALTER TABLE ADD COLUMN statements should use the utility function
        to ensure cross-database compatibility with boolean defaults and quoting.
        """
        # Count direct ALTER TABLE ADD COLUMN in text() calls
        direct_pattern = r"conn\.execute\s*\(\s*text\s*\(['\"][^'\"]*ALTER\s+TABLE[^'\"]*ADD\s+COLUMN"
        direct_matches = re.findall(direct_pattern, self.content, re.IGNORECASE)

        # Count uses of add_column_if_not_exists
        utility_pattern = r"add_column_if_not_exists\s*\("
        utility_matches = re.findall(utility_pattern, self.content)

        # We expect most ADD COLUMN operations to use the utility
        # Allow some direct usage for special cases (e.g., table recreation)
        # but utility usage should significantly outnumber direct usage
        self.assertGreater(
            len(utility_matches), len(direct_matches),
            f"Found {len(direct_matches)} direct ALTER TABLE ADD COLUMN statements "
            f"vs {len(utility_matches)} add_column_if_not_exists() calls. "
            f"Consider using the utility function for cross-database compatibility."
        )

    def test_incompatible_types_handled_by_utility(self):
        """
        Ensure columns with PostgreSQL-incompatible types (DATETIME, BLOB) are
        added through add_column_if_not_exists() which auto-converts them,
        and NOT via raw ALTER TABLE statements that would bypass conversion.

        PostgreSQL type differences:
        - DATETIME -> TIMESTAMP
        - BLOB -> BYTEA
        """
        incompatible_types = ['DATETIME', 'BLOB']

        # Check for raw ALTER TABLE statements using incompatible types
        for sql_type in incompatible_types:
            pattern = rf"conn\.execute\s*\(\s*text\s*\(['\"][^'\"]*ALTER\s+TABLE[^'\"]*\b{sql_type}\b[^'\"]*['\"]"
            matches = re.findall(pattern, self.content, re.IGNORECASE)

            self.assertEqual(
                len(matches), 0,
                f"Found raw ALTER TABLE statements using '{sql_type}' which is incompatible with PostgreSQL. "
                f"Use add_column_if_not_exists() which auto-converts types:\n" +
                "\n".join(f"  - {m[:100]}..." if len(m) > 100 else f"  - {m}" for m in matches)
            )

        # Verify that add_column_if_not_exists calls using these types exist
        # (confirming they go through the utility which handles conversion)
        for sql_type in incompatible_types:
            pattern = rf"add_column_if_not_exists\s*\([^)]*['\"]({sql_type})['\"]"
            matches = re.findall(pattern, self.content, re.IGNORECASE)
            # Just informational - these are fine because the utility converts them

    def test_no_double_quoted_string_defaults(self):
        """
        Ensure no SQL DEFAULT values use double-quoted strings.

        In SQL, double quotes denote identifiers (column/table names), not string
        literals. SQLite tolerates this, but PostgreSQL will interpret DEFAULT "en"
        as a reference to a column named "en" and fail with 'column "en" does not exist'.

        String defaults must use single quotes: DEFAULT 'en'
        """
        # Match DEFAULT followed by a double-quoted string value
        pattern = r'DEFAULT\s+"[^"]*"'

        lines = self.content.splitlines()
        problematic = []
        for i, line in enumerate(lines, 1):
            if re.search(pattern, line, re.IGNORECASE):
                problematic.append(f"  Line {i}: {line.strip()}")

        self.assertEqual(
            len(problematic), 0,
            f"Found double-quoted string defaults in init_db.py. "
            f"PostgreSQL interprets double quotes as column identifiers, not string literals. "
            f"Use single quotes instead (e.g., DEFAULT 'en' not DEFAULT \"en\"):\n" +
            "\n".join(problematic)
        )

    def test_create_index_uses_utility_for_user_table(self):
        """
        Ensure index creation on 'user' table uses create_index_if_not_exists().

        The 'user' table name is a reserved keyword that requires special quoting.
        Using create_index_if_not_exists() handles this automatically.
        """
        # Find all index creation on user table
        pattern = r"CREATE\s+(?:UNIQUE\s+)?INDEX[^;]*ON\s+[\"'`]?user[\"'`]?\s*\("

        # Count raw index creation on user table in text() calls
        raw_pattern = r"conn\.execute\s*\(\s*text\s*\(['\"][^'\"]*CREATE\s+(?:UNIQUE\s+)?INDEX[^'\"]*ON\s+[\"'`]?user"
        raw_matches = re.findall(raw_pattern, self.content, re.IGNORECASE)

        # Count uses of create_index_if_not_exists for user table
        utility_pattern = r"create_index_if_not_exists\s*\([^)]*['\"]user['\"]"
        utility_matches = re.findall(utility_pattern, self.content, re.IGNORECASE)

        # All index creation on user table should use the utility
        # (excluding table recreation scenarios which have their own quoting)
        if len(raw_matches) > 0:
            # Check if these are in table recreation blocks (acceptable)
            table_recreation_pattern = r"CREATE\s+TABLE\s+user_new"
            has_table_recreation = re.search(table_recreation_pattern, self.content, re.IGNORECASE)

            if not has_table_recreation or len(raw_matches) > 1:
                self.fail(
                    f"Found {len(raw_matches)} raw CREATE INDEX statements on 'user' table. "
                    f"Use create_index_if_not_exists() for proper quoting of reserved keywords."
                )


if __name__ == '__main__':
    unittest.main()