Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)

2026-03-16 21:47:37 +00:00
commit 42772a31ed
365 changed files with 103572 additions and 0 deletions
--- a/tests/test_connector_architecture.py
+++ b/tests/test_connector_architecture.py
@@ -0,0 +1,564 @@
+#!/usr/bin/env python3
+"""
+Test script for the transcription connector architecture.
+
+This script tests:
+1. Connector auto-detection from environment variables
+2. Backwards compatibility with legacy config
+3. Connector specifications and capabilities
+4. Chunking logic (connector-aware)
+5. Codec handling per connector
+6. Request/Response data types
+
+Run with: docker exec speakr-dev python /app/tests/test_connector_architecture.py
+"""
+
+import os
+import sys
+import io
+import json
+from pathlib import Path
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+# Test results tracking
+PASSED = 0
+FAILED = 0
+ERRORS = []
+
+
+def run_test(name, func):
+    """Run a test function and track results."""
+    global PASSED, FAILED, ERRORS
+    try:
+        func()
+        print(f"  ✓ {name}")
+        PASSED += 1
+    except AssertionError as e:
+        print(f"  ✗ {name}: {e}")
+        FAILED += 1
+        ERRORS.append((name, str(e)))
+    except Exception as e:
+        print(f"  ✗ {name}: EXCEPTION - {e}")
+        FAILED += 1
+        ERRORS.append((name, f"Exception: {e}"))
+
+
+def clear_env():
+    """Clear all transcription-related environment variables."""
+    keys_to_clear = [
+        'TRANSCRIPTION_CONNECTOR', 'TRANSCRIPTION_API_KEY', 'TRANSCRIPTION_BASE_URL',
+        'TRANSCRIPTION_MODEL', 'WHISPER_MODEL', 'USE_ASR_ENDPOINT', 'ASR_BASE_URL',
+        'ASR_DIARIZE', 'ASR_RETURN_SPEAKER_EMBEDDINGS', 'ASR_TIMEOUT',
+        'ASR_MIN_SPEAKERS', 'ASR_MAX_SPEAKERS', 'ENABLE_CHUNKING', 'CHUNK_LIMIT',
+        'CHUNK_OVERLAP_SECONDS', 'AUDIO_UNSUPPORTED_CODECS',
+    ]
+    for key in keys_to_clear:
+        os.environ.pop(key, None)
+
+
+def reset_registry():
+    """Reset the connector registry singleton."""
+    from src.services.transcription import registry
+    registry._registry = None
+    registry.ConnectorRegistry._instance = None
+    registry.ConnectorRegistry._initialized = False
+    registry.ConnectorRegistry._active_connector = None
+    registry.ConnectorRegistry._connector_name = ""
+
+
+# =============================================================================
+# TEST SECTION 1: Base Classes and Data Types
+# =============================================================================
+
+def test_base_classes():
+    """Test base classes and data types."""
+    print("\n=== Testing Base Classes ===")
+
+    from src.services.transcription.base import (
+        TranscriptionCapability, ConnectorSpecifications, TranscriptionRequest,
+        TranscriptionResponse, TranscriptionSegment,
+    )
+
+    def t1():
+        assert TranscriptionCapability.DIARIZATION is not None
+        assert TranscriptionCapability.TIMESTAMPS is not None
+        assert TranscriptionCapability.SPEAKER_COUNT_CONTROL is not None
+    run_test("TranscriptionCapability enum has expected values", t1)
+
+    def t2():
+        specs = ConnectorSpecifications()
+        assert specs.max_file_size_bytes is None
+        assert specs.handles_chunking_internally is False
+        assert specs.recommended_chunk_seconds == 600
+    run_test("ConnectorSpecifications has correct defaults", t2)
+
+    def t3():
+        specs = ConnectorSpecifications(
+            max_file_size_bytes=25 * 1024 * 1024,
+            handles_chunking_internally=True,
+            unsupported_codecs=frozenset({'opus'})
+        )
+        assert specs.max_file_size_bytes == 25 * 1024 * 1024
+        assert 'opus' in specs.unsupported_codecs
+    run_test("ConnectorSpecifications with custom values", t3)
+
+    def t4():
+        audio = io.BytesIO(b"fake audio data")
+        request = TranscriptionRequest(audio_file=audio, filename="test.wav", diarize=True)
+        assert request.filename == "test.wav"
+        assert request.diarize is True
+    run_test("TranscriptionRequest creation", t4)
+
+    def t5():
+        segments = [
+            TranscriptionSegment(text="Hello", speaker="SPEAKER_00", start_time=0.0, end_time=1.0),
+            TranscriptionSegment(text="World", speaker="SPEAKER_01", start_time=1.0, end_time=2.0),
+        ]
+        response = TranscriptionResponse(text="Hello World", segments=segments, provider="test")
+        storage = response.to_storage_format()
+        data = json.loads(storage)
+        assert len(data) == 2
+        assert data[0]['speaker'] == "SPEAKER_00"
+    run_test("TranscriptionResponse to_storage_format", t5)
+
+    def t6():
+        segments = [TranscriptionSegment(text="Hello", speaker="SPEAKER_00")]
+        response = TranscriptionResponse(text="Hello", segments=segments)
+        assert response.has_diarization() is True
+
+        response2 = TranscriptionResponse(text="Hello", segments=None)
+        assert response2.has_diarization() is False
+    run_test("TranscriptionResponse has_diarization", t6)
+
+
+# =============================================================================
+# TEST SECTION 2: Connector Auto-Detection
+# =============================================================================
+
+def test_auto_detection():
+    """Test connector auto-detection from environment variables."""
+    print("\n=== Testing Connector Auto-Detection ===")
+
+    from src.services.transcription.registry import get_registry
+
+    def t1():
+        clear_env()
+        reset_registry()
+        os.environ['TRANSCRIPTION_CONNECTOR'] = 'openai_whisper'
+        os.environ['TRANSCRIPTION_API_KEY'] = 'test-key'
+        os.environ['ASR_BASE_URL'] = 'http://should-be-ignored:9000'
+        registry = get_registry()
+        registry.initialize_from_env()
+        assert registry.get_active_connector_name() == 'openai_whisper'
+    run_test("Explicit TRANSCRIPTION_CONNECTOR takes priority", t1)
+
+    def t2():
+        clear_env()
+        reset_registry()
+        os.environ['ASR_BASE_URL'] = 'http://whisperx:9000'
+        registry = get_registry()
+        registry.initialize_from_env()
+        assert registry.get_active_connector_name() == 'asr_endpoint'
+    run_test("ASR_BASE_URL auto-detects asr_endpoint", t2)
+
+    def t3():
+        clear_env()
+        reset_registry()
+        os.environ['USE_ASR_ENDPOINT'] = 'true'
+        os.environ['ASR_BASE_URL'] = 'http://whisperx:9000'
+        registry = get_registry()
+        registry.initialize_from_env()
+        assert registry.get_active_connector_name() == 'asr_endpoint'
+    run_test("Legacy USE_ASR_ENDPOINT=true still works", t3)
+
+    def t4():
+        clear_env()
+        reset_registry()
+        os.environ['TRANSCRIPTION_API_KEY'] = 'test-key'
+        os.environ['TRANSCRIPTION_MODEL'] = 'gpt-4o-transcribe-diarize'
+        registry = get_registry()
+        registry.initialize_from_env()
+        assert registry.get_active_connector_name() == 'openai_transcribe'
+    run_test("gpt-4o model auto-detects openai_transcribe", t4)
+
+    def t5():
+        clear_env()
+        reset_registry()
+        os.environ['TRANSCRIPTION_API_KEY'] = 'test-key'
+        os.environ['TRANSCRIPTION_MODEL'] = 'whisper-1'
+        registry = get_registry()
+        registry.initialize_from_env()
+        assert registry.get_active_connector_name() == 'openai_whisper'
+    run_test("whisper-1 model uses openai_whisper", t5)
+
+    def t6():
+        clear_env()
+        reset_registry()
+        os.environ['TRANSCRIPTION_API_KEY'] = 'test-key'
+        os.environ['WHISPER_MODEL'] = 'whisper-1'
+        registry = get_registry()
+        registry.initialize_from_env()
+        assert registry.get_active_connector_name() == 'openai_whisper'
+    run_test("Legacy WHISPER_MODEL still works", t6)
+
+    def t7():
+        clear_env()
+        reset_registry()
+        os.environ['TRANSCRIPTION_API_KEY'] = 'test-key'
+        registry = get_registry()
+        registry.initialize_from_env()
+        assert registry.get_active_connector_name() == 'openai_whisper'
+    run_test("Default falls back to openai_whisper", t7)
+
+
+# =============================================================================
+# TEST SECTION 3: Connector Specifications
+# =============================================================================
+
+def test_connector_specifications():
+    """Test connector specifications are correctly defined."""
+    print("\n=== Testing Connector Specifications ===")
+
+    from src.services.transcription.connectors.openai_whisper import OpenAIWhisperConnector
+    from src.services.transcription.connectors.openai_transcribe import OpenAITranscribeConnector
+    from src.services.transcription.connectors.asr_endpoint import ASREndpointConnector
+    from src.services.transcription.base import TranscriptionCapability
+
+    def t1():
+        specs = OpenAIWhisperConnector.SPECIFICATIONS
+        assert specs.max_file_size_bytes == 25 * 1024 * 1024
+        assert specs.handles_chunking_internally is False
+    run_test("OpenAI Whisper has 25MB limit", t1)
+
+    def t2():
+        specs = OpenAIWhisperConnector.SPECIFICATIONS
+        assert specs.unsupported_codecs is not None
+        assert 'opus' in specs.unsupported_codecs
+    run_test("OpenAI Whisper declares opus as unsupported", t2)
+
+    def t3():
+        specs = OpenAITranscribeConnector.SPECIFICATIONS
+        assert specs.handles_chunking_internally is True
+        assert specs.requires_chunking_param is True
+    run_test("OpenAI Transcribe has internal chunking", t3)
+
+    def t4():
+        specs = ASREndpointConnector.SPECIFICATIONS
+        assert specs.max_file_size_bytes is None
+        assert specs.handles_chunking_internally is True
+    run_test("ASR Endpoint has no limits (handles internally)", t4)
+
+    def t5():
+        assert TranscriptionCapability.DIARIZATION not in OpenAIWhisperConnector.CAPABILITIES
+    run_test("OpenAI Whisper does NOT support diarization", t5)
+
+    def t6():
+        # Diarization is added dynamically based on model at instance level
+        connector = OpenAITranscribeConnector({'api_key': 'test', 'model': 'gpt-4o-transcribe-diarize'})
+        assert TranscriptionCapability.DIARIZATION in connector.CAPABILITIES
+        assert connector.supports_diarization is True
+    run_test("OpenAI Transcribe with diarize model supports diarization", t6)
+
+    def t7():
+        assert TranscriptionCapability.DIARIZATION in ASREndpointConnector.CAPABILITIES
+        assert TranscriptionCapability.SPEAKER_COUNT_CONTROL in ASREndpointConnector.CAPABILITIES
+    run_test("ASR Endpoint supports diarization and speaker count control", t7)
+
+    def t8():
+        assert TranscriptionCapability.SPEAKER_COUNT_CONTROL not in OpenAIWhisperConnector.CAPABILITIES
+        assert TranscriptionCapability.SPEAKER_COUNT_CONTROL not in OpenAITranscribeConnector.CAPABILITIES
+    run_test("OpenAI connectors do NOT support speaker count control", t8)
+
+
+# =============================================================================
+# TEST SECTION 4: Chunking Logic
+# =============================================================================
+
+def test_chunking_logic():
+    """Test connector-aware chunking logic."""
+    print("\n=== Testing Chunking Logic ===")
+
+    from src.audio_chunking import get_effective_chunking_config
+    from src.services.transcription.base import ConnectorSpecifications
+
+    def t1():
+        clear_env()
+        os.environ['ENABLE_CHUNKING'] = 'true'
+        os.environ['CHUNK_LIMIT'] = '20MB'
+        specs = ConnectorSpecifications(handles_chunking_internally=True)
+        config = get_effective_chunking_config(specs)
+        assert config.enabled is False
+        assert config.source == 'connector_internal'
+    run_test("Connector with internal chunking disables app chunking", t1)
+
+    def t2():
+        clear_env()
+        os.environ['ENABLE_CHUNKING'] = 'true'
+        os.environ['CHUNK_LIMIT'] = '15MB'
+        os.environ['CHUNK_OVERLAP_SECONDS'] = '5'
+        specs = ConnectorSpecifications(handles_chunking_internally=False)
+        config = get_effective_chunking_config(specs)
+        assert config.enabled is True
+        assert config.source == 'env'
+        assert config.mode == 'size'
+        assert config.limit_value == 15.0
+    run_test("Connector without internal chunking uses ENV settings", t2)
+
+    def t3():
+        clear_env()
+        os.environ['ENABLE_CHUNKING'] = 'false'
+        specs = ConnectorSpecifications(handles_chunking_internally=False)
+        config = get_effective_chunking_config(specs)
+        assert config.enabled is False
+        assert config.source == 'disabled'
+    run_test("ENABLE_CHUNKING=false disables chunking", t3)
+
+    def t4():
+        clear_env()
+        os.environ['ENABLE_CHUNKING'] = 'true'
+        os.environ['CHUNK_LIMIT'] = '10m'
+        specs = ConnectorSpecifications(handles_chunking_internally=False)
+        config = get_effective_chunking_config(specs)
+        assert config.enabled is True
+        assert config.mode == 'duration'
+        assert config.limit_value == 600.0
+    run_test("Duration-based chunk limit parsing (10m = 600s)", t4)
+
+
+# =============================================================================
+# TEST SECTION 5: Codec Handling
+# =============================================================================
+
+def test_codec_handling():
+    """Test codec handling with connector specifications."""
+    print("\n=== Testing Codec Handling ===")
+
+    from src.services.transcription.base import ConnectorSpecifications
+
+    def reload_audio_module():
+        """Properly reload audio_conversion module with fresh env vars."""
+        import sys
+        # Remove relevant modules from cache to force fresh import
+        # app_config reads AUDIO_UNSUPPORTED_CODECS at import time
+        for mod_name in list(sys.modules.keys()):
+            if mod_name.startswith('src.utils') or mod_name.startswith('src.config'):
+                del sys.modules[mod_name]
+        from src.utils import audio_conversion
+        return audio_conversion
+
+    def t1():
+        clear_env()
+        mod = reload_audio_module()
+        codecs = mod.get_supported_codecs()
+        assert 'mp3' in codecs
+        assert 'flac' in codecs
+    run_test("Default supported codecs include common formats", t1)
+
+    def t2():
+        clear_env()
+        mod = reload_audio_module()
+        specs = ConnectorSpecifications(unsupported_codecs=frozenset({'opus', 'vorbis'}))
+        codecs = mod.get_supported_codecs(connector_specs=specs)
+        assert 'opus' not in codecs
+        assert 'vorbis' not in codecs
+        assert 'mp3' in codecs
+    run_test("Connector unsupported_codecs removes from defaults", t2)
+
+    def t3():
+        clear_env()
+        os.environ['AUDIO_UNSUPPORTED_CODECS'] = 'aac,opus'
+        mod = reload_audio_module()
+        codecs = mod.get_supported_codecs()
+        assert 'aac' not in codecs, f"aac should not be in {codecs}"
+        assert 'opus' not in codecs, f"opus should not be in {codecs}"
+    run_test("AUDIO_UNSUPPORTED_CODECS env var still works", t3)
+
+    def t4():
+        clear_env()
+        os.environ['AUDIO_UNSUPPORTED_CODECS'] = 'aac'
+        mod = reload_audio_module()
+        specs = ConnectorSpecifications(unsupported_codecs=frozenset({'opus'}))
+        codecs = mod.get_supported_codecs(connector_specs=specs)
+        assert 'aac' not in codecs, f"aac should not be in {codecs}"
+        assert 'opus' not in codecs, f"opus should not be in {codecs}"
+        assert 'mp3' in codecs
+    run_test("Both connector specs and ENV var work together", t4)
+
+
+# =============================================================================
+# TEST SECTION 6: Connector Capabilities
+# =============================================================================
+
+def test_connector_capabilities():
+    """Test connector capabilities are exposed correctly."""
+    print("\n=== Testing Connector Capabilities ===")
+
+    from src.services.transcription.connectors.asr_endpoint import ASREndpointConnector
+    from src.services.transcription.connectors.openai_transcribe import OpenAITranscribeConnector
+    from src.services.transcription.base import TranscriptionCapability
+
+    def t1():
+        connector = ASREndpointConnector({'base_url': 'http://test:9000'})
+        assert connector.supports_diarization is True
+    run_test("ASR connector supports_diarization property", t1)
+
+    def t2():
+        connector = ASREndpointConnector({'base_url': 'http://test:9000'})
+        assert connector.supports_speaker_count_control is True
+    run_test("ASR connector supports_speaker_count_control property", t2)
+
+    def t3():
+        connector = OpenAITranscribeConnector({'api_key': 'test-key', 'model': 'gpt-4o-transcribe-diarize'})
+        assert connector.supports_diarization is True
+        assert connector.supports_speaker_count_control is False
+    run_test("OpenAI Transcribe supports diarization but not speaker_count_control", t3)
+
+    def t4():
+        connector = ASREndpointConnector({'base_url': 'http://test:9000'})
+        assert connector.supports(TranscriptionCapability.DIARIZATION) is True
+        assert connector.supports(TranscriptionCapability.STREAMING) is False
+    run_test("supports() method works correctly", t4)
+
+
+# =============================================================================
+# TEST SECTION 7: Registry Operations
+# =============================================================================
+
+def test_registry_operations():
+    """Test registry listing and connector info."""
+    print("\n=== Testing Registry Operations ===")
+
+    from src.services.transcription.registry import get_registry
+
+    def t1():
+        clear_env()
+        reset_registry()
+        registry = get_registry()
+        connectors = registry.list_connectors()
+        names = [c['name'] for c in connectors]
+        assert 'openai_whisper' in names
+        assert 'openai_transcribe' in names
+        assert 'asr_endpoint' in names
+    run_test("Registry lists all built-in connectors", t1)
+
+    def t2():
+        clear_env()
+        reset_registry()
+        registry = get_registry()
+        connectors = registry.list_connectors()
+        asr = next(c for c in connectors if c['name'] == 'asr_endpoint')
+        assert 'DIARIZATION' in asr['capabilities']
+        assert 'SPEAKER_COUNT_CONTROL' in asr['capabilities']
+    run_test("Connector info includes capabilities", t2)
+
+    def t3():
+        clear_env()
+        reset_registry()
+        os.environ['TRANSCRIPTION_API_KEY'] = 'test-key'
+        os.environ['TRANSCRIPTION_MODEL'] = 'whisper-1'
+        registry = get_registry()
+        registry.initialize_from_env()
+        assert registry.get_active_connector_name() == 'openai_whisper'
+
+        os.environ['TRANSCRIPTION_MODEL'] = 'gpt-4o-transcribe-diarize'
+        registry.reinitialize()
+        assert registry.get_active_connector_name() == 'openai_transcribe'
+    run_test("reinitialize() resets the active connector", t3)
+
+
+# =============================================================================
+# TEST SECTION 8: Edge Cases
+# =============================================================================
+
+def test_edge_cases():
+    """Test edge cases and error handling."""
+    print("\n=== Testing Edge Cases ===")
+
+    from src.services.transcription.registry import get_registry
+    from src.services.transcription.exceptions import ConfigurationError
+    from src.services.transcription.base import TranscriptionResponse, TranscriptionSegment
+
+    def t1():
+        # Empty segments list returns the text (empty string), not "[]"
+        response = TranscriptionResponse(text="", segments=[], provider="test")
+        assert response.to_storage_format() == ""
+        assert response.has_diarization() is False
+    run_test("Empty transcription response handling", t1)
+
+    def t2():
+        segments = [TranscriptionSegment(text="Hello", speaker=None)]
+        response = TranscriptionResponse(text="Hello", segments=segments)
+        storage = response.to_storage_format()
+        data = json.loads(storage)
+        assert data[0]['speaker'] == 'Unknown Speaker'
+    run_test("Transcription with unknown speaker handling", t2)
+
+    def t3():
+        clear_env()
+        reset_registry()
+        os.environ['TRANSCRIPTION_CONNECTOR'] = 'nonexistent_connector'
+        registry = get_registry()
+        try:
+            registry.initialize_from_env()
+            assert False, "Should have raised ConfigurationError"
+        except ConfigurationError as e:
+            assert 'Unknown connector' in str(e)
+    run_test("Invalid connector name raises ConfigurationError", t3)
+
+    def t4():
+        from src.services.transcription.connectors.asr_endpoint import ASREndpointConnector
+        try:
+            ASREndpointConnector({})
+            assert False, "Should have raised ConfigurationError"
+        except ConfigurationError as e:
+            assert 'base_url' in str(e)
+    run_test("ASR connector validates base_url is required", t4)
+
+    def t5():
+        clear_env()
+        reset_registry()
+        os.environ['ASR_BASE_URL'] = 'http://whisperx:9000  # This is a comment'
+        registry = get_registry()
+        connector = registry.initialize_from_env()
+        assert connector.base_url == 'http://whisperx:9000'
+    run_test("ASR_BASE_URL with trailing comment is handled", t5)
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main():
+    """Run all tests."""
+    global PASSED, FAILED, ERRORS
+
+    print("=" * 60)
+    print("Transcription Connector Architecture Tests")
+    print("=" * 60)
+
+    test_base_classes()
+    test_auto_detection()
+    test_connector_specifications()
+    test_chunking_logic()
+    test_codec_handling()
+    test_connector_capabilities()
+    test_registry_operations()
+    test_edge_cases()
+
+    print("\n" + "=" * 60)
+    print(f"RESULTS: {PASSED} passed, {FAILED} failed")
+    print("=" * 60)
+
+    if ERRORS:
+        print("\nFailed tests:")
+        for name, error in ERRORS:
+            print(f"  - {name}: {error}")
+
+    clear_env()
+    return 0 if FAILED == 0 else 1
+
+
+if __name__ == '__main__':
+    sys.exit(main())