Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)

2026-03-16 21:47:37 +00:00
commit 42772a31ed
365 changed files with 103572 additions and 0 deletions
--- a/tests/test_json_standalone.py
+++ b/tests/test_json_standalone.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""
+Standalone test for JSON preprocessing functionality.
+Tests the safe_json_loads function with various malformed JSON scenarios.
+"""
+
+import json
+import re
+import ast
+from unittest.mock import Mock
+
+# Mock logger for testing
+class MockLogger:
+    def warning(self, msg): print(f"WARNING: {msg}")
+    def info(self, msg): print(f"INFO: {msg}")
+    def debug(self, msg): print(f"DEBUG: {msg}")
+    def error(self, msg): print(f"ERROR: {msg}")
+
+# Create mock app with logger
+class MockApp:
+    logger = MockLogger()
+
+app = MockApp()
+
+def safe_json_loads(json_string, fallback_value=None):
+    """
+    Safely parse JSON with preprocessing to handle common LLM JSON formatting issues.
+    
+    Args:
+        json_string (str): The JSON string to parse
+        fallback_value: Value to return if parsing fails (default: None)
+    
+    Returns:
+        Parsed JSON object or fallback_value if parsing fails
+    """
+    if not json_string or not isinstance(json_string, str):
+        app.logger.warning(f"Invalid JSON input: {type(json_string)} - {json_string}")
+        return fallback_value
+    
+    # Step 1: Clean the input string
+    cleaned_json = json_string.strip()
+    
+    # Step 2: Extract JSON from markdown code blocks if present
+    json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', cleaned_json, re.DOTALL)
+    if json_match:
+        cleaned_json = json_match.group(1).strip()
+    
+    # Step 3: Try multiple parsing strategies
+    parsing_strategies = [
+        # Strategy 1: Direct parsing (for well-formed JSON)
+        lambda x: json.loads(x),
+        
+        # Strategy 2: Fix common escape issues
+        lambda x: json.loads(preprocess_json_escapes(x)),
+        
+        # Strategy 3: Use ast.literal_eval as fallback for simple cases
+        lambda x: ast.literal_eval(x) if x.startswith(('{', '[')) else None,
+        
+        # Strategy 4: Extract JSON object/array using regex
+        lambda x: json.loads(extract_json_object(x)),
+    ]
+    
+    for i, strategy in enumerate(parsing_strategies):
+        try:
+            result = strategy(cleaned_json)
+            if result is not None:
+                if i > 0:  # Log if we had to use a fallback strategy
+                    app.logger.info(f"JSON parsed successfully using strategy {i+1}")
+                return result
+        except (json.JSONDecodeError, ValueError, SyntaxError) as e:
+            if i == 0:  # Only log the first failure to avoid spam
+                app.logger.debug(f"JSON parsing strategy {i+1} failed: {e}")
+            continue
+    
+    # All strategies failed
+    app.logger.error(f"All JSON parsing strategies failed for: {cleaned_json[:200]}...")
+    return fallback_value
+
+def preprocess_json_escapes(json_string):
+    """
+    Preprocess JSON string to fix common escape issues from LLM responses.
+    Uses a more sophisticated approach to handle nested quotes properly.
+    """
+    if not json_string:
+        return json_string
+    
+    result = []
+    i = 0
+    in_string = False
+    escape_next = False
+    expecting_value = False  # Track if we're expecting a value (after :)
+    
+    while i < len(json_string):
+        char = json_string[i]
+        
+        if escape_next:
+            # This character is escaped, add it as-is
+            result.append(char)
+            escape_next = False
+        elif char == '\\':
+            # This is an escape character
+            result.append(char)
+            escape_next = True
+        elif char == ':' and not in_string:
+            # We found a colon, next string will be a value
+            result.append(char)
+            expecting_value = True
+        elif char == ',' and not in_string:
+            # We found a comma, reset expecting_value
+            result.append(char)
+            expecting_value = False
+        elif char == '"':
+            if not in_string:
+                # Starting a string
+                in_string = True
+                result.append(char)
+            else:
+                # We're in a string, check if this quote should be escaped
+                # Look ahead to see if this is the end of the string value
+                j = i + 1
+                while j < len(json_string) and json_string[j].isspace():
+                    j += 1
+                
+                # For keys (not expecting_value), only end on colon
+                # For values (expecting_value), end on comma, closing brace, or closing bracket
+                if expecting_value:
+                    end_chars = ',}]'
+                else:
+                    end_chars = ':'
+                
+                if j < len(json_string) and json_string[j] in end_chars:
+                    # This is the end of the string
+                    in_string = False
+                    result.append(char)
+                    if not expecting_value:
+                        # We just finished a key, next will be expecting value
+                        expecting_value = True
+                else:
+                    # This is an inner quote that should be escaped
+                    result.append('\\"')
+        else:
+            result.append(char)
+        
+        i += 1
+    
+    return ''.join(result)
+
+def extract_json_object(text):
+    """
+    Extract the first complete JSON object or array from text using regex.
+    """
+    # Look for JSON object
+    obj_match = re.search(r'\{.*\}', text, re.DOTALL)
+    if obj_match:
+        return obj_match.group(0)
+    
+    # Look for JSON array
+    arr_match = re.search(r'\[.*\]', text, re.DOTALL)
+    if arr_match:
+        return arr_match.group(0)
+    
+    # Return original if no JSON structure found
+    return text
+
+def run_comprehensive_test():
+    """Run a comprehensive test with various malformed JSON examples."""
+    print("🧪 Running comprehensive JSON preprocessing tests...\n")
+    
+    test_cases = [
+        {
+            "name": "Valid JSON",
+            "input": '{"title": "Test", "summary": "Valid JSON"}',
+            "should_succeed": True
+        },
+        {
+            "name": "Unescaped quotes in title",
+            "input": '{"title": "Meeting about "Project X"", "summary": "Discussion summary"}',
+            "should_succeed": True
+        },
+        {
+            "name": "Multiple unescaped quotes",
+            "input": '{"title": "John said "Hello" and Mary replied "Hi there"", "summary": "Conversation log"}',
+            "should_succeed": True
+        },
+        {
+            "name": "Markdown code block",
+            "input": '```json\n{"title": "Wrapped", "summary": "In code block"}\n```',
+            "should_succeed": True
+        },
+        {
+            "name": "Mixed quotes and apostrophes",
+            "input": '{"title": "Alice\'s "big idea" presentation", "summary": "She said it\'s "revolutionary""}',
+            "should_succeed": True
+        },
+        {
+            "name": "JSON with newlines",
+            "input": '{"title": "Multi-line", "summary": "Line 1\\nLine 2 with \\"quotes\\"\\nLine 3"}',
+            "should_succeed": True
+        },
+        {
+            "name": "LLM response with explanation",
+            "input": '''Here's the JSON:
+```json
+{
+  "title": "Q3 Planning",
+  "summary": "We discussed the "new initiative" for next quarter."
+}
+```
+Hope this helps!''',
+            "should_succeed": True
+        },
+        {
+            "name": "Speaker identification with quotes",
+            "input": '{"SPEAKER_00": "John Smith", "SPEAKER_01": "Jane "The Expert" Doe", "SPEAKER_02": "Bob"}',
+            "should_succeed": True
+        },
+        {
+            "name": "Completely malformed",
+            "input": '{"title": "Test", "summary": this is not valid json at all}',
+            "should_succeed": False
+        },
+        {
+            "name": "Empty string",
+            "input": "",
+            "should_succeed": False
+        }
+    ]
+    
+    passed = 0
+    failed = 0
+    
+    for i, test_case in enumerate(test_cases, 1):
+        print(f"Test {i}: {test_case['name']}")
+        print(f"Input: {test_case['input'][:100]}{'...' if len(test_case['input']) > 100 else ''}")
+        
+        try:
+            result = safe_json_loads(test_case['input'], {"error": "fallback"})
+            
+            if test_case['should_succeed']:
+                if result != {"error": "fallback"} and isinstance(result, (dict, list)):
+                    print("✅ PASSED - Successfully parsed JSON")
+                    print(f"   Result: {result}")
+                    passed += 1
+                else:
+                    print("❌ FAILED - Expected successful parsing but got fallback")
+                    failed += 1
+            else:
+                if result == {"error": "fallback"}:
+                    print("✅ PASSED - Correctly returned fallback for malformed JSON")
+                    passed += 1
+                else:
+                    print("❌ FAILED - Expected fallback but got parsed result")
+                    print(f"   Unexpected result: {result}")
+                    failed += 1
+                    
+        except Exception as e:
+            print(f"❌ FAILED - Exception occurred: {e}")
+            failed += 1
+        
+        print("-" * 50)
+    
+    print(f"\n📊 Test Results: {passed} passed, {failed} failed")
+    return failed == 0
+
+def test_preprocessing_function():
+    """Test the preprocessing function directly."""
+    print("\n🔧 Testing preprocessing function directly...\n")
+    
+    test_input = '{"title": "Meeting about "Project X"", "summary": "Discussion summary"}'
+    print(f"Original: {test_input}")
+    
+    processed = preprocess_json_escapes(test_input)
+    print(f"Processed: {processed}")
+    
+    try:
+        result = json.loads(processed)
+        print(f"✅ Successfully parsed: {result}")
+    except json.JSONDecodeError as e:
+        print(f"❌ Still failed: {e}")
+
+def test_specific_scenarios():
+    """Test specific real-world scenarios."""
+    print("\n🎯 Testing specific LLM response scenarios...\n")
+    
+    # Test case from the original issue
+    gemini_response = '''{"title": "Meeting about "Project Phoenix" and budget allocation", "summary": "The team discussed John's "breakthrough idea" and Mary said "this will change everything" during the Q3 planning session."}'''
+    
+    print("Testing Gemini-style response with unescaped quotes:")
+    print(f"Input: {gemini_response}")
+    
+    # Test preprocessing directly
+    processed = preprocess_json_escapes(gemini_response)
+    print(f"Processed: {processed}")
+    
+    result = safe_json_loads(gemini_response)
+    if isinstance(result, dict) and "title" in result and "summary" in result:
+        print("✅ SUCCESS - Parsed Gemini response correctly!")
+        print(f"Title: {result['title']}")
+        print(f"Summary: {result['summary'][:100]}...")
+    else:
+        print("❌ FAILED - Could not parse Gemini response")
+        print(f"Result: {result}")
+    
+    print("-" * 50)
+    
+    # Test speaker identification scenario
+    speaker_response = '''{"SPEAKER_00": "John "The Manager" Smith", "SPEAKER_01": "Alice Johnson", "SPEAKER_02": "Bob "Tech Lead" Wilson"}'''
+    
+    print("Testing speaker identification with quotes in names:")
+    print(f"Input: {speaker_response}")
+    
+    # Test preprocessing directly
+    processed = preprocess_json_escapes(speaker_response)
+    print(f"Processed: {processed}")
+    
+    result = safe_json_loads(speaker_response)
+    if isinstance(result, dict) and len(result) >= 3:
+        print("✅ SUCCESS - Parsed speaker identification correctly!")
+        for speaker, name in result.items():
+            print(f"  {speaker}: {name}")
+    else:
+        print("❌ FAILED - Could not parse speaker identification")
+        print(f"Result: {result}")
+
+if __name__ == "__main__":
+    print("🚀 Starting Standalone JSON Preprocessing Tests\n")
+    
+    # Test preprocessing function directly
+    test_preprocessing_function()
+    
+    # Run the comprehensive test
+    success = run_comprehensive_test()
+    
+    # Test specific scenarios
+    test_specific_scenarios()
+    
+    if success:
+        print("\n🎉 All tests completed successfully! JSON preprocessing should handle LLM response issues gracefully.")
+    else:
+        print("\n⚠️  Some tests failed. The implementation may need refinement.")