Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)

This commit is contained in:
InnovA AI
2026-03-16 21:47:37 +00:00
commit 42772a31ed
365 changed files with 103572 additions and 0 deletions

View File

@@ -0,0 +1,340 @@
#!/usr/bin/env python3
"""
Standalone test for JSON preprocessing functionality.
Tests the safe_json_loads function with various malformed JSON scenarios.
"""
import json
import re
import ast
from unittest.mock import Mock
# Mock logger for testing
class MockLogger:
def warning(self, msg): print(f"WARNING: {msg}")
def info(self, msg): print(f"INFO: {msg}")
def debug(self, msg): print(f"DEBUG: {msg}")
def error(self, msg): print(f"ERROR: {msg}")
# Create mock app with logger
class MockApp:
logger = MockLogger()
app = MockApp()
def safe_json_loads(json_string, fallback_value=None):
"""
Safely parse JSON with preprocessing to handle common LLM JSON formatting issues.
Args:
json_string (str): The JSON string to parse
fallback_value: Value to return if parsing fails (default: None)
Returns:
Parsed JSON object or fallback_value if parsing fails
"""
if not json_string or not isinstance(json_string, str):
app.logger.warning(f"Invalid JSON input: {type(json_string)} - {json_string}")
return fallback_value
# Step 1: Clean the input string
cleaned_json = json_string.strip()
# Step 2: Extract JSON from markdown code blocks if present
json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', cleaned_json, re.DOTALL)
if json_match:
cleaned_json = json_match.group(1).strip()
# Step 3: Try multiple parsing strategies
parsing_strategies = [
# Strategy 1: Direct parsing (for well-formed JSON)
lambda x: json.loads(x),
# Strategy 2: Fix common escape issues
lambda x: json.loads(preprocess_json_escapes(x)),
# Strategy 3: Use ast.literal_eval as fallback for simple cases
lambda x: ast.literal_eval(x) if x.startswith(('{', '[')) else None,
# Strategy 4: Extract JSON object/array using regex
lambda x: json.loads(extract_json_object(x)),
]
for i, strategy in enumerate(parsing_strategies):
try:
result = strategy(cleaned_json)
if result is not None:
if i > 0: # Log if we had to use a fallback strategy
app.logger.info(f"JSON parsed successfully using strategy {i+1}")
return result
except (json.JSONDecodeError, ValueError, SyntaxError) as e:
if i == 0: # Only log the first failure to avoid spam
app.logger.debug(f"JSON parsing strategy {i+1} failed: {e}")
continue
# All strategies failed
app.logger.error(f"All JSON parsing strategies failed for: {cleaned_json[:200]}...")
return fallback_value
def preprocess_json_escapes(json_string):
"""
Preprocess JSON string to fix common escape issues from LLM responses.
Uses a more sophisticated approach to handle nested quotes properly.
"""
if not json_string:
return json_string
result = []
i = 0
in_string = False
escape_next = False
expecting_value = False # Track if we're expecting a value (after :)
while i < len(json_string):
char = json_string[i]
if escape_next:
# This character is escaped, add it as-is
result.append(char)
escape_next = False
elif char == '\\':
# This is an escape character
result.append(char)
escape_next = True
elif char == ':' and not in_string:
# We found a colon, next string will be a value
result.append(char)
expecting_value = True
elif char == ',' and not in_string:
# We found a comma, reset expecting_value
result.append(char)
expecting_value = False
elif char == '"':
if not in_string:
# Starting a string
in_string = True
result.append(char)
else:
# We're in a string, check if this quote should be escaped
# Look ahead to see if this is the end of the string value
j = i + 1
while j < len(json_string) and json_string[j].isspace():
j += 1
# For keys (not expecting_value), only end on colon
# For values (expecting_value), end on comma, closing brace, or closing bracket
if expecting_value:
end_chars = ',}]'
else:
end_chars = ':'
if j < len(json_string) and json_string[j] in end_chars:
# This is the end of the string
in_string = False
result.append(char)
if not expecting_value:
# We just finished a key, next will be expecting value
expecting_value = True
else:
# This is an inner quote that should be escaped
result.append('\\"')
else:
result.append(char)
i += 1
return ''.join(result)
def extract_json_object(text):
"""
Extract the first complete JSON object or array from text using regex.
"""
# Look for JSON object
obj_match = re.search(r'\{.*\}', text, re.DOTALL)
if obj_match:
return obj_match.group(0)
# Look for JSON array
arr_match = re.search(r'\[.*\]', text, re.DOTALL)
if arr_match:
return arr_match.group(0)
# Return original if no JSON structure found
return text
def run_comprehensive_test():
"""Run a comprehensive test with various malformed JSON examples."""
print("🧪 Running comprehensive JSON preprocessing tests...\n")
test_cases = [
{
"name": "Valid JSON",
"input": '{"title": "Test", "summary": "Valid JSON"}',
"should_succeed": True
},
{
"name": "Unescaped quotes in title",
"input": '{"title": "Meeting about "Project X"", "summary": "Discussion summary"}',
"should_succeed": True
},
{
"name": "Multiple unescaped quotes",
"input": '{"title": "John said "Hello" and Mary replied "Hi there"", "summary": "Conversation log"}',
"should_succeed": True
},
{
"name": "Markdown code block",
"input": '```json\n{"title": "Wrapped", "summary": "In code block"}\n```',
"should_succeed": True
},
{
"name": "Mixed quotes and apostrophes",
"input": '{"title": "Alice\'s "big idea" presentation", "summary": "She said it\'s "revolutionary""}',
"should_succeed": True
},
{
"name": "JSON with newlines",
"input": '{"title": "Multi-line", "summary": "Line 1\\nLine 2 with \\"quotes\\"\\nLine 3"}',
"should_succeed": True
},
{
"name": "LLM response with explanation",
"input": '''Here's the JSON:
```json
{
"title": "Q3 Planning",
"summary": "We discussed the "new initiative" for next quarter."
}
```
Hope this helps!''',
"should_succeed": True
},
{
"name": "Speaker identification with quotes",
"input": '{"SPEAKER_00": "John Smith", "SPEAKER_01": "Jane "The Expert" Doe", "SPEAKER_02": "Bob"}',
"should_succeed": True
},
{
"name": "Completely malformed",
"input": '{"title": "Test", "summary": this is not valid json at all}',
"should_succeed": False
},
{
"name": "Empty string",
"input": "",
"should_succeed": False
}
]
passed = 0
failed = 0
for i, test_case in enumerate(test_cases, 1):
print(f"Test {i}: {test_case['name']}")
print(f"Input: {test_case['input'][:100]}{'...' if len(test_case['input']) > 100 else ''}")
try:
result = safe_json_loads(test_case['input'], {"error": "fallback"})
if test_case['should_succeed']:
if result != {"error": "fallback"} and isinstance(result, (dict, list)):
print("✅ PASSED - Successfully parsed JSON")
print(f" Result: {result}")
passed += 1
else:
print("❌ FAILED - Expected successful parsing but got fallback")
failed += 1
else:
if result == {"error": "fallback"}:
print("✅ PASSED - Correctly returned fallback for malformed JSON")
passed += 1
else:
print("❌ FAILED - Expected fallback but got parsed result")
print(f" Unexpected result: {result}")
failed += 1
except Exception as e:
print(f"❌ FAILED - Exception occurred: {e}")
failed += 1
print("-" * 50)
print(f"\n📊 Test Results: {passed} passed, {failed} failed")
return failed == 0
def test_preprocessing_function():
"""Test the preprocessing function directly."""
print("\n🔧 Testing preprocessing function directly...\n")
test_input = '{"title": "Meeting about "Project X"", "summary": "Discussion summary"}'
print(f"Original: {test_input}")
processed = preprocess_json_escapes(test_input)
print(f"Processed: {processed}")
try:
result = json.loads(processed)
print(f"✅ Successfully parsed: {result}")
except json.JSONDecodeError as e:
print(f"❌ Still failed: {e}")
def test_specific_scenarios():
"""Test specific real-world scenarios."""
print("\n🎯 Testing specific LLM response scenarios...\n")
# Test case from the original issue
gemini_response = '''{"title": "Meeting about "Project Phoenix" and budget allocation", "summary": "The team discussed John's "breakthrough idea" and Mary said "this will change everything" during the Q3 planning session."}'''
print("Testing Gemini-style response with unescaped quotes:")
print(f"Input: {gemini_response}")
# Test preprocessing directly
processed = preprocess_json_escapes(gemini_response)
print(f"Processed: {processed}")
result = safe_json_loads(gemini_response)
if isinstance(result, dict) and "title" in result and "summary" in result:
print("✅ SUCCESS - Parsed Gemini response correctly!")
print(f"Title: {result['title']}")
print(f"Summary: {result['summary'][:100]}...")
else:
print("❌ FAILED - Could not parse Gemini response")
print(f"Result: {result}")
print("-" * 50)
# Test speaker identification scenario
speaker_response = '''{"SPEAKER_00": "John "The Manager" Smith", "SPEAKER_01": "Alice Johnson", "SPEAKER_02": "Bob "Tech Lead" Wilson"}'''
print("Testing speaker identification with quotes in names:")
print(f"Input: {speaker_response}")
# Test preprocessing directly
processed = preprocess_json_escapes(speaker_response)
print(f"Processed: {processed}")
result = safe_json_loads(speaker_response)
if isinstance(result, dict) and len(result) >= 3:
print("✅ SUCCESS - Parsed speaker identification correctly!")
for speaker, name in result.items():
print(f" {speaker}: {name}")
else:
print("❌ FAILED - Could not parse speaker identification")
print(f"Result: {result}")
if __name__ == "__main__":
print("🚀 Starting Standalone JSON Preprocessing Tests\n")
# Test preprocessing function directly
test_preprocessing_function()
# Run the comprehensive test
success = run_comprehensive_test()
# Test specific scenarios
test_specific_scenarios()
if success:
print("\n🎉 All tests completed successfully! JSON preprocessing should handle LLM response issues gracefully.")
else:
print("\n⚠️ Some tests failed. The implementation may need refinement.")