Files
dictia-public/tests/test_json_preprocessing.py

292 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Test suite for JSON preprocessing functionality in Speakr app.
Tests the safe_json_loads function with various malformed JSON scenarios.
"""
import sys
import os
import json
import unittest
from unittest.mock import Mock
# Add the app directory to the path so we can import from app.py
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Mock the Flask app and logger for testing
class MockApp:
def __init__(self):
self.logger = Mock()
# Set up the mock app before importing
app = MockApp()
# Import the functions we want to test
from src.app import safe_json_loads, preprocess_json_escapes, extract_json_object
class TestJSONPreprocessing(unittest.TestCase):
"""Test cases for JSON preprocessing functionality."""
def test_valid_json(self):
"""Test that valid JSON is parsed correctly."""
valid_json = '{"title": "Test Meeting", "summary": "This is a test summary"}'
result = safe_json_loads(valid_json)
expected = {"title": "Test Meeting", "summary": "This is a test summary"}
self.assertEqual(result, expected)
def test_json_with_markdown_code_blocks(self):
"""Test JSON wrapped in markdown code blocks."""
markdown_json = '''```json
{
"title": "Meeting Notes",
"summary": "Key points discussed"
}
```'''
result = safe_json_loads(markdown_json)
expected = {"title": "Meeting Notes", "summary": "Key points discussed"}
self.assertEqual(result, expected)
def test_json_with_unescaped_quotes(self):
"""Test JSON with unescaped quotes in string values."""
malformed_json = '{"title": "John said "Hello world" to everyone", "summary": "Meeting summary"}'
result = safe_json_loads(malformed_json)
expected = {"title": 'John said "Hello world" to everyone', "summary": "Meeting summary"}
self.assertEqual(result, expected)
def test_json_with_mixed_quotes(self):
"""Test JSON with mixed quote scenarios."""
malformed_json = '{"title": "Alice\'s "big idea" presentation", "summary": "She said "this will change everything""}'
result = safe_json_loads(malformed_json)
self.assertIsInstance(result, dict)
self.assertIn("title", result)
self.assertIn("summary", result)
def test_json_with_newlines_and_special_chars(self):
"""Test JSON with newlines and special characters."""
malformed_json = '''{"title": "Complex Meeting", "summary": "Discussion about:\n- Point 1\n- Point 2 with "quotes"\n- Point 3"}'''
result = safe_json_loads(malformed_json)
self.assertIsInstance(result, dict)
self.assertIn("title", result)
self.assertIn("summary", result)
def test_empty_or_invalid_input(self):
"""Test handling of empty or invalid input."""
# Empty string
result = safe_json_loads("", {"default": "value"})
self.assertEqual(result, {"default": "value"})
# None input
result = safe_json_loads(None, {"default": "value"})
self.assertEqual(result, {"default": "value"})
# Non-string input
result = safe_json_loads(123, {"default": "value"})
self.assertEqual(result, {"default": "value"})
def test_completely_malformed_json(self):
"""Test completely malformed JSON that can't be fixed."""
malformed_json = '{"title": "Test", "summary": unclosed string and missing quotes}'
result = safe_json_loads(malformed_json, {"error": "fallback"})
self.assertEqual(result, {"error": "fallback"})
def test_json_with_nested_quotes(self):
"""Test JSON with deeply nested quote scenarios."""
malformed_json = '{"title": "Meeting about "Project Alpha" and "Project Beta"", "summary": "Both projects involve "cutting-edge" technology"}'
result = safe_json_loads(malformed_json)
self.assertIsInstance(result, dict)
# Should have successfully parsed something
self.assertTrue(len(result) > 0)
def test_json_array_format(self):
"""Test JSON array format (for transcription data)."""
json_array = '[{"speaker": "John", "sentence": "Hello everyone"}, {"speaker": "Jane", "sentence": "Good morning"}]'
result = safe_json_loads(json_array)
expected = [{"speaker": "John", "sentence": "Hello everyone"}, {"speaker": "Jane", "sentence": "Good morning"}]
self.assertEqual(result, expected)
def test_preprocess_json_escapes_function(self):
"""Test the preprocess_json_escapes function directly."""
input_json = '{"title": "John said "Hello" to Mary", "summary": "Simple test"}'
processed = preprocess_json_escapes(input_json)
# Should be valid JSON after preprocessing
result = json.loads(processed)
self.assertIsInstance(result, dict)
self.assertIn("title", result)
self.assertIn("summary", result)
def test_extract_json_object_function(self):
"""Test the extract_json_object function directly."""
# Test with extra text around JSON object
text_with_json = 'Here is some text {"title": "Test", "summary": "Content"} and more text'
extracted = extract_json_object(text_with_json)
result = json.loads(extracted)
expected = {"title": "Test", "summary": "Content"}
self.assertEqual(result, expected)
# Test with JSON array
text_with_array = 'Some text [{"item": "one"}, {"item": "two"}] more text'
extracted = extract_json_object(text_with_array)
result = json.loads(extracted)
expected = [{"item": "one"}, {"item": "two"}]
self.assertEqual(result, expected)
def test_real_world_llm_response_scenarios(self):
"""Test real-world scenarios that might come from LLM responses."""
# Scenario 1: LLM response with explanation text
llm_response1 = '''Here's the JSON response you requested:
```json
{
"title": "Q3 Planning Meeting",
"summary": "We discussed the "new initiative" and John's "breakthrough idea" for next quarter."
}
```
This should help with your transcription needs.'''
result1 = safe_json_loads(llm_response1)
self.assertIsInstance(result1, dict)
self.assertIn("title", result1)
self.assertIn("summary", result1)
# Scenario 2: LLM response with unescaped quotes and no code blocks
llm_response2 = '{"title": "Team Standup", "summary": "Alice mentioned "the deadline is tight" and Bob said "we need more resources""}'
result2 = safe_json_loads(llm_response2)
self.assertIsInstance(result2, dict)
self.assertIn("title", result2)
self.assertIn("summary", result2)
# Scenario 3: LLM response with speaker identification
llm_response3 = '''{"SPEAKER_00": "John Smith", "SPEAKER_01": "Jane "The Expert" Doe", "SPEAKER_02": "Bob"}'''
result3 = safe_json_loads(llm_response3)
self.assertIsInstance(result3, dict)
self.assertTrue(len(result3) >= 2) # Should have parsed at least some speakers
def test_fallback_strategies(self):
"""Test that different parsing strategies work as fallbacks."""
# Test ast.literal_eval fallback for simple cases
simple_dict = "{'title': 'Simple', 'summary': 'Test'}"
result = safe_json_loads(simple_dict)
expected = {"title": "Simple", "summary": "Test"}
self.assertEqual(result, expected)
# Test regex extraction fallback
messy_response = 'Some text before {"title": "Extracted", "summary": "From regex"} some text after'
result = safe_json_loads(messy_response)
expected = {"title": "Extracted", "summary": "From regex"}
self.assertEqual(result, expected)
def test_performance_with_large_content(self):
"""Test performance with larger JSON content."""
large_summary = "This is a very long summary. " * 100 # Create a long string
large_json = f'{{"title": "Large Content Test", "summary": "{large_summary}"}}'
result = safe_json_loads(large_json)
self.assertIsInstance(result, dict)
self.assertIn("title", result)
self.assertIn("summary", result)
self.assertEqual(result["title"], "Large Content Test")
def run_comprehensive_test():
"""Run a comprehensive test with various malformed JSON examples."""
print("🧪 Running comprehensive JSON preprocessing tests...\n")
test_cases = [
{
"name": "Valid JSON",
"input": '{"title": "Test", "summary": "Valid JSON"}',
"should_succeed": True
},
{
"name": "Unescaped quotes in title",
"input": '{"title": "Meeting about "Project X"", "summary": "Discussion summary"}',
"should_succeed": True
},
{
"name": "Multiple unescaped quotes",
"input": '{"title": "John said "Hello" and Mary replied "Hi there"", "summary": "Conversation log"}',
"should_succeed": True
},
{
"name": "Markdown code block",
"input": '```json\n{"title": "Wrapped", "summary": "In code block"}\n```',
"should_succeed": True
},
{
"name": "Mixed quotes and apostrophes",
"input": '{"title": "Alice\'s "big idea" presentation", "summary": "She said it\'s "revolutionary""}',
"should_succeed": True
},
{
"name": "JSON with newlines",
"input": '{"title": "Multi-line", "summary": "Line 1\\nLine 2 with \\"quotes\\"\\nLine 3"}',
"should_succeed": True
},
{
"name": "Completely malformed",
"input": '{"title": "Test", "summary": this is not valid json at all}',
"should_succeed": False
},
{
"name": "Empty string",
"input": "",
"should_succeed": False
}
]
passed = 0
failed = 0
for i, test_case in enumerate(test_cases, 1):
print(f"Test {i}: {test_case['name']}")
print(f"Input: {test_case['input'][:100]}{'...' if len(test_case['input']) > 100 else ''}")
try:
result = safe_json_loads(test_case['input'], {"error": "fallback"})
if test_case['should_succeed']:
if result != {"error": "fallback"} and isinstance(result, dict):
print("✅ PASSED - Successfully parsed JSON")
passed += 1
else:
print("❌ FAILED - Expected successful parsing but got fallback")
failed += 1
else:
if result == {"error": "fallback"}:
print("✅ PASSED - Correctly returned fallback for malformed JSON")
passed += 1
else:
print("❌ FAILED - Expected fallback but got parsed result")
failed += 1
except Exception as e:
print(f"❌ FAILED - Exception occurred: {e}")
failed += 1
print("-" * 50)
print(f"\n📊 Test Results: {passed} passed, {failed} failed")
return failed == 0
if __name__ == "__main__":
print("🚀 Starting JSON Preprocessing Tests for Speakr App\n")
# Run the comprehensive manual test
manual_success = run_comprehensive_test()
print("\n" + "="*60)
print("🔬 Running Unit Tests")
print("="*60)
# Run the unit tests
unittest.main(argv=[''], exit=False, verbosity=2)
if manual_success:
print("\n🎉 All tests completed! JSON preprocessing should handle LLM response issues gracefully.")
else:
print("\n⚠️ Some tests failed. Please review the implementation.")