#!/usr/bin/env python3 """ Standalone test for JSON preprocessing functionality. Tests the safe_json_loads function with various malformed JSON scenarios. """ import json import re import ast from unittest.mock import Mock # Mock logger for testing class MockLogger: def warning(self, msg): print(f"WARNING: {msg}") def info(self, msg): print(f"INFO: {msg}") def debug(self, msg): print(f"DEBUG: {msg}") def error(self, msg): print(f"ERROR: {msg}") # Create mock app with logger class MockApp: logger = MockLogger() app = MockApp() def safe_json_loads(json_string, fallback_value=None): """ Safely parse JSON with preprocessing to handle common LLM JSON formatting issues. Args: json_string (str): The JSON string to parse fallback_value: Value to return if parsing fails (default: None) Returns: Parsed JSON object or fallback_value if parsing fails """ if not json_string or not isinstance(json_string, str): app.logger.warning(f"Invalid JSON input: {type(json_string)} - {json_string}") return fallback_value # Step 1: Clean the input string cleaned_json = json_string.strip() # Step 2: Extract JSON from markdown code blocks if present json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', cleaned_json, re.DOTALL) if json_match: cleaned_json = json_match.group(1).strip() # Step 3: Try multiple parsing strategies parsing_strategies = [ # Strategy 1: Direct parsing (for well-formed JSON) lambda x: json.loads(x), # Strategy 2: Fix common escape issues lambda x: json.loads(preprocess_json_escapes(x)), # Strategy 3: Use ast.literal_eval as fallback for simple cases lambda x: ast.literal_eval(x) if x.startswith(('{', '[')) else None, # Strategy 4: Extract JSON object/array using regex lambda x: json.loads(extract_json_object(x)), ] for i, strategy in enumerate(parsing_strategies): try: result = strategy(cleaned_json) if result is not None: if i > 0: # Log if we had to use a fallback strategy app.logger.info(f"JSON parsed successfully using strategy {i+1}") return result except (json.JSONDecodeError, ValueError, SyntaxError) as e: if i == 0: # Only log the first failure to avoid spam app.logger.debug(f"JSON parsing strategy {i+1} failed: {e}") continue # All strategies failed app.logger.error(f"All JSON parsing strategies failed for: {cleaned_json[:200]}...") return fallback_value def preprocess_json_escapes(json_string): """ Preprocess JSON string to fix common escape issues from LLM responses. Uses a more sophisticated approach to handle nested quotes properly. """ if not json_string: return json_string result = [] i = 0 in_string = False escape_next = False expecting_value = False # Track if we're expecting a value (after :) while i < len(json_string): char = json_string[i] if escape_next: # This character is escaped, add it as-is result.append(char) escape_next = False elif char == '\\': # This is an escape character result.append(char) escape_next = True elif char == ':' and not in_string: # We found a colon, next string will be a value result.append(char) expecting_value = True elif char == ',' and not in_string: # We found a comma, reset expecting_value result.append(char) expecting_value = False elif char == '"': if not in_string: # Starting a string in_string = True result.append(char) else: # We're in a string, check if this quote should be escaped # Look ahead to see if this is the end of the string value j = i + 1 while j < len(json_string) and json_string[j].isspace(): j += 1 # For keys (not expecting_value), only end on colon # For values (expecting_value), end on comma, closing brace, or closing bracket if expecting_value: end_chars = ',}]' else: end_chars = ':' if j < len(json_string) and json_string[j] in end_chars: # This is the end of the string in_string = False result.append(char) if not expecting_value: # We just finished a key, next will be expecting value expecting_value = True else: # This is an inner quote that should be escaped result.append('\\"') else: result.append(char) i += 1 return ''.join(result) def extract_json_object(text): """ Extract the first complete JSON object or array from text using regex. """ # Look for JSON object obj_match = re.search(r'\{.*\}', text, re.DOTALL) if obj_match: return obj_match.group(0) # Look for JSON array arr_match = re.search(r'\[.*\]', text, re.DOTALL) if arr_match: return arr_match.group(0) # Return original if no JSON structure found return text def run_comprehensive_test(): """Run a comprehensive test with various malformed JSON examples.""" print("๐Ÿงช Running comprehensive JSON preprocessing tests...\n") test_cases = [ { "name": "Valid JSON", "input": '{"title": "Test", "summary": "Valid JSON"}', "should_succeed": True }, { "name": "Unescaped quotes in title", "input": '{"title": "Meeting about "Project X"", "summary": "Discussion summary"}', "should_succeed": True }, { "name": "Multiple unescaped quotes", "input": '{"title": "John said "Hello" and Mary replied "Hi there"", "summary": "Conversation log"}', "should_succeed": True }, { "name": "Markdown code block", "input": '```json\n{"title": "Wrapped", "summary": "In code block"}\n```', "should_succeed": True }, { "name": "Mixed quotes and apostrophes", "input": '{"title": "Alice\'s "big idea" presentation", "summary": "She said it\'s "revolutionary""}', "should_succeed": True }, { "name": "JSON with newlines", "input": '{"title": "Multi-line", "summary": "Line 1\\nLine 2 with \\"quotes\\"\\nLine 3"}', "should_succeed": True }, { "name": "LLM response with explanation", "input": '''Here's the JSON: ```json { "title": "Q3 Planning", "summary": "We discussed the "new initiative" for next quarter." } ``` Hope this helps!''', "should_succeed": True }, { "name": "Speaker identification with quotes", "input": '{"SPEAKER_00": "John Smith", "SPEAKER_01": "Jane "The Expert" Doe", "SPEAKER_02": "Bob"}', "should_succeed": True }, { "name": "Completely malformed", "input": '{"title": "Test", "summary": this is not valid json at all}', "should_succeed": False }, { "name": "Empty string", "input": "", "should_succeed": False } ] passed = 0 failed = 0 for i, test_case in enumerate(test_cases, 1): print(f"Test {i}: {test_case['name']}") print(f"Input: {test_case['input'][:100]}{'...' if len(test_case['input']) > 100 else ''}") try: result = safe_json_loads(test_case['input'], {"error": "fallback"}) if test_case['should_succeed']: if result != {"error": "fallback"} and isinstance(result, (dict, list)): print("โœ… PASSED - Successfully parsed JSON") print(f" Result: {result}") passed += 1 else: print("โŒ FAILED - Expected successful parsing but got fallback") failed += 1 else: if result == {"error": "fallback"}: print("โœ… PASSED - Correctly returned fallback for malformed JSON") passed += 1 else: print("โŒ FAILED - Expected fallback but got parsed result") print(f" Unexpected result: {result}") failed += 1 except Exception as e: print(f"โŒ FAILED - Exception occurred: {e}") failed += 1 print("-" * 50) print(f"\n๐Ÿ“Š Test Results: {passed} passed, {failed} failed") return failed == 0 def test_preprocessing_function(): """Test the preprocessing function directly.""" print("\n๐Ÿ”ง Testing preprocessing function directly...\n") test_input = '{"title": "Meeting about "Project X"", "summary": "Discussion summary"}' print(f"Original: {test_input}") processed = preprocess_json_escapes(test_input) print(f"Processed: {processed}") try: result = json.loads(processed) print(f"โœ… Successfully parsed: {result}") except json.JSONDecodeError as e: print(f"โŒ Still failed: {e}") def test_specific_scenarios(): """Test specific real-world scenarios.""" print("\n๐ŸŽฏ Testing specific LLM response scenarios...\n") # Test case from the original issue gemini_response = '''{"title": "Meeting about "Project Phoenix" and budget allocation", "summary": "The team discussed John's "breakthrough idea" and Mary said "this will change everything" during the Q3 planning session."}''' print("Testing Gemini-style response with unescaped quotes:") print(f"Input: {gemini_response}") # Test preprocessing directly processed = preprocess_json_escapes(gemini_response) print(f"Processed: {processed}") result = safe_json_loads(gemini_response) if isinstance(result, dict) and "title" in result and "summary" in result: print("โœ… SUCCESS - Parsed Gemini response correctly!") print(f"Title: {result['title']}") print(f"Summary: {result['summary'][:100]}...") else: print("โŒ FAILED - Could not parse Gemini response") print(f"Result: {result}") print("-" * 50) # Test speaker identification scenario speaker_response = '''{"SPEAKER_00": "John "The Manager" Smith", "SPEAKER_01": "Alice Johnson", "SPEAKER_02": "Bob "Tech Lead" Wilson"}''' print("Testing speaker identification with quotes in names:") print(f"Input: {speaker_response}") # Test preprocessing directly processed = preprocess_json_escapes(speaker_response) print(f"Processed: {processed}") result = safe_json_loads(speaker_response) if isinstance(result, dict) and len(result) >= 3: print("โœ… SUCCESS - Parsed speaker identification correctly!") for speaker, name in result.items(): print(f" {speaker}: {name}") else: print("โŒ FAILED - Could not parse speaker identification") print(f"Result: {result}") if __name__ == "__main__": print("๐Ÿš€ Starting Standalone JSON Preprocessing Tests\n") # Test preprocessing function directly test_preprocessing_function() # Run the comprehensive test success = run_comprehensive_test() # Test specific scenarios test_specific_scenarios() if success: print("\n๐ŸŽ‰ All tests completed successfully! JSON preprocessing should handle LLM response issues gracefully.") else: print("\nโš ๏ธ Some tests failed. The implementation may need refinement.")