Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)

2026-03-16 21:47:37 +00:00
commit 42772a31ed
365 changed files with 103572 additions and 0 deletions
--- a/src/services/speaker.py
+++ b/src/services/speaker.py
@@ -0,0 +1,217 @@
+"""
+Speaker identification and management services.
+"""
+
+import os
+import re
+from datetime import datetime
+from flask import current_app
+from flask_login import current_user
+
+from src.database import db
+from src.models import Speaker, SystemSetting
+from src.services.llm import call_llm_completion
+from src.utils import safe_json_loads
+
+# NOTE: format_transcription_for_llm is referenced but not defined - needs to be implemented
+def format_transcription_for_llm(transcription):
+    """
+    Format transcription for LLM processing.
+
+    TODO: This function needs proper implementation.
+    If transcription is JSON, extract and format the text.
+    Otherwise return as-is.
+    """
+    if isinstance(transcription, str):
+        try:
+            import json
+            data = json.loads(transcription)
+            # If it's JSON diarized format, extract text
+            if isinstance(data, list):
+                return '\n'.join([f"[{seg.get('speaker', 'UNKNOWN')}] {seg.get('text', '')}"
+                                  for seg in data if 'text' in seg])
+        except:
+            pass
+    return str(transcription)
+
+# Import TEXT_MODEL_API_KEY from llm service
+from src.services.llm import TEXT_MODEL_API_KEY
+
+
+def update_speaker_usage(speaker_names):
+    """Helper function to update speaker usage statistics."""
+    if not speaker_names or not current_user.is_authenticated:
+        return
+    
+    try:
+        for name in speaker_names:
+            name = name.strip()
+            if not name:
+                continue
+                
+            speaker = Speaker.query.filter_by(user_id=current_user.id, name=name).first()
+            if speaker:
+                speaker.use_count += 1
+                speaker.last_used = datetime.utcnow()
+            else:
+                # Create new speaker
+                speaker = Speaker(
+                    name=name,
+                    user_id=current_user.id,
+                    use_count=1,
+                    created_at=datetime.utcnow(),
+                    last_used=datetime.utcnow()
+                )
+                db.session.add(speaker)
+        
+        db.session.commit()
+    except Exception as e:
+        current_app.logger.error(f"Error updating speaker usage: {e}")
+        db.session.rollback()
+
+
+
+def identify_speakers_from_text(transcription):
+    """
+    Uses an LLM to identify speakers from a transcription.
+    """
+    if not TEXT_MODEL_API_KEY:
+        raise ValueError("TEXT_MODEL_API_KEY not configured.")
+
+    # The transcription passed here could be JSON, so we format it.
+    formatted_transcription = format_transcription_for_llm(transcription)
+
+    # Extract existing speaker labels (e.g., SPEAKER_00, SPEAKER_01) in order of appearance
+    all_labels = re.findall(r'\[(SPEAKER_\d+)\]', formatted_transcription)
+    seen = set()
+    speaker_labels = [x for x in all_labels if not (x in seen or seen.add(x))]
+    
+    if not speaker_labels:
+        return {}
+
+    # Get configurable transcript length limit
+    transcript_limit = SystemSetting.get_setting('transcript_length_limit', 30000)
+    if transcript_limit == -1:
+        # No limit
+        transcript_text = formatted_transcription
+    else:
+        transcript_text = formatted_transcription[:transcript_limit]
+
+    prompt = f"""Analyze the following transcription and identify the names of the speakers. The speakers are labeled as {', '.join(speaker_labels)}. Based on the context of the conversation, determine the most likely name for each speaker label.
+
+Transcription:
+---
+{transcript_text}
+---
+
+Respond with a single JSON object where keys are the speaker labels (e.g., "SPEAKER_00") and values are the identified full names. If a name cannot be determined, use the value "Unknown".
+
+Example:
+{{
+  "SPEAKER_00": "John Doe",
+  "SPEAKER_01": "Jane Smith",
+  "SPEAKER_02": "Unknown"
+}}
+
+JSON Response:
+"""
+
+    try:
+        completion = call_llm_completion(
+            messages=[
+                {"role": "system", "content": "You are an expert in analyzing conversation transcripts to identify speakers. Your response must be a single, valid JSON object."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.2
+        )
+        response_content = completion.choices[0].message.content
+        speaker_map = safe_json_loads(response_content, {})
+
+        # Post-process the map to replace "Unknown" with an empty string
+        for speaker_label, identified_name in speaker_map.items():
+            if identified_name.strip().lower() == "unknown":
+                speaker_map[speaker_label] = ""
+
+        return speaker_map
+    except Exception as e:
+        current_app.logger.error(f"Error calling LLM for speaker identification: {e}")
+        raise
+
+
+def identify_unidentified_speakers_from_text(transcription, unidentified_speakers):
+    """
+    Uses an LLM to identify only the unidentified speakers from a transcription.
+    """
+    if not TEXT_MODEL_API_KEY:
+        raise ValueError("TEXT_MODEL_API_KEY not configured.")
+
+    # The transcription passed here could be JSON, so we format it.
+    formatted_transcription = format_transcription_for_llm(transcription)
+
+    if not unidentified_speakers:
+        return {}
+
+    # Get configurable transcript length limit
+    transcript_limit = SystemSetting.get_setting('transcript_length_limit', 30000)
+    if transcript_limit == -1:
+        # No limit
+        transcript_text = formatted_transcription
+    else:
+        transcript_text = formatted_transcription[:transcript_limit]
+
+    prompt = f"""Analyze the following conversation transcript and identify the names of the UNIDENTIFIED speakers based on the context and content of their dialogue.
+
+The speakers that need to be identified are: {', '.join(unidentified_speakers)}
+
+Look for clues in the conversation such as:
+- Names mentioned by other speakers when addressing someone
+- Self-introductions or references to their own name
+- Context clues about roles, relationships, or positions
+- Any direct mentions of names in the dialogue
+
+Here is the complete conversation transcript:
+
+{transcript_text}
+
+Based on the conversation above, identify the most likely real names for the unidentified speakers. Pay close attention to how speakers address each other and any names that are mentioned in the dialogue.
+
+Respond with a single JSON object where keys are the speaker labels (e.g., "SPEAKER_01") and values are the identified full names. If a name cannot be determined from the conversation context, use an empty string "".
+
+Example format:
+{{
+  "SPEAKER_01": "Jane Smith",
+  "SPEAKER_03": "Bob Johnson",
+  "SPEAKER_05": ""
+}}
+
+JSON Response:
+"""
+
+    try:
+        current_app.logger.info(f"[Auto-Identify] Calling LLM to identify speakers: {unidentified_speakers}")
+        current_app.logger.info(f"[Auto-Identify] Transcript excerpt (first 500 chars): {transcript_text[:500]}")
+
+        completion = call_llm_completion(
+            messages=[
+                {"role": "system", "content": "You are an expert in analyzing conversation transcripts to identify speakers based on contextual clues in the dialogue. Analyze the conversation carefully to find names mentioned when speakers address each other or introduce themselves. Your response must be a single, valid JSON object containing only the requested speaker identifications."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.2
+        )
+        response_content = completion.choices[0].message.content
+        current_app.logger.info(f"[Auto-Identify] LLM Raw Response: {response_content}")
+
+        speaker_map = safe_json_loads(response_content, {})
+        current_app.logger.info(f"[Auto-Identify] Parsed speaker_map: {speaker_map}")
+
+        # Post-process the map to replace "Unknown" with an empty string
+        for speaker_label, identified_name in speaker_map.items():
+            if identified_name and identified_name.strip().lower() in ["unknown", "n/a", "not available", "unclear"]:
+                speaker_map[speaker_label] = ""
+
+        current_app.logger.info(f"[Auto-Identify] Final speaker_map after post-processing: {speaker_map}")
+        return speaker_map
+    except Exception as e:
+        current_app.logger.error(f"Error calling LLM for speaker identification: {e}", exc_info=True)
+        raise
+