dictia-public/src/services/speaker_identification.py

"""
Shared speaker identification service.

Provides LLM-based speaker identification from transcript context,
used by both the web UI (recordings.py) and REST API (api_v1.py).
"""

import os
import re
import json
from flask import current_app


def identify_speakers_from_transcript(transcription_data, user_id):
    """
    Identify speakers in a transcription using an LLM.

    Args:
        transcription_data: List of transcript segments (already parsed JSON).
        user_id: Current user's ID (for token tracking).

    Returns:
        dict mapping original speaker labels to identified names.
        Values are empty string "" for unidentified speakers.

    Raises:
        ValueError: If LLM API key is not configured.
        Exception: On LLM call failure.
    """
    from src.services.llm import call_llm_completion
    from src.utils import safe_json_loads
    from src.models import SystemSetting

    # Extract unique speakers in order of appearance
    seen_speakers = set()
    unique_speakers = []
    for segment in transcription_data:
        speaker = segment.get('speaker')
        if speaker and speaker not in seen_speakers:
            seen_speakers.add(speaker)
            unique_speakers.append(speaker)

    if not unique_speakers:
        return {}

    # Normalize all labels to SPEAKER_XX format for the LLM
    speaker_to_label = {}
    for idx, speaker in enumerate(unique_speakers):
        speaker_to_label[speaker] = f'SPEAKER_{str(idx).zfill(2)}'

    # Create temporary transcript with normalized labels
    formatted_lines = []
    for segment in transcription_data:
        original_speaker = segment.get('speaker')
        label = speaker_to_label.get(original_speaker, 'Unknown Speaker')
        sentence = segment.get('sentence', '')
        formatted_lines.append(f"[{label}]: {sentence}")
    formatted_transcription = "\n".join(formatted_lines)

    speaker_labels = list(speaker_to_label.values())

    current_app.logger.info(f"[Auto-Identify] Formatted transcript (first 500 chars): {formatted_transcription[:500]}")
    current_app.logger.info(f"[Auto-Identify] Speaker labels: {speaker_labels}")

    # Apply configurable transcript length limit
    transcript_limit = SystemSetting.get_setting('transcript_length_limit', 30000)
    if transcript_limit == -1:
        transcript_text = formatted_transcription
    else:
        transcript_text = formatted_transcription[:transcript_limit]

    prompt = f"""Analyse cette transcription de conversation et identifie les noms des locuteurs à partir du contexte et du contenu de leurs dialogues.

Les locuteurs à identifier sont : {', '.join(speaker_labels)}

Indices à chercher :
- Noms mentionnés par d'autres locuteurs quand ils s'adressent à quelqu'un
- Présentations ou références à son propre nom
- Indices contextuels sur les rôles, relations ou postes
- Toute mention directe de noms dans le dialogue

Transcription complète :

{transcript_text}

À partir de cette conversation, identifie les noms les plus probables pour chaque locuteur. Porte une attention particulière à la façon dont les locuteurs s'adressent les uns aux autres.

Réponds avec un seul objet JSON où les clés sont les étiquettes de locuteurs (ex. "SPEAKER_01") et les valeurs sont les noms complets identifiés. Si un nom ne peut pas être déterminé, utilise une chaîne vide "".

Exemple :
{{
  "SPEAKER_01": "Marie Lavoie",
  "SPEAKER_03": "Jean Tremblay",
  "SPEAKER_05": ""
}}

Réponse JSON :
"""

    current_app.logger.info("[Auto-Identify] Calling LLM")

    use_schema = os.environ.get('AUTO_IDENTIFY_RESPONSE_SCHEMA', '').strip() in ('1', 'true', 'yes')
    system_msg = (
        "You are an expert in analyzing conversation transcripts to identify speakers "
        "based on contextual clues in the dialogue. Analyze the conversation carefully "
        "to find names mentioned when speakers address each other or introduce themselves. "
        "Your response must be a single, valid JSON object containing only the requested "
        "speaker identifications."
    )

    response_content = None
    if use_schema:
        # Build JSON schema response format with constrained keys
        schema_properties = {label: {"type": "string"} for label in speaker_labels}
        schema_response_format = {
            "type": "json_schema",
            "json_schema": {
                "name": "speaker_identification",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": schema_properties,
                    "required": speaker_labels,
                    "additionalProperties": False
                }
            }
        }
        schema_prompt = prompt + f"\n\nIMPORTANT: Your JSON response must contain exactly these keys: {', '.join(speaker_labels)}"
        try:
            current_app.logger.info("[Auto-Identify] Trying json_schema response format")
            completion = call_llm_completion(
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": schema_prompt}
                ],
                temperature=0.2,
                response_format=schema_response_format,
                user_id=user_id,
                operation_type='speaker_identification'
            )
            response_content = completion.choices[0].message.content
            current_app.logger.info(f"[Auto-Identify] LLM Raw Response (schema mode): {response_content}")
        except Exception as schema_err:
            current_app.logger.warning(f"[Auto-Identify] json_schema mode failed, falling back to json_object: {schema_err}")
            response_content = None

    if response_content is None:
        completion = call_llm_completion(
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            user_id=user_id,
            operation_type='speaker_identification'
        )
        response_content = completion.choices[0].message.content
        current_app.logger.info(f"[Auto-Identify] LLM Raw Response: {response_content}")

    identified_map = safe_json_loads(response_content, {})
    current_app.logger.info(f"[Auto-Identify] Parsed identified_map: {identified_map}")

    # --- Sanitize identified_map ---
    identified_map = _sanitize_identified_map(identified_map, speaker_labels)
    current_app.logger.info(f"[Auto-Identify] Sanitized identified_map: {identified_map}")

    # Map back to original speaker labels
    final_speaker_map = {}
    for original_speaker, temp_label in speaker_to_label.items():
        if temp_label in identified_map:
            final_speaker_map[original_speaker] = identified_map[temp_label]

    current_app.logger.info(f"[Auto-Identify] Final speaker_map: {final_speaker_map}")
    return final_speaker_map


def _sanitize_identified_map(identified_map, speaker_labels):
    """
    Clean up LLM output: handle inverted maps, strip commentary,
    clear placeholders, etc.
    """
    speaker_label_re = re.compile(r'^SPEAKER_\d{2}$')

    # Detect inverted map ({name: "SPEAKER_XX"}) and flip it
    if identified_map and all(
        speaker_label_re.match(str(v)) for v in identified_map.values() if v
    ) and not any(speaker_label_re.match(str(k)) for k in identified_map.keys()):
        current_app.logger.warning("[Auto-Identify] Detected inverted map, flipping keys/values")
        identified_map = {v: k for k, v in identified_map.items() if v}

    sanitized = {}
    for speaker_label, identified_name in identified_map.items():
        # Skip entries whose key isn't a valid SPEAKER_XX label
        if not speaker_label_re.match(str(speaker_label)):
            continue
        if not identified_name or not isinstance(identified_name, str):
            sanitized[speaker_label] = ""
            continue

        name = identified_name.strip()

        # Clear generic placeholders
        if name.lower() in ["unknown", "n/a", "not available", "unclear", "unidentified", ""]:
            sanitized[speaker_label] = ""
            continue

        # Clear label-to-label entries (e.g. "SPEAKER_01": "SPEAKER_02")
        if speaker_label_re.match(name):
            sanitized[speaker_label] = ""
            continue

        # Strip parenthetical content: "John (the host)" -> "John"
        name = re.sub(r'\s*\([^)]*\)', '', name).strip()

        # Take first name segment before comma, semicolon, or slash
        name = re.split(r'[,;/]', name)[0].strip()

        # Collapse whitespace
        name = re.sub(r'\s+', ' ', name)

        # Final check: if result still matches SPEAKER_XX, clear it
        if speaker_label_re.match(name) or not name:
            sanitized[speaker_label] = ""
            continue

        sanitized[speaker_label] = name

    return sanitized