Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)

2026-03-16 21:47:37 +00:00
commit 42772a31ed
365 changed files with 103572 additions and 0 deletions
--- a/src/utils/ffprobe.py
+++ b/src/utils/ffprobe.py
@@ -0,0 +1,499 @@
+"""
+FFprobe utility for detecting audio/video codecs and format information.
+
+This module provides functions to inspect media files using ffprobe and return
+structured information about their codecs, streams, and formats.
+"""
+
+import json
+import logging
+import subprocess
+from datetime import datetime
+from typing import Optional, Dict, Any, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+class FFProbeError(Exception):
+    """Raised when ffprobe fails to analyze a file."""
+    pass
+
+
+def probe(filename: str, cmd: str = 'ffprobe', timeout: Optional[int] = None) -> Dict[str, Any]:
+    """
+    Run ffprobe on the specified file and return a JSON representation of the output.
+
+    Args:
+        filename: Path to the media file to probe
+        cmd: Command to use (default: 'ffprobe')
+        timeout: Optional timeout in seconds
+
+    Returns:
+        Dictionary containing streams and format information
+
+    Raises:
+        FFProbeError: if ffprobe returns a non-zero exit code
+    """
+    args = [cmd, '-show_format', '-show_streams', '-of', 'json', filename]
+    p = None
+
+    try:
+        p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        communicate_kwargs = {}
+        if timeout is not None:
+            communicate_kwargs['timeout'] = timeout
+        out, err = p.communicate(**communicate_kwargs)
+        
+        if p.returncode != 0:
+            error_msg = err.decode('utf-8', errors='ignore')
+            raise FFProbeError(f'ffprobe failed: {error_msg}')
+        
+        return json.loads(out.decode('utf-8'))
+    except subprocess.TimeoutExpired:
+        if p:
+            p.kill()
+        raise FFProbeError(f'ffprobe timed out after {timeout} seconds')
+    except FileNotFoundError:
+        raise FFProbeError('ffprobe command not found. Please ensure ffmpeg is installed.')
+    except json.JSONDecodeError as e:
+        raise FFProbeError(f'Failed to parse ffprobe output: {e}')
+
+
+def get_codec_info(filename: str, timeout: Optional[int] = None) -> Dict[str, Any]:
+    """
+    Get codec information for a media file.
+
+    Args:
+        filename: Path to the media file
+        timeout: Optional timeout in seconds
+
+    Returns:
+        Dictionary with keys:
+        - audio_codec: Audio codec name (e.g., 'pcm_s16le', 'aac', 'mp3')
+        - video_codec: Video codec name if present, or None
+        - has_video: Boolean indicating if file contains video stream
+        - has_audio: Boolean indicating if file contains audio stream
+        - format_name: Container format name (e.g., 'wav', 'mov,mp4,m4a')
+        - duration: Duration in seconds (float)
+        - sample_rate: Audio sample rate if available
+        - channels: Number of audio channels if available
+        - bit_rate: Bit rate if available
+
+    Raises:
+        FFProbeError: if ffprobe fails to analyze the file
+    """
+    try:
+        probe_data = probe(filename, timeout=timeout)
+    except FFProbeError:
+        raise
+
+    result = {
+        'audio_codec': None,
+        'video_codec': None,
+        'has_video': False,
+        'has_audio': False,
+        'format_name': None,
+        'duration': None,
+        'sample_rate': None,
+        'channels': None,
+        'bit_rate': None
+    }
+
+    # Extract format information
+    if 'format' in probe_data:
+        fmt = probe_data['format']
+        result['format_name'] = fmt.get('format_name')
+        
+        if 'duration' in fmt:
+            try:
+                result['duration'] = float(fmt['duration'])
+            except (ValueError, TypeError):
+                pass
+        
+        if 'bit_rate' in fmt:
+            try:
+                result['bit_rate'] = int(fmt['bit_rate'])
+            except (ValueError, TypeError):
+                pass
+
+    # Extract stream information
+    if 'streams' in probe_data:
+        for stream in probe_data['streams']:
+            codec_type = stream.get('codec_type')
+            codec_name = stream.get('codec_name')
+            
+            if codec_type == 'audio':
+                result['has_audio'] = True
+                if result['audio_codec'] is None:  # Use first audio stream
+                    result['audio_codec'] = codec_name
+                    result['sample_rate'] = stream.get('sample_rate')
+                    result['channels'] = stream.get('channels')
+            
+            elif codec_type == 'video':
+                result['has_video'] = True
+                if result['video_codec'] is None:  # Use first video stream
+                    result['video_codec'] = codec_name
+
+    return result
+
+
+def is_video_file(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
+    """
+    Check if a file contains video streams.
+
+    Args:
+        filename: Path to the media file
+        timeout: Optional timeout in seconds
+        codec_info: Optional pre-fetched codec info to avoid redundant probe calls
+
+    Returns:
+        True if file contains video streams, False otherwise
+    """
+    try:
+        if codec_info is None:
+            codec_info = get_codec_info(filename, timeout=timeout)
+        return codec_info['has_video']
+    except FFProbeError as e:
+        logger.warning(f"Failed to probe {filename}: {e}")
+        return False
+
+
+def is_audio_file(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
+    """
+    Check if a file contains audio streams.
+
+    Args:
+        filename: Path to the media file
+        timeout: Optional timeout in seconds
+        codec_info: Optional pre-fetched codec info to avoid redundant probe calls
+
+    Returns:
+        True if file contains audio streams, False otherwise
+    """
+    try:
+        if codec_info is None:
+            codec_info = get_codec_info(filename, timeout=timeout)
+        return codec_info['has_audio']
+    except FFProbeError as e:
+        logger.warning(f"Failed to probe {filename}: {e}")
+        return False
+
+
+def get_audio_codec(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Optional[str]:
+    """
+    Get the audio codec name for a file.
+
+    Args:
+        filename: Path to the media file
+        timeout: Optional timeout in seconds
+        codec_info: Optional pre-fetched codec info to avoid redundant probe calls
+
+    Returns:
+        Audio codec name (e.g., 'pcm_s16le', 'aac', 'mp3', 'opus'), or None if no audio
+    """
+    try:
+        if codec_info is None:
+            codec_info = get_codec_info(filename, timeout=timeout)
+        return codec_info['audio_codec']
+    except FFProbeError as e:
+        logger.warning(f"Failed to probe {filename}: {e}")
+        return None
+
+
+def needs_audio_conversion(filename: str, supported_codecs: list, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Tuple[bool, Optional[str]]:
+    """
+    Check if a file needs audio conversion based on its codec.
+
+    Args:
+        filename: Path to the media file
+        supported_codecs: List of supported audio codec names
+        timeout: Optional timeout in seconds
+        codec_info: Optional pre-fetched codec info to avoid redundant probe calls
+
+    Returns:
+        Tuple of (needs_conversion: bool, current_codec: str or None)
+    """
+    try:
+        if codec_info is None:
+            codec_info = get_codec_info(filename, timeout=timeout)
+        
+        # If it has video, it likely needs conversion
+        if codec_info['has_video']:
+            return True, codec_info.get('audio_codec')
+        
+        # If no audio at all, cannot convert
+        if not codec_info['has_audio']:
+            logger.warning(f"File {filename} has no audio streams")
+            return False, None
+        
+        audio_codec = codec_info['audio_codec']
+        
+        # Check if codec is in supported list
+        if audio_codec in supported_codecs:
+            return False, audio_codec
+        
+        return True, audio_codec
+        
+    except FFProbeError as e:
+        logger.warning(f"Failed to probe {filename}: {e}")
+        # Default to attempting conversion on error
+        return True, None
+
+
+def is_lossless_audio(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
+    """
+    Check if a file uses a lossless audio codec.
+
+    Args:
+        filename: Path to the media file
+        timeout: Optional timeout in seconds
+        codec_info: Optional pre-fetched codec info to avoid redundant probe calls
+
+    Returns:
+        True if file uses lossless audio codec, False otherwise
+    """
+    lossless_codecs = {
+        'pcm_s16le', 'pcm_s24le', 'pcm_s32le',
+        'pcm_f32le', 'pcm_f64le',
+        'pcm_u8', 'pcm_u16le', 'pcm_u24le', 'pcm_u32le',
+        'flac', 'alac', 'ape', 'wavpack', 'tta',
+        'mlp', 'truehd'
+    }
+    
+    try:
+        if codec_info is None:
+            codec_info = get_codec_info(filename, timeout=timeout)
+        audio_codec = codec_info['audio_codec']
+        return audio_codec in lossless_codecs if audio_codec else False
+    except FFProbeError as e:
+        logger.warning(f"Failed to probe {filename}: {e}")
+        return False
+
+
+def get_duration(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Optional[float]:
+    """
+    Get the duration of a media file in seconds.
+
+    Uses multiple methods to determine duration:
+    1. Format-level duration (fastest, works for most files)
+    2. Packet timestamps fallback (for files without duration metadata like some WebM)
+
+    Args:
+        filename: Path to the media file
+        timeout: Optional timeout in seconds
+        codec_info: Optional pre-fetched codec info to avoid redundant probe calls
+
+    Returns:
+        Duration in seconds, or None if unable to determine
+    """
+    try:
+        if codec_info is None:
+            codec_info = get_codec_info(filename, timeout=timeout)
+
+        # Try format-level duration first
+        if codec_info['duration'] is not None:
+            return codec_info['duration']
+
+        # Fallback: scan packets to find the last timestamp
+        # This works for WebM and other files without duration metadata
+        return _get_duration_from_packets(filename, timeout=timeout)
+    except FFProbeError as e:
+        logger.warning(f"Failed to probe {filename}: {e}")
+        return None
+
+
+def _get_duration_from_packets(filename: str, timeout: Optional[int] = None) -> Optional[float]:
+    """
+    Get duration by scanning packet timestamps (fallback for files without duration metadata).
+
+    This is slower than format-level duration but works for WebM and similar files
+    that don't store duration in the container metadata.
+
+    Args:
+        filename: Path to the media file
+        timeout: Optional timeout in seconds
+
+    Returns:
+        Duration in seconds, or None if unable to determine
+    """
+    try:
+        args = [
+            'ffprobe', '-v', 'error',
+            '-show_entries', 'packet=pts_time',
+            '-select_streams', 'a:0',  # First audio stream
+            '-of', 'csv=p=0',
+            filename
+        ]
+
+        p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        communicate_kwargs = {}
+        if timeout is not None:
+            communicate_kwargs['timeout'] = timeout
+        out, err = p.communicate(**communicate_kwargs)
+
+        if p.returncode != 0:
+            logger.debug(f"Packet scan failed for {filename}")
+            return None
+
+        # Parse the output to find the last timestamp
+        lines = out.decode('utf-8').strip().split('\n')
+        last_valid_time = None
+        for line in reversed(lines):
+            line = line.strip()
+            if line and line != 'N/A':
+                try:
+                    last_valid_time = float(line)
+                    break
+                except ValueError:
+                    continue
+
+        if last_valid_time is not None:
+            logger.debug(f"Got duration from packets for {filename}: {last_valid_time}")
+            return last_valid_time
+
+        return None
+    except subprocess.TimeoutExpired:
+        logger.warning(f"Packet scan timed out for {filename}")
+        return None
+    except Exception as e:
+        logger.warning(f"Error scanning packets for {filename}: {e}")
+        return None
+
+
+def get_creation_date(filename: str, timeout: Optional[int] = None, use_file_mtime: bool = True) -> Optional[datetime]:
+    """
+    Extract the creation/recording date from a media file's metadata.
+
+    Checks various metadata tags commonly used by recorders and devices:
+    - creation_time (MP4, M4A, MOV)
+    - date (various formats)
+    - encoded_date (some encoders)
+
+    Falls back to file modification time if no metadata found and use_file_mtime is True.
+
+    Args:
+        filename: Path to the media file
+        timeout: Optional timeout in seconds
+        use_file_mtime: If True, fall back to file modification time when no metadata found
+
+    Returns:
+        datetime object if creation date found, None otherwise
+    """
+    import os
+
+    try:
+        probe_data = probe(filename, timeout=timeout)
+    except FFProbeError as e:
+        logger.warning(f"Failed to probe {filename} for creation date: {e}")
+        # Even if probe fails, we can still try file mtime
+        if use_file_mtime:
+            return _get_file_mtime(filename)
+        return None
+
+    # Tags to check for creation date (in order of preference)
+    date_tags = ['creation_time', 'date', 'encoded_date', 'date_recorded', 'recording_time']
+
+    # Check format-level tags first
+    if 'format' in probe_data and 'tags' in probe_data['format']:
+        tags = probe_data['format']['tags']
+        for tag in date_tags:
+            # Check both lowercase and original case
+            value = tags.get(tag) or tags.get(tag.upper())
+            if value:
+                parsed = _parse_date_string(value)
+                if parsed:
+                    logger.debug(f"Found creation date from format tag '{tag}': {parsed}")
+                    return parsed
+
+    # Check stream-level tags
+    if 'streams' in probe_data:
+        for stream in probe_data['streams']:
+            if 'tags' in stream:
+                tags = stream['tags']
+                for tag in date_tags:
+                    value = tags.get(tag) or tags.get(tag.upper())
+                    if value:
+                        parsed = _parse_date_string(value)
+                        if parsed:
+                            logger.debug(f"Found creation date from stream tag '{tag}': {parsed}")
+                            return parsed
+
+    # Fall back to file modification time
+    if use_file_mtime:
+        mtime = _get_file_mtime(filename)
+        if mtime:
+            logger.debug(f"Using file modification time as creation date: {mtime}")
+            return mtime
+
+    logger.debug(f"No creation date found for {filename}")
+    return None
+
+
+def _get_file_mtime(filename: str) -> Optional[datetime]:
+    """
+    Get the file's modification time as a datetime.
+
+    Args:
+        filename: Path to the file
+
+    Returns:
+        datetime object or None if unable to get mtime
+    """
+    import os
+
+    try:
+        stat_info = os.stat(filename)
+        return datetime.fromtimestamp(stat_info.st_mtime)
+    except (OSError, ValueError) as e:
+        logger.warning(f"Failed to get file mtime for {filename}: {e}")
+        return None
+
+
+def _parse_date_string(date_str: str) -> Optional[datetime]:
+    """
+    Parse various date string formats commonly found in media metadata.
+
+    Args:
+        date_str: Date string to parse
+
+    Returns:
+        datetime object if parsing successful, None otherwise
+    """
+    if not date_str:
+        return None
+
+    # Common formats in media files
+    formats = [
+        '%Y-%m-%dT%H:%M:%S.%fZ',      # ISO 8601 with microseconds and Z
+        '%Y-%m-%dT%H:%M:%SZ',          # ISO 8601 with Z
+        '%Y-%m-%dT%H:%M:%S.%f%z',      # ISO 8601 with microseconds and timezone
+        '%Y-%m-%dT%H:%M:%S%z',         # ISO 8601 with timezone
+        '%Y-%m-%dT%H:%M:%S.%f',        # ISO 8601 with microseconds
+        '%Y-%m-%dT%H:%M:%S',           # ISO 8601 basic
+        '%Y-%m-%d %H:%M:%S',           # Common datetime
+        '%Y/%m/%d %H:%M:%S',           # Alternate datetime
+        '%Y-%m-%d',                     # Date only
+        '%Y/%m/%d',                     # Alternate date only
+        '%d-%m-%Y %H:%M:%S',           # European format
+        '%d/%m/%Y %H:%M:%S',           # European format alternate
+    ]
+
+    # Clean up the string
+    date_str = date_str.strip()
+
+    for fmt in formats:
+        try:
+            return datetime.strptime(date_str, fmt)
+        except ValueError:
+            continue
+
+    # Try fromisoformat as a fallback (handles many ISO variants)
+    try:
+        # Replace Z with +00:00 for fromisoformat compatibility
+        clean_str = date_str.replace('Z', '+00:00')
+        return datetime.fromisoformat(clean_str)
+    except ValueError:
+        pass
+
+    logger.debug(f"Could not parse date string: {date_str}")
+    return None