Files
dictia-public/src/utils/ffprobe.py

499 lines
17 KiB
Python

"""
FFprobe utility for detecting audio/video codecs and format information.
This module provides functions to inspect media files using ffprobe and return
structured information about their codecs, streams, and formats.
"""
import json
import logging
import subprocess
from datetime import datetime
from typing import Optional, Dict, Any, Tuple
logger = logging.getLogger(__name__)
class FFProbeError(Exception):
"""Raised when ffprobe fails to analyze a file."""
pass
def probe(filename: str, cmd: str = 'ffprobe', timeout: Optional[int] = None) -> Dict[str, Any]:
"""
Run ffprobe on the specified file and return a JSON representation of the output.
Args:
filename: Path to the media file to probe
cmd: Command to use (default: 'ffprobe')
timeout: Optional timeout in seconds
Returns:
Dictionary containing streams and format information
Raises:
FFProbeError: if ffprobe returns a non-zero exit code
"""
args = [cmd, '-show_format', '-show_streams', '-of', 'json', filename]
p = None
try:
p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
communicate_kwargs = {}
if timeout is not None:
communicate_kwargs['timeout'] = timeout
out, err = p.communicate(**communicate_kwargs)
if p.returncode != 0:
error_msg = err.decode('utf-8', errors='ignore')
raise FFProbeError(f'ffprobe failed: {error_msg}')
return json.loads(out.decode('utf-8'))
except subprocess.TimeoutExpired:
if p:
p.kill()
raise FFProbeError(f'ffprobe timed out after {timeout} seconds')
except FileNotFoundError:
raise FFProbeError('ffprobe command not found. Please ensure ffmpeg is installed.')
except json.JSONDecodeError as e:
raise FFProbeError(f'Failed to parse ffprobe output: {e}')
def get_codec_info(filename: str, timeout: Optional[int] = None) -> Dict[str, Any]:
"""
Get codec information for a media file.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
Returns:
Dictionary with keys:
- audio_codec: Audio codec name (e.g., 'pcm_s16le', 'aac', 'mp3')
- video_codec: Video codec name if present, or None
- has_video: Boolean indicating if file contains video stream
- has_audio: Boolean indicating if file contains audio stream
- format_name: Container format name (e.g., 'wav', 'mov,mp4,m4a')
- duration: Duration in seconds (float)
- sample_rate: Audio sample rate if available
- channels: Number of audio channels if available
- bit_rate: Bit rate if available
Raises:
FFProbeError: if ffprobe fails to analyze the file
"""
try:
probe_data = probe(filename, timeout=timeout)
except FFProbeError:
raise
result = {
'audio_codec': None,
'video_codec': None,
'has_video': False,
'has_audio': False,
'format_name': None,
'duration': None,
'sample_rate': None,
'channels': None,
'bit_rate': None
}
# Extract format information
if 'format' in probe_data:
fmt = probe_data['format']
result['format_name'] = fmt.get('format_name')
if 'duration' in fmt:
try:
result['duration'] = float(fmt['duration'])
except (ValueError, TypeError):
pass
if 'bit_rate' in fmt:
try:
result['bit_rate'] = int(fmt['bit_rate'])
except (ValueError, TypeError):
pass
# Extract stream information
if 'streams' in probe_data:
for stream in probe_data['streams']:
codec_type = stream.get('codec_type')
codec_name = stream.get('codec_name')
if codec_type == 'audio':
result['has_audio'] = True
if result['audio_codec'] is None: # Use first audio stream
result['audio_codec'] = codec_name
result['sample_rate'] = stream.get('sample_rate')
result['channels'] = stream.get('channels')
elif codec_type == 'video':
result['has_video'] = True
if result['video_codec'] is None: # Use first video stream
result['video_codec'] = codec_name
return result
def is_video_file(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
"""
Check if a file contains video streams.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
True if file contains video streams, False otherwise
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
return codec_info['has_video']
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return False
def is_audio_file(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
"""
Check if a file contains audio streams.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
True if file contains audio streams, False otherwise
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
return codec_info['has_audio']
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return False
def get_audio_codec(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Optional[str]:
"""
Get the audio codec name for a file.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
Audio codec name (e.g., 'pcm_s16le', 'aac', 'mp3', 'opus'), or None if no audio
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
return codec_info['audio_codec']
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return None
def needs_audio_conversion(filename: str, supported_codecs: list, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Tuple[bool, Optional[str]]:
"""
Check if a file needs audio conversion based on its codec.
Args:
filename: Path to the media file
supported_codecs: List of supported audio codec names
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
Tuple of (needs_conversion: bool, current_codec: str or None)
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
# If it has video, it likely needs conversion
if codec_info['has_video']:
return True, codec_info.get('audio_codec')
# If no audio at all, cannot convert
if not codec_info['has_audio']:
logger.warning(f"File {filename} has no audio streams")
return False, None
audio_codec = codec_info['audio_codec']
# Check if codec is in supported list
if audio_codec in supported_codecs:
return False, audio_codec
return True, audio_codec
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
# Default to attempting conversion on error
return True, None
def is_lossless_audio(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
"""
Check if a file uses a lossless audio codec.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
True if file uses lossless audio codec, False otherwise
"""
lossless_codecs = {
'pcm_s16le', 'pcm_s24le', 'pcm_s32le',
'pcm_f32le', 'pcm_f64le',
'pcm_u8', 'pcm_u16le', 'pcm_u24le', 'pcm_u32le',
'flac', 'alac', 'ape', 'wavpack', 'tta',
'mlp', 'truehd'
}
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
audio_codec = codec_info['audio_codec']
return audio_codec in lossless_codecs if audio_codec else False
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return False
def get_duration(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Optional[float]:
"""
Get the duration of a media file in seconds.
Uses multiple methods to determine duration:
1. Format-level duration (fastest, works for most files)
2. Packet timestamps fallback (for files without duration metadata like some WebM)
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
Duration in seconds, or None if unable to determine
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
# Try format-level duration first
if codec_info['duration'] is not None:
return codec_info['duration']
# Fallback: scan packets to find the last timestamp
# This works for WebM and other files without duration metadata
return _get_duration_from_packets(filename, timeout=timeout)
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return None
def _get_duration_from_packets(filename: str, timeout: Optional[int] = None) -> Optional[float]:
"""
Get duration by scanning packet timestamps (fallback for files without duration metadata).
This is slower than format-level duration but works for WebM and similar files
that don't store duration in the container metadata.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
Returns:
Duration in seconds, or None if unable to determine
"""
try:
args = [
'ffprobe', '-v', 'error',
'-show_entries', 'packet=pts_time',
'-select_streams', 'a:0', # First audio stream
'-of', 'csv=p=0',
filename
]
p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
communicate_kwargs = {}
if timeout is not None:
communicate_kwargs['timeout'] = timeout
out, err = p.communicate(**communicate_kwargs)
if p.returncode != 0:
logger.debug(f"Packet scan failed for {filename}")
return None
# Parse the output to find the last timestamp
lines = out.decode('utf-8').strip().split('\n')
last_valid_time = None
for line in reversed(lines):
line = line.strip()
if line and line != 'N/A':
try:
last_valid_time = float(line)
break
except ValueError:
continue
if last_valid_time is not None:
logger.debug(f"Got duration from packets for {filename}: {last_valid_time}")
return last_valid_time
return None
except subprocess.TimeoutExpired:
logger.warning(f"Packet scan timed out for {filename}")
return None
except Exception as e:
logger.warning(f"Error scanning packets for {filename}: {e}")
return None
def get_creation_date(filename: str, timeout: Optional[int] = None, use_file_mtime: bool = True) -> Optional[datetime]:
"""
Extract the creation/recording date from a media file's metadata.
Checks various metadata tags commonly used by recorders and devices:
- creation_time (MP4, M4A, MOV)
- date (various formats)
- encoded_date (some encoders)
Falls back to file modification time if no metadata found and use_file_mtime is True.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
use_file_mtime: If True, fall back to file modification time when no metadata found
Returns:
datetime object if creation date found, None otherwise
"""
import os
try:
probe_data = probe(filename, timeout=timeout)
except FFProbeError as e:
logger.warning(f"Failed to probe {filename} for creation date: {e}")
# Even if probe fails, we can still try file mtime
if use_file_mtime:
return _get_file_mtime(filename)
return None
# Tags to check for creation date (in order of preference)
date_tags = ['creation_time', 'date', 'encoded_date', 'date_recorded', 'recording_time']
# Check format-level tags first
if 'format' in probe_data and 'tags' in probe_data['format']:
tags = probe_data['format']['tags']
for tag in date_tags:
# Check both lowercase and original case
value = tags.get(tag) or tags.get(tag.upper())
if value:
parsed = _parse_date_string(value)
if parsed:
logger.debug(f"Found creation date from format tag '{tag}': {parsed}")
return parsed
# Check stream-level tags
if 'streams' in probe_data:
for stream in probe_data['streams']:
if 'tags' in stream:
tags = stream['tags']
for tag in date_tags:
value = tags.get(tag) or tags.get(tag.upper())
if value:
parsed = _parse_date_string(value)
if parsed:
logger.debug(f"Found creation date from stream tag '{tag}': {parsed}")
return parsed
# Fall back to file modification time
if use_file_mtime:
mtime = _get_file_mtime(filename)
if mtime:
logger.debug(f"Using file modification time as creation date: {mtime}")
return mtime
logger.debug(f"No creation date found for {filename}")
return None
def _get_file_mtime(filename: str) -> Optional[datetime]:
"""
Get the file's modification time as a datetime.
Args:
filename: Path to the file
Returns:
datetime object or None if unable to get mtime
"""
import os
try:
stat_info = os.stat(filename)
return datetime.fromtimestamp(stat_info.st_mtime)
except (OSError, ValueError) as e:
logger.warning(f"Failed to get file mtime for {filename}: {e}")
return None
def _parse_date_string(date_str: str) -> Optional[datetime]:
"""
Parse various date string formats commonly found in media metadata.
Args:
date_str: Date string to parse
Returns:
datetime object if parsing successful, None otherwise
"""
if not date_str:
return None
# Common formats in media files
formats = [
'%Y-%m-%dT%H:%M:%S.%fZ', # ISO 8601 with microseconds and Z
'%Y-%m-%dT%H:%M:%SZ', # ISO 8601 with Z
'%Y-%m-%dT%H:%M:%S.%f%z', # ISO 8601 with microseconds and timezone
'%Y-%m-%dT%H:%M:%S%z', # ISO 8601 with timezone
'%Y-%m-%dT%H:%M:%S.%f', # ISO 8601 with microseconds
'%Y-%m-%dT%H:%M:%S', # ISO 8601 basic
'%Y-%m-%d %H:%M:%S', # Common datetime
'%Y/%m/%d %H:%M:%S', # Alternate datetime
'%Y-%m-%d', # Date only
'%Y/%m/%d', # Alternate date only
'%d-%m-%Y %H:%M:%S', # European format
'%d/%m/%Y %H:%M:%S', # European format alternate
]
# Clean up the string
date_str = date_str.strip()
for fmt in formats:
try:
return datetime.strptime(date_str, fmt)
except ValueError:
continue
# Try fromisoformat as a fallback (handles many ISO variants)
try:
# Replace Z with +00:00 for fromisoformat compatibility
clean_str = date_str.replace('Z', '+00:00')
return datetime.fromisoformat(clean_str)
except ValueError:
pass
logger.debug(f"Could not parse date string: {date_str}")
return None