Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)

This commit is contained in:
InnovA AI
2026-03-16 21:47:37 +00:00
commit 42772a31ed
365 changed files with 103572 additions and 0 deletions

81
src/utils/__init__.py Normal file
View File

@@ -0,0 +1,81 @@
"""
Utility functions package for the Speakr application.
This package contains various utility modules for:
- JSON parsing and handling
- Markdown to HTML conversion
- Datetime formatting and timezone handling
- Security utilities
"""
from .json_parser import (
auto_close_json,
safe_json_loads,
preprocess_json_escapes,
extract_json_object
)
from .markdown import (
md_to_html,
sanitize_html
)
from .datetime import (
local_datetime_filter
)
from .security import (
password_check,
is_safe_url,
admin_required,
)
from .database import (
add_column_if_not_exists,
migrate_column_type,
create_index_if_not_exists
)
from .token_auth import (
extract_token_from_request,
hash_token,
load_user_from_token,
is_token_authenticated
)
from .error_formatting import (
is_transcription_error,
format_error_for_user,
format_error_for_storage,
parse_stored_error
)
__all__ = [
# JSON parsing
'auto_close_json',
'safe_json_loads',
'preprocess_json_escapes',
'extract_json_object',
# Markdown/HTML
'md_to_html',
'sanitize_html',
# Datetime
'local_datetime_filter',
# Security
'password_check',
'is_safe_url',
# Database
'add_column_if_not_exists',
'migrate_column_type',
'create_index_if_not_exists',
# Token authentication
'extract_token_from_request',
'hash_token',
'load_user_from_token',
'is_token_authenticated',
# Error formatting
'is_transcription_error',
'format_error_for_user',
'format_error_for_storage',
'parse_stored_error',
]

View File

@@ -0,0 +1,405 @@
"""
Audio conversion utility for handling codec detection and file conversion.
This module provides a single, unified interface for handling ALL audio/video
conversion needs:
- Video to audio extraction
- Unsupported codec conversion
- Lossless audio compression
Callers should ONLY use convert_if_needed() - it handles everything.
"""
import os
import logging
from pathlib import Path
from typing import Optional, Tuple, Set, Dict, Any
from src.utils.ffprobe import get_codec_info, is_lossless_audio, FFProbeError
from src.utils.ffmpeg_utils import compress_audio, extract_audio_from_video, FFmpegError, FFmpegNotFoundError
from src.config.app_config import AUDIO_COMPRESS_UPLOADS, AUDIO_CODEC, AUDIO_BITRATE, AUDIO_UNSUPPORTED_CODECS
logger = logging.getLogger(__name__)
class ConversionResult:
"""Result of a conversion operation."""
def __init__(
self,
output_path: str,
mime_type: str,
was_converted: bool,
was_compressed: bool,
original_size: int,
final_size: int,
original_codec: Optional[str] = None,
final_codec: Optional[str] = None
):
self.output_path = output_path
self.mime_type = mime_type
self.was_converted = was_converted
self.was_compressed = was_compressed
self.original_size = original_size
self.final_size = final_size
self.original_codec = original_codec
self.final_codec = final_codec
@property
def size_reduction_percent(self) -> float:
"""Calculate size reduction percentage."""
if self.original_size == 0:
return 0.0
return ((self.original_size - self.final_size) / self.original_size) * 100
@property
def original_size_mb(self) -> float:
"""Original size in megabytes."""
return self.original_size / (1024 * 1024)
@property
def final_size_mb(self) -> float:
"""Final size in megabytes."""
return self.final_size / (1024 * 1024)
def get_supported_codecs(needs_chunking: bool = False, connector_specs: Optional[Any] = None) -> Set[str]:
"""
Get the set of supported audio codecs.
Args:
needs_chunking: If True, return only codecs that work well with chunking
connector_specs: Optional ConnectorSpecifications with provider-specific codec restrictions
Returns:
Set of supported codec names (minus any excluded via env var or connector specs)
"""
# If connector defines explicit supported codecs, use those
if connector_specs and connector_specs.supported_codecs:
base_codecs = set(connector_specs.supported_codecs)
elif needs_chunking:
# For chunking: only support codecs that work well with chunking
base_codecs = {'pcm_s16le', 'pcm_s24le', 'pcm_f32le', 'mp3', 'flac'}
else:
# For direct transcription: support common formats
# Note: WebM containers are handled separately (by extension check in convert_if_needed)
# because MediaRecorder WebM files often lack seek cues, but the opus/vorbis codecs
# themselves are fine in proper containers (.opus, .ogg)
base_codecs = {'pcm_s16le', 'pcm_s24le', 'pcm_f32le', 'mp3', 'flac', 'aac', 'opus', 'vorbis'}
# Remove connector-specific unsupported codecs
if connector_specs and connector_specs.unsupported_codecs:
excluded = base_codecs & set(connector_specs.unsupported_codecs)
if excluded:
logger.info(f"Excluding codecs from supported list (via connector specs): {excluded}")
base_codecs = base_codecs - set(connector_specs.unsupported_codecs)
# Remove any global user-specified unsupported codecs (env var still applies)
if AUDIO_UNSUPPORTED_CODECS:
excluded = base_codecs & AUDIO_UNSUPPORTED_CODECS
if excluded:
logger.info(f"Excluding codecs from supported list (via AUDIO_UNSUPPORTED_CODECS): {excluded}")
return base_codecs - AUDIO_UNSUPPORTED_CODECS
return base_codecs
def convert_if_needed(
filepath: str,
original_filename: Optional[str] = None,
codec_info: Optional[Dict[str, Any]] = None,
needs_chunking: bool = False,
is_asr_endpoint: bool = False,
delete_original: bool = True,
connector_specs: Optional[Any] = None
) -> ConversionResult:
"""
Handle ALL audio conversion needs in one place.
This is the ONLY function callers should use. It handles:
1. Video to audio extraction (if has_video)
2. Unsupported codec conversion (if codec not supported)
3. Lossless audio compression (if AUDIO_COMPRESS_UPLOADS enabled)
The function makes intelligent decisions about what processing is needed
and performs it in the optimal order.
Args:
filepath: Path to the audio/video file
original_filename: Original filename for logging (defaults to basename)
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
needs_chunking: Whether chunking will be used (affects supported codecs)
is_asr_endpoint: Whether using ASR endpoint (affects AAC handling)
delete_original: Whether to delete original file after successful conversion
connector_specs: Optional ConnectorSpecifications with provider-specific codec restrictions
Returns:
ConversionResult with output path, mime type, and conversion stats
Raises:
FFmpegNotFoundError: If FFmpeg is not available
FFmpegError: If conversion fails
"""
if original_filename is None:
original_filename = os.path.basename(filepath)
# Get original file size
original_size = os.path.getsize(filepath)
# Probe if codec info not provided
if codec_info is None:
try:
codec_info = get_codec_info(filepath, timeout=10)
logger.info(
f"Detected codec for {original_filename}: "
f"audio_codec={codec_info.get('audio_codec')}, "
f"has_video={codec_info.get('has_video', False)}"
)
except FFProbeError as e:
logger.warning(f"Failed to probe {filepath}: {e}. Will attempt conversion.")
codec_info = None
original_codec = codec_info.get('audio_codec') if codec_info else None
audio_codec = original_codec
has_video = codec_info.get('has_video', False) if codec_info else False
# Get supported codecs based on processing mode and connector specs
supported_codecs = get_supported_codecs(needs_chunking, connector_specs)
# Handle video files - extract audio
if has_video:
# Determine target codec for video extraction - fall back to mp3 if AUDIO_CODEC is unsupported
video_target_codec = AUDIO_CODEC
if connector_specs and connector_specs.unsupported_codecs:
if AUDIO_CODEC in connector_specs.unsupported_codecs:
video_target_codec = 'mp3'
logger.warning(
f"AUDIO_CODEC '{AUDIO_CODEC}' is not supported by connector, "
f"falling back to mp3 for video extraction from {original_filename}"
)
# Check if we can remux (copy) instead of transcode
can_remux = False
if audio_codec and audio_codec in supported_codecs:
try:
# Remux if audio is lossy, or if lossless but compression is disabled
is_lossless = is_lossless_audio(filepath, codec_info=codec_info)
can_remux = not is_lossless or not AUDIO_COMPRESS_UPLOADS
except Exception as e:
logger.warning(f"Could not determine if audio is lossless: {e}. Will transcode.")
try:
if can_remux:
logger.info(f"Extracting audio from video (remux, no transcoding): {original_filename}")
output_filepath, mime_type = extract_audio_from_video(
filepath,
output_format='copy',
cleanup_original=delete_original,
copy_stream=True
)
final_codec = audio_codec
else:
logger.info(f"Extracting and converting audio from video to {video_target_codec.upper()}: {original_filename}")
output_filepath, mime_type = extract_audio_from_video(
filepath,
output_format=video_target_codec,
bitrate=AUDIO_BITRATE,
cleanup_original=delete_original,
copy_stream=False
)
final_codec = video_target_codec
final_size = os.path.getsize(output_filepath)
reduction = ((original_size - final_size) / original_size * 100) if original_size > 0 else 0
logger.info(
f"Successfully extracted audio from {original_filename}: "
f"{original_size/1024/1024:.1f}MB -> {final_size/1024/1024:.1f}MB "
f"({reduction:.1f}% reduction)"
)
return ConversionResult(
output_path=output_filepath,
mime_type=mime_type,
was_converted=not can_remux,
was_compressed=False,
original_size=original_size,
final_size=final_size,
original_codec=original_codec,
final_codec=final_codec
)
except FFmpegNotFoundError:
logger.error("FFmpeg not found")
raise
except FFmpegError as e:
logger.error(f"Failed to extract audio from video {filepath}: {e}")
raise
# Handle audio files - check if conversion needed
needs_conversion = False
file_ext = os.path.splitext(filepath)[1].lower()
# Note: Connector-specific codec restrictions are handled via connector_specs.unsupported_codecs
# which is already applied in get_supported_codecs() above
if audio_codec is None:
needs_conversion = True
logger.info(f"Unknown codec for {original_filename}, will attempt conversion")
elif file_ext == '.webm':
# WebM containers from MediaRecorder often lack seek cues, making browser
# audio players unable to seek. Force conversion to a seekable format.
needs_conversion = True
logger.info(f"Converting {original_filename} - WebM container lacks seek support")
elif is_asr_endpoint and audio_codec == 'aac':
needs_conversion = True
logger.info(f"Converting AAC-encoded file for ASR endpoint compatibility")
elif audio_codec not in supported_codecs:
needs_conversion = True
logger.info(f"Converting {original_filename} (codec: {audio_codec}) - unsupported for processing")
if needs_conversion:
# Determine target codec
# If chunking is needed, always convert to MP3 (chunking requires MP3 anyway)
# This avoids double conversion: original → configured codec → mp3
if needs_chunking:
target_codec = 'mp3'
logger.info(f"Using MP3 for {original_filename} since chunking is needed")
else:
# Fall back to mp3 if AUDIO_CODEC is unsupported by connector
target_codec = AUDIO_CODEC
if connector_specs and connector_specs.unsupported_codecs:
if AUDIO_CODEC in connector_specs.unsupported_codecs:
target_codec = 'mp3'
logger.warning(
f"AUDIO_CODEC '{AUDIO_CODEC}' is not supported by connector, "
f"falling back to mp3 for {original_filename}"
)
logger.info(f"Converting {original_filename} to {target_codec.upper()}")
try:
output_filepath, mime_type, _ = compress_audio(
filepath,
codec=target_codec,
bitrate=AUDIO_BITRATE,
delete_original=delete_original,
codec_info=codec_info
)
final_size = os.path.getsize(output_filepath)
reduction = ((original_size - final_size) / original_size * 100) if original_size > 0 else 0
logger.info(
f"Successfully converted {original_filename}: "
f"{original_size/1024/1024:.1f}MB -> {final_size/1024/1024:.1f}MB "
f"({reduction:.1f}% reduction)"
)
return ConversionResult(
output_path=output_filepath,
mime_type=mime_type,
was_converted=True,
was_compressed=False,
original_size=original_size,
final_size=final_size,
original_codec=original_codec,
final_codec=target_codec
)
except FFmpegNotFoundError:
logger.error("FFmpeg not found")
raise
except FFmpegError as e:
logger.error(f"FFmpeg conversion failed for {filepath}: {e}")
raise
# Audio file with supported codec - check if we should compress lossless
logger.info(f"Codec {audio_codec} is supported, no conversion needed")
if AUDIO_COMPRESS_UPLOADS:
# Determine target codec for compression - fall back to mp3 if AUDIO_CODEC is unsupported
compress_target_codec = AUDIO_CODEC
if connector_specs and connector_specs.unsupported_codecs:
if AUDIO_CODEC in connector_specs.unsupported_codecs:
compress_target_codec = 'mp3'
logger.warning(
f"AUDIO_CODEC '{AUDIO_CODEC}' is not supported by connector, "
f"falling back to mp3 for lossless compression of {original_filename}"
)
try:
# Check if file is lossless
if is_lossless_audio(filepath, codec_info=codec_info):
# Skip if already in target codec (e.g., FLAC to FLAC)
if audio_codec == compress_target_codec:
logger.info(f"File already in target codec {compress_target_codec}, no compression needed")
return ConversionResult(
output_path=filepath,
mime_type=_guess_mime_type(filepath),
was_converted=False,
was_compressed=False,
original_size=original_size,
final_size=original_size,
original_codec=original_codec,
final_codec=audio_codec
)
logger.info(f"Compressing lossless audio ({audio_codec}) to {compress_target_codec.upper()}")
# Perform compression
compressed_path, mime_type, _ = compress_audio(
filepath,
codec=compress_target_codec,
bitrate=AUDIO_BITRATE,
delete_original=delete_original,
codec_info=codec_info
)
final_size = os.path.getsize(compressed_path)
reduction = ((original_size - final_size) / original_size * 100) if original_size > 0 else 0
logger.info(
f"Successfully compressed {original_filename}: "
f"{original_size/1024/1024:.1f}MB -> {final_size/1024/1024:.1f}MB "
f"({reduction:.1f}% reduction)"
)
return ConversionResult(
output_path=compressed_path,
mime_type=mime_type,
was_converted=False,
was_compressed=True,
original_size=original_size,
final_size=final_size,
original_codec=original_codec,
final_codec=compress_target_codec
)
except Exception as e:
logger.warning(f"Failed to compress lossless audio: {e}. Continuing with original.")
# Fall through to return original file
# No processing needed - return original file
return ConversionResult(
output_path=filepath,
mime_type=_guess_mime_type(filepath),
was_converted=False,
was_compressed=False,
original_size=original_size,
final_size=original_size,
original_codec=original_codec,
final_codec=audio_codec
)
def _guess_mime_type(filepath: str) -> str:
"""
Guess MIME type from file extension.
Args:
filepath: Path to the file
Returns:
MIME type string
"""
import mimetypes
mime_type, _ = mimetypes.guess_type(filepath)
return mime_type or 'application/octet-stream'

227
src/utils/database.py Normal file
View File

@@ -0,0 +1,227 @@
"""
Database schema migration utilities.
IMPORTANT: All migrations must be compatible with both SQLite and PostgreSQL.
- Boolean defaults: SQLite uses 0/1, PostgreSQL requires FALSE/TRUE
- Type differences: SQLite DATETIME -> PostgreSQL TIMESTAMP, BLOB -> BYTEA
- Reserved keywords: "user", "order" etc. must be quoted
- The add_column_if_not_exists() function handles these automatically
- Use create_index_if_not_exists() for index creation with proper quoting
"""
import re
from sqlalchemy import inspect, text
def add_column_if_not_exists(engine, table_name, column_name, column_type):
"""
Add a column to a table if it doesn't already exist.
Args:
engine: SQLAlchemy engine
table_name: Name of the table
column_name: Name of the column to add
column_type: SQL type definition for the column
Returns:
bool: True if column was added, False if it already existed
"""
inspector = inspect(engine)
columns = [col['name'] for col in inspector.get_columns(table_name)]
if column_name not in columns:
if engine.name == 'postgresql':
# PostgreSQL requires TRUE/FALSE for boolean defaults, not 0/1
if 'BOOLEAN' in column_type.upper():
column_type = column_type.replace('DEFAULT 0', 'DEFAULT FALSE')
column_type = column_type.replace('DEFAULT 1', 'DEFAULT TRUE')
# PostgreSQL uses TIMESTAMP, not DATETIME
column_type = re.sub(r'\bDATETIME\b', 'TIMESTAMP', column_type, flags=re.IGNORECASE)
# PostgreSQL uses BYTEA, not BLOB
column_type = re.sub(r'\bBLOB\b', 'BYTEA', column_type, flags=re.IGNORECASE)
# PostgreSQL interprets double-quoted strings as identifiers, not literals
# Convert DEFAULT "value" to DEFAULT 'value'
column_type = re.sub(r'''DEFAULT\s+"([^"]*)"''', r"DEFAULT '\1'", column_type, flags=re.IGNORECASE)
with engine.connect() as conn:
# Quote identifiers to handle reserved keywords (e.g., "user" in PostgreSQL)
# MySQL uses backticks, PostgreSQL/SQLite use double quotes
# Handle special case where column_type includes the column name
if column_name in column_type:
if engine.name == 'mysql':
conn.execute(text(f'ALTER TABLE `{table_name}` ADD COLUMN {column_type}'))
else:
conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN {column_type}'))
else:
if engine.name == 'mysql':
conn.execute(text(f'ALTER TABLE `{table_name}` ADD COLUMN `{column_name}` {column_type}'))
else:
conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN "{column_name}" {column_type}'))
conn.commit()
return True
return False
def create_index_if_not_exists(engine, index_name, table_name, columns, unique=False):
"""
Create an index on a table if it doesn't already exist.
Handles cross-database compatibility by properly quoting table names,
especially important for reserved keywords like 'user', 'order', etc.
Args:
engine: SQLAlchemy engine
index_name: Name of the index to create
table_name: Name of the table
columns: Column(s) to index (string, can be comma-separated for composite)
unique: Whether to create a unique index (default False)
Returns:
bool: True if index was created, False if it already existed or table doesn't exist
"""
inspector = inspect(engine)
# Check if table exists
if table_name not in inspector.get_table_names():
return False
# Check if index already exists
existing_indexes = [idx['name'] for idx in inspector.get_indexes(table_name)]
if index_name in existing_indexes:
return False
unique_clause = 'UNIQUE ' if unique else ''
with engine.connect() as conn:
# Quote table name to handle reserved keywords (e.g., "user" in PostgreSQL)
# MySQL uses backticks, PostgreSQL/SQLite use double quotes
if engine.name == 'mysql':
quoted_table = f'`{table_name}`'
else:
quoted_table = f'"{table_name}"'
# Note: IF NOT EXISTS may not be supported on all databases, but we already
# checked for existence above, so it's just a safety net
try:
conn.execute(text(
f'CREATE {unique_clause}INDEX IF NOT EXISTS {index_name} ON {quoted_table} ({columns})'
))
except Exception:
# Some databases don't support IF NOT EXISTS, try without
conn.execute(text(
f'CREATE {unique_clause}INDEX {index_name} ON {quoted_table} ({columns})'
))
conn.commit()
return True
def migrate_column_type(engine, table_name, column_name, new_type, transform_sql=None):
"""
Migrate a column to a new type if it exists.
For SQLite, this uses a temporary column approach since SQLite doesn't support ALTER COLUMN.
Args:
engine: SQLAlchemy engine
table_name: Name of the table
column_name: Name of the column to modify
new_type: New SQL type for the column
transform_sql: Optional SQL expression to transform existing data (e.g., "datetime(meeting_date || ' 12:00:00')")
If None, data is copied as-is
Returns:
bool: True if column was migrated, False if it didn't exist or migration wasn't needed
"""
inspector = inspect(engine)
# Check if table exists
if table_name not in inspector.get_table_names():
return False
columns = {col['name']: col for col in inspector.get_columns(table_name)}
if column_name not in columns:
return False
engine_name = engine.name
with engine.connect() as conn:
if engine_name == 'sqlite':
# SQLite approach: use temporary column
temp_col = f"{column_name}_new"
# Check if temp column already exists (migration interrupted?)
if temp_col in columns:
try:
# Try to drop it and start over
conn.execute(text(f'ALTER TABLE "{table_name}" DROP COLUMN "{temp_col}"'))
conn.commit()
except Exception:
# If we can't drop it, the migration may have partially completed
# Check if old column still exists
if column_name not in columns:
# Old column is gone, temp exists - just rename temp to complete migration
try:
conn.execute(text(f'ALTER TABLE "{table_name}" RENAME COLUMN "{temp_col}" TO "{column_name}"'))
conn.commit()
return True
except Exception as e:
# Can't complete, leave as-is
return False
# Both columns exist - abort to avoid data issues
return False
# Add temporary column with new type
conn.execute(text(f'ALTER TABLE "{table_name}" ADD COLUMN "{temp_col}" {new_type}'))
# Copy data with optional transformation
if transform_sql:
conn.execute(text(f'UPDATE "{table_name}" SET "{temp_col}" = {transform_sql} WHERE "{column_name}" IS NOT NULL'))
else:
conn.execute(text(f'UPDATE "{table_name}" SET "{temp_col}" = "{column_name}"'))
# Drop old column (SQLite 3.35.0+ only)
try:
conn.execute(text(f'ALTER TABLE "{table_name}" DROP COLUMN "{column_name}"'))
# Drop succeeded, now rename temp to original name
conn.execute(text(f'ALTER TABLE "{table_name}" RENAME COLUMN "{temp_col}" TO "{column_name}"'))
conn.commit()
except Exception:
# Older SQLite - can't drop columns
# Rename temp column to original name (this will fail if original still exists)
try:
conn.execute(text(f'ALTER TABLE "{table_name}" RENAME COLUMN "{temp_col}" TO "{column_name}"'))
conn.commit()
except Exception:
# Can't rename because old column exists - this is OK for SQLite
# Just keep the new column and let the app use the old one
# The data in the old column is still valid
conn.rollback()
# Actually, let's just commit the temp column addition
# The model will use column_name which still exists with old data
# This is safe - new records will use the new model definition
return False
elif engine_name == 'postgresql':
# PostgreSQL can alter column type directly
if transform_sql:
conn.execute(text(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE {new_type} USING {transform_sql}'))
else:
conn.execute(text(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE {new_type}'))
conn.commit()
elif engine_name == 'mysql':
# MySQL can modify column type
conn.execute(text(f'ALTER TABLE `{table_name}` MODIFY COLUMN `{column_name}` {new_type}'))
# Apply transformation if provided
if transform_sql:
conn.execute(text(f'UPDATE `{table_name}` SET `{column_name}` = {transform_sql} WHERE `{column_name}` IS NOT NULL'))
conn.commit()
return True
return False

46
src/utils/datetime.py Normal file
View File

@@ -0,0 +1,46 @@
"""
Datetime utilities for timezone handling and formatting.
This module provides functions for converting and formatting datetimes
with timezone awareness.
"""
import os
import logging
import pytz
from babel.dates import format_datetime
# Module-level logger
logger = logging.getLogger(__name__)
def local_datetime_filter(dt):
"""
Format a UTC datetime object to the user's local timezone.
Args:
dt: datetime object to format (assumed UTC if naive)
Returns:
str: Formatted datetime string in user's timezone
"""
if dt is None:
return ""
# Get timezone from .env, default to UTC
user_tz_name = os.environ.get('TIMEZONE', 'UTC')
try:
user_tz = pytz.timezone(user_tz_name)
except pytz.UnknownTimeZoneError:
user_tz = pytz.utc
logger.warning(f"Invalid TIMEZONE '{user_tz_name}' in .env. Defaulting to UTC.")
# If the datetime object is naive, assume it's UTC
if dt.tzinfo is None:
dt = pytz.utc.localize(dt)
# Convert to the user's timezone
local_dt = dt.astimezone(user_tz)
# Format it nicely
return format_datetime(local_dt, format='medium', locale='en_US')

View File

@@ -0,0 +1,365 @@
"""
User-friendly error formatting utility.
Transforms technical error messages into user-friendly explanations with
actionable guidance. Works for both known error patterns and unknown errors.
"""
import re
import json
from typing import Dict, Optional, Tuple
# Known error patterns with user-friendly messages
ERROR_PATTERNS = [
# File size errors
{
'patterns': [
r'maximum content size limit.*exceeded',
r'file.*too large',
r'413.*exceeded',
r'payload too large',
],
'title': 'File Too Large',
'message': 'The audio file exceeds the maximum size allowed by the transcription service.',
'guidance': 'Try enabling audio chunking in your settings, or compress the audio file before uploading.',
'icon': 'fa-file-audio',
'type': 'size_limit'
},
# Timeout errors
{
'patterns': [
r'timed?\s*out',
r'timeout',
r'deadline exceeded',
r'request took too long',
],
'title': 'Processing Timeout',
'message': 'The transcription took too long to complete.',
'guidance': 'This can happen with very long recordings. Try splitting the audio into smaller parts, or increase the timeout setting if available.',
'icon': 'fa-clock',
'type': 'timeout'
},
# Authentication errors
{
'patterns': [
r'401.*unauthorized',
r'invalid.*api.*key',
r'authentication.*failed',
r'api key.*invalid',
r'incorrect api key',
],
'title': 'Authentication Error',
'message': 'The transcription service rejected the API credentials.',
'guidance': 'Please check that your API key is correct and has not expired. Contact your administrator if the problem persists.',
'icon': 'fa-key',
'type': 'auth'
},
# Rate limit errors
{
'patterns': [
r'rate.*limit',
r'too many requests',
r'429',
r'quota.*exceeded',
],
'title': 'Rate Limit Exceeded',
'message': 'Too many requests were sent to the transcription service.',
'guidance': 'Please wait a few minutes before trying again. The system will automatically retry failed jobs.',
'icon': 'fa-hourglass-half',
'type': 'rate_limit'
},
# Connection errors
{
'patterns': [
r'connection.*refused',
r'connection.*reset',
r'could not connect',
r'network.*unreachable',
r'name.*resolution.*failed',
r'dns.*failed',
],
'title': 'Connection Error',
'message': 'Could not connect to the transcription service.',
'guidance': 'Please check your internet connection and ensure the transcription service is available. If using a self-hosted service, verify it is running.',
'icon': 'fa-wifi',
'type': 'connection'
},
# Service unavailable
{
'patterns': [
r'503.*service unavailable',
r'502.*bad gateway',
r'500.*internal server error',
r'service.*unavailable',
r'server.*error',
],
'title': 'Service Unavailable',
'message': 'The transcription service is temporarily unavailable.',
'guidance': 'This is usually temporary. Please try again in a few minutes.',
'icon': 'fa-server',
'type': 'service_error'
},
# Invalid audio format
{
'patterns': [
r'invalid.*file.*format',
r'unsupported.*format',
r'could not.*decode',
r'audio.*corrupt',
r'not.*valid.*audio',
],
'title': 'Invalid Audio Format',
'message': 'The audio file format is not supported or the file may be corrupted.',
'guidance': 'Try converting the audio to MP3 or WAV format before uploading. If the file plays correctly on your device, try re-exporting it.',
'icon': 'fa-file-audio',
'type': 'format'
},
# Insufficient funds/billing
{
'patterns': [
r'insufficient.*funds',
r'billing.*issue',
r'payment.*required',
r'account.*suspended',
],
'title': 'Billing Issue',
'message': 'There is a billing issue with the transcription service account.',
'guidance': 'Please check your account status and payment information with the transcription service provider.',
'icon': 'fa-credit-card',
'type': 'billing'
},
# Model not found
{
'patterns': [
r'model.*not.*found',
r'invalid.*model',
r'model.*does not exist',
],
'title': 'Model Not Available',
'message': 'The requested transcription model is not available.',
'guidance': 'Please check the model name in your settings. The model may have been deprecated or renamed.',
'icon': 'fa-microchip',
'type': 'model'
},
# Audio extraction failed
{
'patterns': [
r'audio.*extraction.*failed',
r'could not.*extract.*audio',
r'ffmpeg.*failed',
r'no audio.*stream',
],
'title': 'Audio Extraction Failed',
'message': 'Could not extract audio from the uploaded file.',
'guidance': 'The file may be corrupted or in an unsupported format. Try converting it to a standard audio format (MP3, WAV) before uploading.',
'icon': 'fa-file-video',
'type': 'extraction'
},
]
def extract_error_details(error_text: str) -> Dict:
"""
Extract structured error details from raw error text.
Attempts to parse JSON error responses from APIs.
"""
details = {
'raw': error_text,
'code': None,
'message': None,
'type': None,
}
# Try to extract error code
code_match = re.search(r'(?:error\s*code|status)[:\s]*(\d{3})', error_text, re.IGNORECASE)
if code_match:
details['code'] = code_match.group(1)
# Try to parse JSON error structure
json_match = re.search(r'\{[^{}]*["\']error["\'][^{}]*\}', error_text)
if json_match:
try:
# Clean up the JSON-like string
json_str = json_match.group(0).replace("'", '"')
error_obj = json.loads(json_str)
if 'error' in error_obj:
err = error_obj['error']
if isinstance(err, dict):
details['message'] = err.get('message')
details['type'] = err.get('type')
details['code'] = details['code'] or err.get('code')
except (json.JSONDecodeError, KeyError):
pass
# Try to extract message from common patterns
if not details['message']:
msg_match = re.search(r"['\"]message['\"]\s*:\s*['\"]([^'\"]+)['\"]", error_text)
if msg_match:
details['message'] = msg_match.group(1)
return details
def format_error_for_user(error_text: str) -> Dict:
"""
Transform a technical error message into a user-friendly format.
Returns:
Dict with keys:
- title: Short, user-friendly title
- message: Plain language explanation
- guidance: Actionable suggestion
- icon: FontAwesome icon class
- type: Error category
- technical: Original error (for advanced users/debugging)
- is_known: Whether this matched a known pattern
"""
if not error_text:
return {
'title': 'Unknown Error',
'message': 'An unexpected error occurred.',
'guidance': 'Please try again. If the problem persists, contact support.',
'icon': 'fa-exclamation-triangle',
'type': 'unknown',
'technical': '',
'is_known': False
}
error_lower = error_text.lower()
# Check against known patterns
for pattern_info in ERROR_PATTERNS:
for pattern in pattern_info['patterns']:
if re.search(pattern, error_lower):
return {
'title': pattern_info['title'],
'message': pattern_info['message'],
'guidance': pattern_info['guidance'],
'icon': pattern_info['icon'],
'type': pattern_info['type'],
'technical': error_text,
'is_known': True
}
# Unknown error - try to make it more readable
details = extract_error_details(error_text)
# Clean up the error message for display
clean_message = details['message'] or error_text
# Remove common prefixes
for prefix in ['Transcription failed:', 'Processing failed:', 'Error:', 'Exception:']:
if clean_message.startswith(prefix):
clean_message = clean_message[len(prefix):].strip()
# Truncate very long messages
if len(clean_message) > 200:
clean_message = clean_message[:200] + '...'
# Generate a reasonable title based on error code
title = 'Processing Error'
if details['code']:
code = details['code']
if code.startswith('4'):
title = 'Request Error'
elif code.startswith('5'):
title = 'Server Error'
return {
'title': title,
'message': clean_message,
'guidance': 'If this error persists, try reprocessing the recording or contact support for assistance.',
'icon': 'fa-exclamation-circle',
'type': 'unknown',
'technical': error_text,
'is_known': False
}
def format_error_for_storage(error_text: str) -> str:
"""
Format an error message for storage in the database.
Returns a JSON string that can be parsed by the frontend for nice display.
The format is: ERROR_JSON:{"title": "...", "message": "...", ...}
This allows the frontend to detect formatted errors and display them nicely,
while still being human-readable if viewed raw.
"""
formatted = format_error_for_user(error_text)
# Create a compact JSON representation
error_data = {
't': formatted['title'],
'm': formatted['message'],
'g': formatted['guidance'],
'i': formatted['icon'],
'y': formatted['type'],
'k': formatted['is_known'],
}
# Only include technical details if it adds value
if formatted['technical'] and formatted['technical'] != formatted['message']:
error_data['d'] = formatted['technical'][:500] # Limit technical detail length
try:
json_str = json.dumps(error_data, ensure_ascii=False)
return f"ERROR_JSON:{json_str}"
except (TypeError, ValueError):
# Fallback to plain text if JSON encoding fails
return f"{formatted['title']}: {formatted['message']}"
def parse_stored_error(stored_text: str) -> Optional[Dict]:
"""
Parse a stored error message. Returns the formatted error dict if it's
a JSON-formatted error, or None if it's plain text.
"""
if not stored_text or not stored_text.startswith('ERROR_JSON:'):
return None
try:
json_str = stored_text[11:] # Remove 'ERROR_JSON:' prefix
data = json.loads(json_str)
return {
'title': data.get('t', 'Error'),
'message': data.get('m', 'An error occurred'),
'guidance': data.get('g', ''),
'icon': data.get('i', 'fa-exclamation-circle'),
'type': data.get('y', 'unknown'),
'is_known': data.get('k', False),
'technical': data.get('d', ''),
}
except (json.JSONDecodeError, KeyError, TypeError):
return None
def is_transcription_error(transcription_text: str) -> bool:
"""
Check if the transcription text is actually an error message.
Returns True if the text is an error message (not valid transcription content).
This should be used to prevent operations like summarization or chat on failed recordings.
"""
if not transcription_text:
return False
# Check for JSON-formatted error
if transcription_text.startswith('ERROR_JSON:'):
return True
# Check for legacy error prefixes
error_prefixes = [
'Transcription failed:',
'Processing failed:',
'ASR processing failed:',
'Audio extraction failed:',
'Upload/Processing failed:',
]
for prefix in error_prefixes:
if transcription_text.startswith(prefix):
return True
return False

448
src/utils/ffmpeg_utils.py Normal file
View File

@@ -0,0 +1,448 @@
"""Centralized FFmpeg utilities for consistent audio/video processing."""
import os
import subprocess
import tempfile
from contextlib import contextmanager
from typing import Optional, Tuple
from flask import current_app
# Configuration constants
DEFAULT_MP3_BITRATE = os.getenv('AUDIO_BITRATE', '128k')
DEFAULT_SAMPLE_RATE = os.getenv('AUDIO_SAMPLE_RATE', '44100')
DEFAULT_CHANNELS = int(os.getenv('AUDIO_CHANNELS', '1')) # Mono for speech
DEFAULT_COMPRESSION_LEVEL = int(os.getenv('AUDIO_COMPRESSION_LEVEL', '2'))
class FFmpegError(Exception):
"""Custom exception for FFmpeg-related errors."""
pass
class FFmpegNotFoundError(FFmpegError):
"""Raised when FFmpeg executable is not found."""
pass
def convert_to_mp3(
input_path: str,
output_path: Optional[str] = None,
bitrate: str = DEFAULT_MP3_BITRATE,
sample_rate: str = DEFAULT_SAMPLE_RATE,
channels: int = DEFAULT_CHANNELS,
compression_level: int = DEFAULT_COMPRESSION_LEVEL
) -> str:
"""
Convert audio/video file to MP3 format using FFmpeg.
Args:
input_path: Path to input audio/video file
output_path: Path for output MP3 file (auto-generated if None)
bitrate: MP3 bitrate (e.g., '128k', '192k')
sample_rate: Sample rate in Hz (e.g., '44100', '48000')
channels: Number of audio channels (1=mono, 2=stereo)
compression_level: MP3 compression level (0-9, higher=better compression)
Returns:
Path to the created MP3 file
Raises:
FFmpegNotFoundError: If FFmpeg is not installed
FFmpegError: If conversion fails
"""
if output_path is None:
base = os.path.splitext(input_path)[0]
output_path = f"{base}.mp3"
cmd = [
'ffmpeg',
'-i', input_path,
'-y', # Overwrite output
'-acodec', 'libmp3lame',
'-b:a', bitrate,
'-ar', sample_rate,
'-ac', str(channels),
'-compression_level', str(compression_level),
output_path
]
_run_ffmpeg_command(cmd, f"MP3 conversion of {os.path.basename(input_path)}")
return output_path
def extract_audio_from_video(
video_path: str,
output_format: str = 'mp3',
bitrate: str = DEFAULT_MP3_BITRATE,
cleanup_original: bool = True,
copy_stream: bool = False
) -> Tuple[str, str]:
"""
Extract audio track from video file.
Args:
video_path: Path to video file
output_format: Audio format ('mp3', 'wav', 'flac', 'copy')
bitrate: Audio bitrate for lossy formats (ignored if copy_stream=True)
cleanup_original: Whether to delete the original video file
copy_stream: If True, copy audio stream without re-encoding (fast, preserves quality)
If False, re-encode to specified format
Returns:
Tuple of (audio_filepath, mime_type)
Raises:
FFmpegNotFoundError: If FFmpeg is not installed
FFmpegError: If extraction fails
"""
base_path = os.path.splitext(video_path)[0]
try:
if copy_stream or output_format == 'copy':
# Copy audio stream without re-encoding - need to detect the format first
from src.utils.ffprobe import get_codec_info
try:
codec_info = get_codec_info(video_path, timeout=10)
audio_codec = codec_info.get('audio_codec', 'unknown')
# Map codec to extension and MIME type
codec_map = {
'aac': {'ext': 'm4a', 'mime': 'audio/mp4'},
'mp3': {'ext': 'mp3', 'mime': 'audio/mpeg'},
'opus': {'ext': 'opus', 'mime': 'audio/opus'},
'vorbis': {'ext': 'ogg', 'mime': 'audio/ogg'},
'flac': {'ext': 'flac', 'mime': 'audio/flac'},
}
if audio_codec in codec_map:
output_ext = codec_map[audio_codec]['ext']
mime_type = codec_map[audio_codec]['mime']
else:
# Default to m4a for unknown codecs
current_app.logger.warning(f"Unknown audio codec '{audio_codec}', defaulting to m4a container")
output_ext = 'm4a'
mime_type = 'audio/mp4'
temp_audio_path = f"{base_path}_audio_temp.{output_ext}"
final_audio_path = f"{base_path}_audio.{output_ext}"
cmd = [
'ffmpeg',
'-i', video_path,
'-y',
'-vn', # No video
'-acodec', 'copy', # Copy audio stream without re-encoding
temp_audio_path
]
current_app.logger.info(f"Copying audio stream (codec: {audio_codec}) without re-encoding")
except Exception as probe_error:
current_app.logger.warning(f"Failed to detect audio codec: {probe_error}. Falling back to MP3 encoding.")
# Fallback to MP3 encoding if we can't detect the codec
output_ext = 'mp3'
mime_type = 'audio/mpeg'
temp_audio_path = f"{base_path}_audio_temp.{output_ext}"
final_audio_path = f"{base_path}_audio.{output_ext}"
cmd = [
'ffmpeg',
'-i', video_path,
'-y',
'-vn',
'-acodec', 'libmp3lame',
'-b:a', bitrate,
'-ar', DEFAULT_SAMPLE_RATE,
'-ac', str(DEFAULT_CHANNELS),
'-compression_level', str(DEFAULT_COMPRESSION_LEVEL),
temp_audio_path
]
elif output_format == 'mp3':
temp_audio_path = f"{base_path}_audio_temp.mp3"
final_audio_path = f"{base_path}_audio.mp3"
cmd = [
'ffmpeg',
'-i', video_path,
'-y',
'-vn', # No video
'-acodec', 'libmp3lame',
'-b:a', bitrate,
'-ar', DEFAULT_SAMPLE_RATE,
'-ac', str(DEFAULT_CHANNELS),
'-compression_level', str(DEFAULT_COMPRESSION_LEVEL),
temp_audio_path
]
mime_type = 'audio/mpeg'
elif output_format == 'wav':
temp_audio_path = f"{base_path}_audio_temp.wav"
final_audio_path = f"{base_path}_audio.wav"
cmd = [
'ffmpeg',
'-i', video_path,
'-y',
'-vn',
'-acodec', 'pcm_s16le',
'-ar', DEFAULT_SAMPLE_RATE,
temp_audio_path
]
mime_type = 'audio/wav'
elif output_format == 'flac':
temp_audio_path = f"{base_path}_audio_temp.flac"
final_audio_path = f"{base_path}_audio.flac"
cmd = [
'ffmpeg',
'-i', video_path,
'-y',
'-vn',
'-acodec', 'flac',
'-compression_level', '12',
temp_audio_path
]
mime_type = 'audio/flac'
elif output_format == 'opus':
temp_audio_path = f"{base_path}_audio_temp.opus"
final_audio_path = f"{base_path}_audio.opus"
cmd = [
'ffmpeg',
'-i', video_path,
'-y',
'-vn',
'-acodec', 'libopus',
'-b:a', bitrate,
temp_audio_path
]
mime_type = 'audio/opus'
else:
raise ValueError(f"Unsupported output format: {output_format}")
_run_ffmpeg_command(cmd, f"Audio extraction from {os.path.basename(video_path)}")
# Optionally preserve temp file for debugging
if os.getenv('PRESERVE_TEMP_AUDIO', 'false').lower() == 'true':
import shutil
debug_path = temp_audio_path.replace('_temp', '_debug')
shutil.copy2(temp_audio_path, debug_path)
current_app.logger.info(f"Debug: Preserved temp audio file as {debug_path}")
# Rename temp file to final filename
os.rename(temp_audio_path, final_audio_path)
if cleanup_original:
try:
os.remove(video_path)
current_app.logger.info(f"Cleaned up original video: {os.path.basename(video_path)}")
except Exception as e:
current_app.logger.warning(f"Failed to cleanup video {video_path}: {e}")
return final_audio_path, mime_type
except Exception as e:
# Clean up temp file on error
if os.path.exists(temp_audio_path):
try:
os.remove(temp_audio_path)
except:
pass
raise
def compress_audio(
input_path: str,
codec: str = 'mp3',
bitrate: str = DEFAULT_MP3_BITRATE,
delete_original: bool = True,
codec_info: Optional[dict] = None
) -> Tuple[str, str, Optional[dict]]:
"""
Compress audio file to specified codec.
Args:
input_path: Path to input audio file
codec: Target codec ('mp3', 'flac', 'opus')
bitrate: Bitrate for lossy codecs (ignored for FLAC)
delete_original: Whether to delete the original file after compression
codec_info: Optional pre-fetched codec info (returned as-is, not updated)
Returns:
Tuple of (output_path, mime_type, codec_info)
Note: codec_info is returned unchanged (None after compression)
Raises:
FFmpegNotFoundError: If FFmpeg is not installed
FFmpegError: If compression fails
"""
codec_config = {
'mp3': {
'ext': '.mp3',
'mime': 'audio/mpeg',
'cmd_args': [
'-acodec', 'libmp3lame',
'-b:a', bitrate,
'-ar', DEFAULT_SAMPLE_RATE,
'-ac', str(DEFAULT_CHANNELS)
]
},
'flac': {
'ext': '.flac',
'mime': 'audio/flac',
'cmd_args': ['-acodec', 'flac', '-compression_level', '12']
},
'opus': {
'ext': '.opus',
'mime': 'audio/opus',
'cmd_args': ['-acodec', 'libopus', '-b:a', bitrate]
}
}
if codec not in codec_config:
raise ValueError(f"Unsupported codec: {codec}. Supported: {list(codec_config.keys())}")
config = codec_config[codec]
base_path = os.path.splitext(input_path)[0]
temp_output_path = f"{base_path}_compressed_temp{config['ext']}"
final_output_path = f"{base_path}{config['ext']}"
try:
# Get original file size for logging
original_size = os.path.getsize(input_path)
cmd = ['ffmpeg', '-i', input_path, '-y'] + config['cmd_args'] + [temp_output_path]
_run_ffmpeg_command(cmd, f"Compression of {os.path.basename(input_path)} to {codec}")
# Get compressed file size
compressed_size = os.path.getsize(temp_output_path)
ratio = (1 - compressed_size / original_size) * 100 if original_size > 0 else 0
current_app.logger.info(
f"Compressed {os.path.basename(input_path)}: "
f"{original_size / 1024 / 1024:.1f}MB -> "
f"{compressed_size / 1024 / 1024:.1f}MB ({ratio:.1f}% reduction)"
)
# Remove original and rename temp to final
if delete_original:
os.remove(input_path)
current_app.logger.debug(f"Deleted original file: {input_path}")
os.rename(temp_output_path, final_output_path)
# Return codec_info as None since file was converted (codec changed)
return final_output_path, config['mime'], None
except Exception as e:
# Clean up temp file if it exists
if os.path.exists(temp_output_path):
try:
os.remove(temp_output_path)
except:
pass
# Re-raise with codec_info preservation
raise
def extract_audio_segment(
input_path: str,
output_path: str,
start_time: float,
duration: float
) -> None:
"""
Extract a segment from an audio file.
Args:
input_path: Path to input audio file
output_path: Path for output segment
start_time: Start time in seconds
duration: Duration in seconds
Raises:
FFmpegNotFoundError: If FFmpeg is not installed
FFmpegError: If extraction fails
"""
cmd = [
'ffmpeg',
'-i', input_path,
'-ss', str(start_time),
'-t', str(duration),
'-vn', # Drop video streams (audio segment only)
'-c:a', 'copy', # Copy audio codec (no re-encoding)
'-y',
output_path
]
_run_ffmpeg_command(cmd, f"Segment extraction from {os.path.basename(input_path)}")
@contextmanager
def temp_audio_conversion(input_path: str, target_format: str = 'mp3'):
"""
Context manager for temporary audio conversion.
Automatically cleans up temp file on exit.
Example:
with temp_audio_conversion(input_path, 'mp3') as mp3_path:
# Use mp3_path
process_audio(mp3_path)
# mp3_path is automatically deleted
Args:
input_path: Path to input audio file
target_format: Target format ('mp3', 'wav', etc.)
Yields:
Path to temporary converted file
"""
temp_path = None
try:
with tempfile.NamedTemporaryFile(suffix=f'.{target_format}', delete=False) as temp_file:
temp_path = temp_file.name
if target_format == 'mp3':
convert_to_mp3(input_path, temp_path)
else:
raise ValueError(f"Unsupported target format: {target_format}")
yield temp_path
finally:
if temp_path and os.path.exists(temp_path):
try:
os.unlink(temp_path)
except Exception as e:
current_app.logger.warning(f"Failed to cleanup temp file {temp_path}: {e}")
def _run_ffmpeg_command(cmd: list, operation_description: str) -> None:
"""
Execute FFmpeg command with consistent error handling.
Args:
cmd: FFmpeg command as list of strings
operation_description: Human-readable description for error messages
Raises:
FFmpegNotFoundError: If FFmpeg is not installed
FFmpegError: If FFmpeg command fails
"""
try:
current_app.logger.debug(f"Running FFmpeg command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
check=True,
capture_output=True,
text=True
)
current_app.logger.debug(f"FFmpeg {operation_description} completed successfully")
except FileNotFoundError:
error_msg = "FFmpeg not found. Please ensure FFmpeg is installed and in the system's PATH."
current_app.logger.error(error_msg)
raise FFmpegNotFoundError(error_msg)
except subprocess.CalledProcessError as e:
error_msg = f"{operation_description} failed: {e.stderr}"
current_app.logger.error(f"FFmpeg error: {error_msg}")
raise FFmpegError(error_msg)

499
src/utils/ffprobe.py Normal file
View File

@@ -0,0 +1,499 @@
"""
FFprobe utility for detecting audio/video codecs and format information.
This module provides functions to inspect media files using ffprobe and return
structured information about their codecs, streams, and formats.
"""
import json
import logging
import subprocess
from datetime import datetime
from typing import Optional, Dict, Any, Tuple
logger = logging.getLogger(__name__)
class FFProbeError(Exception):
"""Raised when ffprobe fails to analyze a file."""
pass
def probe(filename: str, cmd: str = 'ffprobe', timeout: Optional[int] = None) -> Dict[str, Any]:
"""
Run ffprobe on the specified file and return a JSON representation of the output.
Args:
filename: Path to the media file to probe
cmd: Command to use (default: 'ffprobe')
timeout: Optional timeout in seconds
Returns:
Dictionary containing streams and format information
Raises:
FFProbeError: if ffprobe returns a non-zero exit code
"""
args = [cmd, '-show_format', '-show_streams', '-of', 'json', filename]
p = None
try:
p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
communicate_kwargs = {}
if timeout is not None:
communicate_kwargs['timeout'] = timeout
out, err = p.communicate(**communicate_kwargs)
if p.returncode != 0:
error_msg = err.decode('utf-8', errors='ignore')
raise FFProbeError(f'ffprobe failed: {error_msg}')
return json.loads(out.decode('utf-8'))
except subprocess.TimeoutExpired:
if p:
p.kill()
raise FFProbeError(f'ffprobe timed out after {timeout} seconds')
except FileNotFoundError:
raise FFProbeError('ffprobe command not found. Please ensure ffmpeg is installed.')
except json.JSONDecodeError as e:
raise FFProbeError(f'Failed to parse ffprobe output: {e}')
def get_codec_info(filename: str, timeout: Optional[int] = None) -> Dict[str, Any]:
"""
Get codec information for a media file.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
Returns:
Dictionary with keys:
- audio_codec: Audio codec name (e.g., 'pcm_s16le', 'aac', 'mp3')
- video_codec: Video codec name if present, or None
- has_video: Boolean indicating if file contains video stream
- has_audio: Boolean indicating if file contains audio stream
- format_name: Container format name (e.g., 'wav', 'mov,mp4,m4a')
- duration: Duration in seconds (float)
- sample_rate: Audio sample rate if available
- channels: Number of audio channels if available
- bit_rate: Bit rate if available
Raises:
FFProbeError: if ffprobe fails to analyze the file
"""
try:
probe_data = probe(filename, timeout=timeout)
except FFProbeError:
raise
result = {
'audio_codec': None,
'video_codec': None,
'has_video': False,
'has_audio': False,
'format_name': None,
'duration': None,
'sample_rate': None,
'channels': None,
'bit_rate': None
}
# Extract format information
if 'format' in probe_data:
fmt = probe_data['format']
result['format_name'] = fmt.get('format_name')
if 'duration' in fmt:
try:
result['duration'] = float(fmt['duration'])
except (ValueError, TypeError):
pass
if 'bit_rate' in fmt:
try:
result['bit_rate'] = int(fmt['bit_rate'])
except (ValueError, TypeError):
pass
# Extract stream information
if 'streams' in probe_data:
for stream in probe_data['streams']:
codec_type = stream.get('codec_type')
codec_name = stream.get('codec_name')
if codec_type == 'audio':
result['has_audio'] = True
if result['audio_codec'] is None: # Use first audio stream
result['audio_codec'] = codec_name
result['sample_rate'] = stream.get('sample_rate')
result['channels'] = stream.get('channels')
elif codec_type == 'video':
result['has_video'] = True
if result['video_codec'] is None: # Use first video stream
result['video_codec'] = codec_name
return result
def is_video_file(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
"""
Check if a file contains video streams.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
True if file contains video streams, False otherwise
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
return codec_info['has_video']
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return False
def is_audio_file(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
"""
Check if a file contains audio streams.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
True if file contains audio streams, False otherwise
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
return codec_info['has_audio']
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return False
def get_audio_codec(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Optional[str]:
"""
Get the audio codec name for a file.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
Audio codec name (e.g., 'pcm_s16le', 'aac', 'mp3', 'opus'), or None if no audio
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
return codec_info['audio_codec']
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return None
def needs_audio_conversion(filename: str, supported_codecs: list, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Tuple[bool, Optional[str]]:
"""
Check if a file needs audio conversion based on its codec.
Args:
filename: Path to the media file
supported_codecs: List of supported audio codec names
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
Tuple of (needs_conversion: bool, current_codec: str or None)
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
# If it has video, it likely needs conversion
if codec_info['has_video']:
return True, codec_info.get('audio_codec')
# If no audio at all, cannot convert
if not codec_info['has_audio']:
logger.warning(f"File {filename} has no audio streams")
return False, None
audio_codec = codec_info['audio_codec']
# Check if codec is in supported list
if audio_codec in supported_codecs:
return False, audio_codec
return True, audio_codec
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
# Default to attempting conversion on error
return True, None
def is_lossless_audio(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> bool:
"""
Check if a file uses a lossless audio codec.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
True if file uses lossless audio codec, False otherwise
"""
lossless_codecs = {
'pcm_s16le', 'pcm_s24le', 'pcm_s32le',
'pcm_f32le', 'pcm_f64le',
'pcm_u8', 'pcm_u16le', 'pcm_u24le', 'pcm_u32le',
'flac', 'alac', 'ape', 'wavpack', 'tta',
'mlp', 'truehd'
}
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
audio_codec = codec_info['audio_codec']
return audio_codec in lossless_codecs if audio_codec else False
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return False
def get_duration(filename: str, timeout: Optional[int] = None, codec_info: Optional[Dict[str, Any]] = None) -> Optional[float]:
"""
Get the duration of a media file in seconds.
Uses multiple methods to determine duration:
1. Format-level duration (fastest, works for most files)
2. Packet timestamps fallback (for files without duration metadata like some WebM)
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
codec_info: Optional pre-fetched codec info to avoid redundant probe calls
Returns:
Duration in seconds, or None if unable to determine
"""
try:
if codec_info is None:
codec_info = get_codec_info(filename, timeout=timeout)
# Try format-level duration first
if codec_info['duration'] is not None:
return codec_info['duration']
# Fallback: scan packets to find the last timestamp
# This works for WebM and other files without duration metadata
return _get_duration_from_packets(filename, timeout=timeout)
except FFProbeError as e:
logger.warning(f"Failed to probe {filename}: {e}")
return None
def _get_duration_from_packets(filename: str, timeout: Optional[int] = None) -> Optional[float]:
"""
Get duration by scanning packet timestamps (fallback for files without duration metadata).
This is slower than format-level duration but works for WebM and similar files
that don't store duration in the container metadata.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
Returns:
Duration in seconds, or None if unable to determine
"""
try:
args = [
'ffprobe', '-v', 'error',
'-show_entries', 'packet=pts_time',
'-select_streams', 'a:0', # First audio stream
'-of', 'csv=p=0',
filename
]
p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
communicate_kwargs = {}
if timeout is not None:
communicate_kwargs['timeout'] = timeout
out, err = p.communicate(**communicate_kwargs)
if p.returncode != 0:
logger.debug(f"Packet scan failed for {filename}")
return None
# Parse the output to find the last timestamp
lines = out.decode('utf-8').strip().split('\n')
last_valid_time = None
for line in reversed(lines):
line = line.strip()
if line and line != 'N/A':
try:
last_valid_time = float(line)
break
except ValueError:
continue
if last_valid_time is not None:
logger.debug(f"Got duration from packets for {filename}: {last_valid_time}")
return last_valid_time
return None
except subprocess.TimeoutExpired:
logger.warning(f"Packet scan timed out for {filename}")
return None
except Exception as e:
logger.warning(f"Error scanning packets for {filename}: {e}")
return None
def get_creation_date(filename: str, timeout: Optional[int] = None, use_file_mtime: bool = True) -> Optional[datetime]:
"""
Extract the creation/recording date from a media file's metadata.
Checks various metadata tags commonly used by recorders and devices:
- creation_time (MP4, M4A, MOV)
- date (various formats)
- encoded_date (some encoders)
Falls back to file modification time if no metadata found and use_file_mtime is True.
Args:
filename: Path to the media file
timeout: Optional timeout in seconds
use_file_mtime: If True, fall back to file modification time when no metadata found
Returns:
datetime object if creation date found, None otherwise
"""
import os
try:
probe_data = probe(filename, timeout=timeout)
except FFProbeError as e:
logger.warning(f"Failed to probe {filename} for creation date: {e}")
# Even if probe fails, we can still try file mtime
if use_file_mtime:
return _get_file_mtime(filename)
return None
# Tags to check for creation date (in order of preference)
date_tags = ['creation_time', 'date', 'encoded_date', 'date_recorded', 'recording_time']
# Check format-level tags first
if 'format' in probe_data and 'tags' in probe_data['format']:
tags = probe_data['format']['tags']
for tag in date_tags:
# Check both lowercase and original case
value = tags.get(tag) or tags.get(tag.upper())
if value:
parsed = _parse_date_string(value)
if parsed:
logger.debug(f"Found creation date from format tag '{tag}': {parsed}")
return parsed
# Check stream-level tags
if 'streams' in probe_data:
for stream in probe_data['streams']:
if 'tags' in stream:
tags = stream['tags']
for tag in date_tags:
value = tags.get(tag) or tags.get(tag.upper())
if value:
parsed = _parse_date_string(value)
if parsed:
logger.debug(f"Found creation date from stream tag '{tag}': {parsed}")
return parsed
# Fall back to file modification time
if use_file_mtime:
mtime = _get_file_mtime(filename)
if mtime:
logger.debug(f"Using file modification time as creation date: {mtime}")
return mtime
logger.debug(f"No creation date found for {filename}")
return None
def _get_file_mtime(filename: str) -> Optional[datetime]:
"""
Get the file's modification time as a datetime.
Args:
filename: Path to the file
Returns:
datetime object or None if unable to get mtime
"""
import os
try:
stat_info = os.stat(filename)
return datetime.fromtimestamp(stat_info.st_mtime)
except (OSError, ValueError) as e:
logger.warning(f"Failed to get file mtime for {filename}: {e}")
return None
def _parse_date_string(date_str: str) -> Optional[datetime]:
"""
Parse various date string formats commonly found in media metadata.
Args:
date_str: Date string to parse
Returns:
datetime object if parsing successful, None otherwise
"""
if not date_str:
return None
# Common formats in media files
formats = [
'%Y-%m-%dT%H:%M:%S.%fZ', # ISO 8601 with microseconds and Z
'%Y-%m-%dT%H:%M:%SZ', # ISO 8601 with Z
'%Y-%m-%dT%H:%M:%S.%f%z', # ISO 8601 with microseconds and timezone
'%Y-%m-%dT%H:%M:%S%z', # ISO 8601 with timezone
'%Y-%m-%dT%H:%M:%S.%f', # ISO 8601 with microseconds
'%Y-%m-%dT%H:%M:%S', # ISO 8601 basic
'%Y-%m-%d %H:%M:%S', # Common datetime
'%Y/%m/%d %H:%M:%S', # Alternate datetime
'%Y-%m-%d', # Date only
'%Y/%m/%d', # Alternate date only
'%d-%m-%Y %H:%M:%S', # European format
'%d/%m/%Y %H:%M:%S', # European format alternate
]
# Clean up the string
date_str = date_str.strip()
for fmt in formats:
try:
return datetime.strptime(date_str, fmt)
except ValueError:
continue
# Try fromisoformat as a fallback (handles many ISO variants)
try:
# Replace Z with +00:00 for fromisoformat compatibility
clean_str = date_str.replace('Z', '+00:00')
return datetime.fromisoformat(clean_str)
except ValueError:
pass
logger.debug(f"Could not parse date string: {date_str}")
return None

24
src/utils/file_hash.py Normal file
View File

@@ -0,0 +1,24 @@
"""File hashing utility for duplicate detection."""
import hashlib
def compute_file_sha256(filepath, chunk_size=8192):
"""
Compute SHA-256 hash of a file, reading in chunks to handle large files.
Args:
filepath: Path to the file to hash
chunk_size: Size of chunks to read at a time (default 8KB)
Returns:
64-character hex digest string
"""
sha256 = hashlib.sha256()
with open(filepath, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
sha256.update(chunk)
return sha256.hexdigest()

215
src/utils/json_parser.py Normal file
View File

@@ -0,0 +1,215 @@
"""
JSON parsing utilities for handling LLM responses and malformed JSON.
This module provides robust JSON parsing functions that can handle common
issues with LLM-generated JSON, including:
- Incomplete/unterminated JSON structures
- Escape sequence problems
- JSON embedded in markdown code blocks
- Nested quotes and special characters
"""
import json
import re
import ast
import logging
# Module-level logger
logger = logging.getLogger(__name__)
def auto_close_json(json_string):
"""
Attempts to close an incomplete JSON string by appending necessary brackets and braces.
This is a simplified parser and may not handle all edge cases, but is
designed to fix unterminated strings from API responses.
"""
if not isinstance(json_string, str):
return json_string
stack = []
in_string = False
escape_next = False
for char in json_string:
if escape_next:
escape_next = False
continue
if char == '\\':
escape_next = True
continue
if char == '"':
# We don't handle escaped quotes inside strings perfectly,
# but this is a simple heuristic.
if not escape_next:
in_string = not in_string
if not in_string:
if char == '{':
stack.append('}')
elif char == '[':
stack.append(']')
elif char == '}':
if stack and stack[-1] == '}':
stack.pop()
elif char == ']':
if stack and stack[-1] == ']':
stack.pop()
# If we are inside a string at the end, close it.
if in_string:
json_string += '"'
# Close any remaining open structures
while stack:
json_string += stack.pop()
return json_string
def preprocess_json_escapes(json_string):
"""
Preprocess JSON string to fix common escape issues from LLM responses.
Uses a more sophisticated approach to handle nested quotes properly.
"""
if not json_string:
return json_string
result = []
i = 0
in_string = False
escape_next = False
expecting_value = False # Track if we're expecting a value (after :)
while i < len(json_string):
char = json_string[i]
if escape_next:
# This character is escaped, add it as-is
result.append(char)
escape_next = False
elif char == '\\':
# This is an escape character
result.append(char)
escape_next = True
elif char == ':' and not in_string:
# We found a colon, next string will be a value
result.append(char)
expecting_value = True
elif char == ',' and not in_string:
# We found a comma, reset expecting_value
result.append(char)
expecting_value = False
elif char == '"':
if not in_string:
# Starting a string
in_string = True
result.append(char)
else:
# We're in a string, check if this quote should be escaped
# Look ahead to see if this is the end of the string value
j = i + 1
while j < len(json_string) and json_string[j].isspace():
j += 1
# For keys (not expecting_value), only end on colon
# For values (expecting_value), end on comma, closing brace, or closing bracket
if expecting_value:
end_chars = ',}]'
else:
end_chars = ':'
if j < len(json_string) and json_string[j] in end_chars:
# This is the end of the string
in_string = False
result.append(char)
if not expecting_value:
# We just finished a key, next will be expecting value
expecting_value = True
else:
# This is an inner quote that should be escaped
result.append('\\"')
else:
result.append(char)
i += 1
return ''.join(result)
def extract_json_object(text):
"""
Extract the first complete JSON object or array from text using regex.
"""
# Look for JSON object
obj_match = re.search(r'\{.*\}', text, re.DOTALL)
if obj_match:
return obj_match.group(0)
# Look for JSON array
arr_match = re.search(r'\[.*\]', text, re.DOTALL)
if arr_match:
return arr_match.group(0)
# Return original if no JSON structure found
return text
def safe_json_loads(json_string, fallback_value=None):
"""
Safely parse JSON with preprocessing to handle common LLM JSON formatting issues.
Args:
json_string (str): The JSON string to parse
fallback_value: Value to return if parsing fails (default: None)
Returns:
Parsed JSON object or fallback_value if parsing fails
"""
if not json_string or not isinstance(json_string, str):
logger.warning(f"Invalid JSON input: {type(json_string)} - {json_string}")
return fallback_value
# Step 1: Clean the input string
cleaned_json = json_string.strip()
# Step 2: Extract JSON from markdown code blocks if present
json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', cleaned_json, re.DOTALL)
if json_match:
cleaned_json = json_match.group(1).strip()
# Step 3: Try multiple parsing strategies
parsing_strategies = [
# Strategy 1: Direct parsing (for well-formed JSON)
lambda x: json.loads(x),
# Strategy 2: Fix common escape issues
lambda x: json.loads(preprocess_json_escapes(x)),
# Strategy 3: Use ast.literal_eval as fallback for simple cases
lambda x: ast.literal_eval(x) if x.startswith(('{', '[')) else None,
# Strategy 4: Extract JSON object/array using regex
lambda x: json.loads(extract_json_object(x)),
# Strategy 5: Auto-close incomplete JSON and parse
lambda x: json.loads(auto_close_json(x)),
]
for i, strategy in enumerate(parsing_strategies):
try:
result = strategy(cleaned_json)
if result is not None:
if i > 0: # Log if we had to use a fallback strategy
logger.info(f"JSON parsed successfully using strategy {i+1}")
return result
except (json.JSONDecodeError, ValueError, SyntaxError) as e:
if i == 0: # Only log the first failure to avoid spam
logger.debug(f"JSON parsing strategy {i+1} failed: {e}")
continue
# All strategies failed
logger.error(f"All JSON parsing strategies failed for: {cleaned_json[:200]}...")
return fallback_value

214
src/utils/localization.py Normal file
View File

@@ -0,0 +1,214 @@
"""
Server-side localization utilities for export templates.
This module provides utilities to load localized labels from
static/locales/*.json files for use in export templates.
"""
import json
import os
import logging
from pathlib import Path
from typing import Dict, Optional
from datetime import datetime
logger = logging.getLogger(__name__)
# Cache for loaded locales
_locale_cache: Dict[str, dict] = {}
def get_locales_dir() -> Path:
"""Get the path to the locales directory."""
# Navigate from src/utils to static/locales
base_dir = Path(__file__).parent.parent.parent
return base_dir / 'static' / 'locales'
def load_locale(language: str) -> dict:
"""
Load locale data for a given language.
Args:
language: Language code (e.g., 'en', 'de', 'fr')
Returns:
Dictionary containing all locale strings
"""
# Check cache first
if language in _locale_cache:
return _locale_cache[language]
locales_dir = get_locales_dir()
locale_file = locales_dir / f'{language}.json'
# Fallback to English if requested language doesn't exist
if not locale_file.exists():
logger.warning(f"Locale file not found for '{language}', falling back to English")
locale_file = locales_dir / 'en.json'
language = 'en'
try:
with open(locale_file, 'r', encoding='utf-8') as f:
locale_data = json.load(f)
_locale_cache[language] = locale_data
return locale_data
except (json.JSONDecodeError, IOError) as e:
logger.error(f"Error loading locale file '{locale_file}': {e}")
# Return empty dict on error
return {}
def get_export_labels(language: str) -> dict:
"""
Get localized export labels for a given language.
Args:
language: Language code (e.g., 'en', 'de', 'fr')
Returns:
Dictionary containing export-specific labels
"""
locale_data = load_locale(language)
# Get exportLabels section, or fall back to defaults
export_labels = locale_data.get('exportLabels', {})
# Default English labels as fallback
defaults = {
'metadata': 'Metadata',
'notes': 'Notes',
'summary': 'Summary',
'transcription': 'Transcription',
'date': 'Date',
'created': 'Created',
'originalFile': 'Original File',
'fileSize': 'File Size',
'participants': 'Participants',
'tags': 'Tags',
'transcriptionTime': 'Transcription Time',
'summarizationTime': 'Summarization Time',
'footer': 'Generated with [Speakr](https://github.com/learnedmachine/speakr)'
}
# Merge defaults with loaded labels
result = defaults.copy()
result.update(export_labels)
return result
def format_date_localized(dt: datetime, language: str) -> str:
"""
Format a datetime in a localized format.
Args:
dt: The datetime to format
language: Language code for localization
Returns:
Localized date string
"""
if dt is None:
return ''
# Define locale-specific date formats
date_formats = {
'en': '%B %d, %Y', # January 15, 2026
'de': '%d. %B %Y', # 15. Januar 2026
'fr': '%d %B %Y', # 15 janvier 2026
'es': '%d de %B de %Y', # 15 de enero de 2026
'zh': '%Y年%m月%d', # 2026年01月15日
'ru': '%d %B %Y г.', # 15 января 2026 г.
}
# Month names for different languages
month_names = {
'de': {
'January': 'Januar', 'February': 'Februar', 'March': 'März',
'April': 'April', 'May': 'Mai', 'June': 'Juni',
'July': 'Juli', 'August': 'August', 'September': 'September',
'October': 'Oktober', 'November': 'November', 'December': 'Dezember'
},
'fr': {
'January': 'janvier', 'February': 'février', 'March': 'mars',
'April': 'avril', 'May': 'mai', 'June': 'juin',
'July': 'juillet', 'August': 'août', 'September': 'septembre',
'October': 'octobre', 'November': 'novembre', 'December': 'décembre'
},
'es': {
'January': 'enero', 'February': 'febrero', 'March': 'marzo',
'April': 'abril', 'May': 'mayo', 'June': 'junio',
'July': 'julio', 'August': 'agosto', 'September': 'septiembre',
'October': 'octubre', 'November': 'noviembre', 'December': 'diciembre'
},
'ru': {
'January': 'января', 'February': 'февраля', 'March': 'марта',
'April': 'апреля', 'May': 'мая', 'June': 'июня',
'July': 'июля', 'August': 'августа', 'September': 'сентября',
'October': 'октября', 'November': 'ноября', 'December': 'декабря'
}
}
# Get format for language, default to English
date_format = date_formats.get(language, date_formats['en'])
# Format the date
formatted = dt.strftime(date_format)
# Replace English month names with localized versions
if language in month_names:
for eng, local in month_names[language].items():
formatted = formatted.replace(eng, local)
return formatted
def format_datetime_localized(dt: datetime, language: str) -> str:
"""
Format a datetime with time in a localized format.
Args:
dt: The datetime to format
language: Language code for localization
Returns:
Localized datetime string
"""
if dt is None:
return ''
date_part = format_date_localized(dt, language)
# Time format varies by language
time_formats = {
'en': '%I:%M %p', # 02:30 PM
'de': '%H:%M Uhr', # 14:30 Uhr
'fr': '%H:%M', # 14:30
'es': '%H:%M', # 14:30
'zh': '%H:%M', # 14:30
'ru': '%H:%M', # 14:30
}
time_format = time_formats.get(language, time_formats['en'])
time_part = dt.strftime(time_format)
# Combine with appropriate connector
connectors = {
'en': ' at ',
'de': ' um ',
'fr': ' à ',
'es': ' a las ',
'zh': ' ',
'ru': ' в ',
}
connector = connectors.get(language, ' at ')
return f"{date_part}{connector}{time_part}"
def clear_locale_cache():
"""Clear the locale cache (useful for testing or hot-reloading)."""
global _locale_cache
_locale_cache = {}

132
src/utils/markdown.py Normal file
View File

@@ -0,0 +1,132 @@
"""
Markdown and HTML utilities for converting and sanitizing text content.
This module provides functions for converting markdown to HTML and
sanitizing HTML to prevent XSS and other security issues.
"""
import re
import markdown
import bleach
# --- Initialize Markdown Once (Performance Optimization) ---
# Create a single reusable Markdown instance to avoid reinitializing extensions on every call
_markdown_instance = markdown.Markdown(extensions=[
'fenced_code', # Fenced code blocks
'tables', # Table support
'attr_list', # Attribute lists
'def_list', # Definition lists
'footnotes', # Footnotes
'abbr', # Abbreviations
'codehilite', # Syntax highlighting for code blocks
'smarty' # Smart quotes, dashes, etc.
])
def sanitize_html(text):
"""
Sanitize HTML content to prevent XSS and other security issues.
Args:
text (str): HTML text to sanitize
Returns:
str: Sanitized HTML text
"""
if not text:
return ""
# Remove any template-like syntax that could be exploited
# Remove {{ }} style template syntax
text = re.sub(r'\{\{.*?\}\}', '', text, flags=re.DOTALL)
text = re.sub(r'\{%.*?%\}', '', text, flags=re.DOTALL)
# Remove other template-like syntax
text = re.sub(r'<%.*?%>', '', text, flags=re.DOTALL)
text = re.sub(r'<\?.*?\?>', '', text, flags=re.DOTALL)
# Define allowed tags and attributes for safe HTML
allowed_tags = [
'p', 'br', 'strong', 'b', 'em', 'i', 'u', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'blockquote', 'code', 'pre', 'a', 'img', 'table', 'thead',
'tbody', 'tr', 'th', 'td', 'dl', 'dt', 'dd', 'div', 'span', 'hr', 'sup', 'sub'
]
allowed_attributes = {
'a': ['href', 'title'],
'img': ['src', 'alt', 'title', 'width', 'height'],
'code': ['class'], # For syntax highlighting
'pre': ['class'], # For syntax highlighting
'div': ['class'], # For code blocks
'span': ['class'], # For syntax highlighting
'th': ['align'],
'td': ['align'],
'table': ['class']
}
# Sanitize the HTML to remove dangerous content
sanitized_html = bleach.clean(
text,
tags=allowed_tags,
attributes=allowed_attributes,
protocols=['http', 'https', 'mailto'],
strip=True # Strip disallowed tags instead of escaping them
)
return sanitized_html
def md_to_html(text):
"""
Convert markdown text to sanitized HTML.
Args:
text (str): Markdown text to convert
Returns:
str: Sanitized HTML output
"""
if not text:
return ""
# Fix list spacing
def fix_list_spacing(text):
lines = text.split('\n')
result = []
in_list = False
for line in lines:
stripped = line.strip()
# Check if this line is a list item (starts with -, *, +, or number.)
is_list_item = (
stripped.startswith(('- ', '* ', '+ ')) or
(stripped and stripped[0].isdigit() and '. ' in stripped[:10])
)
# If we're starting a new list or continuing a list, ensure proper spacing
if is_list_item:
if not in_list and result and result[-1].strip():
# Starting a new list - add blank line before
result.append('')
in_list = True
elif in_list and stripped and not is_list_item:
# Ending a list - add blank line after the list
if result and result[-1].strip():
result.append('')
in_list = False
result.append(line)
return '\n'.join(result)
# Fix list spacing
processed_text = fix_list_spacing(text)
# Convert markdown to HTML using the pre-configured singleton instance
# Reset the instance to clear any state from previous conversions
_markdown_instance.reset()
html = _markdown_instance.convert(processed_text)
# Apply sanitization to the generated HTML
return sanitize_html(html)

65
src/utils/security.py Normal file
View File

@@ -0,0 +1,65 @@
"""
Security utilities for password validation and other security functions.
This module provides security-related utility functions for the application.
"""
import re
from functools import wraps
from wtforms.validators import ValidationError
from urllib.parse import urlparse, urljoin
from flask import request, jsonify
from flask_login import login_required, current_user
def password_check(form, field):
"""
Custom WTForms validator for password strength.
Validates that passwords meet security requirements:
- At least 8 characters long
- Contains at least one uppercase letter
- Contains at least one lowercase letter
- Contains at least one number
- Contains at least one special character
Args:
form: WTForms form object
field: WTForms field object containing the password
Raises:
ValidationError: If password doesn't meet requirements
"""
password = field.data
if len(password) < 8:
raise ValidationError('Password must be at least 8 characters long.')
if not re.search(r'[A-Z]', password):
raise ValidationError('Password must contain at least one uppercase letter.')
if not re.search(r'[a-z]', password):
raise ValidationError('Password must contain at least one lowercase letter.')
if not re.search(r'[0-9]', password):
raise ValidationError('Password must contain at least one number.')
if not re.search(r'[!@#$%^&*(),.?":{}|<>]', password):
raise ValidationError('Password must contain at least one special character.')
# --- Access control decorators ---
def admin_required(f):
"""Decorator that requires the current user to be authenticated and an admin."""
@wraps(f)
@login_required
def decorated(*args, **kwargs):
if not current_user.is_admin:
return jsonify({'error': 'Admin access required'}), 403
return f(*args, **kwargs)
return decorated
# --- URL Security ---
def is_safe_url(target):
ref_url = urlparse(request.host_url)
test_url = urlparse(urljoin(request.host_url, target))
return test_url.scheme in ('http', 'https') and ref_url.netloc == test_url.netloc

108
src/utils/token_auth.py Normal file
View File

@@ -0,0 +1,108 @@
"""
Token authentication utilities.
This module provides token-based authentication for API access,
allowing users to authenticate with Bearer tokens instead of session cookies.
"""
import hashlib
from datetime import datetime
from flask import request
from src.models import APIToken, User
def extract_token_from_request():
"""
Extract API token from various possible locations in the request.
Checks in order:
1. Authorization header with Bearer scheme
2. X-API-Token header
3. API-Token header
4. 'token' query parameter
Returns:
str: The extracted token, or None if not found
"""
# Check Authorization header (Bearer token)
auth_header = request.headers.get('Authorization', '')
if auth_header.startswith('Bearer '):
return auth_header[7:] # Remove 'Bearer ' prefix
# Check X-API-Token header
token = request.headers.get('X-API-Token')
if token:
return token
# Check API-Token header
token = request.headers.get('API-Token')
if token:
return token
# Check query parameter
token = request.args.get('token')
if token:
return token
return None
def hash_token(token):
"""
Hash a token using SHA-256.
Args:
token (str): The plaintext token to hash
Returns:
str: The hexadecimal hash of the token
"""
return hashlib.sha256(token.encode()).hexdigest()
def load_user_from_token():
"""
Load a user from an API token in the request.
This function is used by Flask-Login's request_loader to authenticate
users via API tokens instead of sessions.
Returns:
User: The authenticated user, or None if authentication fails
"""
# Extract token from request
token = extract_token_from_request()
if not token:
return None
# Hash the token to look up in database
token_hash = hash_token(token)
# Find the token in the database
api_token = APIToken.query.filter_by(token_hash=token_hash).first()
# Validate token
if not api_token:
return None
if not api_token.is_valid():
return None
# Update last used timestamp
api_token.last_used_at = datetime.utcnow()
from src.database import db
db.session.commit()
# Return the associated user
return api_token.user
def is_token_authenticated():
"""
Check if the current request is authenticated via API token.
Returns:
bool: True if a valid token was provided, False otherwise
"""
token = extract_token_from_request()
return token is not None

101
src/utils/vapid_keys.py Normal file
View File

@@ -0,0 +1,101 @@
"""
VAPID Key Management
Auto-generates and stores VAPID keys for push notifications
"""
import os
import json
from pathlib import Path
def generate_vapid_keys():
"""Generate new VAPID keys using pywebpush"""
try:
from pywebpush import webpush
# Generate keys
vapid_claims = webpush.WebPusher().vapid_claims
# For newer versions of pywebpush, use this approach:
from py_vapid import Vapid
vapid = Vapid()
vapid.generate_keys()
return {
'public_key': vapid.public_key.export_public(encoding='uncompressed'),
'private_key': vapid.private_key.export_private(encoding='pem')
}
except ImportError:
print("[VAPID] pywebpush not installed. Push notifications will be disabled.")
print("[VAPID] Install with: pip install pywebpush")
return None
except Exception as e:
print(f"[VAPID] Failed to generate keys: {e}")
return None
def get_vapid_keys_file():
"""Get path to VAPID keys storage file"""
# Store in /config directory (persistent in Docker)
config_dir = Path(os.getenv('CONFIG_DIR', '/config'))
config_dir.mkdir(parents=True, exist_ok=True)
return config_dir / 'vapid_keys.json'
def load_vapid_keys():
"""Load existing VAPID keys or generate new ones"""
keys_file = get_vapid_keys_file()
# Try to load existing keys
if keys_file.exists():
try:
with open(keys_file, 'r') as f:
keys = json.load(f)
print(f"[VAPID] Loaded existing keys from {keys_file}")
return keys
except Exception as e:
print(f"[VAPID] Failed to load existing keys: {e}")
# Continue to generate new keys
# Generate new keys
print("[VAPID] Generating new VAPID keys...")
keys = generate_vapid_keys()
if keys:
# Save keys to file
try:
with open(keys_file, 'w') as f:
json.dump(keys, f, indent=2)
# Set restrictive permissions (owner read/write only)
os.chmod(keys_file, 0o600)
print(f"[VAPID] Saved new keys to {keys_file}")
print(f"[VAPID] Public key: {keys['public_key'][:50]}...")
return keys
except Exception as e:
print(f"[VAPID] Failed to save keys: {e}")
return keys
else:
print("[VAPID] Push notifications disabled - pywebpush not available")
return None
def get_public_key():
"""Get the public VAPID key for client use"""
keys = load_vapid_keys()
return keys['public_key'] if keys else None
def get_private_key():
"""Get the private VAPID key for server use"""
keys = load_vapid_keys()
return keys['private_key'] if keys else None
# Initialize on module import
VAPID_KEYS = load_vapid_keys()
VAPID_ENABLED = VAPID_KEYS is not None
# Make keys available as module-level variables
VAPID_PUBLIC_KEY = VAPID_KEYS['public_key'] if VAPID_KEYS else None
VAPID_PRIVATE_KEY = VAPID_KEYS['private_key'] if VAPID_KEYS else None