Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)

2026-03-16 21:47:37 +00:00
commit 42772a31ed
365 changed files with 103572 additions and 0 deletions
--- a/config/env.transcription.example
+++ b/config/env.transcription.example
@@ -0,0 +1,289 @@
+# =============================================================================
+# Transcription Connector Configuration
+# =============================================================================
+#
+# DictIA supports multiple transcription providers through a connector-based
+# architecture. This file documents all available configuration options.
+#
+# Quick Start (Simplified):
+# 1. For OpenAI with diarization: Set TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize
+# 2. For self-hosted ASR: Set ASR_BASE_URL=http://your-asr:9000
+# 3. For legacy Whisper: Set TRANSCRIPTION_API_KEY and optionally TRANSCRIPTION_MODEL
+#
+# Auto-Detection Priority:
+# 1. TRANSCRIPTION_CONNECTOR - explicit connector name (if you need full control)
+# 2. ASR_BASE_URL - if set, uses ASR endpoint connector
+# 3. TRANSCRIPTION_MODEL contains 'gpt-4o' - uses OpenAI Transcribe connector
+# 4. Default - uses OpenAI Whisper connector with TRANSCRIPTION_MODEL or whisper-1
+
+# =============================================================================
+# TEXT GENERATION MODEL (REQUIRED for summaries, titles, chat)
+# =============================================================================
+# DictIA uses a text/LLM model for generating summaries, titles, and chat.
+# This is separate from the transcription model (STT).
+#
+# You can use OpenRouter (recommended - access to many models) or direct OpenAI API.
+
+# OpenRouter example (recommended - supports many models):
+TEXT_MODEL_BASE_URL=https://openrouter.ai/api/v1
+TEXT_MODEL_API_KEY=your_openrouter_api_key
+TEXT_MODEL_NAME=openai/gpt-4o-mini
+
+# OpenAI direct example:
+# TEXT_MODEL_BASE_URL=https://api.openai.com/v1
+# TEXT_MODEL_API_KEY=sk-your_openai_api_key
+# TEXT_MODEL_NAME=gpt-4o-mini
+
+# --- GPT-5 Specific Settings (only used with OpenAI API and GPT-5 models) ---
+# Reasoning effort: minimal, low, medium, high (default: medium)
+GPT5_REASONING_EFFORT=medium
+# Verbosity: low, medium, high (default: medium)
+GPT5_VERBOSITY=medium
+
+# --- Chat Model Configuration (Optional) ---
+# Configure a separate model for real-time chat interactions.
+# If not set, chat will use the TEXT_MODEL_* settings above.
+# CHAT_MODEL_API_KEY=your_chat_api_key
+# CHAT_MODEL_BASE_URL=https://openrouter.ai/api/v1
+# CHAT_MODEL_NAME=openai/gpt-4o
+
+# =============================================================================
+# CONNECTOR SELECTION (Auto-detected if not set)
+# =============================================================================
+# Options: openai_whisper, openai_transcribe, asr_endpoint
+# Leave empty to auto-detect based on other settings
+# TRANSCRIPTION_CONNECTOR=
+
+# Feature flag to enable/disable new connector architecture (default: true)
+# Set to false to use legacy code path for troubleshooting
+# USE_NEW_TRANSCRIPTION_ARCHITECTURE=true
+
+# =============================================================================
+# OPENAI CONFIGURATION (Required for openai_whisper and openai_transcribe)
+# =============================================================================
+TRANSCRIPTION_API_KEY=your_openai_api_key
+TRANSCRIPTION_BASE_URL=https://api.openai.com/v1
+
+# Model Selection - determines which connector is used:
+#
+# whisper-1              - Legacy Whisper model, no diarization, $0.006/min
+#                          Supports: srt, vtt, json, verbose_json output formats
+#
+# gpt-4o-transcribe      - High quality transcription, no diarization, $0.006/min
+#                          Better accuracy than whisper-1, accepts prompts
+#
+# gpt-4o-mini-transcribe - Cost-effective option, no diarization, $0.003/min
+#                          Good for high-volume, budget-conscious use
+#
+# gpt-4o-transcribe-diarize - Speaker diarization!, $0.006/min
+#                             Identifies speakers as A, B, C, D...
+#                             Requires chunking_strategy for audio >30s
+#
+TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize
+
+# Legacy Whisper model name (used when TRANSCRIPTION_MODEL is not set)
+# WHISPER_MODEL=whisper-1
+
+# =============================================================================
+# ASR ENDPOINT CONFIGURATION (For self-hosted whisper services)
+# =============================================================================
+# Note: USE_ASR_ENDPOINT is deprecated. Just set ASR_BASE_URL instead.
+# The connector will auto-detect ASR mode when ASR_BASE_URL is set.
+# USE_ASR_ENDPOINT=true  # Deprecated - kept for backwards compatibility
+
+# Base URL of your ASR service (required if USE_ASR_ENDPOINT=true)
+# Supports: whisper-asr-webservice, WhisperX, and compatible services
+# ASR_BASE_URL=http://whisper-asr:9000
+
+# Request timeout in seconds (default: 1800 = 30 minutes)
+# Increase for very long audio files
+# ASR_TIMEOUT=1800
+
+# Enable speaker diarization (default: true)
+# ASR_DIARIZE=true
+
+# Speaker count hints (optional, helps with diarization accuracy)
+# ASR_MIN_SPEAKERS=1
+# ASR_MAX_SPEAKERS=5
+
+# Return speaker embeddings for speaker identification (WhisperX only)
+# Enables automatic speaker matching across recordings
+# ASR_RETURN_SPEAKER_EMBEDDINGS=false
+
+# =============================================================================
+# CHUNKING CONFIGURATION (For large files)
+# =============================================================================
+# Chunking is now connector-aware with this priority:
+# 1. Connector handles internally (openai_transcribe, asr_endpoint) → No app chunking
+# 2. ENABLE_CHUNKING=false → Disable chunking (only affects openai_whisper)
+# 3. CHUNK_LIMIT set → Use your settings
+# 4. Connector defaults → Use connector's recommended limits
+# 5. App default → 20MB size-based
+#
+# For openai_transcribe/asr_endpoint: These settings are IGNORED (connector handles it)
+# For openai_whisper: These settings control chunking behavior
+
+# ENABLE_CHUNKING=false  # Uncomment to disable chunking for openai_whisper
+
+# Chunk limit - supports size (20MB) or duration (600s, 10m)
+CHUNK_LIMIT=20MB
+
+# Overlap between chunks in seconds (helps with transcription accuracy at boundaries)
+CHUNK_OVERLAP_SECONDS=3
+
+# =============================================================================
+# EXAMPLE CONFIGURATIONS (Simplified)
+# =============================================================================
+#
+# --- OpenAI with Speaker Diarization (Recommended) ---
+# Just two environment variables needed:
+# TRANSCRIPTION_API_KEY=sk-xxx
+# TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize
+#
+# --- Self-hosted WhisperX (Best for privacy) ---
+# Just one environment variable needed (auto-detects ASR mode):
+# ASR_BASE_URL=http://whisper-asr:9000
+# Optional:
+# ASR_DIARIZE=true
+# ASR_RETURN_SPEAKER_EMBEDDINGS=true
+#
+# --- OpenAI Whisper (Legacy, no diarization) ---
+# TRANSCRIPTION_API_KEY=sk-xxx
+# TRANSCRIPTION_MODEL=whisper-1
+#
+# --- Custom Whisper model (local or compatible endpoint) ---
+# TRANSCRIPTION_API_KEY=not-needed
+# TRANSCRIPTION_BASE_URL=http://localhost:8080/v1
+# TRANSCRIPTION_MODEL=Systran/faster-distil-whisper-large-v3
+
+# =============================================================================
+# APPLICATION SETTINGS
+# =============================================================================
+
+# --- Admin User (created on first run) ---
+ADMIN_USERNAME=admin
+ADMIN_EMAIL=admin@example.com
+ADMIN_PASSWORD=changeme
+
+# --- Registration & Access ---
+ALLOW_REGISTRATION=false
+# Comma-separated list of allowed email domains for registration.
+# Leave empty to allow all domains. Example: company.com,subsidiary.org
+REGISTRATION_ALLOWED_DOMAINS=
+
+# --- Token Limits ---
+SUMMARY_MAX_TOKENS=8000
+CHAT_MAX_TOKENS=5000
+
+# --- Timezone ---
+# Use a valid TZ database name (e.g., "America/New_York", "Europe/London", "UTC")
+TIMEZONE="UTC"
+
+# --- Logging ---
+LOG_LEVEL="INFO"
+
+# =============================================================================
+# AUDIO PROCESSING
+# =============================================================================
+
+# --- Audio Compression ---
+# Automatically compress lossless uploads (WAV, AIFF) to save storage
+AUDIO_COMPRESS_UPLOADS=true
+
+# Target codec: mp3 (lossy, smallest), flac (lossless), opus (lossy, efficient)
+AUDIO_CODEC=mp3
+
+# Bitrate for lossy codecs (ignored for FLAC)
+AUDIO_BITRATE=128k
+
+# Unsupported codecs - comma-separated list of codecs to exclude
+# Example: AUDIO_UNSUPPORTED_CODECS=opus,vorbis
+# AUDIO_UNSUPPORTED_CODECS=
+
+# =============================================================================
+# OPTIONAL FEATURES
+# =============================================================================
+
+# --- Inquire Mode (AI search across all recordings) ---
+ENABLE_INQUIRE_MODE=false
+
+# --- Automated File Processing (Black Hole Directory) ---
+ENABLE_AUTO_PROCESSING=false
+# AUTO_PROCESS_MODE=admin_only
+# AUTO_PROCESS_WATCH_DIR=/data/auto-process
+
+# --- Automated Export ---
+ENABLE_AUTO_EXPORT=false
+# AUTO_EXPORT_DIR=/data/exports
+
+# --- Auto-Deletion & Retention ---
+ENABLE_AUTO_DELETION=false
+# GLOBAL_RETENTION_DAYS=90
+# DELETION_MODE=audio_only
+
+# --- Sharing Settings ---
+ENABLE_INTERNAL_SHARING=false
+ENABLE_PUBLIC_SHARING=true
+# SHOW_USERNAMES_IN_UI=false
+
+# --- Permission Controls ---
+USERS_CAN_DELETE=true
+
+# Delete speaker profiles when all their recordings are removed.
+# Default: false (speaker profiles and voice embeddings are preserved)
+# Set to true for privacy-sensitive deployments where biometric voice data
+# should not outlive the recordings it was derived from.
+# DELETE_ORPHANED_SPEAKERS=false
+
+# --- Video Retention ---
+# When enabled, uploaded video files keep their video stream for in-browser playback
+# The audio is extracted to a temp file for transcription, then cleaned up
+# Default: false (video uploads extract audio only, video stream is discarded)
+VIDEO_RETENTION=false
+
+# --- Video Passthrough to ASR ---
+# Send original video files directly to ASR without extracting audio.
+# Useful for custom ASR backends that handle video internally (e.g., multi-track audio extraction).
+# When enabled, video files bypass audio extraction, codec conversion, and chunking.
+# Only affects video files — audio uploads are processed normally.
+# Default: false
+# VIDEO_PASSTHROUGH_ASR=false
+
+# --- Concurrent Uploads ---
+# Maximum number of simultaneous file uploads (default: 3)
+MAX_CONCURRENT_UPLOADS=3
+
+# =============================================================================
+# BACKGROUND PROCESSING
+# =============================================================================
+
+# Transcription queue workers (default: 2)
+JOB_QUEUE_WORKERS=2
+
+# Summary queue workers (default: 2)
+SUMMARY_QUEUE_WORKERS=2
+
+# Maximum retry attempts for failed jobs (default: 3)
+JOB_MAX_RETRIES=3
+
+# =============================================================================
+# DOCKER/DATABASE SETTINGS
+# =============================================================================
+
+# Database URI - SQLite (default) or PostgreSQL
+SQLALCHEMY_DATABASE_URI=sqlite:////data/instance/transcriptions.db
+# For PostgreSQL: postgresql://username:password@hostname:5432/database_name
+
+UPLOAD_FOLDER=/data/uploads
+
+# =============================================================================
+# FUTURE: Additional Provider Notes
+# =============================================================================
+# The connector architecture is designed to support additional providers.
+# Future connectors may include:
+#
+# - Deepgram: Known for excellent diarization and real-time transcription
+# - AssemblyAI: Strong diarization with speaker labels
+# - Google Cloud Speech-to-Text: Enterprise-grade with speaker diarization
+#
+# To request a new connector, please open an issue on GitHub.