# ============================================================================= # Transcription Connector Configuration # ============================================================================= # # DictIA supports multiple transcription providers through a connector-based # architecture. This file documents all available configuration options. # # Quick Start (Simplified): # 1. For OpenAI with diarization: Set TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize # 2. For self-hosted ASR: Set ASR_BASE_URL=http://your-asr:9000 # 3. For legacy Whisper: Set TRANSCRIPTION_API_KEY and optionally TRANSCRIPTION_MODEL # # Auto-Detection Priority: # 1. TRANSCRIPTION_CONNECTOR - explicit connector name (if you need full control) # 2. ASR_BASE_URL - if set, uses ASR endpoint connector # 3. TRANSCRIPTION_MODEL contains 'gpt-4o' - uses OpenAI Transcribe connector # 4. Default - uses OpenAI Whisper connector with TRANSCRIPTION_MODEL or whisper-1 # ============================================================================= # TEXT GENERATION MODEL (REQUIRED for summaries, titles, chat) # ============================================================================= # DictIA uses a text/LLM model for generating summaries, titles, and chat. # This is separate from the transcription model (STT). # # You can use OpenRouter (recommended - access to many models) or direct OpenAI API. # OpenRouter example (recommended - supports many models): TEXT_MODEL_BASE_URL=https://openrouter.ai/api/v1 TEXT_MODEL_API_KEY=your_openrouter_api_key TEXT_MODEL_NAME=openai/gpt-4o-mini # OpenAI direct example: # TEXT_MODEL_BASE_URL=https://api.openai.com/v1 # TEXT_MODEL_API_KEY=sk-your_openai_api_key # TEXT_MODEL_NAME=gpt-4o-mini # --- GPT-5 Specific Settings (only used with OpenAI API and GPT-5 models) --- # Reasoning effort: minimal, low, medium, high (default: medium) GPT5_REASONING_EFFORT=medium # Verbosity: low, medium, high (default: medium) GPT5_VERBOSITY=medium # --- Chat Model Configuration (Optional) --- # Configure a separate model for real-time chat interactions. # If not set, chat will use the TEXT_MODEL_* settings above. # CHAT_MODEL_API_KEY=your_chat_api_key # CHAT_MODEL_BASE_URL=https://openrouter.ai/api/v1 # CHAT_MODEL_NAME=openai/gpt-4o # ============================================================================= # CONNECTOR SELECTION (Auto-detected if not set) # ============================================================================= # Options: openai_whisper, openai_transcribe, asr_endpoint # Leave empty to auto-detect based on other settings # TRANSCRIPTION_CONNECTOR= # Feature flag to enable/disable new connector architecture (default: true) # Set to false to use legacy code path for troubleshooting # USE_NEW_TRANSCRIPTION_ARCHITECTURE=true # ============================================================================= # OPENAI CONFIGURATION (Required for openai_whisper and openai_transcribe) # ============================================================================= TRANSCRIPTION_API_KEY=your_openai_api_key TRANSCRIPTION_BASE_URL=https://api.openai.com/v1 # Model Selection - determines which connector is used: # # whisper-1 - Legacy Whisper model, no diarization, $0.006/min # Supports: srt, vtt, json, verbose_json output formats # # gpt-4o-transcribe - High quality transcription, no diarization, $0.006/min # Better accuracy than whisper-1, accepts prompts # # gpt-4o-mini-transcribe - Cost-effective option, no diarization, $0.003/min # Good for high-volume, budget-conscious use # # gpt-4o-transcribe-diarize - Speaker diarization!, $0.006/min # Identifies speakers as A, B, C, D... # Requires chunking_strategy for audio >30s # TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize # Legacy Whisper model name (used when TRANSCRIPTION_MODEL is not set) # WHISPER_MODEL=whisper-1 # ============================================================================= # ASR ENDPOINT CONFIGURATION (For self-hosted whisper services) # ============================================================================= # Note: USE_ASR_ENDPOINT is deprecated. Just set ASR_BASE_URL instead. # The connector will auto-detect ASR mode when ASR_BASE_URL is set. # USE_ASR_ENDPOINT=true # Deprecated - kept for backwards compatibility # Base URL of your ASR service (required if USE_ASR_ENDPOINT=true) # Supports: whisper-asr-webservice, WhisperX, and compatible services # ASR_BASE_URL=http://whisper-asr:9000 # Request timeout in seconds (default: 1800 = 30 minutes) # Increase for very long audio files # ASR_TIMEOUT=1800 # Enable speaker diarization (default: true) # ASR_DIARIZE=true # Speaker count hints (optional, helps with diarization accuracy) # ASR_MIN_SPEAKERS=1 # ASR_MAX_SPEAKERS=5 # Return speaker embeddings for speaker identification (WhisperX only) # Enables automatic speaker matching across recordings # ASR_RETURN_SPEAKER_EMBEDDINGS=false # ============================================================================= # CHUNKING CONFIGURATION (For large files) # ============================================================================= # Chunking is now connector-aware with this priority: # 1. Connector handles internally (openai_transcribe, asr_endpoint) → No app chunking # 2. ENABLE_CHUNKING=false → Disable chunking (only affects openai_whisper) # 3. CHUNK_LIMIT set → Use your settings # 4. Connector defaults → Use connector's recommended limits # 5. App default → 20MB size-based # # For openai_transcribe/asr_endpoint: These settings are IGNORED (connector handles it) # For openai_whisper: These settings control chunking behavior # ENABLE_CHUNKING=false # Uncomment to disable chunking for openai_whisper # Chunk limit - supports size (20MB) or duration (600s, 10m) CHUNK_LIMIT=20MB # Overlap between chunks in seconds (helps with transcription accuracy at boundaries) CHUNK_OVERLAP_SECONDS=3 # ============================================================================= # EXAMPLE CONFIGURATIONS (Simplified) # ============================================================================= # # --- OpenAI with Speaker Diarization (Recommended) --- # Just two environment variables needed: # TRANSCRIPTION_API_KEY=sk-xxx # TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize # # --- Self-hosted WhisperX (Best for privacy) --- # Just one environment variable needed (auto-detects ASR mode): # ASR_BASE_URL=http://whisper-asr:9000 # Optional: # ASR_DIARIZE=true # ASR_RETURN_SPEAKER_EMBEDDINGS=true # # --- OpenAI Whisper (Legacy, no diarization) --- # TRANSCRIPTION_API_KEY=sk-xxx # TRANSCRIPTION_MODEL=whisper-1 # # --- Custom Whisper model (local or compatible endpoint) --- # TRANSCRIPTION_API_KEY=not-needed # TRANSCRIPTION_BASE_URL=http://localhost:8080/v1 # TRANSCRIPTION_MODEL=Systran/faster-distil-whisper-large-v3 # ============================================================================= # APPLICATION SETTINGS # ============================================================================= # --- Admin User (created on first run) --- ADMIN_USERNAME=admin ADMIN_EMAIL=admin@example.com ADMIN_PASSWORD=changeme # --- Registration & Access --- ALLOW_REGISTRATION=false # Comma-separated list of allowed email domains for registration. # Leave empty to allow all domains. Example: company.com,subsidiary.org REGISTRATION_ALLOWED_DOMAINS= # --- Token Limits --- SUMMARY_MAX_TOKENS=8000 CHAT_MAX_TOKENS=5000 # --- Timezone --- # Use a valid TZ database name (e.g., "America/New_York", "Europe/London", "UTC") TIMEZONE="UTC" # --- Logging --- LOG_LEVEL="INFO" # ============================================================================= # AUDIO PROCESSING # ============================================================================= # --- Audio Compression --- # Automatically compress lossless uploads (WAV, AIFF) to save storage AUDIO_COMPRESS_UPLOADS=true # Target codec: mp3 (lossy, smallest), flac (lossless), opus (lossy, efficient) AUDIO_CODEC=mp3 # Bitrate for lossy codecs (ignored for FLAC) AUDIO_BITRATE=128k # Unsupported codecs - comma-separated list of codecs to exclude # Example: AUDIO_UNSUPPORTED_CODECS=opus,vorbis # AUDIO_UNSUPPORTED_CODECS= # ============================================================================= # OPTIONAL FEATURES # ============================================================================= # --- Inquire Mode (AI search across all recordings) --- ENABLE_INQUIRE_MODE=false # --- Automated File Processing (Black Hole Directory) --- ENABLE_AUTO_PROCESSING=false # AUTO_PROCESS_MODE=admin_only # AUTO_PROCESS_WATCH_DIR=/data/auto-process # --- Automated Export --- ENABLE_AUTO_EXPORT=false # AUTO_EXPORT_DIR=/data/exports # --- Auto-Deletion & Retention --- ENABLE_AUTO_DELETION=false # GLOBAL_RETENTION_DAYS=90 # DELETION_MODE=audio_only # --- Sharing Settings --- ENABLE_INTERNAL_SHARING=false ENABLE_PUBLIC_SHARING=true # SHOW_USERNAMES_IN_UI=false # --- Permission Controls --- USERS_CAN_DELETE=true # Delete speaker profiles when all their recordings are removed. # Default: false (speaker profiles and voice embeddings are preserved) # Set to true for privacy-sensitive deployments where biometric voice data # should not outlive the recordings it was derived from. # DELETE_ORPHANED_SPEAKERS=false # --- Video Retention --- # When enabled, uploaded video files keep their video stream for in-browser playback # The audio is extracted to a temp file for transcription, then cleaned up # Default: false (video uploads extract audio only, video stream is discarded) VIDEO_RETENTION=false # --- Video Passthrough to ASR --- # Send original video files directly to ASR without extracting audio. # Useful for custom ASR backends that handle video internally (e.g., multi-track audio extraction). # When enabled, video files bypass audio extraction, codec conversion, and chunking. # Only affects video files — audio uploads are processed normally. # Default: false # VIDEO_PASSTHROUGH_ASR=false # --- Concurrent Uploads --- # Maximum number of simultaneous file uploads (default: 3) MAX_CONCURRENT_UPLOADS=3 # ============================================================================= # BACKGROUND PROCESSING # ============================================================================= # Transcription queue workers (default: 2) JOB_QUEUE_WORKERS=2 # Summary queue workers (default: 2) SUMMARY_QUEUE_WORKERS=2 # Maximum retry attempts for failed jobs (default: 3) JOB_MAX_RETRIES=3 # ============================================================================= # DOCKER/DATABASE SETTINGS # ============================================================================= # Database URI - SQLite (default) or PostgreSQL SQLALCHEMY_DATABASE_URI=sqlite:////data/instance/transcriptions.db # For PostgreSQL: postgresql://username:password@hostname:5432/database_name UPLOAD_FOLDER=/data/uploads # ============================================================================= # FUTURE: Additional Provider Notes # ============================================================================= # The connector architecture is designed to support additional providers. # Future connectors may include: # # - Deepgram: Known for excellent diarization and real-time transcription # - AssemblyAI: Strong diarization with speaker labels # - Google Cloud Speech-to-Text: Enterprise-grade with speaker diarization # # To request a new connector, please open an issue on GitHub.