dictia-public/config/env.transcription.example

# =============================================================================
# Transcription Connector Configuration
# =============================================================================
#
# DictIA supports multiple transcription providers through a connector-based
# architecture. This file documents all available configuration options.
#
# Quick Start (Simplified):
# 1. For OpenAI with diarization: Set TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize
# 2. For self-hosted ASR: Set ASR_BASE_URL=http://your-asr:9000
# 3. For legacy Whisper: Set TRANSCRIPTION_API_KEY and optionally TRANSCRIPTION_MODEL
#
# Auto-Detection Priority:
# 1. TRANSCRIPTION_CONNECTOR - explicit connector name (if you need full control)
# 2. ASR_BASE_URL - if set, uses ASR endpoint connector
# 3. TRANSCRIPTION_MODEL contains 'gpt-4o' - uses OpenAI Transcribe connector
# 4. Default - uses OpenAI Whisper connector with TRANSCRIPTION_MODEL or whisper-1

# =============================================================================
# TEXT GENERATION MODEL (REQUIRED for summaries, titles, chat)
# =============================================================================
# DictIA uses a text/LLM model for generating summaries, titles, and chat.
# This is separate from the transcription model (STT).
#
# You can use OpenRouter (recommended - access to many models) or direct OpenAI API.

# OpenRouter example (recommended - supports many models):
TEXT_MODEL_BASE_URL=https://openrouter.ai/api/v1
TEXT_MODEL_API_KEY=your_openrouter_api_key
TEXT_MODEL_NAME=openai/gpt-4o-mini

# OpenAI direct example:
# TEXT_MODEL_BASE_URL=https://api.openai.com/v1
# TEXT_MODEL_API_KEY=sk-your_openai_api_key
# TEXT_MODEL_NAME=gpt-4o-mini

# --- GPT-5 Specific Settings (only used with OpenAI API and GPT-5 models) ---
# Reasoning effort: minimal, low, medium, high (default: medium)
GPT5_REASONING_EFFORT=medium
# Verbosity: low, medium, high (default: medium)
GPT5_VERBOSITY=medium

# --- Chat Model Configuration (Optional) ---
# Configure a separate model for real-time chat interactions.
# If not set, chat will use the TEXT_MODEL_* settings above.
# CHAT_MODEL_API_KEY=your_chat_api_key
# CHAT_MODEL_BASE_URL=https://openrouter.ai/api/v1
# CHAT_MODEL_NAME=openai/gpt-4o

# =============================================================================
# CONNECTOR SELECTION (Auto-detected if not set)
# =============================================================================
# Options: openai_whisper, openai_transcribe, asr_endpoint
# Leave empty to auto-detect based on other settings
# TRANSCRIPTION_CONNECTOR=

# Feature flag to enable/disable new connector architecture (default: true)
# Set to false to use legacy code path for troubleshooting
# USE_NEW_TRANSCRIPTION_ARCHITECTURE=true

# =============================================================================
# OPENAI CONFIGURATION (Required for openai_whisper and openai_transcribe)
# =============================================================================
TRANSCRIPTION_API_KEY=your_openai_api_key
TRANSCRIPTION_BASE_URL=https://api.openai.com/v1

# Model Selection - determines which connector is used:
#
# whisper-1              - Legacy Whisper model, no diarization, $0.006/min
#                          Supports: srt, vtt, json, verbose_json output formats
#
# gpt-4o-transcribe      - High quality transcription, no diarization, $0.006/min
#                          Better accuracy than whisper-1, accepts prompts
#
# gpt-4o-mini-transcribe - Cost-effective option, no diarization, $0.003/min
#                          Good for high-volume, budget-conscious use
#
# gpt-4o-transcribe-diarize - Speaker diarization!, $0.006/min
#                             Identifies speakers as A, B, C, D...
#                             Requires chunking_strategy for audio >30s
#
TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize

# Legacy Whisper model name (used when TRANSCRIPTION_MODEL is not set)
# WHISPER_MODEL=whisper-1

# =============================================================================
# ASR ENDPOINT CONFIGURATION (For self-hosted whisper services)
# =============================================================================
# Note: USE_ASR_ENDPOINT is deprecated. Just set ASR_BASE_URL instead.
# The connector will auto-detect ASR mode when ASR_BASE_URL is set.
# USE_ASR_ENDPOINT=true  # Deprecated - kept for backwards compatibility

# Base URL of your ASR service (required if USE_ASR_ENDPOINT=true)
# Supports: whisper-asr-webservice, WhisperX, and compatible services
# ASR_BASE_URL=http://whisper-asr:9000

# Request timeout in seconds (default: 1800 = 30 minutes)
# Increase for very long audio files
# ASR_TIMEOUT=1800

# Enable speaker diarization (default: true)
# ASR_DIARIZE=true

# Speaker count hints (optional, helps with diarization accuracy)
# ASR_MIN_SPEAKERS=1
# ASR_MAX_SPEAKERS=5

# Return speaker embeddings for speaker identification (WhisperX only)
# Enables automatic speaker matching across recordings
# ASR_RETURN_SPEAKER_EMBEDDINGS=false

# =============================================================================
# CHUNKING CONFIGURATION (For large files)
# =============================================================================
# Chunking is now connector-aware with this priority:
# 1. Connector handles internally (openai_transcribe, asr_endpoint) → No app chunking
# 2. ENABLE_CHUNKING=false → Disable chunking (only affects openai_whisper)
# 3. CHUNK_LIMIT set → Use your settings
# 4. Connector defaults → Use connector's recommended limits
# 5. App default → 20MB size-based
#
# For openai_transcribe/asr_endpoint: These settings are IGNORED (connector handles it)
# For openai_whisper: These settings control chunking behavior

# ENABLE_CHUNKING=false  # Uncomment to disable chunking for openai_whisper

# Chunk limit - supports size (20MB) or duration (600s, 10m)
CHUNK_LIMIT=20MB

# Overlap between chunks in seconds (helps with transcription accuracy at boundaries)
CHUNK_OVERLAP_SECONDS=3

# =============================================================================
# EXAMPLE CONFIGURATIONS (Simplified)
# =============================================================================
#
# --- OpenAI with Speaker Diarization (Recommended) ---
# Just two environment variables needed:
# TRANSCRIPTION_API_KEY=sk-xxx
# TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize
#
# --- Self-hosted WhisperX (Best for privacy) ---
# Just one environment variable needed (auto-detects ASR mode):
# ASR_BASE_URL=http://whisper-asr:9000
# Optional:
# ASR_DIARIZE=true
# ASR_RETURN_SPEAKER_EMBEDDINGS=true
#
# --- OpenAI Whisper (Legacy, no diarization) ---
# TRANSCRIPTION_API_KEY=sk-xxx
# TRANSCRIPTION_MODEL=whisper-1
#
# --- Custom Whisper model (local or compatible endpoint) ---
# TRANSCRIPTION_API_KEY=not-needed
# TRANSCRIPTION_BASE_URL=http://localhost:8080/v1
# TRANSCRIPTION_MODEL=Systran/faster-distil-whisper-large-v3

# =============================================================================
# APPLICATION SETTINGS
# =============================================================================

# --- Admin User (created on first run) ---
ADMIN_USERNAME=admin
ADMIN_EMAIL=admin@example.com
ADMIN_PASSWORD=changeme

# --- Registration & Access ---
ALLOW_REGISTRATION=false
# Comma-separated list of allowed email domains for registration.
# Leave empty to allow all domains. Example: company.com,subsidiary.org
REGISTRATION_ALLOWED_DOMAINS=

# --- Token Limits ---
SUMMARY_MAX_TOKENS=8000
CHAT_MAX_TOKENS=5000

# --- Timezone ---
# Use a valid TZ database name (e.g., "America/New_York", "Europe/London", "UTC")
TIMEZONE="UTC"

# --- Logging ---
LOG_LEVEL="INFO"

# =============================================================================
# AUDIO PROCESSING
# =============================================================================

# --- Audio Compression ---
# Automatically compress lossless uploads (WAV, AIFF) to save storage
AUDIO_COMPRESS_UPLOADS=true

# Target codec: mp3 (lossy, smallest), flac (lossless), opus (lossy, efficient)
AUDIO_CODEC=mp3

# Bitrate for lossy codecs (ignored for FLAC)
AUDIO_BITRATE=128k

# Unsupported codecs - comma-separated list of codecs to exclude
# Example: AUDIO_UNSUPPORTED_CODECS=opus,vorbis
# AUDIO_UNSUPPORTED_CODECS=

# =============================================================================
# OPTIONAL FEATURES
# =============================================================================

# --- Inquire Mode (AI search across all recordings) ---
ENABLE_INQUIRE_MODE=false

# --- Automated File Processing (Black Hole Directory) ---
ENABLE_AUTO_PROCESSING=false
# AUTO_PROCESS_MODE=admin_only
# AUTO_PROCESS_WATCH_DIR=/data/auto-process

# --- Automated Export ---
ENABLE_AUTO_EXPORT=false
# AUTO_EXPORT_DIR=/data/exports

# --- Auto-Deletion & Retention ---
ENABLE_AUTO_DELETION=false
# GLOBAL_RETENTION_DAYS=90
# DELETION_MODE=audio_only

# --- Sharing Settings ---
ENABLE_INTERNAL_SHARING=false
ENABLE_PUBLIC_SHARING=true
# SHOW_USERNAMES_IN_UI=false

# --- Permission Controls ---
USERS_CAN_DELETE=true

# Delete speaker profiles when all their recordings are removed.
# Default: false (speaker profiles and voice embeddings are preserved)
# Set to true for privacy-sensitive deployments where biometric voice data
# should not outlive the recordings it was derived from.
# DELETE_ORPHANED_SPEAKERS=false

# --- Video Retention ---
# When enabled, uploaded video files keep their video stream for in-browser playback
# The audio is extracted to a temp file for transcription, then cleaned up
# Default: false (video uploads extract audio only, video stream is discarded)
VIDEO_RETENTION=false

# --- Video Passthrough to ASR ---
# Send original video files directly to ASR without extracting audio.
# Useful for custom ASR backends that handle video internally (e.g., multi-track audio extraction).
# When enabled, video files bypass audio extraction, codec conversion, and chunking.
# Only affects video files — audio uploads are processed normally.
# Default: false
# VIDEO_PASSTHROUGH_ASR=false

# --- Concurrent Uploads ---
# Maximum number of simultaneous file uploads (default: 3)
MAX_CONCURRENT_UPLOADS=3

# =============================================================================
# BACKGROUND PROCESSING
# =============================================================================

# Transcription queue workers (default: 2)
JOB_QUEUE_WORKERS=2

# Summary queue workers (default: 2)
SUMMARY_QUEUE_WORKERS=2

# Maximum retry attempts for failed jobs (default: 3)
JOB_MAX_RETRIES=3

# =============================================================================
# DOCKER/DATABASE SETTINGS
# =============================================================================

# Database URI - SQLite (default) or PostgreSQL
SQLALCHEMY_DATABASE_URI=sqlite:////data/instance/transcriptions.db
# For PostgreSQL: postgresql://username:password@hostname:5432/database_name

UPLOAD_FOLDER=/data/uploads

# =============================================================================
# FUTURE: Additional Provider Notes
# =============================================================================
# The connector architecture is designed to support additional providers.
# Future connectors may include:
#
# - Deepgram: Known for excellent diarization and real-time transcription
# - AssemblyAI: Strong diarization with speaker labels
# - Google Cloud Speech-to-Text: Enterprise-grade with speaker diarization
#
# To request a new connector, please open an issue on GitHub.