Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)
This commit is contained in:
289
config/env.transcription.example
Normal file
289
config/env.transcription.example
Normal file
@@ -0,0 +1,289 @@
|
||||
# =============================================================================
|
||||
# Transcription Connector Configuration
|
||||
# =============================================================================
|
||||
#
|
||||
# DictIA supports multiple transcription providers through a connector-based
|
||||
# architecture. This file documents all available configuration options.
|
||||
#
|
||||
# Quick Start (Simplified):
|
||||
# 1. For OpenAI with diarization: Set TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize
|
||||
# 2. For self-hosted ASR: Set ASR_BASE_URL=http://your-asr:9000
|
||||
# 3. For legacy Whisper: Set TRANSCRIPTION_API_KEY and optionally TRANSCRIPTION_MODEL
|
||||
#
|
||||
# Auto-Detection Priority:
|
||||
# 1. TRANSCRIPTION_CONNECTOR - explicit connector name (if you need full control)
|
||||
# 2. ASR_BASE_URL - if set, uses ASR endpoint connector
|
||||
# 3. TRANSCRIPTION_MODEL contains 'gpt-4o' - uses OpenAI Transcribe connector
|
||||
# 4. Default - uses OpenAI Whisper connector with TRANSCRIPTION_MODEL or whisper-1
|
||||
|
||||
# =============================================================================
|
||||
# TEXT GENERATION MODEL (REQUIRED for summaries, titles, chat)
|
||||
# =============================================================================
|
||||
# DictIA uses a text/LLM model for generating summaries, titles, and chat.
|
||||
# This is separate from the transcription model (STT).
|
||||
#
|
||||
# You can use OpenRouter (recommended - access to many models) or direct OpenAI API.
|
||||
|
||||
# OpenRouter example (recommended - supports many models):
|
||||
TEXT_MODEL_BASE_URL=https://openrouter.ai/api/v1
|
||||
TEXT_MODEL_API_KEY=your_openrouter_api_key
|
||||
TEXT_MODEL_NAME=openai/gpt-4o-mini
|
||||
|
||||
# OpenAI direct example:
|
||||
# TEXT_MODEL_BASE_URL=https://api.openai.com/v1
|
||||
# TEXT_MODEL_API_KEY=sk-your_openai_api_key
|
||||
# TEXT_MODEL_NAME=gpt-4o-mini
|
||||
|
||||
# --- GPT-5 Specific Settings (only used with OpenAI API and GPT-5 models) ---
|
||||
# Reasoning effort: minimal, low, medium, high (default: medium)
|
||||
GPT5_REASONING_EFFORT=medium
|
||||
# Verbosity: low, medium, high (default: medium)
|
||||
GPT5_VERBOSITY=medium
|
||||
|
||||
# --- Chat Model Configuration (Optional) ---
|
||||
# Configure a separate model for real-time chat interactions.
|
||||
# If not set, chat will use the TEXT_MODEL_* settings above.
|
||||
# CHAT_MODEL_API_KEY=your_chat_api_key
|
||||
# CHAT_MODEL_BASE_URL=https://openrouter.ai/api/v1
|
||||
# CHAT_MODEL_NAME=openai/gpt-4o
|
||||
|
||||
# =============================================================================
|
||||
# CONNECTOR SELECTION (Auto-detected if not set)
|
||||
# =============================================================================
|
||||
# Options: openai_whisper, openai_transcribe, asr_endpoint
|
||||
# Leave empty to auto-detect based on other settings
|
||||
# TRANSCRIPTION_CONNECTOR=
|
||||
|
||||
# Feature flag to enable/disable new connector architecture (default: true)
|
||||
# Set to false to use legacy code path for troubleshooting
|
||||
# USE_NEW_TRANSCRIPTION_ARCHITECTURE=true
|
||||
|
||||
# =============================================================================
|
||||
# OPENAI CONFIGURATION (Required for openai_whisper and openai_transcribe)
|
||||
# =============================================================================
|
||||
TRANSCRIPTION_API_KEY=your_openai_api_key
|
||||
TRANSCRIPTION_BASE_URL=https://api.openai.com/v1
|
||||
|
||||
# Model Selection - determines which connector is used:
|
||||
#
|
||||
# whisper-1 - Legacy Whisper model, no diarization, $0.006/min
|
||||
# Supports: srt, vtt, json, verbose_json output formats
|
||||
#
|
||||
# gpt-4o-transcribe - High quality transcription, no diarization, $0.006/min
|
||||
# Better accuracy than whisper-1, accepts prompts
|
||||
#
|
||||
# gpt-4o-mini-transcribe - Cost-effective option, no diarization, $0.003/min
|
||||
# Good for high-volume, budget-conscious use
|
||||
#
|
||||
# gpt-4o-transcribe-diarize - Speaker diarization!, $0.006/min
|
||||
# Identifies speakers as A, B, C, D...
|
||||
# Requires chunking_strategy for audio >30s
|
||||
#
|
||||
TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize
|
||||
|
||||
# Legacy Whisper model name (used when TRANSCRIPTION_MODEL is not set)
|
||||
# WHISPER_MODEL=whisper-1
|
||||
|
||||
# =============================================================================
|
||||
# ASR ENDPOINT CONFIGURATION (For self-hosted whisper services)
|
||||
# =============================================================================
|
||||
# Note: USE_ASR_ENDPOINT is deprecated. Just set ASR_BASE_URL instead.
|
||||
# The connector will auto-detect ASR mode when ASR_BASE_URL is set.
|
||||
# USE_ASR_ENDPOINT=true # Deprecated - kept for backwards compatibility
|
||||
|
||||
# Base URL of your ASR service (required if USE_ASR_ENDPOINT=true)
|
||||
# Supports: whisper-asr-webservice, WhisperX, and compatible services
|
||||
# ASR_BASE_URL=http://whisper-asr:9000
|
||||
|
||||
# Request timeout in seconds (default: 1800 = 30 minutes)
|
||||
# Increase for very long audio files
|
||||
# ASR_TIMEOUT=1800
|
||||
|
||||
# Enable speaker diarization (default: true)
|
||||
# ASR_DIARIZE=true
|
||||
|
||||
# Speaker count hints (optional, helps with diarization accuracy)
|
||||
# ASR_MIN_SPEAKERS=1
|
||||
# ASR_MAX_SPEAKERS=5
|
||||
|
||||
# Return speaker embeddings for speaker identification (WhisperX only)
|
||||
# Enables automatic speaker matching across recordings
|
||||
# ASR_RETURN_SPEAKER_EMBEDDINGS=false
|
||||
|
||||
# =============================================================================
|
||||
# CHUNKING CONFIGURATION (For large files)
|
||||
# =============================================================================
|
||||
# Chunking is now connector-aware with this priority:
|
||||
# 1. Connector handles internally (openai_transcribe, asr_endpoint) → No app chunking
|
||||
# 2. ENABLE_CHUNKING=false → Disable chunking (only affects openai_whisper)
|
||||
# 3. CHUNK_LIMIT set → Use your settings
|
||||
# 4. Connector defaults → Use connector's recommended limits
|
||||
# 5. App default → 20MB size-based
|
||||
#
|
||||
# For openai_transcribe/asr_endpoint: These settings are IGNORED (connector handles it)
|
||||
# For openai_whisper: These settings control chunking behavior
|
||||
|
||||
# ENABLE_CHUNKING=false # Uncomment to disable chunking for openai_whisper
|
||||
|
||||
# Chunk limit - supports size (20MB) or duration (600s, 10m)
|
||||
CHUNK_LIMIT=20MB
|
||||
|
||||
# Overlap between chunks in seconds (helps with transcription accuracy at boundaries)
|
||||
CHUNK_OVERLAP_SECONDS=3
|
||||
|
||||
# =============================================================================
|
||||
# EXAMPLE CONFIGURATIONS (Simplified)
|
||||
# =============================================================================
|
||||
#
|
||||
# --- OpenAI with Speaker Diarization (Recommended) ---
|
||||
# Just two environment variables needed:
|
||||
# TRANSCRIPTION_API_KEY=sk-xxx
|
||||
# TRANSCRIPTION_MODEL=gpt-4o-transcribe-diarize
|
||||
#
|
||||
# --- Self-hosted WhisperX (Best for privacy) ---
|
||||
# Just one environment variable needed (auto-detects ASR mode):
|
||||
# ASR_BASE_URL=http://whisper-asr:9000
|
||||
# Optional:
|
||||
# ASR_DIARIZE=true
|
||||
# ASR_RETURN_SPEAKER_EMBEDDINGS=true
|
||||
#
|
||||
# --- OpenAI Whisper (Legacy, no diarization) ---
|
||||
# TRANSCRIPTION_API_KEY=sk-xxx
|
||||
# TRANSCRIPTION_MODEL=whisper-1
|
||||
#
|
||||
# --- Custom Whisper model (local or compatible endpoint) ---
|
||||
# TRANSCRIPTION_API_KEY=not-needed
|
||||
# TRANSCRIPTION_BASE_URL=http://localhost:8080/v1
|
||||
# TRANSCRIPTION_MODEL=Systran/faster-distil-whisper-large-v3
|
||||
|
||||
# =============================================================================
|
||||
# APPLICATION SETTINGS
|
||||
# =============================================================================
|
||||
|
||||
# --- Admin User (created on first run) ---
|
||||
ADMIN_USERNAME=admin
|
||||
ADMIN_EMAIL=admin@example.com
|
||||
ADMIN_PASSWORD=changeme
|
||||
|
||||
# --- Registration & Access ---
|
||||
ALLOW_REGISTRATION=false
|
||||
# Comma-separated list of allowed email domains for registration.
|
||||
# Leave empty to allow all domains. Example: company.com,subsidiary.org
|
||||
REGISTRATION_ALLOWED_DOMAINS=
|
||||
|
||||
# --- Token Limits ---
|
||||
SUMMARY_MAX_TOKENS=8000
|
||||
CHAT_MAX_TOKENS=5000
|
||||
|
||||
# --- Timezone ---
|
||||
# Use a valid TZ database name (e.g., "America/New_York", "Europe/London", "UTC")
|
||||
TIMEZONE="UTC"
|
||||
|
||||
# --- Logging ---
|
||||
LOG_LEVEL="INFO"
|
||||
|
||||
# =============================================================================
|
||||
# AUDIO PROCESSING
|
||||
# =============================================================================
|
||||
|
||||
# --- Audio Compression ---
|
||||
# Automatically compress lossless uploads (WAV, AIFF) to save storage
|
||||
AUDIO_COMPRESS_UPLOADS=true
|
||||
|
||||
# Target codec: mp3 (lossy, smallest), flac (lossless), opus (lossy, efficient)
|
||||
AUDIO_CODEC=mp3
|
||||
|
||||
# Bitrate for lossy codecs (ignored for FLAC)
|
||||
AUDIO_BITRATE=128k
|
||||
|
||||
# Unsupported codecs - comma-separated list of codecs to exclude
|
||||
# Example: AUDIO_UNSUPPORTED_CODECS=opus,vorbis
|
||||
# AUDIO_UNSUPPORTED_CODECS=
|
||||
|
||||
# =============================================================================
|
||||
# OPTIONAL FEATURES
|
||||
# =============================================================================
|
||||
|
||||
# --- Inquire Mode (AI search across all recordings) ---
|
||||
ENABLE_INQUIRE_MODE=false
|
||||
|
||||
# --- Automated File Processing (Black Hole Directory) ---
|
||||
ENABLE_AUTO_PROCESSING=false
|
||||
# AUTO_PROCESS_MODE=admin_only
|
||||
# AUTO_PROCESS_WATCH_DIR=/data/auto-process
|
||||
|
||||
# --- Automated Export ---
|
||||
ENABLE_AUTO_EXPORT=false
|
||||
# AUTO_EXPORT_DIR=/data/exports
|
||||
|
||||
# --- Auto-Deletion & Retention ---
|
||||
ENABLE_AUTO_DELETION=false
|
||||
# GLOBAL_RETENTION_DAYS=90
|
||||
# DELETION_MODE=audio_only
|
||||
|
||||
# --- Sharing Settings ---
|
||||
ENABLE_INTERNAL_SHARING=false
|
||||
ENABLE_PUBLIC_SHARING=true
|
||||
# SHOW_USERNAMES_IN_UI=false
|
||||
|
||||
# --- Permission Controls ---
|
||||
USERS_CAN_DELETE=true
|
||||
|
||||
# Delete speaker profiles when all their recordings are removed.
|
||||
# Default: false (speaker profiles and voice embeddings are preserved)
|
||||
# Set to true for privacy-sensitive deployments where biometric voice data
|
||||
# should not outlive the recordings it was derived from.
|
||||
# DELETE_ORPHANED_SPEAKERS=false
|
||||
|
||||
# --- Video Retention ---
|
||||
# When enabled, uploaded video files keep their video stream for in-browser playback
|
||||
# The audio is extracted to a temp file for transcription, then cleaned up
|
||||
# Default: false (video uploads extract audio only, video stream is discarded)
|
||||
VIDEO_RETENTION=false
|
||||
|
||||
# --- Video Passthrough to ASR ---
|
||||
# Send original video files directly to ASR without extracting audio.
|
||||
# Useful for custom ASR backends that handle video internally (e.g., multi-track audio extraction).
|
||||
# When enabled, video files bypass audio extraction, codec conversion, and chunking.
|
||||
# Only affects video files — audio uploads are processed normally.
|
||||
# Default: false
|
||||
# VIDEO_PASSTHROUGH_ASR=false
|
||||
|
||||
# --- Concurrent Uploads ---
|
||||
# Maximum number of simultaneous file uploads (default: 3)
|
||||
MAX_CONCURRENT_UPLOADS=3
|
||||
|
||||
# =============================================================================
|
||||
# BACKGROUND PROCESSING
|
||||
# =============================================================================
|
||||
|
||||
# Transcription queue workers (default: 2)
|
||||
JOB_QUEUE_WORKERS=2
|
||||
|
||||
# Summary queue workers (default: 2)
|
||||
SUMMARY_QUEUE_WORKERS=2
|
||||
|
||||
# Maximum retry attempts for failed jobs (default: 3)
|
||||
JOB_MAX_RETRIES=3
|
||||
|
||||
# =============================================================================
|
||||
# DOCKER/DATABASE SETTINGS
|
||||
# =============================================================================
|
||||
|
||||
# Database URI - SQLite (default) or PostgreSQL
|
||||
SQLALCHEMY_DATABASE_URI=sqlite:////data/instance/transcriptions.db
|
||||
# For PostgreSQL: postgresql://username:password@hostname:5432/database_name
|
||||
|
||||
UPLOAD_FOLDER=/data/uploads
|
||||
|
||||
# =============================================================================
|
||||
# FUTURE: Additional Provider Notes
|
||||
# =============================================================================
|
||||
# The connector architecture is designed to support additional providers.
|
||||
# Future connectors may include:
|
||||
#
|
||||
# - Deepgram: Known for excellent diarization and real-time transcription
|
||||
# - AssemblyAI: Strong diarization with speaker labels
|
||||
# - Google Cloud Speech-to-Text: Enterprise-grade with speaker diarization
|
||||
#
|
||||
# To request a new connector, please open an issue on GitHub.
|
||||
Reference in New Issue
Block a user