Files
dictia-public/config/env.whisperx.example

242 lines
9.4 KiB
Plaintext

# -----------------------------------------------------------------------------
# DictIA Configuration: WhisperX ASR Endpoint (with Voice Profiles)
#
# ⚠️ DEPRECATION NOTICE: This configuration style is still supported but
# we recommend using the new unified configuration in env.transcription.example
# which supports all transcription providers with auto-detection.
#
# Migration: Simply set ASR_BASE_URL and the connector will auto-detect ASR mode.
# USE_ASR_ENDPOINT=true is no longer required (but still works for backwards compat).
#
# This configuration is for use with the WhisperX ASR Service:
# https://github.com/murtaza-nasir/whisperx-asr-service
#
# Features supported:
# - Speaker diarization with pyannote/speaker-diarization-community-1
# - Voice profile embeddings (256-dimensional) for speaker recognition
# - Automatic speaker matching across recordings
# - Better timestamp alignment between speakers and words
#
# Instructions:
# 1. Copy this file to a new file named .env
# cp config/env.whisperx.example .env
# 2. Fill in the required URLs, API keys, and settings below.
# 3. Set up WhisperX ASR Service (see installation guide)
# -----------------------------------------------------------------------------
# --- Text Generation Model (for summaries, titles, etc.) ---
TEXT_MODEL_BASE_URL=https://openrouter.ai/api/v1
TEXT_MODEL_API_KEY=your_openrouter_api_key
TEXT_MODEL_NAME=openai/gpt-4o-mini
# --- GPT-5 Specific Settings (only used with OpenAI API and GPT-5 models) ---
# If using GPT-5 models (gpt-5, gpt-5-mini, gpt-5-nano, gpt-5-chat-latest) with OpenAI API,
# these parameters will be used instead of temperature.
#
# Example GPT-5 configuration:
# TEXT_MODEL_BASE_URL=https://api.openai.com/v1
# TEXT_MODEL_NAME=gpt-5-mini
#
# Reasoning effort: minimal, low, medium, high (default: medium)
# - minimal: Fastest responses, minimal reasoning tokens
# - low: Fast responses with basic reasoning
# - medium: Balanced reasoning and speed (recommended)
# - high: Maximum reasoning for complex tasks
GPT5_REASONING_EFFORT=medium
#
# Verbosity: low, medium, high (default: medium)
# - low: Concise responses
# - medium: Balanced detail
# - high: Detailed explanations
GPT5_VERBOSITY=medium
# --- Auto-Identify Speaker Response Format ---
# When enabled, auto-identify uses JSON Schema response format (structured outputs)
# to constrain LLM output to valid SPEAKER_XX keys. Falls back to json_object mode
# if the model doesn't support it. Leave disabled for widest model compatibility.
# AUTO_IDENTIFY_RESPONSE_SCHEMA=1
# --- Chat Model Configuration (Optional) ---
# Configure a separate model for real-time chat interactions.
# If not set, chat will use the TEXT_MODEL_* settings above.
#
# Use cases:
# - Use a faster model for chat while using a more capable model for summarization
# - Use a cheaper model for interactive chat to reduce costs
# - Use different service tiers for different operations
#
# CHAT_MODEL_API_KEY=your_chat_api_key
# CHAT_MODEL_BASE_URL=https://openrouter.ai/api/v1
# CHAT_MODEL_NAME=openai/gpt-4o
# --- Chat GPT-5 Settings (only used with OpenAI API and GPT-5 chat models) ---
# These settings allow independent control of GPT-5 parameters for chat.
# If not set, falls back to the main GPT5_* settings above.
#
# CHAT_GPT5_REASONING_EFFORT=medium
# CHAT_GPT5_VERBOSITY=medium
# --- Transcription Service (WhisperX ASR Endpoint) ---
# New connector architecture auto-detects ASR mode when ASR_BASE_URL is set.
# USE_ASR_ENDPOINT=true is deprecated but still works for backwards compatibility.
#
# Note: ASR endpoints handle chunking internally - CHUNK_LIMIT settings are ignored.
# WhisperX ASR Endpoint URL (setting this auto-enables ASR mode)
# For containers in same docker-compose: Use container name and internal port
# Example: http://whisperx-asr:9000 (NOT the host port or external IP)
# For external ASR: Use http://192.168.1.100:9000 or http://asr.example.com:9000
ASR_BASE_URL=http://whisperx-asr:9000
# Deprecated: No longer needed, kept for backwards compatibility
# USE_ASR_ENDPOINT=true
# Speaker diarization options
ASR_DIARIZE=true
# ASR_MIN_SPEAKERS=1 # Hint for minimum speakers
# ASR_MAX_SPEAKERS=5 # Default maximum speakers
# Enable speaker embeddings for voice profile matching (WhisperX only)
ASR_RETURN_SPEAKER_EMBEDDINGS=true
# --- Application Settings ---
# Set to "true" to allow user registration, "false" to disable
ALLOW_REGISTRATION=false
# Comma-separated list of allowed email domains for registration.
# Leave empty to allow all domains. Example: company.com,subsidiary.org
REGISTRATION_ALLOWED_DOMAINS=
SUMMARY_MAX_TOKENS=8000
CHAT_MAX_TOKENS=5000
# Timezone for displaying dates and times in the UI
# Use a valid TZ database name (e.g., "America/New_York", "Europe/London", "UTC")
TIMEZONE="UTC"
# Set the logging level for the application.
# Options: DEBUG, INFO, WARNING, ERROR
LOG_LEVEL="INFO"
# --- Audio Compression ---
# Automatically compress lossless uploads (WAV, AIFF) to save storage
AUDIO_COMPRESS_UPLOADS=true
# Target codec: mp3 (lossy, smallest), flac (lossless), opus (lossy, efficient)
AUDIO_CODEC=mp3
# Bitrate for lossy codecs (ignored for FLAC)
AUDIO_BITRATE=128k
# --- Admin User (created on first run) ---
ADMIN_USERNAME=admin
ADMIN_EMAIL=admin@example.com
ADMIN_PASSWORD=changeme
# --- Inquire Mode (AI search across all recordings) ---
# Set to "true" to enable semantic search and chat across all recordings
# Requires additional dependencies (already included in Docker image)
ENABLE_INQUIRE_MODE=false
# --- Automated File Processing (Black Hole Directory) ---
# Set to "true" to enable automated file processing
ENABLE_AUTO_PROCESSING=false
# --- Automated Export Settings ---
# Automatically export transcriptions and summaries to markdown files
ENABLE_AUTO_EXPORT=false
# Directory where exports will be saved (per-user subdirectories created automatically)
AUTO_EXPORT_DIR=/data/exports
# What to include in exports
AUTO_EXPORT_TRANSCRIPTION=true
AUTO_EXPORT_SUMMARY=true
# Processing mode: admin_only, user_directories, or single_user
AUTO_PROCESS_MODE=admin_only
# Directory to watch for new audio files
AUTO_PROCESS_WATCH_DIR=/data/auto-process
# How often to check for new files (seconds)
AUTO_PROCESS_CHECK_INTERVAL=30
# How long to wait (seconds) to confirm a file has stopped changing before processing.
# Increase for slow network transfers (NFS, SMB). Default: 5
# AUTO_PROCESS_STABILITY_TIME=5
# Default username for single_user mode (only used if AUTO_PROCESS_MODE=single_user)
# AUTO_PROCESS_DEFAULT_USERNAME=admin
# --- Auto-Deletion & Retention Settings ---
# Enable automated deletion of old recordings
ENABLE_AUTO_DELETION=false
# Number of days to retain recordings (0 = disabled)
# Example: 90 means recordings older than 90 days will be processed
GLOBAL_RETENTION_DAYS=90
# Deletion mode: 'audio_only' keeps transcription, 'full_recording' deletes everything
# audio_only: Deletes audio file but keeps transcription/summary/notes (recommended)
# full_recording: Permanently deletes the entire recording from database
DELETION_MODE=audio_only
# --- Permission-Based Deletion Controls ---
# Allow all users to delete their recordings, or restrict to admins only
# true: All users can delete their own recordings (default)
# false: Only admins can delete recordings
USERS_CAN_DELETE=true
# Delete speaker profiles when all their recordings are removed.
# Default: false (speaker profiles and voice embeddings are preserved)
# Set to true for privacy-sensitive deployments where biometric voice data
# should not outlive the recordings it was derived from.
# DELETE_ORPHANED_SPEAKERS=false
# --- Internal Sharing Settings ---
# Enable user-to-user sharing of recordings (works independently of groups)
ENABLE_INTERNAL_SHARING=false
# Show usernames in the UI (when sharing/viewing shared recordings)
# true: Display usernames throughout the interface
# false: Hide usernames (users must know each other's usernames to share)
SHOW_USERNAMES_IN_UI=false
# --- Public Sharing Settings ---
# Enable creation of public share links (anonymous access)
# true: Users can create public links to share recordings externally (default)
# false: Public sharing is disabled globally
ENABLE_PUBLIC_SHARING=true
# Note: Admins can control public sharing permissions per-user in the admin dashboard
# even when ENABLE_PUBLIC_SHARING is true
# --- Video Retention ---
# When enabled, uploaded video files keep their video stream for in-browser playback
# The audio is extracted to a temp file for transcription, then cleaned up
# Default: false (video uploads extract audio only, video stream is discarded)
VIDEO_RETENTION=false
# --- Concurrent Uploads ---
# Maximum number of simultaneous file uploads (default: 3)
MAX_CONCURRENT_UPLOADS=3
# --- Background Processing Queues ---
# Separate queues for transcription (slow) and summary (fast) jobs
# This prevents slow ASR jobs from blocking quick summary generation
# Transcription queue workers (for ASR processing, default: 2)
JOB_QUEUE_WORKERS=2
# Summary queue workers (for LLM summarization, default: 2)
SUMMARY_QUEUE_WORKERS=2
# Maximum retry attempts for failed jobs (default: 3)
JOB_MAX_RETRIES=3
# --- Docker Settings (rarely need to be changed) ---
# Database URI - SQLite (default) or PostgreSQL
SQLALCHEMY_DATABASE_URI=sqlite:////data/instance/transcriptions.db
# For PostgreSQL, use: postgresql://username:password@hostname:5432/database_name
# Example: postgresql://speakr:password@postgres:5432/speakr
UPLOAD_FOLDER=/data/uploads