556 lines
19 KiB
Python
556 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
File Exporter for Automated Recording Export
|
|
|
|
Exports transcriptions and summaries as markdown files to a configured directory.
|
|
Supports per-user subdirectories based on username.
|
|
Supports customizable export templates with localized labels.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from werkzeug.utils import secure_filename
|
|
|
|
# Configuration from environment
|
|
ENABLE_AUTO_EXPORT = os.environ.get('ENABLE_AUTO_EXPORT', 'false').lower() == 'true'
|
|
AUTO_EXPORT_DIR = os.environ.get('AUTO_EXPORT_DIR', '/data/exports')
|
|
AUTO_EXPORT_TRANSCRIPTION = os.environ.get('AUTO_EXPORT_TRANSCRIPTION', 'true').lower() == 'true'
|
|
AUTO_EXPORT_SUMMARY = os.environ.get('AUTO_EXPORT_SUMMARY', 'true').lower() == 'true'
|
|
|
|
# Setup logging
|
|
logger = logging.getLogger('file_exporter')
|
|
logger.setLevel(logging.INFO)
|
|
|
|
|
|
def format_transcription_with_template(transcription_text, user):
|
|
"""
|
|
Format transcription using the user's default template.
|
|
|
|
Args:
|
|
transcription_text: Raw transcription (JSON or plain text)
|
|
user: User object to get template from
|
|
|
|
Returns:
|
|
Formatted transcription string
|
|
"""
|
|
# Import here to avoid circular imports
|
|
from src.models import TranscriptTemplate
|
|
|
|
# Try to parse as JSON
|
|
try:
|
|
transcription_data = json.loads(transcription_text)
|
|
if not isinstance(transcription_data, list):
|
|
# Not our expected format, return as-is
|
|
return transcription_text
|
|
except (json.JSONDecodeError, TypeError):
|
|
# Not JSON, return as-is
|
|
return transcription_text
|
|
|
|
# Get user's default template
|
|
template = TranscriptTemplate.query.filter_by(
|
|
user_id=user.id,
|
|
is_default=True
|
|
).first()
|
|
|
|
# Default format if no template set
|
|
if not template:
|
|
template_format = "[{{speaker}}]: {{text}}"
|
|
else:
|
|
template_format = template.template
|
|
|
|
# Helper functions for formatting
|
|
def format_time(seconds):
|
|
"""Format seconds to HH:MM:SS"""
|
|
if seconds is None:
|
|
return "00:00:00"
|
|
td = timedelta(seconds=seconds)
|
|
hours = int(td.total_seconds() // 3600)
|
|
minutes = int((td.total_seconds() % 3600) // 60)
|
|
secs = int(td.total_seconds() % 60)
|
|
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
|
|
|
def format_srt_time(seconds):
|
|
"""Format seconds to SRT format HH:MM:SS,mmm"""
|
|
if seconds is None:
|
|
return "00:00:00,000"
|
|
td = timedelta(seconds=seconds)
|
|
hours = int(td.total_seconds() // 3600)
|
|
minutes = int((td.total_seconds() % 3600) // 60)
|
|
secs = int(td.total_seconds() % 60)
|
|
millis = int((td.total_seconds() % 1) * 1000)
|
|
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
|
|
|
# Generate formatted transcript
|
|
output_lines = []
|
|
for index, segment in enumerate(transcription_data, 1):
|
|
line = template_format
|
|
|
|
# Replace variables
|
|
replacements = {
|
|
'{{index}}': str(index),
|
|
'{{speaker}}': segment.get('speaker', 'Unknown'),
|
|
'{{text}}': segment.get('sentence', ''),
|
|
'{{start_time}}': format_time(segment.get('start_time')),
|
|
'{{end_time}}': format_time(segment.get('end_time')),
|
|
}
|
|
|
|
for key, value in replacements.items():
|
|
line = line.replace(key, value)
|
|
|
|
# Handle filters
|
|
# Upper case filter
|
|
line = re.sub(r'{{(.*?)\|upper}}', lambda m: replacements.get('{{' + m.group(1) + '}}', '').upper(), line)
|
|
# SRT time filter
|
|
line = re.sub(r'{{start_time\|srt}}', format_srt_time(segment.get('start_time')), line)
|
|
line = re.sub(r'{{end_time\|srt}}', format_srt_time(segment.get('end_time')), line)
|
|
|
|
output_lines.append(line)
|
|
|
|
return '\n'.join(output_lines)
|
|
|
|
|
|
def get_export_directory(user):
|
|
"""Get the export directory for a user, creating if needed."""
|
|
base_dir = Path(AUTO_EXPORT_DIR)
|
|
|
|
# Create per-user subdirectory based on username
|
|
user_dir = base_dir / secure_filename(user.username)
|
|
user_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
return user_dir
|
|
|
|
|
|
def generate_safe_filename(recording):
|
|
"""Generate a safe filename for the export based on recording ID only."""
|
|
# Use only recording ID for consistent filename that doesn't change
|
|
return f"recording_{recording.id}"
|
|
|
|
|
|
def get_export_filepath(user, recording):
|
|
"""Get the full export filepath for a recording."""
|
|
export_dir = get_export_directory(user)
|
|
filename = generate_safe_filename(recording)
|
|
return export_dir / f"{filename}.md"
|
|
|
|
|
|
def mark_export_as_deleted(recording_id):
|
|
"""
|
|
Rename the export file to indicate the recording was deleted.
|
|
|
|
Args:
|
|
recording_id: ID of the deleted recording
|
|
|
|
Returns:
|
|
New filepath if renamed, None otherwise
|
|
"""
|
|
if not ENABLE_AUTO_EXPORT:
|
|
return None
|
|
|
|
# Import here to avoid circular imports
|
|
from src.app import app, db
|
|
from src.models import Recording, User
|
|
|
|
with app.app_context():
|
|
try:
|
|
# We need to find the file - check all user directories
|
|
base_dir = Path(AUTO_EXPORT_DIR)
|
|
if not base_dir.exists():
|
|
return None
|
|
|
|
# Look for the file in all user subdirectories
|
|
for user_dir in base_dir.iterdir():
|
|
if user_dir.is_dir():
|
|
old_filepath = user_dir / f"recording_{recording_id}.md"
|
|
if old_filepath.exists():
|
|
new_filepath = user_dir / f"[deleted]_recording_{recording_id}.md"
|
|
old_filepath.rename(new_filepath)
|
|
logger.info(f"Marked export as deleted: {new_filepath}")
|
|
return str(new_filepath)
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to mark export as deleted for recording {recording_id}: {e}")
|
|
return None
|
|
|
|
|
|
def format_duration(seconds):
|
|
"""Format duration in seconds to human-readable string."""
|
|
if not seconds:
|
|
return ""
|
|
|
|
hours = seconds // 3600
|
|
minutes = (seconds % 3600) // 60
|
|
secs = seconds % 60
|
|
|
|
if hours > 0:
|
|
return f"{hours}h {minutes}m {secs}s"
|
|
elif minutes > 0:
|
|
return f"{minutes}m {secs}s"
|
|
else:
|
|
return f"{secs}s"
|
|
|
|
|
|
def format_file_size(bytes_size):
|
|
"""Format file size in bytes to human-readable string."""
|
|
if not bytes_size:
|
|
return ""
|
|
|
|
for unit in ['B', 'KB', 'MB', 'GB']:
|
|
if bytes_size < 1024:
|
|
return f"{bytes_size:.1f} {unit}"
|
|
bytes_size /= 1024
|
|
return f"{bytes_size:.1f} TB"
|
|
|
|
|
|
def get_user_export_template(user, recording=None):
|
|
"""
|
|
Get the export template to use for a recording.
|
|
|
|
Resolution order:
|
|
1. Folder's export_template_id (if recording is in a folder)
|
|
2. Tag's export_template_id (first matching tag with an export template)
|
|
3. User's default export template (is_default=True)
|
|
|
|
Args:
|
|
user: User object
|
|
recording: Optional Recording object (for folder/tag lookup)
|
|
|
|
Returns:
|
|
ExportTemplate object or None
|
|
"""
|
|
from src.models import ExportTemplate
|
|
|
|
# 1. Check folder's export template
|
|
if recording and recording.folder and recording.folder.export_template_id:
|
|
template = ExportTemplate.query.get(recording.folder.export_template_id)
|
|
if template:
|
|
return template
|
|
|
|
# 2. Check tags' export templates
|
|
if recording and recording.tags:
|
|
for tag in recording.tags:
|
|
if tag.export_template_id:
|
|
template = ExportTemplate.query.get(tag.export_template_id)
|
|
if template:
|
|
return template
|
|
|
|
# 3. Fall back to user's default
|
|
return ExportTemplate.query.filter_by(
|
|
user_id=user.id,
|
|
is_default=True
|
|
).first()
|
|
|
|
|
|
def render_export_template(template_str, context, labels):
|
|
"""
|
|
Render an export template with variable substitution and conditionals.
|
|
|
|
Args:
|
|
template_str: Template string with {{variables}} and {{#if var}}...{{/if}} blocks
|
|
context: Dictionary of variable values
|
|
labels: Dictionary of localized labels
|
|
|
|
Returns:
|
|
Rendered string
|
|
"""
|
|
result = template_str
|
|
|
|
# Process conditionals first: {{#if variable}}content{{/if}}
|
|
def replace_conditional(match):
|
|
var_name = match.group(1)
|
|
content = match.group(2)
|
|
# Check if the variable exists and is truthy
|
|
value = context.get(var_name, '')
|
|
if value:
|
|
return content
|
|
return ''
|
|
|
|
# Match {{#if var}}...{{/if}} blocks (non-greedy)
|
|
conditional_pattern = r'\{\{#if\s+(\w+)\}\}(.*?)\{\{/if\}\}'
|
|
result = re.sub(conditional_pattern, replace_conditional, result, flags=re.DOTALL)
|
|
|
|
# Replace label variables: {{label.key}}
|
|
def replace_label(match):
|
|
key = match.group(1)
|
|
return labels.get(key, key)
|
|
|
|
result = re.sub(r'\{\{label\.(\w+)\}\}', replace_label, result)
|
|
|
|
# Replace context variables: {{variable}}
|
|
for key, value in context.items():
|
|
placeholder = '{{' + key + '}}'
|
|
result = result.replace(placeholder, str(value) if value else '')
|
|
|
|
return result
|
|
|
|
|
|
def generate_markdown_content(recording, user, include_transcription=True, include_summary=True):
|
|
"""Generate markdown content for a recording export.
|
|
|
|
Args:
|
|
recording: Recording object to export
|
|
user: User object for getting template preferences
|
|
include_transcription: Whether to include transcription
|
|
include_summary: Whether to include summary
|
|
"""
|
|
from src.utils.localization import get_export_labels, format_date_localized, format_datetime_localized
|
|
|
|
# Get user's language preference (default to English)
|
|
user_language = getattr(user, 'ui_language', 'en') or 'en'
|
|
|
|
# Get localized labels
|
|
labels = get_export_labels(user_language)
|
|
|
|
# Get export template (checks folder, tags, then user default)
|
|
export_template = get_user_export_template(user, recording)
|
|
|
|
if export_template:
|
|
# Use custom template
|
|
return generate_from_template(
|
|
recording, user, export_template.template, labels, user_language,
|
|
include_transcription, include_summary
|
|
)
|
|
else:
|
|
# Use default (backwards compatible) behavior
|
|
return generate_default_markdown(
|
|
recording, user, labels, user_language,
|
|
include_transcription, include_summary
|
|
)
|
|
|
|
|
|
def generate_from_template(recording, user, template_str, labels, user_language,
|
|
include_transcription=True, include_summary=True):
|
|
"""
|
|
Generate markdown content using a custom template.
|
|
|
|
Args:
|
|
recording: Recording object
|
|
user: User object
|
|
template_str: Template string
|
|
labels: Localized labels dictionary
|
|
user_language: User's language code
|
|
include_transcription: Whether to include transcription
|
|
include_summary: Whether to include summary
|
|
|
|
Returns:
|
|
Rendered markdown string
|
|
"""
|
|
from src.utils.localization import format_date_localized, format_datetime_localized
|
|
|
|
# Build context with all available variables
|
|
context = {
|
|
'title': recording.title or f"Recording {recording.id}",
|
|
'meeting_date': format_date_localized(recording.meeting_date, user_language) if recording.meeting_date else '',
|
|
'created_at': format_datetime_localized(recording.created_at, user_language) if recording.created_at else '',
|
|
'original_filename': recording.original_filename or '',
|
|
'file_size': format_file_size(recording.file_size) if recording.file_size else '',
|
|
'participants': recording.participants or '',
|
|
'tags': ', '.join([tag.name for tag in recording.tags]) if recording.tags else '',
|
|
'transcription_duration': format_duration(recording.transcription_duration_seconds) if recording.transcription_duration_seconds else '',
|
|
'summarization_duration': format_duration(recording.summarization_duration_seconds) if recording.summarization_duration_seconds else '',
|
|
'notes': recording.notes or '' if include_summary else '', # Notes included with summary setting
|
|
'summary': recording.summary or '' if include_summary else '',
|
|
'transcription': '', # Will be set below
|
|
}
|
|
|
|
# Format transcription if included
|
|
if include_transcription and recording.transcription:
|
|
context['transcription'] = format_transcription_with_template(recording.transcription, user)
|
|
|
|
# Render template
|
|
rendered = render_export_template(template_str, context, labels)
|
|
|
|
# Always append hardcoded footer
|
|
footer = labels.get('footer', 'Generated with [Speakr](https://github.com/learnedmachine/speakr)')
|
|
rendered += f"\n\n---\n\n*{footer}*\n"
|
|
|
|
return rendered
|
|
|
|
|
|
def generate_default_markdown(recording, user, labels, user_language,
|
|
include_transcription=True, include_summary=True):
|
|
"""
|
|
Generate markdown using the default (backwards compatible) format.
|
|
|
|
Args:
|
|
recording: Recording object
|
|
user: User object
|
|
labels: Localized labels dictionary
|
|
user_language: User's language code
|
|
include_transcription: Whether to include transcription
|
|
include_summary: Whether to include summary
|
|
|
|
Returns:
|
|
Rendered markdown string
|
|
"""
|
|
from src.utils.localization import format_date_localized, format_datetime_localized
|
|
|
|
lines = []
|
|
|
|
# Header with title
|
|
title = recording.title or f"Recording {recording.id}"
|
|
lines.append(f"# {title}")
|
|
lines.append("")
|
|
|
|
# Metadata section
|
|
lines.append(f"## {labels.get('metadata', 'Metadata')}")
|
|
lines.append("")
|
|
|
|
if recording.meeting_date:
|
|
date_str = format_date_localized(recording.meeting_date, user_language)
|
|
lines.append(f"- **{labels.get('date', 'Date')}:** {date_str}")
|
|
|
|
if recording.created_at:
|
|
created_str = format_datetime_localized(recording.created_at, user_language)
|
|
lines.append(f"- **{labels.get('created', 'Created')}:** {created_str}")
|
|
|
|
if recording.original_filename:
|
|
lines.append(f"- **{labels.get('originalFile', 'Original File')}:** {recording.original_filename}")
|
|
|
|
if recording.file_size:
|
|
lines.append(f"- **{labels.get('fileSize', 'File Size')}:** {format_file_size(recording.file_size)}")
|
|
|
|
if recording.participants:
|
|
lines.append(f"- **{labels.get('participants', 'Participants')}:** {recording.participants}")
|
|
|
|
if recording.tags:
|
|
tag_names = [tag.name for tag in recording.tags]
|
|
lines.append(f"- **{labels.get('tags', 'Tags')}:** {', '.join(tag_names)}")
|
|
|
|
if recording.transcription_duration_seconds:
|
|
lines.append(f"- **{labels.get('transcriptionTime', 'Transcription Time')}:** {format_duration(recording.transcription_duration_seconds)}")
|
|
|
|
if recording.summarization_duration_seconds:
|
|
lines.append(f"- **{labels.get('summarizationTime', 'Summarization Time')}:** {format_duration(recording.summarization_duration_seconds)}")
|
|
|
|
lines.append("")
|
|
|
|
# Notes section (if available)
|
|
if recording.notes:
|
|
lines.append(f"## {labels.get('notes', 'Notes')}")
|
|
lines.append("")
|
|
lines.append(recording.notes)
|
|
lines.append("")
|
|
|
|
# Summary section
|
|
if include_summary and recording.summary:
|
|
lines.append(f"## {labels.get('summary', 'Summary')}")
|
|
lines.append("")
|
|
lines.append(recording.summary)
|
|
lines.append("")
|
|
|
|
# Transcription section
|
|
if include_transcription and recording.transcription:
|
|
lines.append(f"## {labels.get('transcription', 'Transcription')}")
|
|
lines.append("")
|
|
# Format transcription using user's template
|
|
formatted_transcription = format_transcription_with_template(recording.transcription, user)
|
|
lines.append(formatted_transcription)
|
|
lines.append("")
|
|
|
|
# Footer
|
|
lines.append("---")
|
|
lines.append("")
|
|
footer = labels.get('footer', 'Generated with [Speakr](https://github.com/learnedmachine/speakr)')
|
|
lines.append(f"*{footer}*")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def export_recording(recording_id):
|
|
"""
|
|
Export a recording to markdown file.
|
|
|
|
Args:
|
|
recording_id: ID of the recording to export
|
|
|
|
Returns:
|
|
Path to the exported file, or None if export failed/disabled
|
|
"""
|
|
if not ENABLE_AUTO_EXPORT:
|
|
return None
|
|
|
|
# Check if we should export anything
|
|
if not AUTO_EXPORT_TRANSCRIPTION and not AUTO_EXPORT_SUMMARY:
|
|
logger.warning("Auto-export is enabled but both transcription and summary export are disabled")
|
|
return None
|
|
|
|
# Import here to avoid circular imports
|
|
from src.app import app, db
|
|
from src.models import Recording, User
|
|
|
|
with app.app_context():
|
|
try:
|
|
recording = db.session.get(Recording, recording_id)
|
|
if not recording:
|
|
logger.error(f"Recording {recording_id} not found for export")
|
|
return None
|
|
|
|
# Get the owner
|
|
user = db.session.get(User, recording.user_id)
|
|
if not user:
|
|
logger.error(f"User not found for recording {recording_id}")
|
|
return None
|
|
|
|
# Check if we have content to export
|
|
has_transcription = bool(recording.transcription) and AUTO_EXPORT_TRANSCRIPTION
|
|
has_summary = bool(recording.summary) and AUTO_EXPORT_SUMMARY
|
|
|
|
if not has_transcription and not has_summary:
|
|
logger.debug(f"Recording {recording_id} has no content to export")
|
|
return None
|
|
|
|
# Get export directory for user
|
|
export_dir = get_export_directory(user)
|
|
|
|
# Generate filename and path
|
|
filename = generate_safe_filename(recording)
|
|
filepath = export_dir / f"{filename}.md"
|
|
|
|
# Generate content
|
|
content = generate_markdown_content(
|
|
recording,
|
|
user,
|
|
include_transcription=AUTO_EXPORT_TRANSCRIPTION,
|
|
include_summary=AUTO_EXPORT_SUMMARY
|
|
)
|
|
|
|
# Write to file (overwrites if exists)
|
|
filepath.write_text(content, encoding='utf-8')
|
|
|
|
logger.info(f"Exported recording {recording_id} to {filepath}")
|
|
return str(filepath)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to export recording {recording_id}: {e}")
|
|
return None
|
|
|
|
|
|
def initialize_export_directory():
|
|
"""Initialize the export directory on startup."""
|
|
if not ENABLE_AUTO_EXPORT:
|
|
return
|
|
|
|
try:
|
|
export_dir = Path(AUTO_EXPORT_DIR)
|
|
export_dir.mkdir(parents=True, exist_ok=True)
|
|
logger.info(f"Auto-export enabled, directory: {AUTO_EXPORT_DIR}")
|
|
|
|
if AUTO_EXPORT_TRANSCRIPTION and AUTO_EXPORT_SUMMARY:
|
|
logger.info("Exporting: transcription and summary")
|
|
elif AUTO_EXPORT_TRANSCRIPTION:
|
|
logger.info("Exporting: transcription only")
|
|
elif AUTO_EXPORT_SUMMARY:
|
|
logger.info("Exporting: summary only")
|
|
else:
|
|
logger.warning("Auto-export enabled but no content types selected")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize export directory: {e}")
|