Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)
This commit is contained in:
555
src/file_exporter.py
Normal file
555
src/file_exporter.py
Normal file
@@ -0,0 +1,555 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
File Exporter for Automated Recording Export
|
||||
|
||||
Exports transcriptions and summaries as markdown files to a configured directory.
|
||||
Supports per-user subdirectories based on username.
|
||||
Supports customizable export templates with localized labels.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
# Configuration from environment
|
||||
ENABLE_AUTO_EXPORT = os.environ.get('ENABLE_AUTO_EXPORT', 'false').lower() == 'true'
|
||||
AUTO_EXPORT_DIR = os.environ.get('AUTO_EXPORT_DIR', '/data/exports')
|
||||
AUTO_EXPORT_TRANSCRIPTION = os.environ.get('AUTO_EXPORT_TRANSCRIPTION', 'true').lower() == 'true'
|
||||
AUTO_EXPORT_SUMMARY = os.environ.get('AUTO_EXPORT_SUMMARY', 'true').lower() == 'true'
|
||||
|
||||
# Setup logging
|
||||
logger = logging.getLogger('file_exporter')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def format_transcription_with_template(transcription_text, user):
|
||||
"""
|
||||
Format transcription using the user's default template.
|
||||
|
||||
Args:
|
||||
transcription_text: Raw transcription (JSON or plain text)
|
||||
user: User object to get template from
|
||||
|
||||
Returns:
|
||||
Formatted transcription string
|
||||
"""
|
||||
# Import here to avoid circular imports
|
||||
from src.models import TranscriptTemplate
|
||||
|
||||
# Try to parse as JSON
|
||||
try:
|
||||
transcription_data = json.loads(transcription_text)
|
||||
if not isinstance(transcription_data, list):
|
||||
# Not our expected format, return as-is
|
||||
return transcription_text
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
# Not JSON, return as-is
|
||||
return transcription_text
|
||||
|
||||
# Get user's default template
|
||||
template = TranscriptTemplate.query.filter_by(
|
||||
user_id=user.id,
|
||||
is_default=True
|
||||
).first()
|
||||
|
||||
# Default format if no template set
|
||||
if not template:
|
||||
template_format = "[{{speaker}}]: {{text}}"
|
||||
else:
|
||||
template_format = template.template
|
||||
|
||||
# Helper functions for formatting
|
||||
def format_time(seconds):
|
||||
"""Format seconds to HH:MM:SS"""
|
||||
if seconds is None:
|
||||
return "00:00:00"
|
||||
td = timedelta(seconds=seconds)
|
||||
hours = int(td.total_seconds() // 3600)
|
||||
minutes = int((td.total_seconds() % 3600) // 60)
|
||||
secs = int(td.total_seconds() % 60)
|
||||
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
||||
|
||||
def format_srt_time(seconds):
|
||||
"""Format seconds to SRT format HH:MM:SS,mmm"""
|
||||
if seconds is None:
|
||||
return "00:00:00,000"
|
||||
td = timedelta(seconds=seconds)
|
||||
hours = int(td.total_seconds() // 3600)
|
||||
minutes = int((td.total_seconds() % 3600) // 60)
|
||||
secs = int(td.total_seconds() % 60)
|
||||
millis = int((td.total_seconds() % 1) * 1000)
|
||||
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
||||
|
||||
# Generate formatted transcript
|
||||
output_lines = []
|
||||
for index, segment in enumerate(transcription_data, 1):
|
||||
line = template_format
|
||||
|
||||
# Replace variables
|
||||
replacements = {
|
||||
'{{index}}': str(index),
|
||||
'{{speaker}}': segment.get('speaker', 'Unknown'),
|
||||
'{{text}}': segment.get('sentence', ''),
|
||||
'{{start_time}}': format_time(segment.get('start_time')),
|
||||
'{{end_time}}': format_time(segment.get('end_time')),
|
||||
}
|
||||
|
||||
for key, value in replacements.items():
|
||||
line = line.replace(key, value)
|
||||
|
||||
# Handle filters
|
||||
# Upper case filter
|
||||
line = re.sub(r'{{(.*?)\|upper}}', lambda m: replacements.get('{{' + m.group(1) + '}}', '').upper(), line)
|
||||
# SRT time filter
|
||||
line = re.sub(r'{{start_time\|srt}}', format_srt_time(segment.get('start_time')), line)
|
||||
line = re.sub(r'{{end_time\|srt}}', format_srt_time(segment.get('end_time')), line)
|
||||
|
||||
output_lines.append(line)
|
||||
|
||||
return '\n'.join(output_lines)
|
||||
|
||||
|
||||
def get_export_directory(user):
|
||||
"""Get the export directory for a user, creating if needed."""
|
||||
base_dir = Path(AUTO_EXPORT_DIR)
|
||||
|
||||
# Create per-user subdirectory based on username
|
||||
user_dir = base_dir / secure_filename(user.username)
|
||||
user_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return user_dir
|
||||
|
||||
|
||||
def generate_safe_filename(recording):
|
||||
"""Generate a safe filename for the export based on recording ID only."""
|
||||
# Use only recording ID for consistent filename that doesn't change
|
||||
return f"recording_{recording.id}"
|
||||
|
||||
|
||||
def get_export_filepath(user, recording):
|
||||
"""Get the full export filepath for a recording."""
|
||||
export_dir = get_export_directory(user)
|
||||
filename = generate_safe_filename(recording)
|
||||
return export_dir / f"{filename}.md"
|
||||
|
||||
|
||||
def mark_export_as_deleted(recording_id):
|
||||
"""
|
||||
Rename the export file to indicate the recording was deleted.
|
||||
|
||||
Args:
|
||||
recording_id: ID of the deleted recording
|
||||
|
||||
Returns:
|
||||
New filepath if renamed, None otherwise
|
||||
"""
|
||||
if not ENABLE_AUTO_EXPORT:
|
||||
return None
|
||||
|
||||
# Import here to avoid circular imports
|
||||
from src.app import app, db
|
||||
from src.models import Recording, User
|
||||
|
||||
with app.app_context():
|
||||
try:
|
||||
# We need to find the file - check all user directories
|
||||
base_dir = Path(AUTO_EXPORT_DIR)
|
||||
if not base_dir.exists():
|
||||
return None
|
||||
|
||||
# Look for the file in all user subdirectories
|
||||
for user_dir in base_dir.iterdir():
|
||||
if user_dir.is_dir():
|
||||
old_filepath = user_dir / f"recording_{recording_id}.md"
|
||||
if old_filepath.exists():
|
||||
new_filepath = user_dir / f"[deleted]_recording_{recording_id}.md"
|
||||
old_filepath.rename(new_filepath)
|
||||
logger.info(f"Marked export as deleted: {new_filepath}")
|
||||
return str(new_filepath)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to mark export as deleted for recording {recording_id}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def format_duration(seconds):
|
||||
"""Format duration in seconds to human-readable string."""
|
||||
if not seconds:
|
||||
return ""
|
||||
|
||||
hours = seconds // 3600
|
||||
minutes = (seconds % 3600) // 60
|
||||
secs = seconds % 60
|
||||
|
||||
if hours > 0:
|
||||
return f"{hours}h {minutes}m {secs}s"
|
||||
elif minutes > 0:
|
||||
return f"{minutes}m {secs}s"
|
||||
else:
|
||||
return f"{secs}s"
|
||||
|
||||
|
||||
def format_file_size(bytes_size):
|
||||
"""Format file size in bytes to human-readable string."""
|
||||
if not bytes_size:
|
||||
return ""
|
||||
|
||||
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||
if bytes_size < 1024:
|
||||
return f"{bytes_size:.1f} {unit}"
|
||||
bytes_size /= 1024
|
||||
return f"{bytes_size:.1f} TB"
|
||||
|
||||
|
||||
def get_user_export_template(user, recording=None):
|
||||
"""
|
||||
Get the export template to use for a recording.
|
||||
|
||||
Resolution order:
|
||||
1. Folder's export_template_id (if recording is in a folder)
|
||||
2. Tag's export_template_id (first matching tag with an export template)
|
||||
3. User's default export template (is_default=True)
|
||||
|
||||
Args:
|
||||
user: User object
|
||||
recording: Optional Recording object (for folder/tag lookup)
|
||||
|
||||
Returns:
|
||||
ExportTemplate object or None
|
||||
"""
|
||||
from src.models import ExportTemplate
|
||||
|
||||
# 1. Check folder's export template
|
||||
if recording and recording.folder and recording.folder.export_template_id:
|
||||
template = ExportTemplate.query.get(recording.folder.export_template_id)
|
||||
if template:
|
||||
return template
|
||||
|
||||
# 2. Check tags' export templates
|
||||
if recording and recording.tags:
|
||||
for tag in recording.tags:
|
||||
if tag.export_template_id:
|
||||
template = ExportTemplate.query.get(tag.export_template_id)
|
||||
if template:
|
||||
return template
|
||||
|
||||
# 3. Fall back to user's default
|
||||
return ExportTemplate.query.filter_by(
|
||||
user_id=user.id,
|
||||
is_default=True
|
||||
).first()
|
||||
|
||||
|
||||
def render_export_template(template_str, context, labels):
|
||||
"""
|
||||
Render an export template with variable substitution and conditionals.
|
||||
|
||||
Args:
|
||||
template_str: Template string with {{variables}} and {{#if var}}...{{/if}} blocks
|
||||
context: Dictionary of variable values
|
||||
labels: Dictionary of localized labels
|
||||
|
||||
Returns:
|
||||
Rendered string
|
||||
"""
|
||||
result = template_str
|
||||
|
||||
# Process conditionals first: {{#if variable}}content{{/if}}
|
||||
def replace_conditional(match):
|
||||
var_name = match.group(1)
|
||||
content = match.group(2)
|
||||
# Check if the variable exists and is truthy
|
||||
value = context.get(var_name, '')
|
||||
if value:
|
||||
return content
|
||||
return ''
|
||||
|
||||
# Match {{#if var}}...{{/if}} blocks (non-greedy)
|
||||
conditional_pattern = r'\{\{#if\s+(\w+)\}\}(.*?)\{\{/if\}\}'
|
||||
result = re.sub(conditional_pattern, replace_conditional, result, flags=re.DOTALL)
|
||||
|
||||
# Replace label variables: {{label.key}}
|
||||
def replace_label(match):
|
||||
key = match.group(1)
|
||||
return labels.get(key, key)
|
||||
|
||||
result = re.sub(r'\{\{label\.(\w+)\}\}', replace_label, result)
|
||||
|
||||
# Replace context variables: {{variable}}
|
||||
for key, value in context.items():
|
||||
placeholder = '{{' + key + '}}'
|
||||
result = result.replace(placeholder, str(value) if value else '')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def generate_markdown_content(recording, user, include_transcription=True, include_summary=True):
|
||||
"""Generate markdown content for a recording export.
|
||||
|
||||
Args:
|
||||
recording: Recording object to export
|
||||
user: User object for getting template preferences
|
||||
include_transcription: Whether to include transcription
|
||||
include_summary: Whether to include summary
|
||||
"""
|
||||
from src.utils.localization import get_export_labels, format_date_localized, format_datetime_localized
|
||||
|
||||
# Get user's language preference (default to English)
|
||||
user_language = getattr(user, 'ui_language', 'en') or 'en'
|
||||
|
||||
# Get localized labels
|
||||
labels = get_export_labels(user_language)
|
||||
|
||||
# Get export template (checks folder, tags, then user default)
|
||||
export_template = get_user_export_template(user, recording)
|
||||
|
||||
if export_template:
|
||||
# Use custom template
|
||||
return generate_from_template(
|
||||
recording, user, export_template.template, labels, user_language,
|
||||
include_transcription, include_summary
|
||||
)
|
||||
else:
|
||||
# Use default (backwards compatible) behavior
|
||||
return generate_default_markdown(
|
||||
recording, user, labels, user_language,
|
||||
include_transcription, include_summary
|
||||
)
|
||||
|
||||
|
||||
def generate_from_template(recording, user, template_str, labels, user_language,
|
||||
include_transcription=True, include_summary=True):
|
||||
"""
|
||||
Generate markdown content using a custom template.
|
||||
|
||||
Args:
|
||||
recording: Recording object
|
||||
user: User object
|
||||
template_str: Template string
|
||||
labels: Localized labels dictionary
|
||||
user_language: User's language code
|
||||
include_transcription: Whether to include transcription
|
||||
include_summary: Whether to include summary
|
||||
|
||||
Returns:
|
||||
Rendered markdown string
|
||||
"""
|
||||
from src.utils.localization import format_date_localized, format_datetime_localized
|
||||
|
||||
# Build context with all available variables
|
||||
context = {
|
||||
'title': recording.title or f"Recording {recording.id}",
|
||||
'meeting_date': format_date_localized(recording.meeting_date, user_language) if recording.meeting_date else '',
|
||||
'created_at': format_datetime_localized(recording.created_at, user_language) if recording.created_at else '',
|
||||
'original_filename': recording.original_filename or '',
|
||||
'file_size': format_file_size(recording.file_size) if recording.file_size else '',
|
||||
'participants': recording.participants or '',
|
||||
'tags': ', '.join([tag.name for tag in recording.tags]) if recording.tags else '',
|
||||
'transcription_duration': format_duration(recording.transcription_duration_seconds) if recording.transcription_duration_seconds else '',
|
||||
'summarization_duration': format_duration(recording.summarization_duration_seconds) if recording.summarization_duration_seconds else '',
|
||||
'notes': recording.notes or '' if include_summary else '', # Notes included with summary setting
|
||||
'summary': recording.summary or '' if include_summary else '',
|
||||
'transcription': '', # Will be set below
|
||||
}
|
||||
|
||||
# Format transcription if included
|
||||
if include_transcription and recording.transcription:
|
||||
context['transcription'] = format_transcription_with_template(recording.transcription, user)
|
||||
|
||||
# Render template
|
||||
rendered = render_export_template(template_str, context, labels)
|
||||
|
||||
# Always append hardcoded footer
|
||||
footer = labels.get('footer', 'Generated with [Speakr](https://github.com/learnedmachine/speakr)')
|
||||
rendered += f"\n\n---\n\n*{footer}*\n"
|
||||
|
||||
return rendered
|
||||
|
||||
|
||||
def generate_default_markdown(recording, user, labels, user_language,
|
||||
include_transcription=True, include_summary=True):
|
||||
"""
|
||||
Generate markdown using the default (backwards compatible) format.
|
||||
|
||||
Args:
|
||||
recording: Recording object
|
||||
user: User object
|
||||
labels: Localized labels dictionary
|
||||
user_language: User's language code
|
||||
include_transcription: Whether to include transcription
|
||||
include_summary: Whether to include summary
|
||||
|
||||
Returns:
|
||||
Rendered markdown string
|
||||
"""
|
||||
from src.utils.localization import format_date_localized, format_datetime_localized
|
||||
|
||||
lines = []
|
||||
|
||||
# Header with title
|
||||
title = recording.title or f"Recording {recording.id}"
|
||||
lines.append(f"# {title}")
|
||||
lines.append("")
|
||||
|
||||
# Metadata section
|
||||
lines.append(f"## {labels.get('metadata', 'Metadata')}")
|
||||
lines.append("")
|
||||
|
||||
if recording.meeting_date:
|
||||
date_str = format_date_localized(recording.meeting_date, user_language)
|
||||
lines.append(f"- **{labels.get('date', 'Date')}:** {date_str}")
|
||||
|
||||
if recording.created_at:
|
||||
created_str = format_datetime_localized(recording.created_at, user_language)
|
||||
lines.append(f"- **{labels.get('created', 'Created')}:** {created_str}")
|
||||
|
||||
if recording.original_filename:
|
||||
lines.append(f"- **{labels.get('originalFile', 'Original File')}:** {recording.original_filename}")
|
||||
|
||||
if recording.file_size:
|
||||
lines.append(f"- **{labels.get('fileSize', 'File Size')}:** {format_file_size(recording.file_size)}")
|
||||
|
||||
if recording.participants:
|
||||
lines.append(f"- **{labels.get('participants', 'Participants')}:** {recording.participants}")
|
||||
|
||||
if recording.tags:
|
||||
tag_names = [tag.name for tag in recording.tags]
|
||||
lines.append(f"- **{labels.get('tags', 'Tags')}:** {', '.join(tag_names)}")
|
||||
|
||||
if recording.transcription_duration_seconds:
|
||||
lines.append(f"- **{labels.get('transcriptionTime', 'Transcription Time')}:** {format_duration(recording.transcription_duration_seconds)}")
|
||||
|
||||
if recording.summarization_duration_seconds:
|
||||
lines.append(f"- **{labels.get('summarizationTime', 'Summarization Time')}:** {format_duration(recording.summarization_duration_seconds)}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Notes section (if available)
|
||||
if recording.notes:
|
||||
lines.append(f"## {labels.get('notes', 'Notes')}")
|
||||
lines.append("")
|
||||
lines.append(recording.notes)
|
||||
lines.append("")
|
||||
|
||||
# Summary section
|
||||
if include_summary and recording.summary:
|
||||
lines.append(f"## {labels.get('summary', 'Summary')}")
|
||||
lines.append("")
|
||||
lines.append(recording.summary)
|
||||
lines.append("")
|
||||
|
||||
# Transcription section
|
||||
if include_transcription and recording.transcription:
|
||||
lines.append(f"## {labels.get('transcription', 'Transcription')}")
|
||||
lines.append("")
|
||||
# Format transcription using user's template
|
||||
formatted_transcription = format_transcription_with_template(recording.transcription, user)
|
||||
lines.append(formatted_transcription)
|
||||
lines.append("")
|
||||
|
||||
# Footer
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
footer = labels.get('footer', 'Generated with [Speakr](https://github.com/learnedmachine/speakr)')
|
||||
lines.append(f"*{footer}*")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def export_recording(recording_id):
|
||||
"""
|
||||
Export a recording to markdown file.
|
||||
|
||||
Args:
|
||||
recording_id: ID of the recording to export
|
||||
|
||||
Returns:
|
||||
Path to the exported file, or None if export failed/disabled
|
||||
"""
|
||||
if not ENABLE_AUTO_EXPORT:
|
||||
return None
|
||||
|
||||
# Check if we should export anything
|
||||
if not AUTO_EXPORT_TRANSCRIPTION and not AUTO_EXPORT_SUMMARY:
|
||||
logger.warning("Auto-export is enabled but both transcription and summary export are disabled")
|
||||
return None
|
||||
|
||||
# Import here to avoid circular imports
|
||||
from src.app import app, db
|
||||
from src.models import Recording, User
|
||||
|
||||
with app.app_context():
|
||||
try:
|
||||
recording = db.session.get(Recording, recording_id)
|
||||
if not recording:
|
||||
logger.error(f"Recording {recording_id} not found for export")
|
||||
return None
|
||||
|
||||
# Get the owner
|
||||
user = db.session.get(User, recording.user_id)
|
||||
if not user:
|
||||
logger.error(f"User not found for recording {recording_id}")
|
||||
return None
|
||||
|
||||
# Check if we have content to export
|
||||
has_transcription = bool(recording.transcription) and AUTO_EXPORT_TRANSCRIPTION
|
||||
has_summary = bool(recording.summary) and AUTO_EXPORT_SUMMARY
|
||||
|
||||
if not has_transcription and not has_summary:
|
||||
logger.debug(f"Recording {recording_id} has no content to export")
|
||||
return None
|
||||
|
||||
# Get export directory for user
|
||||
export_dir = get_export_directory(user)
|
||||
|
||||
# Generate filename and path
|
||||
filename = generate_safe_filename(recording)
|
||||
filepath = export_dir / f"{filename}.md"
|
||||
|
||||
# Generate content
|
||||
content = generate_markdown_content(
|
||||
recording,
|
||||
user,
|
||||
include_transcription=AUTO_EXPORT_TRANSCRIPTION,
|
||||
include_summary=AUTO_EXPORT_SUMMARY
|
||||
)
|
||||
|
||||
# Write to file (overwrites if exists)
|
||||
filepath.write_text(content, encoding='utf-8')
|
||||
|
||||
logger.info(f"Exported recording {recording_id} to {filepath}")
|
||||
return str(filepath)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to export recording {recording_id}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def initialize_export_directory():
|
||||
"""Initialize the export directory on startup."""
|
||||
if not ENABLE_AUTO_EXPORT:
|
||||
return
|
||||
|
||||
try:
|
||||
export_dir = Path(AUTO_EXPORT_DIR)
|
||||
export_dir.mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"Auto-export enabled, directory: {AUTO_EXPORT_DIR}")
|
||||
|
||||
if AUTO_EXPORT_TRANSCRIPTION and AUTO_EXPORT_SUMMARY:
|
||||
logger.info("Exporting: transcription and summary")
|
||||
elif AUTO_EXPORT_TRANSCRIPTION:
|
||||
logger.info("Exporting: transcription only")
|
||||
elif AUTO_EXPORT_SUMMARY:
|
||||
logger.info("Exporting: summary only")
|
||||
else:
|
||||
logger.warning("Auto-export enabled but no content types selected")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize export directory: {e}")
|
||||
Reference in New Issue
Block a user