Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)
This commit is contained in:
296
src/services/document.py
Normal file
296
src/services/document.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""
|
||||
Document processing and conversion services.
|
||||
"""
|
||||
|
||||
import re
|
||||
from docx import Document
|
||||
from docx.shared import Pt, RGBColor
|
||||
|
||||
|
||||
|
||||
def process_markdown_to_docx(doc, content):
|
||||
"""Convert markdown content to properly formatted Word document elements.
|
||||
|
||||
Supports:
|
||||
- Tables (markdown pipe tables)
|
||||
- Headings (# ## ###)
|
||||
- Bold text (**text**)
|
||||
- Italic text (*text* or _text_)
|
||||
- Bold italic (***text***)
|
||||
- Inline code (`code`)
|
||||
- Code blocks (```code```)
|
||||
- Strikethrough (~~text~~)
|
||||
- Links ([text](url))
|
||||
- Bullet lists (- or *)
|
||||
- Numbered lists (1. 2. 3.)
|
||||
- Horizontal rules (--- or ***)
|
||||
"""
|
||||
from docx.shared import RGBColor, Pt
|
||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||||
from docx.oxml.ns import qn
|
||||
import re
|
||||
|
||||
def ensure_unicode_font(run, text):
|
||||
"""Ensure the run uses a font that supports the characters in the text."""
|
||||
# Check if text contains non-ASCII characters
|
||||
try:
|
||||
text.encode('ascii')
|
||||
# Text is pure ASCII, no special font needed
|
||||
except UnicodeEncodeError:
|
||||
# Text contains non-ASCII characters, use a font with better Unicode support
|
||||
# Use Arial for broad compatibility - it has good Unicode support on most systems
|
||||
run.font.name = 'Arial'
|
||||
# Set the East Asian font for CJK (Chinese, Japanese, Korean) text
|
||||
# This ensures proper rendering in Word
|
||||
r = run._element
|
||||
r.rPr.rFonts.set(qn('w:eastAsia'), 'Arial')
|
||||
return run
|
||||
|
||||
def add_formatted_run(paragraph, text):
|
||||
"""Add a run with inline formatting to a paragraph."""
|
||||
if not text:
|
||||
return
|
||||
|
||||
# Pattern for all inline formatting
|
||||
# Order matters: check triple asterisk before double/single
|
||||
patterns = [
|
||||
(r'\*\*\*(.*?)\*\*\*', lambda p, t: (lambda r: (setattr(r, 'bold', True), setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))), # Bold italic
|
||||
(r'\*\*(.*?)\*\*', lambda p, t: (lambda r: (setattr(r, 'bold', True), ensure_unicode_font(r, t)))(p.add_run(t))), # Bold
|
||||
(r'(?<!\*)\*(?!\*)(.*?)\*(?!\*)', lambda p, t: (lambda r: (setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))), # Italic with *
|
||||
(r'\b_(.*?)_\b', lambda p, t: (lambda r: (setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))), # Italic with _
|
||||
(r'~~(.*?)~~', lambda p, t: (lambda r: (setattr(r, 'strike', True), ensure_unicode_font(r, t)))(p.add_run(t))), # Strikethrough
|
||||
(r'`([^`]+)`', lambda p, t: add_code_run(p, t)), # Inline code
|
||||
(r'\[([^\]]+)\]\(([^)]+)\)', lambda p, t, u: add_link_run(p, t, u)), # Links
|
||||
]
|
||||
|
||||
def add_code_run(para, text):
|
||||
"""Add inline code with monospace font and background."""
|
||||
run = para.add_run(text)
|
||||
run.font.name = 'Courier New'
|
||||
run.font.size = Pt(10)
|
||||
run.font.color.rgb = RGBColor(220, 20, 60) # Crimson color for code
|
||||
# Check if we need Unicode support for code
|
||||
try:
|
||||
text.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
# Use Consolas as fallback for better Unicode support in monospace
|
||||
r = run._element
|
||||
r.rPr.rFonts.set(qn('w:eastAsia'), 'Consolas')
|
||||
return run
|
||||
|
||||
def add_link_run(para, text, url):
|
||||
"""Add a hyperlink-styled run (note: actual hyperlinks require more complex handling)."""
|
||||
full_text = f"{text} ({url})"
|
||||
run = para.add_run(full_text)
|
||||
run.font.color.rgb = RGBColor(0, 0, 255) # Blue color for links
|
||||
run.font.underline = True
|
||||
ensure_unicode_font(run, full_text)
|
||||
return run
|
||||
|
||||
# Process the text with all patterns
|
||||
remaining_text = text
|
||||
while remaining_text:
|
||||
earliest_match = None
|
||||
earliest_pos = len(remaining_text)
|
||||
matched_pattern = None
|
||||
|
||||
# Find the earliest matching pattern
|
||||
for pattern, handler in patterns:
|
||||
match = re.search(pattern, remaining_text)
|
||||
if match and match.start() < earliest_pos:
|
||||
earliest_match = match
|
||||
earliest_pos = match.start()
|
||||
matched_pattern = handler
|
||||
|
||||
if earliest_match:
|
||||
# Add text before the match
|
||||
if earliest_pos > 0:
|
||||
run = paragraph.add_run(remaining_text[:earliest_pos])
|
||||
ensure_unicode_font(run, remaining_text[:earliest_pos])
|
||||
|
||||
# Apply formatting for the matched text
|
||||
if '[' in earliest_match.group(0) and '](' in earliest_match.group(0):
|
||||
# Special handling for links (two groups)
|
||||
matched_pattern(paragraph, earliest_match.group(1), earliest_match.group(2))
|
||||
else:
|
||||
matched_pattern(paragraph, earliest_match.group(1))
|
||||
|
||||
# Continue with remaining text
|
||||
remaining_text = remaining_text[earliest_match.end():]
|
||||
else:
|
||||
# No more patterns, add the rest as plain text
|
||||
run = paragraph.add_run(remaining_text)
|
||||
ensure_unicode_font(run, remaining_text)
|
||||
break
|
||||
|
||||
def parse_table(lines, start_idx):
|
||||
"""Parse a markdown table starting at the given index."""
|
||||
if start_idx >= len(lines):
|
||||
return None, start_idx
|
||||
|
||||
# Check if this looks like a table
|
||||
if '|' not in lines[start_idx]:
|
||||
return None, start_idx
|
||||
|
||||
table_data = []
|
||||
idx = start_idx
|
||||
|
||||
while idx < len(lines) and '|' in lines[idx]:
|
||||
# Skip separator lines
|
||||
if re.match(r'^[\s\|\-:]+$', lines[idx]):
|
||||
idx += 1
|
||||
continue
|
||||
|
||||
# Parse cells
|
||||
cells = [cell.strip() for cell in lines[idx].split('|')]
|
||||
# Remove empty cells at start and end
|
||||
if cells and not cells[0]:
|
||||
cells = cells[1:]
|
||||
if cells and not cells[-1]:
|
||||
cells = cells[:-1]
|
||||
|
||||
if cells:
|
||||
table_data.append(cells)
|
||||
idx += 1
|
||||
|
||||
if table_data:
|
||||
return table_data, idx
|
||||
return None, start_idx
|
||||
|
||||
# Split content into lines
|
||||
lines = content.split('\n')
|
||||
i = 0
|
||||
in_code_block = False
|
||||
code_block_content = []
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# Handle code blocks
|
||||
if line.strip().startswith('```'):
|
||||
if not in_code_block:
|
||||
in_code_block = True
|
||||
code_block_content = []
|
||||
else:
|
||||
# End of code block - add it as preformatted text
|
||||
in_code_block = False
|
||||
if code_block_content:
|
||||
p = doc.add_paragraph()
|
||||
p.style = 'Normal'
|
||||
code_text = '\n'.join(code_block_content)
|
||||
run = p.add_run(code_text)
|
||||
run.font.name = 'Courier New'
|
||||
run.font.size = Pt(10)
|
||||
run.font.color.rgb = RGBColor(64, 64, 64)
|
||||
# Check if we need Unicode support for code blocks
|
||||
try:
|
||||
code_text.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
r = run._element
|
||||
r.rPr.rFonts.set(qn('w:eastAsia'), 'Consolas')
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if in_code_block:
|
||||
code_block_content.append(line)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Check for table
|
||||
table_data, end_idx = parse_table(lines, i)
|
||||
if table_data:
|
||||
# Create Word table
|
||||
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
||||
table.style = 'Table Grid'
|
||||
|
||||
# Populate table
|
||||
for row_idx, row_data in enumerate(table_data):
|
||||
for col_idx, cell_text in enumerate(row_data):
|
||||
if col_idx < len(table.rows[row_idx].cells):
|
||||
cell = table.rows[row_idx].cells[col_idx]
|
||||
# Clear existing paragraphs and add new one
|
||||
cell.text = ""
|
||||
p = cell.add_paragraph()
|
||||
add_formatted_run(p, cell_text)
|
||||
# Make header row bold
|
||||
if row_idx == 0:
|
||||
for run in p.runs:
|
||||
run.bold = True
|
||||
|
||||
doc.add_paragraph('') # Space after table
|
||||
i = end_idx
|
||||
continue
|
||||
|
||||
line = line.rstrip()
|
||||
|
||||
# Skip empty lines
|
||||
if not line:
|
||||
doc.add_paragraph('')
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Horizontal rule
|
||||
if re.match(r'^(\*{3,}|-{3,}|_{3,})$', line.strip()):
|
||||
p = doc.add_paragraph('─' * 50)
|
||||
p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Headings
|
||||
if line.startswith('# '):
|
||||
doc.add_heading(line[2:], 1)
|
||||
elif line.startswith('## '):
|
||||
doc.add_heading(line[3:], 2)
|
||||
elif line.startswith('### '):
|
||||
doc.add_heading(line[4:], 3)
|
||||
elif line.startswith('#### '):
|
||||
doc.add_heading(line[5:], 4)
|
||||
# Bullet points
|
||||
elif line.lstrip().startswith('- ') or line.lstrip().startswith('* '):
|
||||
# Get the indentation level
|
||||
indent = len(line) - len(line.lstrip())
|
||||
bullet_text = line.lstrip()[2:]
|
||||
p = doc.add_paragraph(style='List Bullet')
|
||||
# Add indentation if nested
|
||||
if indent > 0:
|
||||
p.paragraph_format.left_indent = Pt(indent * 10)
|
||||
add_formatted_run(p, bullet_text)
|
||||
# Numbered lists
|
||||
elif re.match(r'^\s*\d+\.', line):
|
||||
match = re.match(r'^(\s*)(\d+)\.\s*(.*)', line)
|
||||
if match:
|
||||
indent = len(match.group(1))
|
||||
list_text = match.group(3)
|
||||
p = doc.add_paragraph(style='List Number')
|
||||
if indent > 0:
|
||||
p.paragraph_format.left_indent = Pt(indent * 10)
|
||||
add_formatted_run(p, list_text)
|
||||
# Blockquote
|
||||
elif line.startswith('> '):
|
||||
p = doc.add_paragraph()
|
||||
p.paragraph_format.left_indent = Pt(30)
|
||||
add_formatted_run(p, line[2:])
|
||||
# Add a gray color to indicate quote
|
||||
for run in p.runs:
|
||||
run.font.color.rgb = RGBColor(100, 100, 100)
|
||||
else:
|
||||
# Regular paragraph
|
||||
p = doc.add_paragraph()
|
||||
add_formatted_run(p, line)
|
||||
|
||||
i += 1
|
||||
|
||||
# --- Database Models ---
|
||||
# --- Database Models ---
|
||||
# Models have been extracted to src/models/ and imported at the top of this file
|
||||
|
||||
# --- Forms for Authentication ---
|
||||
# --- Custom Password Validator ---
|
||||
# password_check utility has been extracted to src/utils/security.py
|
||||
|
||||
|
||||
# --- Blueprint Registration ---
|
||||
# Import and register all blueprints for modular route organization
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user