dictia-public/src/services/document.py

"""
Document processing and conversion services.
"""

import re
from docx import Document
from docx.shared import Pt, RGBColor


def process_markdown_to_docx(doc, content):
    """Convert markdown content to properly formatted Word document elements.

    Supports:
    - Tables (markdown pipe tables)
    - Headings (# ## ###)
    - Bold text (**text**)
    - Italic text (*text* or _text_)
    - Bold italic (***text***)
    - Inline code (`code`)
    - Code blocks (```code```)
    - Strikethrough (~~text~~)
    - Links ([text](url))
    - Bullet lists (- or *)
    - Numbered lists (1. 2. 3.)
    - Horizontal rules (--- or ***)
    """
    from docx.shared import RGBColor, Pt
    from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
    from docx.oxml.ns import qn
    import re

    def ensure_unicode_font(run, text):
        """Ensure the run uses a font that supports the characters in the text."""
        # Check if text contains non-ASCII characters
        try:
            text.encode('ascii')
            # Text is pure ASCII, no special font needed
        except UnicodeEncodeError:
            # Text contains non-ASCII characters, use a font with better Unicode support
            # Use Arial for broad compatibility - it has good Unicode support on most systems
            run.font.name = 'Arial'
            # Set the East Asian font for CJK (Chinese, Japanese, Korean) text
            # This ensures proper rendering in Word
            r = run._element
            r.rPr.rFonts.set(qn('w:eastAsia'), 'Arial')
        return run

    def add_formatted_run(paragraph, text):
        """Add a run with inline formatting to a paragraph."""
        if not text:
            return

        # Pattern for all inline formatting
        # Order matters: check triple asterisk before double/single
        patterns = [
            (r'\*\*\*(.*?)\*\*\*', lambda p, t: (lambda r: (setattr(r, 'bold', True), setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Bold italic
            (r'\*\*(.*?)\*\*', lambda p, t: (lambda r: (setattr(r, 'bold', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Bold
            (r'(?<!\*)\*(?!\*)(.*?)\*(?!\*)', lambda p, t: (lambda r: (setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Italic with *
            (r'\b_(.*?)_\b', lambda p, t: (lambda r: (setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Italic with _
            (r'~~(.*?)~~', lambda p, t: (lambda r: (setattr(r, 'strike', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Strikethrough
            (r'`([^`]+)`', lambda p, t: add_code_run(p, t)),  # Inline code
            (r'\[([^\]]+)\]\(([^)]+)\)', lambda p, t, u: add_link_run(p, t, u)),  # Links
        ]

        def add_code_run(para, text):
            """Add inline code with monospace font and background."""
            run = para.add_run(text)
            run.font.name = 'Courier New'
            run.font.size = Pt(10)
            run.font.color.rgb = RGBColor(220, 20, 60)  # Crimson color for code
            # Check if we need Unicode support for code
            try:
                text.encode('ascii')
            except UnicodeEncodeError:
                # Use Consolas as fallback for better Unicode support in monospace
                r = run._element
                r.rPr.rFonts.set(qn('w:eastAsia'), 'Consolas')
            return run

        def add_link_run(para, text, url):
            """Add a hyperlink-styled run (note: actual hyperlinks require more complex handling)."""
            full_text = f"{text} ({url})"
            run = para.add_run(full_text)
            run.font.color.rgb = RGBColor(0, 0, 255)  # Blue color for links
            run.font.underline = True
            ensure_unicode_font(run, full_text)
            return run

        # Process the text with all patterns
        remaining_text = text
        while remaining_text:
            earliest_match = None
            earliest_pos = len(remaining_text)
            matched_pattern = None

            # Find the earliest matching pattern
            for pattern, handler in patterns:
                match = re.search(pattern, remaining_text)
                if match and match.start() < earliest_pos:
                    earliest_match = match
                    earliest_pos = match.start()
                    matched_pattern = handler

            if earliest_match:
                # Add text before the match
                if earliest_pos > 0:
                    run = paragraph.add_run(remaining_text[:earliest_pos])
                    ensure_unicode_font(run, remaining_text[:earliest_pos])

                # Apply formatting for the matched text
                if '[' in earliest_match.group(0) and '](' in earliest_match.group(0):
                    # Special handling for links (two groups)
                    matched_pattern(paragraph, earliest_match.group(1), earliest_match.group(2))
                else:
                    matched_pattern(paragraph, earliest_match.group(1))

                # Continue with remaining text
                remaining_text = remaining_text[earliest_match.end():]
            else:
                # No more patterns, add the rest as plain text
                run = paragraph.add_run(remaining_text)
                ensure_unicode_font(run, remaining_text)
                break

    def parse_table(lines, start_idx):
        """Parse a markdown table starting at the given index."""
        if start_idx >= len(lines):
            return None, start_idx

        # Check if this looks like a table
        if '|' not in lines[start_idx]:
            return None, start_idx

        table_data = []
        idx = start_idx

        while idx < len(lines) and '|' in lines[idx]:
            # Skip separator lines
            if re.match(r'^[\s\|\-:]+$', lines[idx]):
                idx += 1
                continue

            # Parse cells
            cells = [cell.strip() for cell in lines[idx].split('|')]
            # Remove empty cells at start and end
            if cells and not cells[0]:
                cells = cells[1:]
            if cells and not cells[-1]:
                cells = cells[:-1]

            if cells:
                table_data.append(cells)
            idx += 1

        if table_data:
            return table_data, idx
        return None, start_idx

    # Split content into lines
    lines = content.split('\n')
    i = 0
    in_code_block = False
    code_block_content = []

    while i < len(lines):
        line = lines[i]

        # Handle code blocks
        if line.strip().startswith('```'):
            if not in_code_block:
                in_code_block = True
                code_block_content = []
            else:
                # End of code block - add it as preformatted text
                in_code_block = False
                if code_block_content:
                    p = doc.add_paragraph()
                    p.style = 'Normal'
                    code_text = '\n'.join(code_block_content)
                    run = p.add_run(code_text)
                    run.font.name = 'Courier New'
                    run.font.size = Pt(10)
                    run.font.color.rgb = RGBColor(64, 64, 64)
                    # Check if we need Unicode support for code blocks
                    try:
                        code_text.encode('ascii')
                    except UnicodeEncodeError:
                        r = run._element
                        r.rPr.rFonts.set(qn('w:eastAsia'), 'Consolas')
            i += 1
            continue

        if in_code_block:
            code_block_content.append(line)
            i += 1
            continue

        # Check for table
        table_data, end_idx = parse_table(lines, i)
        if table_data:
            # Create Word table
            table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
            table.style = 'Table Grid'

            # Populate table
            for row_idx, row_data in enumerate(table_data):
                for col_idx, cell_text in enumerate(row_data):
                    if col_idx < len(table.rows[row_idx].cells):
                        cell = table.rows[row_idx].cells[col_idx]
                        # Clear existing paragraphs and add new one
                        cell.text = ""
                        p = cell.add_paragraph()
                        add_formatted_run(p, cell_text)
                        # Make header row bold
                        if row_idx == 0:
                            for run in p.runs:
                                run.bold = True

            doc.add_paragraph('')  # Space after table
            i = end_idx
            continue

        line = line.rstrip()

        # Skip empty lines
        if not line:
            doc.add_paragraph('')
            i += 1
            continue

        # Horizontal rule
        if re.match(r'^(\*{3,}|-{3,}|_{3,})$', line.strip()):
            p = doc.add_paragraph('─' * 50)
            p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
            i += 1
            continue

        # Headings
        if line.startswith('# '):
            doc.add_heading(line[2:], 1)
        elif line.startswith('## '):
            doc.add_heading(line[3:], 2)
        elif line.startswith('### '):
            doc.add_heading(line[4:], 3)
        elif line.startswith('#### '):
            doc.add_heading(line[5:], 4)
        # Bullet points
        elif line.lstrip().startswith('- ') or line.lstrip().startswith('* '):
            # Get the indentation level
            indent = len(line) - len(line.lstrip())
            bullet_text = line.lstrip()[2:]
            p = doc.add_paragraph(style='List Bullet')
            # Add indentation if nested
            if indent > 0:
                p.paragraph_format.left_indent = Pt(indent * 10)
            add_formatted_run(p, bullet_text)
        # Numbered lists
        elif re.match(r'^\s*\d+\.', line):
            match = re.match(r'^(\s*)(\d+)\.\s*(.*)', line)
            if match:
                indent = len(match.group(1))
                list_text = match.group(3)
                p = doc.add_paragraph(style='List Number')
                if indent > 0:
                    p.paragraph_format.left_indent = Pt(indent * 10)
                add_formatted_run(p, list_text)
        # Blockquote
        elif line.startswith('> '):
            p = doc.add_paragraph()
            p.paragraph_format.left_indent = Pt(30)
            add_formatted_run(p, line[2:])
            # Add a gray color to indicate quote
            for run in p.runs:
                run.font.color.rgb = RGBColor(100, 100, 100)
        else:
            # Regular paragraph
            p = doc.add_paragraph()
            add_formatted_run(p, line)

        i += 1

# --- Database Models ---
# --- Database Models ---
# Models have been extracted to src/models/ and imported at the top of this file

# --- Forms for Authentication ---
# --- Custom Password Validator ---
# password_check utility has been extracted to src/utils/security.py


# --- Blueprint Registration ---
# Import and register all blueprints for modular route organization