Initial release: DictIA v0.8.14-alpha (fork de Speakr, AGPL-3.0)

2026-03-16 21:47:37 +00:00
commit 42772a31ed
365 changed files with 103572 additions and 0 deletions
--- a/src/services/document.py
+++ b/src/services/document.py
@@ -0,0 +1,296 @@
+"""
+Document processing and conversion services.
+"""
+
+import re
+from docx import Document
+from docx.shared import Pt, RGBColor
+
+
+
+def process_markdown_to_docx(doc, content):
+    """Convert markdown content to properly formatted Word document elements.
+
+    Supports:
+    - Tables (markdown pipe tables)
+    - Headings (# ## ###)
+    - Bold text (**text**)
+    - Italic text (*text* or _text_)
+    - Bold italic (***text***)
+    - Inline code (`code`)
+    - Code blocks (```code```)
+    - Strikethrough (~~text~~)
+    - Links ([text](url))
+    - Bullet lists (- or *)
+    - Numbered lists (1. 2. 3.)
+    - Horizontal rules (--- or ***)
+    """
+    from docx.shared import RGBColor, Pt
+    from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
+    from docx.oxml.ns import qn
+    import re
+
+    def ensure_unicode_font(run, text):
+        """Ensure the run uses a font that supports the characters in the text."""
+        # Check if text contains non-ASCII characters
+        try:
+            text.encode('ascii')
+            # Text is pure ASCII, no special font needed
+        except UnicodeEncodeError:
+            # Text contains non-ASCII characters, use a font with better Unicode support
+            # Use Arial for broad compatibility - it has good Unicode support on most systems
+            run.font.name = 'Arial'
+            # Set the East Asian font for CJK (Chinese, Japanese, Korean) text
+            # This ensures proper rendering in Word
+            r = run._element
+            r.rPr.rFonts.set(qn('w:eastAsia'), 'Arial')
+        return run
+
+    def add_formatted_run(paragraph, text):
+        """Add a run with inline formatting to a paragraph."""
+        if not text:
+            return
+
+        # Pattern for all inline formatting
+        # Order matters: check triple asterisk before double/single
+        patterns = [
+            (r'\*\*\*(.*?)\*\*\*', lambda p, t: (lambda r: (setattr(r, 'bold', True), setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Bold italic
+            (r'\*\*(.*?)\*\*', lambda p, t: (lambda r: (setattr(r, 'bold', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Bold
+            (r'(?<!\*)\*(?!\*)(.*?)\*(?!\*)', lambda p, t: (lambda r: (setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Italic with *
+            (r'\b_(.*?)_\b', lambda p, t: (lambda r: (setattr(r, 'italic', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Italic with _
+            (r'~~(.*?)~~', lambda p, t: (lambda r: (setattr(r, 'strike', True), ensure_unicode_font(r, t)))(p.add_run(t))),  # Strikethrough
+            (r'`([^`]+)`', lambda p, t: add_code_run(p, t)),  # Inline code
+            (r'\[([^\]]+)\]\(([^)]+)\)', lambda p, t, u: add_link_run(p, t, u)),  # Links
+        ]
+
+        def add_code_run(para, text):
+            """Add inline code with monospace font and background."""
+            run = para.add_run(text)
+            run.font.name = 'Courier New'
+            run.font.size = Pt(10)
+            run.font.color.rgb = RGBColor(220, 20, 60)  # Crimson color for code
+            # Check if we need Unicode support for code
+            try:
+                text.encode('ascii')
+            except UnicodeEncodeError:
+                # Use Consolas as fallback for better Unicode support in monospace
+                r = run._element
+                r.rPr.rFonts.set(qn('w:eastAsia'), 'Consolas')
+            return run
+
+        def add_link_run(para, text, url):
+            """Add a hyperlink-styled run (note: actual hyperlinks require more complex handling)."""
+            full_text = f"{text} ({url})"
+            run = para.add_run(full_text)
+            run.font.color.rgb = RGBColor(0, 0, 255)  # Blue color for links
+            run.font.underline = True
+            ensure_unicode_font(run, full_text)
+            return run
+
+        # Process the text with all patterns
+        remaining_text = text
+        while remaining_text:
+            earliest_match = None
+            earliest_pos = len(remaining_text)
+            matched_pattern = None
+
+            # Find the earliest matching pattern
+            for pattern, handler in patterns:
+                match = re.search(pattern, remaining_text)
+                if match and match.start() < earliest_pos:
+                    earliest_match = match
+                    earliest_pos = match.start()
+                    matched_pattern = handler
+
+            if earliest_match:
+                # Add text before the match
+                if earliest_pos > 0:
+                    run = paragraph.add_run(remaining_text[:earliest_pos])
+                    ensure_unicode_font(run, remaining_text[:earliest_pos])
+
+                # Apply formatting for the matched text
+                if '[' in earliest_match.group(0) and '](' in earliest_match.group(0):
+                    # Special handling for links (two groups)
+                    matched_pattern(paragraph, earliest_match.group(1), earliest_match.group(2))
+                else:
+                    matched_pattern(paragraph, earliest_match.group(1))
+
+                # Continue with remaining text
+                remaining_text = remaining_text[earliest_match.end():]
+            else:
+                # No more patterns, add the rest as plain text
+                run = paragraph.add_run(remaining_text)
+                ensure_unicode_font(run, remaining_text)
+                break
+
+    def parse_table(lines, start_idx):
+        """Parse a markdown table starting at the given index."""
+        if start_idx >= len(lines):
+            return None, start_idx
+
+        # Check if this looks like a table
+        if '|' not in lines[start_idx]:
+            return None, start_idx
+
+        table_data = []
+        idx = start_idx
+
+        while idx < len(lines) and '|' in lines[idx]:
+            # Skip separator lines
+            if re.match(r'^[\s\|\-:]+$', lines[idx]):
+                idx += 1
+                continue
+
+            # Parse cells
+            cells = [cell.strip() for cell in lines[idx].split('|')]
+            # Remove empty cells at start and end
+            if cells and not cells[0]:
+                cells = cells[1:]
+            if cells and not cells[-1]:
+                cells = cells[:-1]
+
+            if cells:
+                table_data.append(cells)
+            idx += 1
+
+        if table_data:
+            return table_data, idx
+        return None, start_idx
+
+    # Split content into lines
+    lines = content.split('\n')
+    i = 0
+    in_code_block = False
+    code_block_content = []
+
+    while i < len(lines):
+        line = lines[i]
+
+        # Handle code blocks
+        if line.strip().startswith('```'):
+            if not in_code_block:
+                in_code_block = True
+                code_block_content = []
+            else:
+                # End of code block - add it as preformatted text
+                in_code_block = False
+                if code_block_content:
+                    p = doc.add_paragraph()
+                    p.style = 'Normal'
+                    code_text = '\n'.join(code_block_content)
+                    run = p.add_run(code_text)
+                    run.font.name = 'Courier New'
+                    run.font.size = Pt(10)
+                    run.font.color.rgb = RGBColor(64, 64, 64)
+                    # Check if we need Unicode support for code blocks
+                    try:
+                        code_text.encode('ascii')
+                    except UnicodeEncodeError:
+                        r = run._element
+                        r.rPr.rFonts.set(qn('w:eastAsia'), 'Consolas')
+            i += 1
+            continue
+
+        if in_code_block:
+            code_block_content.append(line)
+            i += 1
+            continue
+
+        # Check for table
+        table_data, end_idx = parse_table(lines, i)
+        if table_data:
+            # Create Word table
+            table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
+            table.style = 'Table Grid'
+
+            # Populate table
+            for row_idx, row_data in enumerate(table_data):
+                for col_idx, cell_text in enumerate(row_data):
+                    if col_idx < len(table.rows[row_idx].cells):
+                        cell = table.rows[row_idx].cells[col_idx]
+                        # Clear existing paragraphs and add new one
+                        cell.text = ""
+                        p = cell.add_paragraph()
+                        add_formatted_run(p, cell_text)
+                        # Make header row bold
+                        if row_idx == 0:
+                            for run in p.runs:
+                                run.bold = True
+
+            doc.add_paragraph('')  # Space after table
+            i = end_idx
+            continue
+
+        line = line.rstrip()
+
+        # Skip empty lines
+        if not line:
+            doc.add_paragraph('')
+            i += 1
+            continue
+
+        # Horizontal rule
+        if re.match(r'^(\*{3,}|-{3,}|_{3,})$', line.strip()):
+            p = doc.add_paragraph('─' * 50)
+            p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+            i += 1
+            continue
+
+        # Headings
+        if line.startswith('# '):
+            doc.add_heading(line[2:], 1)
+        elif line.startswith('## '):
+            doc.add_heading(line[3:], 2)
+        elif line.startswith('### '):
+            doc.add_heading(line[4:], 3)
+        elif line.startswith('#### '):
+            doc.add_heading(line[5:], 4)
+        # Bullet points
+        elif line.lstrip().startswith('- ') or line.lstrip().startswith('* '):
+            # Get the indentation level
+            indent = len(line) - len(line.lstrip())
+            bullet_text = line.lstrip()[2:]
+            p = doc.add_paragraph(style='List Bullet')
+            # Add indentation if nested
+            if indent > 0:
+                p.paragraph_format.left_indent = Pt(indent * 10)
+            add_formatted_run(p, bullet_text)
+        # Numbered lists
+        elif re.match(r'^\s*\d+\.', line):
+            match = re.match(r'^(\s*)(\d+)\.\s*(.*)', line)
+            if match:
+                indent = len(match.group(1))
+                list_text = match.group(3)
+                p = doc.add_paragraph(style='List Number')
+                if indent > 0:
+                    p.paragraph_format.left_indent = Pt(indent * 10)
+                add_formatted_run(p, list_text)
+        # Blockquote
+        elif line.startswith('> '):
+            p = doc.add_paragraph()
+            p.paragraph_format.left_indent = Pt(30)
+            add_formatted_run(p, line[2:])
+            # Add a gray color to indicate quote
+            for run in p.runs:
+                run.font.color.rgb = RGBColor(100, 100, 100)
+        else:
+            # Regular paragraph
+            p = doc.add_paragraph()
+            add_formatted_run(p, line)
+
+        i += 1
+
+# --- Database Models ---
+# --- Database Models ---
+# Models have been extracted to src/models/ and imported at the top of this file
+
+# --- Forms for Authentication ---
+# --- Custom Password Validator ---
+# password_check utility has been extracted to src/utils/security.py
+
+
+# --- Blueprint Registration ---
+# Import and register all blueprints for modular route organization
+
+
+