Source code for scitex_msword.writer

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 15:15:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/writer.py

"""
SciTeX writer document -> DOCX converter.

This module exports SciTeX documents to MS Word .docx files,
applying journal-specific styles and formatting.
"""

from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, List, Optional

from .profiles import BaseWordProfile

# Lazy import for python-docx
try:
    import docx
    from docx.document import Document as DocxDocument
    from docx.enum.style import WD_STYLE_TYPE
    from docx.enum.text import WD_ALIGN_PARAGRAPH
    from docx.shared import Cm, Inches, Pt

    DOCX_AVAILABLE = True
    _DOCX_IMPORT_ERROR = None
except ImportError as exc:
    DOCX_AVAILABLE = False
    _DOCX_IMPORT_ERROR = exc



[docs]
class WordWriter:
    """
    Export a SciTeX writer document to a DOCX file.

    This writer handles:
    - Section headings with proper styles
    - Paragraphs with formatting
    - Figure and table captions
    - References section
    - Image embedding
    - Journal-specific template application
    """


[docs]
    def __init__(
        self,
        profile: BaseWordProfile,
        template_path: Optional[Path] = None,
    ):
        """
        Parameters
        ----------
        profile : BaseWordProfile
            Mapping from writer structures to Word styles.
        template_path : Path | None
            Optional path to a Word template (.dotx/.docx) to use as base.
        """
        if not DOCX_AVAILABLE:
            raise ImportError(
                "python-docx is required for scitex.msword.WordWriter. "
                "Install it via `pip install python-docx`."
            ) from _DOCX_IMPORT_ERROR
        self.profile = profile
        self.template_path = template_path



[docs]
    def write(
        self,
        writer_doc: Dict[str, Any] | Any,
        path: Path,
    ) -> None:
        """
        Write a SciTeX writer document to a DOCX file.

        Parameters
        ----------
        writer_doc : dict | Any
            Writer document or intermediate structure.
        path : Path
            Output path for the DOCX file.
        """
        # Create document (from template if specified)
        if self.template_path and Path(self.template_path).exists():
            doc = docx.Document(str(self.template_path))
            # Clear existing content but keep styles
            self._clear_document_content(doc)
        else:
            doc = docx.Document()

        # Run pre-export hooks
        for hook in self.profile.pre_export_hooks:
            writer_doc = hook(writer_doc)

        # Extract blocks from writer_doc
        if isinstance(writer_doc, dict) and "blocks" in writer_doc:
            blocks = writer_doc["blocks"]
            images = writer_doc.get("images", [])
        else:
            blocks = list(writer_doc)
            images = []

        # Build image lookup by hash
        image_lookup = {img.get("hash"): img for img in images if "hash" in img}

        # Process each block
        for block in blocks:
            self._add_block(doc, block, image_lookup)

        # Apply double-anonymous processing if needed
        if self.profile.double_anonymous:
            self._apply_double_anonymous(doc, writer_doc)

        # Save document
        doc.save(str(path))


    def _clear_document_content(self, doc: DocxDocument) -> None:
        """Clear document content while preserving styles."""
        for element in doc.element.body[:]:
            doc.element.body.remove(element)

    def _add_block(
        self,
        doc: DocxDocument,
        block: Dict[str, Any],
        image_lookup: Dict[str, Any],
    ) -> None:
        """Add a single block to the document."""
        btype = block.get("type", "paragraph")
        text = block.get("text", "")

        if not text and btype not in ("table", "image"):
            return

        if btype == "heading":
            level = block.get("level", 1)
            self._add_heading(doc, text, level)

        elif btype == "caption":
            self._add_caption(doc, block)

        elif btype == "reference-paragraph":
            self._add_reference(doc, block)

        elif btype == "table":
            self._add_table(doc, block)

        elif btype == "image":
            self._add_image(doc, block, image_lookup)

        elif btype == "list-item":
            self._add_list_item(doc, block)

        else:
            # Default: paragraph
            self._add_paragraph(doc, text, block.get("runs"))

    def _add_heading(
        self,
        doc: DocxDocument,
        text: str,
        level: int,
    ) -> None:
        """Add a heading paragraph at the given logical level."""
        style_name = self.profile.heading_styles.get(level)

        if style_name and self._style_exists(doc, style_name):
            p = doc.add_paragraph(text)
            p.style = style_name
        else:
            # Fallback to built-in heading
            doc.add_heading(text, level=min(level, 9))

    def _add_paragraph(
        self,
        doc: DocxDocument,
        text: str,
        runs: Optional[List[Dict[str, Any]]] = None,
    ) -> None:
        """Add a paragraph with optional formatted runs."""
        p = doc.add_paragraph()

        if runs:
            # Add formatted runs
            for run_data in runs:
                run = p.add_run(run_data.get("text", ""))
                if run_data.get("bold"):
                    run.bold = True
                if run_data.get("italic"):
                    run.italic = True
                if run_data.get("underline"):
                    run.underline = True
                if run_data.get("font_size"):
                    run.font.size = Pt(run_data["font_size"])
                if run_data.get("font_name"):
                    run.font.name = run_data["font_name"]
        else:
            p.add_run(text)

        # Apply normal style
        if self._style_exists(doc, self.profile.normal_style):
            try:
                p.style = self.profile.normal_style
            except Exception:
                pass

    def _add_caption(
        self,
        doc: DocxDocument,
        block: Dict[str, Any],
    ) -> None:
        """Add a figure or table caption."""
        caption_type = block.get("caption_type", "")
        number = block.get("number", "")
        caption_text = block.get("caption_text", block.get("text", ""))

        # Build caption text
        if caption_type == "figure" and number:
            full_text = f"Figure {number}. {caption_text}"
        elif caption_type == "table" and number:
            full_text = f"Table {number}. {caption_text}"
        else:
            full_text = block.get("text", caption_text)

        p = doc.add_paragraph(full_text)

        if self._style_exists(doc, self.profile.caption_style):
            try:
                p.style = self.profile.caption_style
            except Exception:
                pass

    def _add_reference(
        self,
        doc: DocxDocument,
        block: Dict[str, Any],
    ) -> None:
        """Add a reference entry."""
        ref_number = block.get("ref_number")
        ref_text = block.get("ref_text", block.get("text", ""))

        if ref_number is not None:
            full_text = f"[{ref_number}] {ref_text}"
        else:
            full_text = ref_text

        p = doc.add_paragraph(full_text)

        if self._style_exists(doc, self.profile.normal_style):
            try:
                p.style = self.profile.normal_style
            except Exception:
                pass

    def _add_table(
        self,
        doc: DocxDocument,
        block: Dict[str, Any],
    ) -> None:
        """Add a table."""
        rows = block.get("rows", [])
        if not rows:
            return

        num_rows = len(rows)
        num_cols = len(rows[0]) if rows else 0

        table = doc.add_table(rows=num_rows, cols=num_cols)
        table.style = "Table Grid"

        for i, row_data in enumerate(rows):
            row = table.rows[i]
            for j, cell_text in enumerate(row_data):
                if j < len(row.cells):
                    row.cells[j].text = str(cell_text)

    def _add_image(
        self,
        doc: DocxDocument,
        block: Dict[str, Any],
        image_lookup: Dict[str, Any],
    ) -> None:
        """Add an image."""
        image_hash = block.get("image_hash")
        image_data = block.get("data")

        if image_hash and image_hash in image_lookup:
            image_info = image_lookup[image_hash]
            image_data = image_info.get("data")

        if image_data:
            from io import BytesIO

            image_stream = BytesIO(image_data)
            width = block.get("width_inches", 5.0)
            doc.add_picture(image_stream, width=Inches(width))

    def _add_list_item(
        self,
        doc: DocxDocument,
        block: Dict[str, Any],
    ) -> None:
        """Add a list item (bullet or numbered)."""
        text = block.get("text", "")
        list_type = block.get("list_type", "bullet")

        p = doc.add_paragraph(text)

        style_key = "bullet" if list_type == "bullet" else "numbered"
        style_name = self.profile.list_styles.get(style_key)

        if style_name and self._style_exists(doc, style_name):
            try:
                p.style = style_name
            except Exception:
                pass

    def _style_exists(self, doc: DocxDocument, style_name: str) -> bool:
        """Check if a style exists in the document."""
        try:
            _ = doc.styles[style_name]
            return True
        except KeyError:
            return False

    def _apply_double_anonymous(
        self,
        doc: DocxDocument,
        writer_doc: Dict[str, Any],
    ) -> None:
        """
        Apply double-anonymous formatting.

        This removes or masks author-identifying information.
        """
        # Get author info to mask
        metadata = writer_doc.get("metadata", {})
        author = metadata.get("author", "")

        if not author:
            return

        # Search and replace author names with placeholder
        # This is a simple implementation; more sophisticated
        # masking may be needed for real use
        for para in doc.paragraphs:
            if author.lower() in para.text.lower():
                for run in para.runs:
                    if author.lower() in run.text.lower():
                        # Mask author name
                        import re

                        run.text = re.sub(
                            re.escape(author),
                            "[Author]",
                            run.text,
                            flags=re.IGNORECASE,
                        )



__all__ = ["WordWriter"]