Source code for scitex_msword.writer

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 15:15:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/writer.py

"""
SciTeX writer document -> DOCX converter.

This module exports SciTeX documents to MS Word .docx files,
applying journal-specific styles and formatting.
"""

from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, List, Optional

from .profiles import BaseWordProfile

# Lazy import for python-docx
try:
    import docx
    from docx.document import Document as DocxDocument
    from docx.enum.style import WD_STYLE_TYPE
    from docx.enum.text import WD_ALIGN_PARAGRAPH
    from docx.shared import Cm, Inches, Pt

    DOCX_AVAILABLE = True
    _DOCX_IMPORT_ERROR = None
except ImportError as exc:
    DOCX_AVAILABLE = False
    _DOCX_IMPORT_ERROR = exc


[docs] class WordWriter: """ Export a SciTeX writer document to a DOCX file. This writer handles: - Section headings with proper styles - Paragraphs with formatting - Figure and table captions - References section - Image embedding - Journal-specific template application """
[docs] def __init__( self, profile: BaseWordProfile, template_path: Optional[Path] = None, ): """ Parameters ---------- profile : BaseWordProfile Mapping from writer structures to Word styles. template_path : Path | None Optional path to a Word template (.dotx/.docx) to use as base. """ if not DOCX_AVAILABLE: raise ImportError( "python-docx is required for scitex.msword.WordWriter. " "Install it via `pip install python-docx`." ) from _DOCX_IMPORT_ERROR self.profile = profile self.template_path = template_path
[docs] def write( self, writer_doc: Dict[str, Any] | Any, path: Path, ) -> None: """ Write a SciTeX writer document to a DOCX file. Parameters ---------- writer_doc : dict | Any Writer document or intermediate structure. path : Path Output path for the DOCX file. """ # Create document (from template if specified) if self.template_path and Path(self.template_path).exists(): doc = docx.Document(str(self.template_path)) # Clear existing content but keep styles self._clear_document_content(doc) else: doc = docx.Document() # Run pre-export hooks for hook in self.profile.pre_export_hooks: writer_doc = hook(writer_doc) # Extract blocks from writer_doc if isinstance(writer_doc, dict) and "blocks" in writer_doc: blocks = writer_doc["blocks"] images = writer_doc.get("images", []) else: blocks = list(writer_doc) images = [] # Build image lookup by hash image_lookup = {img.get("hash"): img for img in images if "hash" in img} # Process each block for block in blocks: self._add_block(doc, block, image_lookup) # Apply double-anonymous processing if needed if self.profile.double_anonymous: self._apply_double_anonymous(doc, writer_doc) # Save document doc.save(str(path))
def _clear_document_content(self, doc: DocxDocument) -> None: """Clear document content while preserving styles.""" for element in doc.element.body[:]: doc.element.body.remove(element) def _add_block( self, doc: DocxDocument, block: Dict[str, Any], image_lookup: Dict[str, Any], ) -> None: """Add a single block to the document.""" btype = block.get("type", "paragraph") text = block.get("text", "") if not text and btype not in ("table", "image"): return if btype == "heading": level = block.get("level", 1) self._add_heading(doc, text, level) elif btype == "caption": self._add_caption(doc, block) elif btype == "reference-paragraph": self._add_reference(doc, block) elif btype == "table": self._add_table(doc, block) elif btype == "image": self._add_image(doc, block, image_lookup) elif btype == "list-item": self._add_list_item(doc, block) else: # Default: paragraph self._add_paragraph(doc, text, block.get("runs")) def _add_heading( self, doc: DocxDocument, text: str, level: int, ) -> None: """Add a heading paragraph at the given logical level.""" style_name = self.profile.heading_styles.get(level) if style_name and self._style_exists(doc, style_name): p = doc.add_paragraph(text) p.style = style_name else: # Fallback to built-in heading doc.add_heading(text, level=min(level, 9)) def _add_paragraph( self, doc: DocxDocument, text: str, runs: Optional[List[Dict[str, Any]]] = None, ) -> None: """Add a paragraph with optional formatted runs.""" p = doc.add_paragraph() if runs: # Add formatted runs for run_data in runs: run = p.add_run(run_data.get("text", "")) if run_data.get("bold"): run.bold = True if run_data.get("italic"): run.italic = True if run_data.get("underline"): run.underline = True if run_data.get("font_size"): run.font.size = Pt(run_data["font_size"]) if run_data.get("font_name"): run.font.name = run_data["font_name"] else: p.add_run(text) # Apply normal style if self._style_exists(doc, self.profile.normal_style): try: p.style = self.profile.normal_style except Exception: pass def _add_caption( self, doc: DocxDocument, block: Dict[str, Any], ) -> None: """Add a figure or table caption.""" caption_type = block.get("caption_type", "") number = block.get("number", "") caption_text = block.get("caption_text", block.get("text", "")) # Build caption text if caption_type == "figure" and number: full_text = f"Figure {number}. {caption_text}" elif caption_type == "table" and number: full_text = f"Table {number}. {caption_text}" else: full_text = block.get("text", caption_text) p = doc.add_paragraph(full_text) if self._style_exists(doc, self.profile.caption_style): try: p.style = self.profile.caption_style except Exception: pass def _add_reference( self, doc: DocxDocument, block: Dict[str, Any], ) -> None: """Add a reference entry.""" ref_number = block.get("ref_number") ref_text = block.get("ref_text", block.get("text", "")) if ref_number is not None: full_text = f"[{ref_number}] {ref_text}" else: full_text = ref_text p = doc.add_paragraph(full_text) if self._style_exists(doc, self.profile.normal_style): try: p.style = self.profile.normal_style except Exception: pass def _add_table( self, doc: DocxDocument, block: Dict[str, Any], ) -> None: """Add a table.""" rows = block.get("rows", []) if not rows: return num_rows = len(rows) num_cols = len(rows[0]) if rows else 0 table = doc.add_table(rows=num_rows, cols=num_cols) table.style = "Table Grid" for i, row_data in enumerate(rows): row = table.rows[i] for j, cell_text in enumerate(row_data): if j < len(row.cells): row.cells[j].text = str(cell_text) def _add_image( self, doc: DocxDocument, block: Dict[str, Any], image_lookup: Dict[str, Any], ) -> None: """Add an image.""" image_hash = block.get("image_hash") image_data = block.get("data") if image_hash and image_hash in image_lookup: image_info = image_lookup[image_hash] image_data = image_info.get("data") if image_data: from io import BytesIO image_stream = BytesIO(image_data) width = block.get("width_inches", 5.0) doc.add_picture(image_stream, width=Inches(width)) def _add_list_item( self, doc: DocxDocument, block: Dict[str, Any], ) -> None: """Add a list item (bullet or numbered).""" text = block.get("text", "") list_type = block.get("list_type", "bullet") p = doc.add_paragraph(text) style_key = "bullet" if list_type == "bullet" else "numbered" style_name = self.profile.list_styles.get(style_key) if style_name and self._style_exists(doc, style_name): try: p.style = style_name except Exception: pass def _style_exists(self, doc: DocxDocument, style_name: str) -> bool: """Check if a style exists in the document.""" try: _ = doc.styles[style_name] return True except KeyError: return False def _apply_double_anonymous( self, doc: DocxDocument, writer_doc: Dict[str, Any], ) -> None: """ Apply double-anonymous formatting. This removes or masks author-identifying information. """ # Get author info to mask metadata = writer_doc.get("metadata", {}) author = metadata.get("author", "") if not author: return # Search and replace author names with placeholder # This is a simple implementation; more sophisticated # masking may be needed for real use for para in doc.paragraphs: if author.lower() in para.text.lower(): for run in para.runs: if author.lower() in run.text.lower(): # Mask author name import re run.text = re.sub( re.escape(author), "[Author]", run.text, flags=re.IGNORECASE, )
__all__ = ["WordWriter"]