Source code for scitex_msword.reader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 15:15:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/reader.py

"""
DOCX -> SciTeX writer document converter.

This module reads MS Word .docx files and converts them into
SciTeX's intermediate document format for further processing.
"""

from __future__ import annotations

import hashlib
import re
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

from .profiles import BaseWordProfile

# Lazy import for python-docx
try:
    import docx
    from docx.document import Document as DocxDocument
    from docx.oxml.ns import qn
    from docx.shared import Inches, Pt

    DOCX_AVAILABLE = True
    _DOCX_IMPORT_ERROR = None
except ImportError as exc:
    DOCX_AVAILABLE = False
    _DOCX_IMPORT_ERROR = exc
    DocxDocument = None

# Common academic section headings for heuristic detection
COMMON_SECTION_HEADINGS = {
    "abstract",
    "introduction",
    "background",
    "literature review",
    "methods",
    "methodology",
    "materials and methods",
    "experimental",
    "results",
    "findings",
    "analysis",
    "discussion",
    "conclusions",
    "conclusion",
    "summary",
    "acknowledgements",
    "acknowledgments",
    "acknowledgement",
    "references",
    "bibliography",
    "works cited",
    "appendix",
    "appendices",
    "supplementary",
    "supplementary material",
}

# Caption patterns for robust detection
CAPTION_PATTERNS = [
    # Figure patterns
    (r"^(figure|fig\.?)\s*(\d+)[\.:\s]*(.*)$", "figure"),
    (r"^(scheme)\s*(\d+)[\.:\s]*(.*)$", "scheme"),
    (r"^(chart)\s*(\d+)[\.:\s]*(.*)$", "chart"),
    (r"^(graph)\s*(\d+)[\.:\s]*(.*)$", "graph"),
    (r"^(plate)\s*(\d+)[\.:\s]*(.*)$", "plate"),
    (r"^(illustration)\s*(\d+)[\.:\s]*(.*)$", "illustration"),
    # Table patterns
    (r"^(table|tbl\.?)\s*(\d+)[\.:\s]*(.*)$", "table"),
    # Equation patterns
    (r"^(equation|eq\.?)\s*(\d+)[\.:\s]*(.*)$", "equation"),
    # Listing/code patterns
    (r"^(listing|code)\s*(\d+)[\.:\s]*(.*)$", "listing"),
    # Algorithm patterns
    (r"^(algorithm|alg\.?)\s*(\d+)[\.:\s]*(.*)$", "algorithm"),
]



[docs]
class WordReader:
    """
    Read a DOCX file and convert it into a SciTeX writer document.

    This reader focuses on:
    - Sections (via heading styles)
    - Plain paragraphs
    - Figure/table captions (via caption style)
    - Embedded images extraction
    - References section boundary detection
    - Basic formatting (bold, italic)

    The output is a structured intermediate representation that can be
    easily fed into `scitex.writer` or exported to LaTeX/other formats.
    """


[docs]
    def __init__(
        self,
        profile: BaseWordProfile,
        extract_images: bool = True,
    ):
        """
        Parameters
        ----------
        profile : BaseWordProfile
            Mapping between Word styles and SciTeX writer semantics.
        extract_images : bool
            Whether to extract embedded images from the document.
        """
        if not DOCX_AVAILABLE:
            raise ImportError(
                "python-docx is required for scitex.msword.WordReader. "
                "Install it via `pip install python-docx`."
            ) from _DOCX_IMPORT_ERROR
        self.profile = profile
        self.extract_images = extract_images



[docs]
    def read(self, path: Path) -> Dict[str, Any]:
        """
        Read a DOCX file and return a SciTeX writer document.

        Parameters
        ----------
        path : Path
            Path to the DOCX file.

        Returns
        -------
        dict
            SciTeX writer document structure with keys:
            - blocks: List of document blocks
            - metadata: Profile and source information
            - images: Extracted image data (if extract_images=True)
            - references: Parsed reference entries
            - warnings: List of conversion warnings
        """
        doc = docx.Document(str(path))

        # Initialize result structure
        result: Dict[str, Any] = {
            "blocks": [],
            "metadata": {
                "profile": self.profile.name,
                "source_file": str(path),
                "import_timestamp": datetime.now().isoformat(),
            },
            "images": [],
            "references": [],
            "warnings": [],
        }

        # Extract document properties if available
        result["metadata"].update(self._extract_metadata(doc))

        # Process paragraphs and tables
        blocks = self._process_body(doc, result)
        result["blocks"] = blocks

        # Extract images
        if self.extract_images:
            result["images"] = self._extract_images(doc, path)

        # Parse references section
        result["references"] = self._parse_references(blocks)

        # Run post-import hooks
        for hook in self.profile.post_import_hooks:
            result = hook(result)

        return result


    def _extract_metadata(self, doc: DocxDocument) -> Dict[str, Any]:
        """Extract document metadata (title, author, etc.)."""
        metadata = {}
        try:
            core_props = doc.core_properties
            if core_props.title:
                metadata["title"] = core_props.title
            if core_props.author:
                metadata["author"] = core_props.author
            if core_props.subject:
                metadata["subject"] = core_props.subject
            if core_props.keywords:
                metadata["keywords"] = core_props.keywords
            if core_props.created:
                metadata["created"] = core_props.created.isoformat()
            if core_props.modified:
                metadata["modified"] = core_props.modified.isoformat()
        except Exception:
            pass  # Metadata extraction is optional
        return metadata

    def _process_body(
        self,
        doc: DocxDocument,
        result: Dict[str, Any],
    ) -> List[Dict[str, Any]]:
        """Process document body: paragraphs and tables."""
        blocks: List[Dict[str, Any]] = []
        in_reference_section = False
        block_index = 0

        # Build rel_id -> hash map for image detection
        rel_to_hash = {}
        if self.extract_images:
            for rel_id, rel in doc.part.rels.items():
                if "image" in rel.reltype:
                    image_bytes = rel.target_part.blob
                    image_hash = hashlib.md5(image_bytes).hexdigest()[:12]
                    rel_to_hash[rel_id] = image_hash

        # Namespace for picture detection
        pic_ns = {"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture"}
        a_ns = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}
        r_ns = {
            "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
        }

        for element in doc.element.body:
            tag = element.tag.split("}")[-1]  # Remove namespace

            if tag == "p":
                # Process paragraph
                para = docx.text.paragraph.Paragraph(element, doc)

                # Detect inline images in this paragraph
                if self.extract_images:
                    for run in para.runs:
                        # Check for drawing elements containing pictures
                        drawings = run.element.findall(".//a:blip", namespaces=a_ns)
                        for blip in drawings:
                            embed_attr = qn("r:embed")
                            rel_id = blip.get(embed_attr)
                            if rel_id and rel_id in rel_to_hash:
                                blocks.append(
                                    {
                                        "index": block_index,
                                        "type": "image",
                                        "image_hash": rel_to_hash[rel_id],
                                        "rel_id": rel_id,
                                    }
                                )
                                block_index += 1

                block = self._process_paragraph(para, in_reference_section, block_index)
                if block:
                    # Check if entering references section
                    if block["type"] == "heading" and block["text"] in (
                        self.profile.reference_section_titles
                    ):
                        in_reference_section = True
                        block["is_reference_header"] = True

                    blocks.append(block)
                    block_index += 1

            elif tag == "tbl":
                # Process table
                table = docx.table.Table(element, doc)
                block = self._process_table(table, block_index)
                blocks.append(block)
                block_index += 1

        return blocks

    def _process_paragraph(
        self,
        para,
        in_reference_section: bool,
        block_index: int,
    ) -> Optional[Dict[str, Any]]:
        """Process a single paragraph."""
        style_name = (para.style.name or "").strip() if para.style else ""
        text = para.text.strip()

        if not text:
            return None

        # Extract runs with formatting info
        runs = self._extract_runs(para)

        # Base block structure
        block: Dict[str, Any] = {
            "index": block_index,
            "text": text,
            "style": style_name,
            "runs": runs,
        }

        # Check for equations (OMML)
        equation_latex = self._extract_equation(para)
        if equation_latex:
            block["type"] = "equation"
            block["latex"] = equation_latex
            return block

        # Detect heading (style-based first, then heuristic)
        level = self._detect_heading(para, style_name, text, runs)
        if level is not None:
            block["type"] = "heading"
            block["level"] = level
            block["detection_method"] = (
                "style" if self._heading_level_from_style(style_name) else "heuristic"
            )
            return block

        # Detect caption (improved pattern matching)
        caption_info = self._detect_caption(style_name, text)
        if caption_info:
            block["type"] = "caption"
            block.update(caption_info)
            return block

        # Reference paragraph
        if in_reference_section:
            block["type"] = "reference-paragraph"
            ref_info = self._parse_reference_entry(text)
            block.update(ref_info)
            return block

        # List item detection
        if self._is_list_item(para):
            block["type"] = "list-item"
            list_info = self._parse_list_item(para)
            block.update(list_info)
            return block

        # Normal paragraph
        block["type"] = "paragraph"
        return block

    def _detect_heading(
        self,
        para,
        style_name: str,
        text: str,
        runs: List[Dict[str, Any]],
    ) -> Optional[int]:
        """
        Detect heading using multiple strategies:
        1. Style-based (most reliable)
        2. Font-based heuristics (bold, larger size)
        3. Content-based (known section titles)
        """
        # Strategy 1: Style-based detection
        level = self._heading_level_from_style(style_name)
        if level is not None:
            return level

        # Strategy 2: Font-based heuristics
        # Check if entire paragraph is bold and short
        text_clean = text.strip()
        if len(text_clean) < 100:  # Headings are typically short
            all_bold = all(r.get("bold") for r in runs if r.get("text", "").strip())
            if all_bold and runs:
                # Check font size - headings often larger
                avg_size = self._get_average_font_size(runs)
                if avg_size and avg_size >= 12:
                    # Check if it looks like a section heading
                    if self._looks_like_heading(text_clean):
                        return 1 if avg_size >= 14 else 2

        # Strategy 3: Content-based detection (common section titles)
        text_lower = text_clean.lower().rstrip(".:;")
        # Check numbered sections: "1. Introduction", "2.1 Methods"
        numbered_match = re.match(r"^(\d+(?:\.\d+)*)[\.:\s]+(.+)$", text_clean)
        if numbered_match:
            section_text = numbered_match.group(2).lower().strip()
            if section_text in COMMON_SECTION_HEADINGS:
                depth = numbered_match.group(1).count(".")
                return min(depth + 1, 4)

        # Check unnumbered common headings (if bold or all caps)
        if text_lower in COMMON_SECTION_HEADINGS:
            is_bold = all(r.get("bold") for r in runs if r.get("text", "").strip())
            is_all_caps = text_clean.isupper() and len(text_clean) > 3
            if is_bold or is_all_caps:
                return 1

        return None

    def _looks_like_heading(self, text: str) -> bool:
        """Check if text looks like a heading based on content patterns."""
        text_lower = text.lower().rstrip(".:;")

        # Check common section headings
        if text_lower in COMMON_SECTION_HEADINGS:
            return True

        # Check numbered sections
        if re.match(r"^\d+(?:\.\d+)*\s+\w", text):
            return True

        # All caps short text
        if text.isupper() and 3 < len(text) < 50:
            return True

        return False

    def _get_average_font_size(self, runs: List[Dict[str, Any]]) -> Optional[float]:
        """Get average font size from runs."""
        sizes = [r["font_size"] for r in runs if r.get("font_size")]
        return sum(sizes) / len(sizes) if sizes else None

    def _detect_caption(self, style_name: str, text: str) -> Optional[Dict[str, Any]]:
        """
        Detect and parse captions using multiple patterns.
        Returns caption info dict or None.
        """
        # Check by style first
        if style_name == self.profile.caption_style:
            return self._parse_caption(text)

        # Check using comprehensive patterns
        text_stripped = text.strip()
        for pattern, caption_type in CAPTION_PATTERNS:
            match = re.match(pattern, text_stripped, re.IGNORECASE)
            if match:
                return {
                    "caption_type": caption_type,
                    "number": int(match.group(2)),
                    "caption_text": match.group(3).strip(),
                }

        # Check profile-specific prefixes
        if self._is_caption(style_name, text):
            return self._parse_caption(text)

        return None

    def _extract_equation(self, para) -> Optional[str]:
        """
        Extract equation from paragraph if it contains OMML (Office Math Markup).
        Returns LaTeX representation or None.
        """
        try:
            # Check for oMath elements
            omml_ns = {
                "m": "http://schemas.openxmlformats.org/officeDocument/2006/math"
            }
            math_elements = para._element.findall(".//m:oMath", namespaces=omml_ns)

            if not math_elements:
                return None

            # Basic OMML to LaTeX conversion
            latex_parts = []
            for math_elem in math_elements:
                latex = self._omml_to_latex(math_elem)
                if latex:
                    latex_parts.append(latex)

            return " ".join(latex_parts) if latex_parts else None
        except Exception:
            return None

    def _omml_to_latex(self, math_elem) -> str:
        """
        Convert OMML element to LaTeX string.
        This is a basic converter - handles common cases.
        """
        omml_ns = {"m": "http://schemas.openxmlformats.org/officeDocument/2006/math"}

        def get_text(elem) -> str:
            """Recursively get text from element."""
            texts = []
            if elem.text:
                texts.append(elem.text)
            for child in elem:
                texts.append(get_text(child))
                if child.tail:
                    texts.append(child.tail)
            return "".join(texts)

        def convert_element(elem) -> str:
            """Convert a single OMML element to LaTeX."""
            tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag

            if tag == "r":  # Run (text)
                return get_text(elem)
            elif tag == "f":  # Fraction
                num = elem.find("m:num", namespaces=omml_ns)
                den = elem.find("m:den", namespaces=omml_ns)
                num_tex = convert_children(num) if num is not None else ""
                den_tex = convert_children(den) if den is not None else ""
                return f"\\frac{{{num_tex}}}{{{den_tex}}}"
            elif tag == "rad":  # Radical/root
                deg = elem.find("m:deg", namespaces=omml_ns)
                content = elem.find("m:e", namespaces=omml_ns)
                content_tex = convert_children(content) if content is not None else ""
                if deg is not None and get_text(deg).strip():
                    deg_tex = convert_children(deg)
                    return f"\\sqrt[{deg_tex}]{{{content_tex}}}"
                return f"\\sqrt{{{content_tex}}}"
            elif tag == "sSup":  # Superscript
                base = elem.find("m:e", namespaces=omml_ns)
                sup = elem.find("m:sup", namespaces=omml_ns)
                base_tex = convert_children(base) if base is not None else ""
                sup_tex = convert_children(sup) if sup is not None else ""
                return f"{base_tex}^{{{sup_tex}}}"
            elif tag == "sSub":  # Subscript
                base = elem.find("m:e", namespaces=omml_ns)
                sub = elem.find("m:sub", namespaces=omml_ns)
                base_tex = convert_children(base) if base is not None else ""
                sub_tex = convert_children(sub) if sub is not None else ""
                return f"{base_tex}_{{{sub_tex}}}"
            elif tag == "sSubSup":  # Sub-superscript
                base = elem.find("m:e", namespaces=omml_ns)
                sub = elem.find("m:sub", namespaces=omml_ns)
                sup = elem.find("m:sup", namespaces=omml_ns)
                base_tex = convert_children(base) if base is not None else ""
                sub_tex = convert_children(sub) if sub is not None else ""
                sup_tex = convert_children(sup) if sup is not None else ""
                return f"{base_tex}_{{{sub_tex}}}^{{{sup_tex}}}"
            elif tag == "nary":  # N-ary (sum, product, integral)
                chr_elem = elem.find(".//m:chr", namespaces=omml_ns)
                symbol = chr_elem.get(qn("m:val")) if chr_elem is not None else "∑"
                symbol_map = {"∑": "\\sum", "∏": "\\prod", "∫": "\\int", "∮": "\\oint"}
                latex_sym = symbol_map.get(symbol, symbol)
                sub = elem.find("m:sub", namespaces=omml_ns)
                sup = elem.find("m:sup", namespaces=omml_ns)
                content = elem.find("m:e", namespaces=omml_ns)
                result = latex_sym
                if sub is not None:
                    result += f"_{{{convert_children(sub)}}}"
                if sup is not None:
                    result += f"^{{{convert_children(sup)}}}"
                if content is not None:
                    result += f" {convert_children(content)}"
                return result
            elif tag == "d":  # Delimiter (parentheses, brackets)
                content = elem.find("m:e", namespaces=omml_ns)
                content_tex = convert_children(content) if content is not None else ""
                beg = elem.find(".//m:begChr", namespaces=omml_ns)
                end = elem.find(".//m:endChr", namespaces=omml_ns)
                left = beg.get(qn("m:val")) if beg is not None else "("
                right = end.get(qn("m:val")) if end is not None else ")"
                return f"\\left{left}{content_tex}\\right{right}"
            elif tag in ("e", "num", "den", "sub", "sup", "deg"):
                # Container elements - just process children
                return convert_children(elem)
            else:
                # Unknown element - try to get text
                return convert_children(elem)

        def convert_children(elem) -> str:
            """Convert all children of an element."""
            if elem is None:
                return ""
            parts = []
            for child in elem:
                parts.append(convert_element(child))
            return "".join(parts)

        return convert_element(math_elem)

    def _is_list_item(self, para) -> bool:
        """Check if paragraph is a list item."""
        try:
            # Check for numbering properties
            pPr = para._element.find(qn("w:pPr"))
            if pPr is not None:
                numPr = pPr.find(qn("w:numPr"))
                if numPr is not None:
                    return True

            # Check for bullet/number at start of text
            text = para.text.strip()
            if re.match(r"^[\u2022\u2023\u25E6\u2043\u2219•‣◦⁃∙]\s", text):
                return True
            if re.match(
                r"^(\d+[\.\):]|\([a-z]\)|\([ivxlc]+\)|[a-z][\.\)])\s",
                text,
                re.IGNORECASE,
            ):
                return True

            return False
        except Exception:
            return False

    def _parse_list_item(self, para) -> Dict[str, Any]:
        """Parse list item to extract level and content."""
        info: Dict[str, Any] = {"list_type": "unordered", "level": 0}

        try:
            pPr = para._element.find(qn("w:pPr"))
            if pPr is not None:
                numPr = pPr.find(qn("w:numPr"))
                if numPr is not None:
                    ilvl = numPr.find(qn("w:ilvl"))
                    if ilvl is not None:
                        info["level"] = int(ilvl.get(qn("w:val"), 0))

            # Detect ordered vs unordered
            text = para.text.strip()
            if re.match(r"^\d+[\.\):]\s", text):
                info["list_type"] = "ordered"
        except Exception:
            pass

        return info

    def _extract_runs(self, para) -> List[Dict[str, Any]]:
        """Extract formatted runs from a paragraph."""
        runs = []
        for run in para.runs:
            if not run.text:
                continue
            run_data = {
                "text": run.text,
                "bold": run.bold,
                "italic": run.italic,
                "underline": run.underline is not None,
            }
            if run.font.size:
                run_data["font_size"] = run.font.size.pt
            if run.font.name:
                run_data["font_name"] = run.font.name
            runs.append(run_data)
        return runs

    def _heading_level_from_style(self, style_name: str) -> Optional[int]:
        """Return heading level for a given Word style, or None."""
        for level, expected_style in self.profile.heading_styles.items():
            if style_name == expected_style:
                return level
        return None

    def _is_caption(self, style_name: str, text: str) -> bool:
        """Check if paragraph is a caption."""
        if style_name == self.profile.caption_style:
            return True

        # Check by prefix
        text_lower = text.lower()
        prefixes = (
            self.profile.figure_caption_prefixes + self.profile.table_caption_prefixes
        )
        for prefix in prefixes:
            if text_lower.startswith(prefix.lower()):
                return True
        return False

    def _parse_caption(self, text: str) -> Dict[str, Any]:
        """Parse caption text to extract figure/table number."""
        info: Dict[str, Any] = {}

        # Check figure
        for prefix in self.profile.figure_caption_prefixes:
            pattern = rf"^{re.escape(prefix)}\.?\s*(\d+)[\.:]?\s*(.*)$"
            match = re.match(pattern, text, re.IGNORECASE)
            if match:
                info["caption_type"] = "figure"
                info["number"] = int(match.group(1))
                info["caption_text"] = match.group(2).strip()
                return info

        # Check table
        for prefix in self.profile.table_caption_prefixes:
            pattern = rf"^{re.escape(prefix)}\.?\s*(\d+)[\.:]?\s*(.*)$"
            match = re.match(pattern, text, re.IGNORECASE)
            if match:
                info["caption_type"] = "table"
                info["number"] = int(match.group(1))
                info["caption_text"] = match.group(2).strip()
                return info

        info["caption_type"] = "unknown"
        info["caption_text"] = text
        return info

    def _parse_reference_entry(self, text: str) -> Dict[str, Any]:
        """Parse a reference entry to extract citation number."""
        info: Dict[str, Any] = {}

        # Try to extract numbered reference: [1], 1., (1), etc.
        patterns = [
            r"^\[(\d+)\]",  # [1] Author...
            r"^(\d+)\.",  # 1. Author...
            r"^\((\d+)\)",  # (1) Author...
        ]
        for pattern in patterns:
            match = re.match(pattern, text)
            if match:
                info["ref_number"] = int(match.group(1))
                info["ref_text"] = re.sub(pattern, "", text).strip()
                break
        else:
            info["ref_text"] = text

        return info

    def _process_table(
        self,
        table,
        block_index: int,
    ) -> Dict[str, Any]:
        """Process a table."""
        rows = []
        for row in table.rows:
            cells = []
            for cell in row.cells:
                cells.append(cell.text.strip())
            rows.append(cells)

        return {
            "index": block_index,
            "type": "table",
            "rows": rows,
            "num_rows": len(rows),
            "num_cols": len(rows[0]) if rows else 0,
        }

    def _extract_images(
        self,
        doc: DocxDocument,
        source_path: Path,
    ) -> List[Dict[str, Any]]:
        """Extract embedded images from the document."""
        images = []

        try:
            for rel_id, rel in doc.part.rels.items():
                if "image" in rel.reltype:
                    image_part = rel.target_part
                    image_bytes = image_part.blob

                    # Generate hash for deduplication
                    image_hash = hashlib.md5(image_bytes).hexdigest()[:12]

                    # Determine extension from content type
                    content_type = image_part.content_type
                    ext_map = {
                        "image/png": ".png",
                        "image/jpeg": ".jpg",
                        "image/gif": ".gif",
                        "image/tiff": ".tiff",
                        "image/bmp": ".bmp",
                    }
                    ext = ext_map.get(content_type, ".png")

                    images.append(
                        {
                            "rel_id": rel_id,
                            "hash": image_hash,
                            "content_type": content_type,
                            "extension": ext,
                            "size_bytes": len(image_bytes),
                            "data": image_bytes,  # Raw bytes
                        }
                    )
        except Exception as e:
            pass  # Image extraction is optional

        return images

    def _parse_references(
        self,
        blocks: List[Dict[str, Any]],
    ) -> List[Dict[str, Any]]:
        """Extract and structure references from blocks."""
        references = []
        for block in blocks:
            if block.get("type") == "reference-paragraph":
                ref_entry = {
                    "number": block.get("ref_number"),
                    "text": block.get("ref_text", block.get("text", "")),
                    "raw": block.get("text", ""),
                }
                references.append(ref_entry)
        return references



__all__ = ["WordReader"]