Source code for scitex_msword.reader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 15:15:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/reader.py

"""
DOCX -> SciTeX writer document converter.

This module reads MS Word .docx files and converts them into
SciTeX's intermediate document format for further processing.
"""

from __future__ import annotations

import hashlib
import re
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

from .profiles import BaseWordProfile

# Lazy import for python-docx
try:
    import docx
    from docx.document import Document as DocxDocument
    from docx.oxml.ns import qn
    from docx.shared import Inches, Pt

    DOCX_AVAILABLE = True
    _DOCX_IMPORT_ERROR = None
except ImportError as exc:
    DOCX_AVAILABLE = False
    _DOCX_IMPORT_ERROR = exc
    DocxDocument = None

# Common academic section headings for heuristic detection
COMMON_SECTION_HEADINGS = {
    "abstract",
    "introduction",
    "background",
    "literature review",
    "methods",
    "methodology",
    "materials and methods",
    "experimental",
    "results",
    "findings",
    "analysis",
    "discussion",
    "conclusions",
    "conclusion",
    "summary",
    "acknowledgements",
    "acknowledgments",
    "acknowledgement",
    "references",
    "bibliography",
    "works cited",
    "appendix",
    "appendices",
    "supplementary",
    "supplementary material",
}

# Caption patterns for robust detection
CAPTION_PATTERNS = [
    # Figure patterns
    (r"^(figure|fig\.?)\s*(\d+)[\.:\s]*(.*)$", "figure"),
    (r"^(scheme)\s*(\d+)[\.:\s]*(.*)$", "scheme"),
    (r"^(chart)\s*(\d+)[\.:\s]*(.*)$", "chart"),
    (r"^(graph)\s*(\d+)[\.:\s]*(.*)$", "graph"),
    (r"^(plate)\s*(\d+)[\.:\s]*(.*)$", "plate"),
    (r"^(illustration)\s*(\d+)[\.:\s]*(.*)$", "illustration"),
    # Table patterns
    (r"^(table|tbl\.?)\s*(\d+)[\.:\s]*(.*)$", "table"),
    # Equation patterns
    (r"^(equation|eq\.?)\s*(\d+)[\.:\s]*(.*)$", "equation"),
    # Listing/code patterns
    (r"^(listing|code)\s*(\d+)[\.:\s]*(.*)$", "listing"),
    # Algorithm patterns
    (r"^(algorithm|alg\.?)\s*(\d+)[\.:\s]*(.*)$", "algorithm"),
]


[docs] class WordReader: """ Read a DOCX file and convert it into a SciTeX writer document. This reader focuses on: - Sections (via heading styles) - Plain paragraphs - Figure/table captions (via caption style) - Embedded images extraction - References section boundary detection - Basic formatting (bold, italic) The output is a structured intermediate representation that can be easily fed into `scitex.writer` or exported to LaTeX/other formats. """
[docs] def __init__( self, profile: BaseWordProfile, extract_images: bool = True, ): """ Parameters ---------- profile : BaseWordProfile Mapping between Word styles and SciTeX writer semantics. extract_images : bool Whether to extract embedded images from the document. """ if not DOCX_AVAILABLE: raise ImportError( "python-docx is required for scitex.msword.WordReader. " "Install it via `pip install python-docx`." ) from _DOCX_IMPORT_ERROR self.profile = profile self.extract_images = extract_images
[docs] def read(self, path: Path) -> Dict[str, Any]: """ Read a DOCX file and return a SciTeX writer document. Parameters ---------- path : Path Path to the DOCX file. Returns ------- dict SciTeX writer document structure with keys: - blocks: List of document blocks - metadata: Profile and source information - images: Extracted image data (if extract_images=True) - references: Parsed reference entries - warnings: List of conversion warnings """ doc = docx.Document(str(path)) # Initialize result structure result: Dict[str, Any] = { "blocks": [], "metadata": { "profile": self.profile.name, "source_file": str(path), "import_timestamp": datetime.now().isoformat(), }, "images": [], "references": [], "warnings": [], } # Extract document properties if available result["metadata"].update(self._extract_metadata(doc)) # Process paragraphs and tables blocks = self._process_body(doc, result) result["blocks"] = blocks # Extract images if self.extract_images: result["images"] = self._extract_images(doc, path) # Parse references section result["references"] = self._parse_references(blocks) # Run post-import hooks for hook in self.profile.post_import_hooks: result = hook(result) return result
def _extract_metadata(self, doc: DocxDocument) -> Dict[str, Any]: """Extract document metadata (title, author, etc.).""" metadata = {} try: core_props = doc.core_properties if core_props.title: metadata["title"] = core_props.title if core_props.author: metadata["author"] = core_props.author if core_props.subject: metadata["subject"] = core_props.subject if core_props.keywords: metadata["keywords"] = core_props.keywords if core_props.created: metadata["created"] = core_props.created.isoformat() if core_props.modified: metadata["modified"] = core_props.modified.isoformat() except Exception: pass # Metadata extraction is optional return metadata def _process_body( self, doc: DocxDocument, result: Dict[str, Any], ) -> List[Dict[str, Any]]: """Process document body: paragraphs and tables.""" blocks: List[Dict[str, Any]] = [] in_reference_section = False block_index = 0 # Build rel_id -> hash map for image detection rel_to_hash = {} if self.extract_images: for rel_id, rel in doc.part.rels.items(): if "image" in rel.reltype: image_bytes = rel.target_part.blob image_hash = hashlib.md5(image_bytes).hexdigest()[:12] rel_to_hash[rel_id] = image_hash # Namespace for picture detection pic_ns = {"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture"} a_ns = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"} r_ns = { "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships" } for element in doc.element.body: tag = element.tag.split("}")[-1] # Remove namespace if tag == "p": # Process paragraph para = docx.text.paragraph.Paragraph(element, doc) # Detect inline images in this paragraph if self.extract_images: for run in para.runs: # Check for drawing elements containing pictures drawings = run.element.findall(".//a:blip", namespaces=a_ns) for blip in drawings: embed_attr = qn("r:embed") rel_id = blip.get(embed_attr) if rel_id and rel_id in rel_to_hash: blocks.append( { "index": block_index, "type": "image", "image_hash": rel_to_hash[rel_id], "rel_id": rel_id, } ) block_index += 1 block = self._process_paragraph(para, in_reference_section, block_index) if block: # Check if entering references section if block["type"] == "heading" and block["text"] in ( self.profile.reference_section_titles ): in_reference_section = True block["is_reference_header"] = True blocks.append(block) block_index += 1 elif tag == "tbl": # Process table table = docx.table.Table(element, doc) block = self._process_table(table, block_index) blocks.append(block) block_index += 1 return blocks def _process_paragraph( self, para, in_reference_section: bool, block_index: int, ) -> Optional[Dict[str, Any]]: """Process a single paragraph.""" style_name = (para.style.name or "").strip() if para.style else "" text = para.text.strip() if not text: return None # Extract runs with formatting info runs = self._extract_runs(para) # Base block structure block: Dict[str, Any] = { "index": block_index, "text": text, "style": style_name, "runs": runs, } # Check for equations (OMML) equation_latex = self._extract_equation(para) if equation_latex: block["type"] = "equation" block["latex"] = equation_latex return block # Detect heading (style-based first, then heuristic) level = self._detect_heading(para, style_name, text, runs) if level is not None: block["type"] = "heading" block["level"] = level block["detection_method"] = ( "style" if self._heading_level_from_style(style_name) else "heuristic" ) return block # Detect caption (improved pattern matching) caption_info = self._detect_caption(style_name, text) if caption_info: block["type"] = "caption" block.update(caption_info) return block # Reference paragraph if in_reference_section: block["type"] = "reference-paragraph" ref_info = self._parse_reference_entry(text) block.update(ref_info) return block # List item detection if self._is_list_item(para): block["type"] = "list-item" list_info = self._parse_list_item(para) block.update(list_info) return block # Normal paragraph block["type"] = "paragraph" return block def _detect_heading( self, para, style_name: str, text: str, runs: List[Dict[str, Any]], ) -> Optional[int]: """ Detect heading using multiple strategies: 1. Style-based (most reliable) 2. Font-based heuristics (bold, larger size) 3. Content-based (known section titles) """ # Strategy 1: Style-based detection level = self._heading_level_from_style(style_name) if level is not None: return level # Strategy 2: Font-based heuristics # Check if entire paragraph is bold and short text_clean = text.strip() if len(text_clean) < 100: # Headings are typically short all_bold = all(r.get("bold") for r in runs if r.get("text", "").strip()) if all_bold and runs: # Check font size - headings often larger avg_size = self._get_average_font_size(runs) if avg_size and avg_size >= 12: # Check if it looks like a section heading if self._looks_like_heading(text_clean): return 1 if avg_size >= 14 else 2 # Strategy 3: Content-based detection (common section titles) text_lower = text_clean.lower().rstrip(".:;") # Check numbered sections: "1. Introduction", "2.1 Methods" numbered_match = re.match(r"^(\d+(?:\.\d+)*)[\.:\s]+(.+)$", text_clean) if numbered_match: section_text = numbered_match.group(2).lower().strip() if section_text in COMMON_SECTION_HEADINGS: depth = numbered_match.group(1).count(".") return min(depth + 1, 4) # Check unnumbered common headings (if bold or all caps) if text_lower in COMMON_SECTION_HEADINGS: is_bold = all(r.get("bold") for r in runs if r.get("text", "").strip()) is_all_caps = text_clean.isupper() and len(text_clean) > 3 if is_bold or is_all_caps: return 1 return None def _looks_like_heading(self, text: str) -> bool: """Check if text looks like a heading based on content patterns.""" text_lower = text.lower().rstrip(".:;") # Check common section headings if text_lower in COMMON_SECTION_HEADINGS: return True # Check numbered sections if re.match(r"^\d+(?:\.\d+)*\s+\w", text): return True # All caps short text if text.isupper() and 3 < len(text) < 50: return True return False def _get_average_font_size(self, runs: List[Dict[str, Any]]) -> Optional[float]: """Get average font size from runs.""" sizes = [r["font_size"] for r in runs if r.get("font_size")] return sum(sizes) / len(sizes) if sizes else None def _detect_caption(self, style_name: str, text: str) -> Optional[Dict[str, Any]]: """ Detect and parse captions using multiple patterns. Returns caption info dict or None. """ # Check by style first if style_name == self.profile.caption_style: return self._parse_caption(text) # Check using comprehensive patterns text_stripped = text.strip() for pattern, caption_type in CAPTION_PATTERNS: match = re.match(pattern, text_stripped, re.IGNORECASE) if match: return { "caption_type": caption_type, "number": int(match.group(2)), "caption_text": match.group(3).strip(), } # Check profile-specific prefixes if self._is_caption(style_name, text): return self._parse_caption(text) return None def _extract_equation(self, para) -> Optional[str]: """ Extract equation from paragraph if it contains OMML (Office Math Markup). Returns LaTeX representation or None. """ try: # Check for oMath elements omml_ns = { "m": "http://schemas.openxmlformats.org/officeDocument/2006/math" } math_elements = para._element.findall(".//m:oMath", namespaces=omml_ns) if not math_elements: return None # Basic OMML to LaTeX conversion latex_parts = [] for math_elem in math_elements: latex = self._omml_to_latex(math_elem) if latex: latex_parts.append(latex) return " ".join(latex_parts) if latex_parts else None except Exception: return None def _omml_to_latex(self, math_elem) -> str: """ Convert OMML element to LaTeX string. This is a basic converter - handles common cases. """ omml_ns = {"m": "http://schemas.openxmlformats.org/officeDocument/2006/math"} def get_text(elem) -> str: """Recursively get text from element.""" texts = [] if elem.text: texts.append(elem.text) for child in elem: texts.append(get_text(child)) if child.tail: texts.append(child.tail) return "".join(texts) def convert_element(elem) -> str: """Convert a single OMML element to LaTeX.""" tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag if tag == "r": # Run (text) return get_text(elem) elif tag == "f": # Fraction num = elem.find("m:num", namespaces=omml_ns) den = elem.find("m:den", namespaces=omml_ns) num_tex = convert_children(num) if num is not None else "" den_tex = convert_children(den) if den is not None else "" return f"\\frac{{{num_tex}}}{{{den_tex}}}" elif tag == "rad": # Radical/root deg = elem.find("m:deg", namespaces=omml_ns) content = elem.find("m:e", namespaces=omml_ns) content_tex = convert_children(content) if content is not None else "" if deg is not None and get_text(deg).strip(): deg_tex = convert_children(deg) return f"\\sqrt[{deg_tex}]{{{content_tex}}}" return f"\\sqrt{{{content_tex}}}" elif tag == "sSup": # Superscript base = elem.find("m:e", namespaces=omml_ns) sup = elem.find("m:sup", namespaces=omml_ns) base_tex = convert_children(base) if base is not None else "" sup_tex = convert_children(sup) if sup is not None else "" return f"{base_tex}^{{{sup_tex}}}" elif tag == "sSub": # Subscript base = elem.find("m:e", namespaces=omml_ns) sub = elem.find("m:sub", namespaces=omml_ns) base_tex = convert_children(base) if base is not None else "" sub_tex = convert_children(sub) if sub is not None else "" return f"{base_tex}_{{{sub_tex}}}" elif tag == "sSubSup": # Sub-superscript base = elem.find("m:e", namespaces=omml_ns) sub = elem.find("m:sub", namespaces=omml_ns) sup = elem.find("m:sup", namespaces=omml_ns) base_tex = convert_children(base) if base is not None else "" sub_tex = convert_children(sub) if sub is not None else "" sup_tex = convert_children(sup) if sup is not None else "" return f"{base_tex}_{{{sub_tex}}}^{{{sup_tex}}}" elif tag == "nary": # N-ary (sum, product, integral) chr_elem = elem.find(".//m:chr", namespaces=omml_ns) symbol = chr_elem.get(qn("m:val")) if chr_elem is not None else "∑" symbol_map = {"∑": "\\sum", "∏": "\\prod", "∫": "\\int", "∮": "\\oint"} latex_sym = symbol_map.get(symbol, symbol) sub = elem.find("m:sub", namespaces=omml_ns) sup = elem.find("m:sup", namespaces=omml_ns) content = elem.find("m:e", namespaces=omml_ns) result = latex_sym if sub is not None: result += f"_{{{convert_children(sub)}}}" if sup is not None: result += f"^{{{convert_children(sup)}}}" if content is not None: result += f" {convert_children(content)}" return result elif tag == "d": # Delimiter (parentheses, brackets) content = elem.find("m:e", namespaces=omml_ns) content_tex = convert_children(content) if content is not None else "" beg = elem.find(".//m:begChr", namespaces=omml_ns) end = elem.find(".//m:endChr", namespaces=omml_ns) left = beg.get(qn("m:val")) if beg is not None else "(" right = end.get(qn("m:val")) if end is not None else ")" return f"\\left{left}{content_tex}\\right{right}" elif tag in ("e", "num", "den", "sub", "sup", "deg"): # Container elements - just process children return convert_children(elem) else: # Unknown element - try to get text return convert_children(elem) def convert_children(elem) -> str: """Convert all children of an element.""" if elem is None: return "" parts = [] for child in elem: parts.append(convert_element(child)) return "".join(parts) return convert_element(math_elem) def _is_list_item(self, para) -> bool: """Check if paragraph is a list item.""" try: # Check for numbering properties pPr = para._element.find(qn("w:pPr")) if pPr is not None: numPr = pPr.find(qn("w:numPr")) if numPr is not None: return True # Check for bullet/number at start of text text = para.text.strip() if re.match(r"^[\u2022\u2023\u25E6\u2043\u2219•‣◦⁃∙]\s", text): return True if re.match( r"^(\d+[\.\):]|\([a-z]\)|\([ivxlc]+\)|[a-z][\.\)])\s", text, re.IGNORECASE, ): return True return False except Exception: return False def _parse_list_item(self, para) -> Dict[str, Any]: """Parse list item to extract level and content.""" info: Dict[str, Any] = {"list_type": "unordered", "level": 0} try: pPr = para._element.find(qn("w:pPr")) if pPr is not None: numPr = pPr.find(qn("w:numPr")) if numPr is not None: ilvl = numPr.find(qn("w:ilvl")) if ilvl is not None: info["level"] = int(ilvl.get(qn("w:val"), 0)) # Detect ordered vs unordered text = para.text.strip() if re.match(r"^\d+[\.\):]\s", text): info["list_type"] = "ordered" except Exception: pass return info def _extract_runs(self, para) -> List[Dict[str, Any]]: """Extract formatted runs from a paragraph.""" runs = [] for run in para.runs: if not run.text: continue run_data = { "text": run.text, "bold": run.bold, "italic": run.italic, "underline": run.underline is not None, } if run.font.size: run_data["font_size"] = run.font.size.pt if run.font.name: run_data["font_name"] = run.font.name runs.append(run_data) return runs def _heading_level_from_style(self, style_name: str) -> Optional[int]: """Return heading level for a given Word style, or None.""" for level, expected_style in self.profile.heading_styles.items(): if style_name == expected_style: return level return None def _is_caption(self, style_name: str, text: str) -> bool: """Check if paragraph is a caption.""" if style_name == self.profile.caption_style: return True # Check by prefix text_lower = text.lower() prefixes = ( self.profile.figure_caption_prefixes + self.profile.table_caption_prefixes ) for prefix in prefixes: if text_lower.startswith(prefix.lower()): return True return False def _parse_caption(self, text: str) -> Dict[str, Any]: """Parse caption text to extract figure/table number.""" info: Dict[str, Any] = {} # Check figure for prefix in self.profile.figure_caption_prefixes: pattern = rf"^{re.escape(prefix)}\.?\s*(\d+)[\.:]?\s*(.*)$" match = re.match(pattern, text, re.IGNORECASE) if match: info["caption_type"] = "figure" info["number"] = int(match.group(1)) info["caption_text"] = match.group(2).strip() return info # Check table for prefix in self.profile.table_caption_prefixes: pattern = rf"^{re.escape(prefix)}\.?\s*(\d+)[\.:]?\s*(.*)$" match = re.match(pattern, text, re.IGNORECASE) if match: info["caption_type"] = "table" info["number"] = int(match.group(1)) info["caption_text"] = match.group(2).strip() return info info["caption_type"] = "unknown" info["caption_text"] = text return info def _parse_reference_entry(self, text: str) -> Dict[str, Any]: """Parse a reference entry to extract citation number.""" info: Dict[str, Any] = {} # Try to extract numbered reference: [1], 1., (1), etc. patterns = [ r"^\[(\d+)\]", # [1] Author... r"^(\d+)\.", # 1. Author... r"^\((\d+)\)", # (1) Author... ] for pattern in patterns: match = re.match(pattern, text) if match: info["ref_number"] = int(match.group(1)) info["ref_text"] = re.sub(pattern, "", text).strip() break else: info["ref_text"] = text return info def _process_table( self, table, block_index: int, ) -> Dict[str, Any]: """Process a table.""" rows = [] for row in table.rows: cells = [] for cell in row.cells: cells.append(cell.text.strip()) rows.append(cells) return { "index": block_index, "type": "table", "rows": rows, "num_rows": len(rows), "num_cols": len(rows[0]) if rows else 0, } def _extract_images( self, doc: DocxDocument, source_path: Path, ) -> List[Dict[str, Any]]: """Extract embedded images from the document.""" images = [] try: for rel_id, rel in doc.part.rels.items(): if "image" in rel.reltype: image_part = rel.target_part image_bytes = image_part.blob # Generate hash for deduplication image_hash = hashlib.md5(image_bytes).hexdigest()[:12] # Determine extension from content type content_type = image_part.content_type ext_map = { "image/png": ".png", "image/jpeg": ".jpg", "image/gif": ".gif", "image/tiff": ".tiff", "image/bmp": ".bmp", } ext = ext_map.get(content_type, ".png") images.append( { "rel_id": rel_id, "hash": image_hash, "content_type": content_type, "extension": ext, "size_bytes": len(image_bytes), "data": image_bytes, # Raw bytes } ) except Exception as e: pass # Image extraction is optional return images def _parse_references( self, blocks: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: """Extract and structure references from blocks.""" references = [] for block in blocks: if block.get("type") == "reference-paragraph": ref_entry = { "number": block.get("ref_number"), "text": block.get("ref_text", block.get("text", "")), "raw": block.get("text", ""), } references.append(ref_entry) return references
__all__ = ["WordReader"]