Source code for scitex_msword

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 15:15:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/__init__.py

"""
MS Word (DOCX) import/export utilities for SciTeX.

This module provides high-level functions to convert between
MS Word .docx files and SciTeX's internal writer document model.

Strategy:
---------
- Word users write text only (paragraphs, minimal formatting)
- SciTeX handles: figures, tables, references, LaTeX generation
- SciTeX JSON is the "source of truth", Word is just a view/edit layer

Typical usage:
--------------
    from scitex_msword import load_docx, save_docx, list_profiles

    # Import from Word
    doc = load_docx("input.docx", profile="generic")

    # Manipulate via scitex.writer...
    # doc.normalize()

    # Export to Word (different journal template)
    save_docx(doc, "output.docx", profile="mdpi-ijerph")

Available profiles:
-------------------
- generic: Standard Word with Heading 1/2/3
- mdpi-ijerph: MDPI IJERPH journal template
- resna-2025: RESNA 2025 scientific paper template
- iop-double-anonymous: IOP double-anonymous template
"""

from __future__ import annotations

try:
    from importlib.metadata import PackageNotFoundError
    from importlib.metadata import version as _v

    try:
        __version__ = _v("scitex-msword")
    except PackageNotFoundError:
        __version__ = "0.0.0+local"
    del _v, PackageNotFoundError
except ImportError:  # pragma: no cover — only on ancient Pythons
    __version__ = "0.0.0+local"

from pathlib import Path
from typing import Any, Optional

from .bold import preserve_bold_tokens
from .comments import apply_comments_as_edits, extract_comments
from .diff import diff_docx, summarize_diff
from .highlights import (
    clear_highlights,
    extract_highlights,
    mark_additions,
    mark_modifications,
)
from .profiles import BaseWordProfile, get_profile, list_profiles, register_profile
from .reader import WordReader
from .track_changes import (
    accept_all_tracked_changes,
    enable_track_changes,
    extract_tracked_changes,
    is_track_changes_enabled,
    reject_all_tracked_changes,
    wrap_as_tracked_deletion,
    wrap_as_tracked_insertion,
)
from .utils import (
    create_post_import_hook,
    link_captions_to_images,
    link_captions_to_images_by_proximity,
    normalize_section_headings,
    validate_document,
)
from .writer import WordWriter


[docs] def load_docx( path: str | Path, profile: str | None = None, extract_images: bool = True, ) -> dict[str, Any]: """ Load a DOCX file and convert it into a SciTeX writer document. Parameters ---------- path : str | Path Path to the .docx file. profile : str | None Optional profile name that specifies how to interpret Word styles (e.g., "mdpi-ijerph", "resna-2025"). If None, "generic" is used. extract_images : bool If True, extract embedded images and store references. Returns ------- dict A SciTeX writer document structure containing: - blocks: List of document blocks (headings, paragraphs, captions, etc.) - metadata: Profile and source file information - images: Extracted image references (if extract_images=True) - references: Parsed reference entries Examples -------- >>> from scitex.msword import load_docx >>> doc = load_docx("manuscript.docx", profile="mdpi-ijerph") >>> print(doc["metadata"]["profile"]) 'mdpi-ijerph' """ path = Path(path) profile_obj: BaseWordProfile = get_profile(profile) reader = WordReader(profile=profile_obj, extract_images=extract_images) return reader.read(path)
[docs] def save_docx( writer_doc: dict[str, Any] | Any, path: str | Path, profile: str | None = None, overwrite: bool = True, template_path: str | Path | None = None, ) -> Path: """ Save a SciTeX writer document as a DOCX file. Parameters ---------- writer_doc : dict | Any SciTeX writer document instance to export. path : str | Path Output path for the .docx file. profile : str | None Optional profile name that controls how sections, headings, figures, tables and references are mapped to Word styles. If None, "generic" is used. overwrite : bool If False and the file already exists, raises FileExistsError. template_path : str | Path | None Optional path to a Word template (.dotx/.docx) to use as base. This allows using journal-specific formatting. Returns ------- Path The path to the written .docx file. Examples -------- >>> from scitex.msword import save_docx >>> save_docx(doc, "submission_resna_2025.docx", profile="resna-2025") PosixPath('submission_resna_2025.docx') """ output_path = Path(path) if output_path.exists() and not overwrite: raise FileExistsError(f"File already exists: {output_path}") profile_obj: BaseWordProfile = get_profile(profile) writer = WordWriter(profile=profile_obj, template_path=template_path) writer.write(writer_doc, output_path) return output_path
[docs] def convert_docx_to_tex( input_path: str | Path, output_path: str | Path, profile: str | None = None, *, image_dir: str | Path | None = None, link_images: bool = True, link_mode: str = "by-number", normalize_headings: bool = True, validate: bool = True, ) -> Path: """ Convert a DOCX file directly to LaTeX. This is a convenience function that: 1. Loads the DOCX file into SciTeX intermediate format 2. (Optionally) normalizes headings 3. (Optionally) links figure captions to images 4. (Optionally) validates the document and adds warnings 5. Exports to LaTeX (including figures via image_dir) Parameters ---------- input_path : str | Path Path to the input .docx file. output_path : str | Path Path for the output .tex file. profile : str | None Word profile for interpreting styles (e.g., "resna-2025", "iop-double-anonymous"). image_dir : str | Path | None, optional Directory where extracted figure image files will be saved. If None, the LaTeX exporter will create "<tex_stem>_figures" next to `output_path`. link_images : bool, default True Whether to link figure captions to extracted images so that LaTeX can generate \\includegraphics inside figure environments. link_mode : {"by-number", "by-proximity"}, default "by-number" Strategy for linking captions to images: - "by-number": Figure 1 -> first image, Figure 2 -> second image... - "by-proximity": assign images in document order, useful when figure numbers and image order don't match. normalize_headings : bool, default True If True, apply common heading normalizations (e.g., "intro" -> "Introduction"). validate : bool, default True If True, run basic structural checks and populate doc["warnings"] with any issues. Returns ------- Path The path to the written .tex file. Examples -------- >>> from scitex.msword import convert_docx_to_tex >>> convert_docx_to_tex( ... "RESNA 2025 Scientific Paper Template.docx", ... "manuscript.tex", ... profile="resna-2025", ... image_dir="figures", ... ) PosixPath('manuscript.tex') """ # Lazy import: convert_docx_to_tex requires scitex (or scitex-tex) for the # final .tex serialization step. Other scitex_msword functions don't need it. try: from scitex_tex import export_tex except ImportError as _e: raise ImportError( "convert_docx_to_tex requires the 'scitex' package (provides scitex.tex). " "Install with: pip install scitex" ) from _e # 1. DOCX -> SciTeX intermediate format doc = load_docx(input_path, profile=profile, extract_images=True) # 2. Normalize headings (optional) if normalize_headings: doc = normalize_section_headings(doc) # 3. Link captions to images (optional) if link_images and doc.get("images"): if link_mode == "by-proximity": doc = link_captions_to_images_by_proximity(doc) else: # Default: link by figure number doc = link_captions_to_images(doc) # 4. Validate document structure (optional) if validate: doc = validate_document(doc) # 5. SciTeX -> LaTeX (with figures) return export_tex(doc, output_path, image_dir=image_dir)
__all__ = [ "__version__", "load_docx", "save_docx", "convert_docx_to_tex", "list_profiles", "get_profile", "register_profile", "BaseWordProfile", "WordReader", "WordWriter", # Utility functions for post-processing "link_captions_to_images", "link_captions_to_images_by_proximity", "normalize_section_headings", "validate_document", "create_post_import_hook", # Diff API (BOOST v16 dogfooding) "diff_docx", "summarize_diff", # Highlight API (BOOST v16 visual marking) "mark_additions", "mark_modifications", "extract_highlights", "clear_highlights", # Bold-token preservation (BOOST v16 Japanese keyword emphasis) "preserve_bold_tokens", # Comment extraction (+ narrow REPLACE-grammar application) "extract_comments", "apply_comments_as_edits", # Track-Changes API (BOOST post-v16 dogfooding) "enable_track_changes", "is_track_changes_enabled", "wrap_as_tracked_insertion", "wrap_as_tracked_deletion", "extract_tracked_changes", "accept_all_tracked_changes", "reject_all_tracked_changes", ]