Source code for scitex_msword

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 15:15:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/__init__.py

"""
MS Word (DOCX) import/export utilities for SciTeX.

This module provides high-level functions to convert between
MS Word .docx files and SciTeX's internal writer document model.

Strategy:
---------
- Word users write text only (paragraphs, minimal formatting)
- SciTeX handles: figures, tables, references, LaTeX generation
- SciTeX JSON is the "source of truth", Word is just a view/edit layer

Typical usage:
--------------
    from scitex_msword import load_docx, save_docx, list_profiles

    # Import from Word
    doc = load_docx("input.docx", profile="generic")

    # Manipulate via scitex.writer...
    # doc.normalize()

    # Export to Word (different journal template)
    save_docx(doc, "output.docx", profile="mdpi-ijerph")

Available profiles:
-------------------
- generic: Standard Word with Heading 1/2/3
- mdpi-ijerph: MDPI IJERPH journal template
- resna-2025: RESNA 2025 scientific paper template
- iop-double-anonymous: IOP double-anonymous template
"""

from __future__ import annotations

try:
    from importlib.metadata import PackageNotFoundError
    from importlib.metadata import version as _v

    try:
        __version__ = _v("scitex-msword")
    except PackageNotFoundError:
        __version__ = "0.0.0+local"
    del _v, PackageNotFoundError
except ImportError:  # pragma: no cover — only on ancient Pythons
    __version__ = "0.0.0+local"

from pathlib import Path
from typing import Any, Optional

from .bold import preserve_bold_tokens
from .comments import apply_comments_as_edits, extract_comments
from .diff import diff_docx, summarize_diff
from .highlights import (
    clear_highlights,
    extract_highlights,
    mark_additions,
    mark_modifications,
)
from .profiles import BaseWordProfile, get_profile, list_profiles, register_profile
from .reader import WordReader
from ._save_document import save_document
from .tables import insert_table_after_paragraph
from .track_changes import (
    accept_all_tracked_changes,
    enable_track_changes,
    extract_tracked_changes,
    is_track_changes_enabled,
    reject_all_tracked_changes,
    save_with_track_changes_on,
    wrap_as_tracked_deletion,
    wrap_as_tracked_insertion,
)
from .utils import (
    create_post_import_hook,
    link_captions_to_images,
    link_captions_to_images_by_proximity,
    normalize_section_headings,
    validate_document,
)
from .writer import WordWriter



[docs]
def load_docx(
    path: str | Path,
    profile: str | None = None,
    extract_images: bool = True,
) -> dict[str, Any]:
    """
    Load a DOCX file and convert it into a SciTeX writer document.

    Parameters
    ----------
    path : str | Path
        Path to the .docx file.
    profile : str | None
        Optional profile name that specifies how to interpret Word styles
        (e.g., "mdpi-ijerph", "resna-2025"). If None, "generic" is used.
    extract_images : bool
        If True, extract embedded images and store references.

    Returns
    -------
    dict
        A SciTeX writer document structure containing:
        - blocks: List of document blocks (headings, paragraphs, captions, etc.)
        - metadata: Profile and source file information
        - images: Extracted image references (if extract_images=True)
        - references: Parsed reference entries

    Examples
    --------
    >>> from scitex.msword import load_docx
    >>> doc = load_docx("manuscript.docx", profile="mdpi-ijerph")
    >>> print(doc["metadata"]["profile"])
    'mdpi-ijerph'
    """
    path = Path(path)
    profile_obj: BaseWordProfile = get_profile(profile)
    reader = WordReader(profile=profile_obj, extract_images=extract_images)
    return reader.read(path)




[docs]
def save_docx(
    writer_doc: dict[str, Any] | Any,
    path: str | Path,
    profile: str | None = None,
    overwrite: bool = True,
    template_path: str | Path | None = None,
) -> Path:
    """
    Save a SciTeX writer document as a DOCX file.

    Parameters
    ----------
    writer_doc : dict | Any
        SciTeX writer document instance to export.
    path : str | Path
        Output path for the .docx file.
    profile : str | None
        Optional profile name that controls how sections, headings,
        figures, tables and references are mapped to Word styles.
        If None, "generic" is used.
    overwrite : bool
        If False and the file already exists, raises FileExistsError.
    template_path : str | Path | None
        Optional path to a Word template (.dotx/.docx) to use as base.
        This allows using journal-specific formatting.

    Returns
    -------
    Path
        The path to the written .docx file.

    Examples
    --------
    >>> from scitex.msword import save_docx
    >>> save_docx(doc, "submission_resna_2025.docx", profile="resna-2025")
    PosixPath('submission_resna_2025.docx')
    """
    output_path = Path(path)
    if output_path.exists() and not overwrite:
        raise FileExistsError(f"File already exists: {output_path}")

    profile_obj: BaseWordProfile = get_profile(profile)
    writer = WordWriter(profile=profile_obj, template_path=template_path)
    writer.write(writer_doc, output_path)
    return output_path




[docs]
def convert_docx_to_tex(
    input_path: str | Path,
    output_path: str | Path,
    profile: str | None = None,
    *,
    image_dir: str | Path | None = None,
    link_images: bool = True,
    link_mode: str = "by-number",
    normalize_headings: bool = True,
    validate: bool = True,
) -> Path:
    """
    Convert a DOCX file directly to LaTeX.

    This is a convenience function that:
    1. Loads the DOCX file into SciTeX intermediate format
    2. (Optionally) normalizes headings
    3. (Optionally) links figure captions to images
    4. (Optionally) validates the document and adds warnings
    5. Exports to LaTeX (including figures via image_dir)

    Parameters
    ----------
    input_path : str | Path
        Path to the input .docx file.
    output_path : str | Path
        Path for the output .tex file.
    profile : str | None
        Word profile for interpreting styles
        (e.g., "resna-2025", "iop-double-anonymous").
    image_dir : str | Path | None, optional
        Directory where extracted figure image files will be saved.
        If None, the LaTeX exporter will create "<tex_stem>_figures"
        next to `output_path`.
    link_images : bool, default True
        Whether to link figure captions to extracted images so that
        LaTeX can generate \\includegraphics inside figure environments.
    link_mode : {"by-number", "by-proximity"}, default "by-number"
        Strategy for linking captions to images:
        - "by-number": Figure 1 -> first image, Figure 2 -> second image...
        - "by-proximity": assign images in document order, useful when
          figure numbers and image order don't match.
    normalize_headings : bool, default True
        If True, apply common heading normalizations
        (e.g., "intro" -> "Introduction").
    validate : bool, default True
        If True, run basic structural checks and populate
        doc["warnings"] with any issues.

    Returns
    -------
    Path
        The path to the written .tex file.

    Examples
    --------
    .. code-block:: python

        from scitex.msword import convert_docx_to_tex

        convert_docx_to_tex(
            "RESNA 2025 Scientific Paper Template.docx",
            "manuscript.tex",
            profile="resna-2025",
            image_dir="figures",
        )
        # -> PosixPath('manuscript.tex')
    """
    # Lazy import: convert_docx_to_tex requires scitex (or scitex-tex) for the
    # final .tex serialization step. Other scitex_msword functions don't need it.
    try:
        from scitex_tex import export_tex
    except ImportError as _e:
        raise ImportError(
            "convert_docx_to_tex requires the 'scitex' package (provides scitex.tex). "
            "Install with: pip install scitex"
        ) from _e

    # 1. DOCX -> SciTeX intermediate format
    doc = load_docx(input_path, profile=profile, extract_images=True)

    # 2. Normalize headings (optional)
    if normalize_headings:
        doc = normalize_section_headings(doc)

    # 3. Link captions to images (optional)
    if link_images and doc.get("images"):
        if link_mode == "by-proximity":
            doc = link_captions_to_images_by_proximity(doc)
        else:
            # Default: link by figure number
            doc = link_captions_to_images(doc)

    # 4. Validate document structure (optional)
    if validate:
        doc = validate_document(doc)

    # 5. SciTeX -> LaTeX (with figures)
    return export_tex(doc, output_path, image_dir=image_dir)



__all__ = [
    "__version__",
    "load_docx",
    "save_docx",
    "convert_docx_to_tex",
    "list_profiles",
    "get_profile",
    "register_profile",
    "BaseWordProfile",
    "WordReader",
    "WordWriter",
    # Utility functions for post-processing
    "link_captions_to_images",
    "link_captions_to_images_by_proximity",
    "normalize_section_headings",
    "validate_document",
    "create_post_import_hook",
    # Diff API (BOOST v16 dogfooding)
    "diff_docx",
    "summarize_diff",
    # Highlight API (BOOST v16 visual marking)
    "mark_additions",
    "mark_modifications",
    "extract_highlights",
    "clear_highlights",
    # Bold-token preservation (BOOST v16 Japanese keyword emphasis)
    "preserve_bold_tokens",
    # Comment extraction (+ narrow REPLACE-grammar application)
    "extract_comments",
    "apply_comments_as_edits",
    # Track-Changes API (BOOST post-v16 dogfooding)
    "enable_track_changes",
    "is_track_changes_enabled",
    "wrap_as_tracked_insertion",
    "wrap_as_tracked_deletion",
    "extract_tracked_changes",
    "accept_all_tracked_changes",
    "reject_all_tracked_changes",
    "save_with_track_changes_on",
    # Document-based save API (BOOST v37 dogfooding — no writer-dict round-trip)
    "save_document",
    # Table insertion API (BOOST v43 dogfooding — direct lxml <w:tbl> with TC)
    "insert_table_after_paragraph",
]