Source code for scitex_msword.utils

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 16:45:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/utils.py

"""
Utility functions for processing MS Word documents.

These functions can be used as post_import_hooks or called directly
to process document structures.
"""

from __future__ import annotations

from typing import Any, Dict, List



[docs]
def link_captions_to_images(doc: Dict[str, Any]) -> Dict[str, Any]:
    """
    Link figure captions to images by matching order.

    This function pairs figure captions with images based on their
    sequential order in the document. Each figure caption is assigned
    an `image_hash` that corresponds to the image at the same position.

    Parameters
    ----------
    doc : dict
        SciTeX writer document with 'blocks' and 'images' keys.

    Returns
    -------
    dict
        The same document with image_hash added to figure captions.

    Examples
    --------
    >>> from scitex.msword import load_docx
    >>> from scitex.msword.utils import link_captions_to_images
    >>> doc = load_docx("manuscript.docx")
    >>> doc = link_captions_to_images(doc)
    >>> # Now captions have image_hash for LaTeX export
    """
    blocks = doc.get("blocks", [])
    images = doc.get("images", [])

    # Find all figure captions
    figure_captions = [
        b
        for b in blocks
        if b.get("type") == "caption" and b.get("caption_type") == "figure"
    ]

    # Link by order (figure 1 -> image 0, figure 2 -> image 1, etc.)
    for caption in figure_captions:
        fig_num = caption.get("number")
        if fig_num is not None and isinstance(fig_num, int):
            # Figure numbers are typically 1-indexed
            img_idx = fig_num - 1
            if 0 <= img_idx < len(images):
                caption["image_hash"] = images[img_idx].get("hash")

    return doc




[docs]
def link_captions_to_images_by_proximity(doc: Dict[str, Any]) -> Dict[str, Any]:
    """
    Link figure captions to images by document proximity.

    This function uses the image blocks (type="image") that are inserted
    at their actual positions in the document body. It finds the nearest
    unlinked image block to each figure caption.

    Parameters
    ----------
    doc : dict
        SciTeX writer document.

    Returns
    -------
    dict
        Document with image_hash added to captions.
    """
    blocks = doc.get("blocks", [])

    # Collect image blocks and figure captions with their indices
    image_blocks = []
    figure_captions = []

    for i, block in enumerate(blocks):
        if block.get("type") == "image":
            image_blocks.append((i, block))
        elif block.get("type") == "caption" and block.get("caption_type") == "figure":
            figure_captions.append((i, block))

    if not image_blocks:
        # Fallback to old behavior using doc["images"] list
        images = doc.get("images", [])
        if not images:
            return doc
        image_hashes = [img.get("hash") for img in images]
        for idx, (_, caption) in enumerate(figure_captions):
            if idx < len(image_hashes):
                caption["image_hash"] = image_hashes[idx]
        return doc

    used_images = set()

    # For each caption, find the nearest preceding image block
    for cap_idx, caption in figure_captions:
        best_img_idx = None
        best_img_hash = None
        best_distance = float("inf")

        for img_idx, img_block in image_blocks:
            img_hash = img_block.get("image_hash")
            if img_hash in used_images:
                continue

            # Prefer images that come before the caption (typical layout)
            distance = cap_idx - img_idx
            if distance >= 0 and distance < best_distance:
                best_distance = distance
                best_img_idx = img_idx
                best_img_hash = img_hash

        # If no preceding image, try following images
        if best_img_hash is None:
            for img_idx, img_block in image_blocks:
                img_hash = img_block.get("image_hash")
                if img_hash in used_images:
                    continue

                distance = abs(cap_idx - img_idx)
                if distance < best_distance:
                    best_distance = distance
                    best_img_idx = img_idx
                    best_img_hash = img_hash

        if best_img_hash:
            caption["image_hash"] = best_img_hash
            used_images.add(best_img_hash)

    return doc




[docs]
def normalize_section_headings(doc: Dict[str, Any]) -> Dict[str, Any]:
    """
    Normalize section headings for consistency.

    Converts common section titles to standard academic format:
    - "intro" -> "Introduction"
    - "method" -> "Methods"
    - etc.

    Parameters
    ----------
    doc : dict
        SciTeX writer document.

    Returns
    -------
    dict
        Document with normalized headings.
    """
    blocks = doc.get("blocks", [])

    # Common normalizations
    normalizations = {
        "intro": "Introduction",
        "introduction": "Introduction",
        "method": "Methods",
        "methods": "Methods",
        "materials and methods": "Materials and Methods",
        "result": "Results",
        "results": "Results",
        "discussion": "Discussion",
        "conclusion": "Conclusions",
        "conclusions": "Conclusions",
        "acknowledgement": "Acknowledgements",
        "acknowledgements": "Acknowledgements",
        "reference": "References",
        "references": "References",
        "bibliography": "References",
    }

    for block in blocks:
        if block.get("type") == "heading" and block.get("level") == 1:
            text = block.get("text", "").strip().lower()
            if text in normalizations:
                block["text"] = normalizations[text]

    return doc




[docs]
def validate_document(doc: Dict[str, Any]) -> Dict[str, Any]:
    """
    Validate document structure and add warnings.

    Checks for common issues:
    - Missing required sections
    - Unmatched caption numbers
    - Empty references section
    - Duplicate figure numbers

    Parameters
    ----------
    doc : dict
        SciTeX writer document.

    Returns
    -------
    dict
        Document with warnings added.
    """
    blocks = doc.get("blocks", [])
    warnings = doc.get("warnings", [])

    # Check for required sections
    headings = [b.get("text", "").lower() for b in blocks if b.get("type") == "heading"]

    required_sections = [
        "introduction",
        "methods",
        "results",
        "discussion",
        "references",
    ]
    for section in required_sections:
        if not any(section in h for h in headings):
            warnings.append(f"Missing section: {section.title()}")

    # Check for duplicate figure numbers
    figure_numbers = [
        b.get("number")
        for b in blocks
        if b.get("type") == "caption" and b.get("caption_type") == "figure"
    ]
    seen = set()
    for num in figure_numbers:
        if num in seen:
            warnings.append(f"Duplicate figure number: {num}")
        seen.add(num)

    # Check for missing references
    references = doc.get("references", [])
    if not references:
        ref_blocks = [b for b in blocks if b.get("type") == "reference-paragraph"]
        if not ref_blocks:
            warnings.append("No references found in document")

    doc["warnings"] = warnings
    return doc




[docs]
def create_post_import_hook(*functions):
    """
    Create a composite post_import_hook from multiple functions.

    Parameters
    ----------
    *functions : callable
        Functions to apply in sequence.

    Returns
    -------
    callable
        A single hook that applies all functions.

    Examples
    --------
    >>> from scitex.msword.utils import (
    ...     link_captions_to_images,
    ...     normalize_section_headings,
    ...     create_post_import_hook,
    ... )
    >>> hook = create_post_import_hook(
    ...     link_captions_to_images,
    ...     normalize_section_headings,
    ... )
    >>> # Use with custom profile
    >>> profile.post_import_hooks = [hook]
    """

    def composite_hook(doc: Dict[str, Any]) -> Dict[str, Any]:
        for func in functions:
            doc = func(doc)
        return doc

    return composite_hook



__all__ = [
    "link_captions_to_images",
    "link_captions_to_images_by_proximity",
    "normalize_section_headings",
    "validate_document",
    "create_post_import_hook",
]