Source code for scitex_msword.bold

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2026-06-02 00:00:00
# File: src/scitex_msword/bold.py
#
# Part of scitex-msword (AGPL-3.0-only). See LICENSE at the repo root.

"""
Preserve specified tokens by re-splitting runs and applying bold + font.

This module implements :func:`preserve_bold_tokens` which scans a
``python-docx`` Document, locates every occurrence of each token, and
ensures that token is its own run with ``bold=True`` and a configurable
font (default ``MS Gothic`` — used for emphasized Japanese keywords in
the BOOST 2026 application).

The implementation operates per paragraph: it concatenates run text,
finds token spans, then rebuilds the paragraph's runs as
``[before, token, after]`` triples, preserving the original character
formatting of the surrounding text.
"""

from __future__ import annotations

import re
from typing import Any, Dict, List, Optional, Sequence, Tuple

try:
    from docx.document import Document as DocxDocument  # type: ignore[import-untyped]
    from docx.oxml.ns import qn  # type: ignore[import-untyped]

    DOCX_AVAILABLE = True
    _DOCX_IMPORT_ERROR: Optional[Exception] = None
except ImportError as exc:  # pragma: no cover
    DOCX_AVAILABLE = False
    _DOCX_IMPORT_ERROR = exc
    DocxDocument = None  # type: ignore[assignment,misc]
    qn = None  # type: ignore[assignment]


DEFAULT_FONT = "MS Gothic"


def _ensure_docx_available() -> None:
    if not DOCX_AVAILABLE:
        raise ImportError(
            "python-docx is required for scitex_msword.bold. "
            "Install it via `pip install python-docx`."
        ) from _DOCX_IMPORT_ERROR


def _snapshot_run_format(run) -> Dict[str, Any]:
    """Capture the formatting attributes we care about from a Run."""
    snap: Dict[str, Any] = {
        "bold": run.bold,
        "italic": run.italic,
        "underline": run.underline,
    }
    try:
        snap["font_name"] = run.font.name
    except Exception:
        snap["font_name"] = None
    try:
        snap["font_size"] = run.font.size
    except Exception:
        snap["font_size"] = None
    try:
        snap["highlight"] = run.font.highlight_color
    except Exception:
        snap["highlight"] = None
    return snap


def _apply_run_format(run, snap: Dict[str, Any]) -> None:
    """Apply a previously snapshotted format onto a Run."""
    if snap.get("bold") is not None:
        run.bold = snap["bold"]
    if snap.get("italic") is not None:
        run.italic = snap["italic"]
    if snap.get("underline") is not None:
        run.underline = snap["underline"]
    if snap.get("font_name"):
        run.font.name = snap["font_name"]
    if snap.get("font_size"):
        run.font.size = snap["font_size"]
    if snap.get("highlight") is not None:
        try:
            run.font.highlight_color = snap["highlight"]
        except Exception:
            pass


def _force_east_asian_font(run, font_name: str) -> None:
    """
    Force the East-Asian font slot on a run.

    python-docx's ``run.font.name`` only sets the Latin face; for Japanese
    text (which BOOST keywords usually contain) Word also requires the
    East-Asian (``w:eastAsia``) attribute on the run's ``w:rFonts`` element.
    """
    if qn is None:  # pragma: no cover
        return
    rPr = run._element.get_or_add_rPr()
    rFonts = rPr.find(qn("w:rFonts"))
    if rFonts is None:
        # python-docx auto-creates rFonts when we set run.font.name; if
        # somehow missing we manually add the East-Asian slot via XML.
        from docx.oxml import OxmlElement  # type: ignore[import-untyped]

        rFonts = OxmlElement("w:rFonts")
        rPr.append(rFonts)
    rFonts.set(qn("w:eastAsia"), font_name)
    rFonts.set(qn("w:ascii"), font_name)
    rFonts.set(qn("w:hAnsi"), font_name)
    rFonts.set(qn("w:cs"), font_name)


def _find_token_spans(
    text: str, tokens: Sequence[str], *, case_sensitive: bool
) -> List[Tuple[int, int, str]]:
    """
    Return ``[(start, end, token), ...]`` spans for every token occurrence.

    Spans are sorted by start position; if tokens overlap, the earliest
    longest match wins.
    """
    if not text or not tokens:
        return []
    # Build an alternation pattern of escaped tokens, longest first so
    # the regex prefers the longer match when tokens overlap.
    sorted_tokens = sorted({t for t in tokens if t}, key=len, reverse=True)
    if not sorted_tokens:
        return []
    pattern = "|".join(re.escape(t) for t in sorted_tokens)
    flags = 0 if case_sensitive else re.IGNORECASE
    spans = [
        (m.start(), m.end(), m.group(0)) for m in re.finditer(pattern, text, flags)
    ]
    return spans


def _rebuild_paragraph_with_spans(
    paragraph,
    spans: List[Tuple[int, int, str]],
    font_name: str,
    base_format: Dict[str, Any],
) -> None:
    """
    Rebuild ``paragraph``'s runs so each span becomes its own bold run.

    The non-span text is split into runs that inherit ``base_format``.
    Non-text run children (drawings, breaks, fields) on the original
    runs are preserved by *not* removing the original ``w:r`` elements
    that contain only non-text content; instead, we only rewrite runs
    whose entire content was plain text. Mixed runs are left alone and
    skipped from rewriting in this pass.
    """
    # Capture text segments to emit, in order.
    text = paragraph.text
    if not text:
        return

    segments: List[Tuple[str, bool]] = []
    cursor = 0
    for s, e, _ in spans:
        if s > cursor:
            segments.append((text[cursor:s], False))
        segments.append((text[s:e], True))
        cursor = e
    if cursor < len(text):
        segments.append((text[cursor:], False))

    # Remove existing runs entirely (we will reconstruct them).
    for r in list(paragraph._p.findall(qn("w:r"))):
        paragraph._p.remove(r)

    # Re-add runs.
    for content, is_token in segments:
        if not content:
            continue
        new_run = paragraph.add_run(content)
        _apply_run_format(new_run, base_format)
        if is_token:
            new_run.bold = True
            new_run.font.name = font_name
            _force_east_asian_font(new_run, font_name)



[docs]
def preserve_bold_tokens(
    document: "DocxDocument",
    tokens: Sequence[str],
    *,
    font_name: str = DEFAULT_FONT,
    case_sensitive: bool = True,
) -> "DocxDocument":
    """
    Walk every paragraph in ``document`` and bold-emphasize each token hit.

    Wherever a token appears inside paragraph text, the paragraph's runs
    are split so that the token sits in its own run with ``bold=True``
    and ``font.name = font_name`` (Latin + East-Asian + complex script
    slots are all set so Japanese text picks up MS Gothic in Word).

    Parameters
    ----------
    document : docx.Document
        The Document to mutate in place.
    tokens : sequence of str
        Strings to emphasize. Empty strings are ignored. Longer tokens
        take precedence on overlapping matches.
    font_name : str, default "MS Gothic"
        Font face applied to matched tokens.
    case_sensitive : bool, default True
        If False, matching is case-insensitive.

    Returns
    -------
    docx.Document
        The same Document object, mutated.

    Notes
    -----
    This implementation rewrites all runs of a paragraph when at least
    one token hits; paragraphs without hits are left untouched. The
    original surrounding format (italic, underline, size, highlight) is
    captured from the *first* run before rebuilding — if you need
    finer-grained preservation, run :func:`preserve_bold_tokens` before
    other run-level edits.

    Examples
    --------
    >>> from scitex_msword.bold import preserve_bold_tokens
    >>> preserve_bold_tokens(doc, tokens=["BOOST", "JST"])
    """
    _ensure_docx_available()
    if not tokens:
        return document

    for paragraph in document.paragraphs:
        spans = _find_token_spans(
            paragraph.text, list(tokens), case_sensitive=case_sensitive
        )
        if not spans:
            continue
        # Capture base formatting from the first run (best-effort).
        base_format: Dict[str, Any] = {}
        if paragraph.runs:
            base_format = _snapshot_run_format(paragraph.runs[0])
            # Don't propagate the source run's bold/highlight onto
            # non-token segments — those should look "normal".
            base_format["bold"] = False
            base_format["highlight"] = None
        _rebuild_paragraph_with_spans(paragraph, spans, font_name, base_format)

    return document



__all__ = ["preserve_bold_tokens", "DEFAULT_FONT"]