Source code for scitex_msword.highlights

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2026-06-02 00:00:00
# File: src/scitex_msword/highlights.py
#
# Part of scitex-msword (AGPL-3.0-only). See LICENSE at the repo root.

"""
Visual-mark / highlight utilities for python-docx Documents.

This module is used by the BOOST v16 dogfooding workflow to visualize:

- *additions* by the operator (turquoise highlight, by convention)
- *modifications* to operator-supplied content (magenta highlight)

plus a generic :func:`extract_highlights` to inspect what colors are
present in a document and which runs carry them.

Color names mirror Word's ``WD_COLOR_INDEX`` enum (case-insensitive):
``turquoise``, ``pink``, ``yellow``, ``green``, ``blue``, ``red``,
``gray_25``, ``gray_50``, ``black``, ``white``, ``dark_red``,
``dark_yellow``, ``dark_blue``, ``teal``, ``violet``, and ``auto``.

For convenience the following BOOST-v16 aliases are also accepted:

- ``magenta`` -> ``pink`` (Word's bright-pink highlight)
- ``purple``  -> ``violet``
- ``cyan``    -> ``turquoise``
"""

from __future__ import annotations

from collections import defaultdict
from typing import Any, Dict, Iterable, List, Optional, Tuple

try:
    from docx.document import Document as DocxDocument  # type: ignore[import-untyped]
    from docx.enum.text import WD_COLOR_INDEX  # type: ignore[import-untyped]

    DOCX_AVAILABLE = True
    _DOCX_IMPORT_ERROR: Optional[Exception] = None
except ImportError as exc:  # pragma: no cover
    DOCX_AVAILABLE = False
    _DOCX_IMPORT_ERROR = exc
    WD_COLOR_INDEX = None  # type: ignore[assignment]
    DocxDocument = None  # type: ignore[assignment,misc]


# Canonical addition / modification colors (mirrors BOOST v16 convention).
ADDITION_COLOR = "turquoise"
MODIFICATION_COLOR = "magenta"

# Map lowercase / underscore-tolerant color name -> WD_COLOR_INDEX value.
# Lazily populated on first use so the module is import-safe without docx.
_COLOR_NAME_MAP: Dict[str, Any] = {}


def _ensure_docx_available() -> None:
    if not DOCX_AVAILABLE:
        raise ImportError(
            "python-docx is required for scitex_msword.highlights. "
            "Install it via `pip install python-docx`."
        ) from _DOCX_IMPORT_ERROR


def _color_map() -> Dict[str, Any]:
    """Lazy-build the lowercase -> WD_COLOR_INDEX mapping."""
    if _COLOR_NAME_MAP:
        return _COLOR_NAME_MAP
    _ensure_docx_available()
    for member in WD_COLOR_INDEX:  # type: ignore[union-attr]
        name = member.name.lower()
        _COLOR_NAME_MAP[name] = member
        # Allow callers to use either "gray25" or "gray_25" forms.
        if "_" in name:
            _COLOR_NAME_MAP[name.replace("_", "")] = member
    # Friendly aliases used by the BOOST v16 workflow.
    _COLOR_NAME_MAP["magenta"] = WD_COLOR_INDEX.PINK  # type: ignore[union-attr]
    _COLOR_NAME_MAP["purple"] = WD_COLOR_INDEX.VIOLET  # type: ignore[union-attr]
    _COLOR_NAME_MAP["cyan"] = WD_COLOR_INDEX.TURQUOISE  # type: ignore[union-attr]
    return _COLOR_NAME_MAP


def _resolve_color(color):
    """Translate a string color name (or pass-through enum) into WD_COLOR_INDEX."""
    if color is None:
        _ensure_docx_available()
        return WD_COLOR_INDEX.AUTO  # type: ignore[union-attr]
    if isinstance(color, str):
        key = color.strip().lower().replace("-", "_")
        cmap = _color_map()
        if key not in cmap:
            raise ValueError(
                f"Unknown highlight color: {color!r}. "
                f"Known: {sorted(set(cmap))[:12]}..."
            )
        return cmap[key]
    return color


def _iter_target_runs(document: "DocxDocument", targets: Iterable[Tuple[int, int]]):
    """Yield (run_obj, (paragraph_idx, run_idx)) pairs for valid targets."""
    paragraphs = list(document.paragraphs)
    for pi, ri in targets:
        if 0 <= pi < len(paragraphs):
            runs = paragraphs[pi].runs
            if 0 <= ri < len(runs):
                yield runs[ri], (pi, ri)


def _apply_highlight(
    document: "DocxDocument",
    runs: Iterable[Tuple[int, int]],
    color,
) -> "DocxDocument":
    """Apply ``color`` highlight to every (paragraph_idx, run_idx) target."""
    _ensure_docx_available()
    wd_color = _resolve_color(color)
    for run, _ in _iter_target_runs(document, runs):
        run.font.highlight_color = wd_color
    return document


[docs] def mark_additions( document: "DocxDocument", runs: Iterable[Tuple[int, int]], color: str = ADDITION_COLOR, ) -> "DocxDocument": """ Highlight the runs that the operator (or agent) *added* to the document. Parameters ---------- document : docx.Document The Document to mutate in place. runs : iterable of (paragraph_idx, run_idx) Targets to highlight. Out-of-range indices are skipped silently. color : str, default "turquoise" Color name. See module docstring for the supported palette. Returns ------- docx.Document The same Document object, mutated. Examples -------- >>> from scitex_msword.highlights import mark_additions >>> doc = mark_additions(doc, [(3, 0), (5, 2)]) # default turquoise """ return _apply_highlight(document, runs, color)
[docs] def mark_modifications( document: "DocxDocument", runs: Iterable[Tuple[int, int]], color: str = MODIFICATION_COLOR, ) -> "DocxDocument": """ Highlight the runs that the operator (or agent) *modified* in the document. Parameters ---------- document : docx.Document The Document to mutate in place. runs : iterable of (paragraph_idx, run_idx) Targets to highlight. Out-of-range indices are skipped silently. color : str, default "magenta" Color name. See module docstring for the supported palette. Returns ------- docx.Document The same Document object, mutated. Examples -------- >>> from scitex_msword.highlights import mark_modifications >>> doc = mark_modifications(doc, [(7, 1)]) # default magenta """ return _apply_highlight(document, runs, color)
def _run_highlight_name(run) -> Optional[str]: """Return the lowercase highlight color name for the run, or None.""" try: hl = run.font.highlight_color except Exception: return None if hl is None: return None # WD_COLOR_INDEX members expose .name; str() includes the int code, # which is unhelpful for bucketing. name = getattr(hl, "name", str(hl)).split(".")[-1].lower() # Strip any trailing " (N)" leftover defensively. name = name.split(" ")[0] if name in ("auto", "inherited"): return None return name
[docs] def extract_highlights( document: "DocxDocument", by_color: bool = True, ) -> Dict[str, List[Dict[str, Any]]]: """ Extract highlighted runs from a document, grouped by color. Parameters ---------- document : docx.Document The document to scan. by_color : bool, default True If True (default), return ``{color_name: [run_info, ...]}``. If False, return a single ``{"all": [run_info, ...]}`` bucket with each entry's ``color`` field populated. Returns ------- dict[str, list[dict]] Mapping from color name to a list of run info dicts of shape:: {"paragraph": int, "run": int, "text": str, "color": str} Examples -------- >>> from scitex_msword.highlights import extract_highlights >>> by_color = extract_highlights(doc) >>> by_color.get("turquoise", []) [{'paragraph': 3, 'run': 0, 'text': '...', 'color': 'turquoise'}] """ _ensure_docx_available() buckets: Dict[str, List[Dict[str, Any]]] = defaultdict(list) for pi, para in enumerate(document.paragraphs): for ri, run in enumerate(para.runs): name = _run_highlight_name(run) if not name: continue entry = { "paragraph": pi, "run": ri, "text": run.text, "color": name, } bucket = name if by_color else "all" buckets[bucket].append(entry) return dict(buckets)
[docs] def clear_highlights( document: "DocxDocument", colors: Optional[Iterable[str]] = None, ) -> "DocxDocument": """ Remove highlights from all runs (optionally only for the listed colors). Parameters ---------- document : docx.Document Document to mutate in place. colors : iterable of str, optional If provided, only runs with one of these highlight colors are cleared. If ``None`` (default), every highlighted run is cleared. Returns ------- docx.Document The same Document object, mutated. """ _ensure_docx_available() target = None if colors is not None: target = {c.strip().lower().replace("-", "_") for c in colors} for para in document.paragraphs: for run in para.runs: name = _run_highlight_name(run) if name and (target is None or name in target): run.font.highlight_color = WD_COLOR_INDEX.AUTO # type: ignore[union-attr] return document
__all__ = [ "ADDITION_COLOR", "MODIFICATION_COLOR", "mark_additions", "mark_modifications", "extract_highlights", "clear_highlights", ]