#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2026-06-02 00:00:00
# File: src/scitex_msword/track_changes.py
#
# Part of scitex-msword (AGPL-3.0-only). See LICENSE at the repo root.
"""
Track-Changes (revision) utilities for python-docx Documents.
This module surfaces the OOXML revision primitives so agents can:
1. Toggle Word's "Track Changes" switch (``<w:trackChanges/>`` in
``word/settings.xml``) via :func:`enable_track_changes`.
2. Wrap agent edits as ``<w:ins>`` / ``<w:del>`` revisions
(:func:`wrap_as_tracked_insertion`, :func:`wrap_as_tracked_deletion`).
3. Extract all tracked changes (:func:`extract_tracked_changes`).
4. Accept / reject all changes in bulk
(:func:`accept_all_tracked_changes`, :func:`reject_all_tracked_changes`).
OOXML refs: ``w:trackChanges`` (ECMA-376 §17.15.1.86), ``w:ins``
(§17.13.5.18), ``w:del`` (§17.13.5.14), ``w:delText`` (§17.13.5.15).
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any, List, Optional, Sequence
try:
from docx.document import Document as DocxDocument # type: ignore[import-untyped]
from docx.oxml.ns import qn # type: ignore[import-untyped]
from docx.text.paragraph import Paragraph # type: ignore[import-untyped]
from docx.text.run import Run # type: ignore[import-untyped]
from lxml import etree
DOCX_AVAILABLE = True
_DOCX_IMPORT_ERROR: Optional[Exception] = None
except ImportError as exc: # pragma: no cover
DOCX_AVAILABLE = False
_DOCX_IMPORT_ERROR = exc
DocxDocument = None # type: ignore[assignment,misc]
Paragraph = None # type: ignore[assignment,misc]
Run = None # type: ignore[assignment,misc]
qn = None # type: ignore[assignment]
etree = None # type: ignore[assignment]
_W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
def _ensure_docx_available() -> None:
if not DOCX_AVAILABLE:
raise ImportError(
"python-docx (and lxml) are required for scitex_msword.track_changes. "
"Install via `pip install python-docx`."
) from _DOCX_IMPORT_ERROR
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _settings_element(document: "DocxDocument"):
"""Return the lxml ``<w:settings>`` element for the document."""
return document.settings.element
def _now_iso() -> str:
"""UTC ISO-8601 timestamp at second precision (Word-friendly)."""
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def _make_w_element(tag_local_name: str, **attrs):
"""Create a ``w:<tag>`` lxml element with namespaced ``w:`` attributes."""
el = etree.Element(f"{{{_W_NS}}}{tag_local_name}")
for key, value in attrs.items():
if value is None:
continue
el.set(f"{{{_W_NS}}}{key}", str(value))
return el
def _scan_max_revision_id(document: "DocxDocument") -> int:
"""Largest ``w:id`` currently used on a ``<w:ins>`` / ``<w:del>``."""
body = document.element.body
ins_tag = f"{{{_W_NS}}}ins"
del_tag = f"{{{_W_NS}}}del"
max_id = 0
for elem in body.iter():
if elem.tag in (ins_tag, del_tag):
raw = elem.get(qn("w:id"))
try:
cid = int(raw) if raw is not None else 0
except (TypeError, ValueError):
cid = 0
if cid > max_id:
max_id = cid
return max_id
def _resolve_runs(
paragraph: "Paragraph",
runs: Sequence[Any],
) -> List["Run"]:
"""Resolve Run objects / run indices into a list of paragraph Runs."""
all_runs = list(paragraph.runs)
elems = [r._r for r in all_runs]
resolved: List[Run] = []
for item in runs:
if isinstance(item, int):
if 0 <= item < len(all_runs):
resolved.append(all_runs[item])
else:
elem = getattr(item, "_r", None) or getattr(item, "element", None)
if elem is not None and elem in elems:
resolved.append(item)
return resolved
def _wrap_runs_in_element(
paragraph: "Paragraph",
target_runs: Sequence["Run"],
wrapper_tag: str,
attrs: dict,
):
"""Wrap each run's ``<w:r>`` in a new ``<w:wrapper_tag>`` parent."""
wrappers = []
for run in target_runs:
r_elem = run._r
parent = r_elem.getparent()
if parent is None:
continue
idx = parent.index(r_elem)
wrapper = _make_w_element(wrapper_tag, **attrs)
parent.insert(idx, wrapper)
parent.remove(r_elem)
wrapper.append(r_elem)
wrappers.append(wrapper)
return wrappers
def _next_revision_id(paragraph: "Paragraph", explicit: Optional[int]) -> int:
"""Resolve the ``w:id`` for a new revision, defaulting to max+1."""
if explicit is not None:
return int(explicit)
try:
document = paragraph.part.document # type: ignore[attr-defined]
except Exception:
document = None
if document is None:
return 1
return _scan_max_revision_id(document) + 1
# ---------------------------------------------------------------------------
# API 1: enable_track_changes / is_track_changes_enabled
# ---------------------------------------------------------------------------
[docs]
def enable_track_changes(
document: "DocxDocument",
enabled: bool = True,
) -> "DocxDocument":
"""
Toggle Word's "Track Changes" switch on the document.
Inserts ``<w:trackChanges/>`` into ``word/settings.xml`` when
``enabled=True`` (idempotent) or removes it when ``enabled=False``.
Parameters
----------
document : docx.Document
The Document to mutate in place.
enabled : bool, default True
``True`` keeps a single ``<w:trackChanges/>`` element present;
``False`` removes any such elements.
Returns
-------
docx.Document
The same Document object (chainable).
"""
_ensure_docx_available()
settings_el = _settings_element(document)
existing = settings_el.findall(qn("w:trackChanges"))
if enabled:
if not existing:
settings_el.append(_make_w_element("trackChanges"))
else:
for dup in existing[1:]:
settings_el.remove(dup)
else:
for el in existing:
settings_el.remove(el)
return document
[docs]
def is_track_changes_enabled(document: "DocxDocument") -> bool:
"""Return True iff ``<w:trackChanges/>`` is present in settings.xml."""
_ensure_docx_available()
return _settings_element(document).find(qn("w:trackChanges")) is not None
# ---------------------------------------------------------------------------
# API 2: wrap_as_tracked_insertion
# ---------------------------------------------------------------------------
[docs]
def wrap_as_tracked_insertion(
paragraph: "Paragraph",
runs: Sequence[Any],
author: str = "agent",
date: Optional[str] = None,
w_id: Optional[int] = None,
) -> List[Any]:
"""
Wrap the given runs of ``paragraph`` in ``<w:ins>`` revision blocks.
Word renders the wrapped content as "inserted by <author>" and
surfaces it as an accept/reject-able revision.
Parameters
----------
paragraph : docx.text.paragraph.Paragraph
Paragraph that owns the runs to wrap.
runs : sequence of Run or int
Runs to wrap, by Run object or by 0-based index.
author : str, default "agent"
Recorded in ``w:author``.
date : str, optional
ISO-8601 string for ``w:date``; defaults to ``now(UTC)``.
w_id : int, optional
Explicit revision id; auto-assigned (max+1) when ``None``.
Returns
-------
list
Newly created ``<w:ins>`` lxml elements.
"""
_ensure_docx_available()
target_runs = _resolve_runs(paragraph, runs)
if not target_runs:
return []
attrs = {
"id": _next_revision_id(paragraph, w_id),
"author": author,
"date": date or _now_iso(),
}
return _wrap_runs_in_element(paragraph, target_runs, "ins", attrs)
# ---------------------------------------------------------------------------
# API 3: wrap_as_tracked_deletion
# ---------------------------------------------------------------------------
[docs]
def wrap_as_tracked_deletion(
paragraph: "Paragraph",
runs: Sequence[Any],
author: str = "agent",
date: Optional[str] = None,
w_id: Optional[int] = None,
) -> List[Any]:
"""
Wrap the given runs of ``paragraph`` in ``<w:del>`` revision blocks.
Each wrapped run's ``<w:t>`` children are also retagged as
``<w:delText>`` so Word renders the deletion with strike-through.
Parameters
----------
paragraph : docx.text.paragraph.Paragraph
Paragraph that owns the runs to wrap.
runs : sequence of Run or int
Runs to wrap, by Run object or by 0-based index.
author : str, default "agent"
Recorded in ``w:author``.
date : str, optional
ISO-8601 string for ``w:date``; defaults to ``now(UTC)``.
w_id : int, optional
Explicit revision id; auto-assigned (max+1) when ``None``.
Returns
-------
list
Newly created ``<w:del>`` lxml elements.
"""
_ensure_docx_available()
target_runs = _resolve_runs(paragraph, runs)
if not target_runs:
return []
attrs = {
"id": _next_revision_id(paragraph, w_id),
"author": author,
"date": date or _now_iso(),
}
wrappers = _wrap_runs_in_element(paragraph, target_runs, "del", attrs)
t_qn = qn("w:t")
delText_qn = qn("w:delText")
for wrapper in wrappers:
for t in list(wrapper.iter(t_qn)):
t.tag = delText_qn
return wrappers
# ---------------------------------------------------------------------------
# API 4: extract_tracked_changes
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# API 5: accept_all / reject_all
# ---------------------------------------------------------------------------
def _unwrap_element(elem) -> None:
"""Replace ``elem`` in its parent with its own children, in order."""
parent = elem.getparent()
if parent is None:
return
idx = parent.index(elem)
for offset, child in enumerate(list(elem)):
elem.remove(child)
parent.insert(idx + offset, child)
parent.remove(elem)
[docs]
def accept_all_tracked_changes(document: "DocxDocument") -> "DocxDocument":
"""
Accept all tracked changes — equivalent to Word's "Accept All".
``<w:ins>`` wrappers are unwrapped (content remains); ``<w:del>``
wrappers and their contents are removed.
Parameters
----------
document : docx.Document
The Document to mutate in place.
Returns
-------
docx.Document
The same Document, mutated.
"""
_ensure_docx_available()
body = document.element.body
ins_tag = f"{{{_W_NS}}}ins"
del_tag = f"{{{_W_NS}}}del"
for el in [e for e in body.iter() if e.tag in (ins_tag, del_tag)]:
parent = el.getparent()
if parent is None:
continue
if el.tag == ins_tag:
_unwrap_element(el)
else:
parent.remove(el)
return document
[docs]
def reject_all_tracked_changes(document: "DocxDocument") -> "DocxDocument":
"""
Reject all tracked changes — equivalent to Word's "Reject All".
``<w:ins>`` wrappers and contents are removed; ``<w:del>`` wrappers
are unwrapped and their ``<w:delText>`` children retagged back to
``<w:t>`` so the original text is restored.
Parameters
----------
document : docx.Document
The Document to mutate in place.
Returns
-------
docx.Document
The same Document, mutated.
"""
_ensure_docx_available()
body = document.element.body
ins_tag = f"{{{_W_NS}}}ins"
del_tag = f"{{{_W_NS}}}del"
t_tag = qn("w:t")
delText_tag = f"{{{_W_NS}}}delText"
for el in [e for e in body.iter() if e.tag in (ins_tag, del_tag)]:
parent = el.getparent()
if parent is None:
continue
if el.tag == ins_tag:
parent.remove(el)
else:
for dt in list(el.iter(delText_tag)):
dt.tag = t_tag
_unwrap_element(el)
return document
__all__ = [
"enable_track_changes",
"is_track_changes_enabled",
"wrap_as_tracked_insertion",
"wrap_as_tracked_deletion",
"extract_tracked_changes",
"accept_all_tracked_changes",
"reject_all_tracked_changes",
]