Source code for scitex_msword.comments
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2026-06-02 00:00:00
# File: src/scitex_msword/comments.py
#
# Part of scitex-msword (AGPL-3.0-only). See LICENSE at the repo root.
"""
Comment extraction and (limited) application for python-docx Documents.
Word stores comments in ``word/comments.xml`` inside the .docx ZIP, while
the comment *anchors* (i.e. the ranges the comment refers to) live in
``word/document.xml`` as ``commentRangeStart`` / ``commentRangeEnd``
sibling elements with an ``w:id`` attribute matching the comment.
This module exposes :func:`extract_comments` for reading them back into
a structured list and :func:`apply_comments_as_edits` which honors a
narrow "REPLACE:" comment grammar to perform automated edits.
"""
from __future__ import annotations
import re
import zipfile
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from xml.etree import ElementTree as ET
try:
import docx # type: ignore[import-untyped]
from docx.document import Document as DocxDocument # type: ignore[import-untyped]
from docx.oxml.ns import qn # type: ignore[import-untyped]
DOCX_AVAILABLE = True
_DOCX_IMPORT_ERROR: Optional[Exception] = None
except ImportError as exc: # pragma: no cover
DOCX_AVAILABLE = False
_DOCX_IMPORT_ERROR = exc
DocxDocument = None # type: ignore[assignment,misc]
qn = None # type: ignore[assignment]
_W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
_NS = {"w": _W_NS}
# Grammar for apply_comments_as_edits — intentionally narrow.
_REPLACE_RE = re.compile(r"^\s*REPLACE\s*:\s*(.+?)\s*$", re.IGNORECASE | re.DOTALL)
def _ensure_docx_available() -> None:
if not DOCX_AVAILABLE:
raise ImportError(
"python-docx is required for scitex_msword.comments. "
"Install it via `pip install python-docx`."
) from _DOCX_IMPORT_ERROR
def _docx_path(doc: Union[str, Path, "DocxDocument"]) -> Optional[Path]:
"""Return the on-disk path backing a Document, or None if in-memory."""
if isinstance(doc, (str, Path)):
return Path(doc)
pkg = getattr(doc, "part", None)
if pkg is None:
return None
pkg_pkg = getattr(pkg, "package", None)
if pkg_pkg is None:
return None
pkg_path = getattr(pkg_pkg, "_path", None)
if pkg_path:
return Path(pkg_path)
return None
def _read_comments_xml(source: Union[str, Path, "DocxDocument"]) -> Optional[bytes]:
"""Return the raw word/comments.xml bytes, or None if absent."""
if isinstance(source, (str, Path)):
zip_path = Path(source)
else:
# In-memory Document: read straight from the OPC part.
try:
for part in source.part.package.iter_parts():
if part.partname == "/word/comments.xml":
return part.blob
except Exception:
pass
path = _docx_path(source)
if path is None:
return None
zip_path = path
if not zip_path.exists():
return None
try:
with zipfile.ZipFile(zip_path) as zf:
if "word/comments.xml" not in zf.namelist():
return None
return zf.read("word/comments.xml")
except zipfile.BadZipFile:
return None
def _parse_comments_xml(blob: bytes) -> Dict[str, Dict[str, Any]]:
"""Parse word/comments.xml into ``{comment_id: {author, text, date}}``."""
root = ET.fromstring(blob)
comments: Dict[str, Dict[str, Any]] = {}
for c in root.findall("w:comment", _NS):
cid = c.get(qn("w:id")) if qn is not None else c.get(f"{{{_W_NS}}}id")
if cid is None:
continue
author = (
c.get(qn("w:author")) if qn is not None else c.get(f"{{{_W_NS}}}author")
) or ""
date = (
c.get(qn("w:date")) if qn is not None else c.get(f"{{{_W_NS}}}date")
) or ""
texts: List[str] = []
for t in c.iter(f"{{{_W_NS}}}t"):
if t.text:
texts.append(t.text)
comments[str(cid)] = {
"id": int(cid) if str(cid).isdigit() else cid,
"author": author,
"date": date,
"text": "".join(texts),
}
return comments
def _scan_anchors(
document: "DocxDocument",
) -> Dict[str, Dict[str, Any]]:
"""
Walk the document body and locate comment anchor ranges.
Returns ``{comment_id_str: {anchor_text, paragraph_range}}``.
"""
paragraphs = list(document.paragraphs)
anchors: Dict[str, Dict[str, Any]] = {}
# Track which comments are currently open and at which paragraph they started.
open_comments: Dict[str, int] = {}
# Buffer for in-progress anchor text per comment id.
open_text: Dict[str, List[str]] = {}
id_attr = qn("w:id") if qn is not None else f"{{{_W_NS}}}id"
for pi, para in enumerate(paragraphs):
# We want a stream of (kind, payload) events from the paragraph XML.
for elem in para._p.iter():
tag = elem.tag
if tag == f"{{{_W_NS}}}commentRangeStart":
cid = elem.get(id_attr)
if cid is not None:
open_comments[cid] = pi
open_text.setdefault(cid, [])
elif tag == f"{{{_W_NS}}}commentRangeEnd":
cid = elem.get(id_attr)
if cid is not None and cid in open_comments:
start_p = open_comments.pop(cid)
text = "".join(open_text.pop(cid, []))
anchors[cid] = {
"anchor_text": text,
"paragraph_range": [start_p, pi],
}
elif tag == f"{{{_W_NS}}}t":
# Any text run that runs while a comment is open contributes
# to that comment's anchor text.
if elem.text and open_comments:
for cid in open_comments:
open_text[cid].append(elem.text)
return anchors
[docs]
def extract_comments(
document: Union[str, Path, "DocxDocument"],
) -> List[Dict[str, Any]]:
"""
Extract Word comments from a .docx file or open Document.
Parameters
----------
document : str | Path | docx.Document
Path to the .docx or an already-open Document.
Returns
-------
list[dict]
One entry per comment::
{"id": int | str,
"author": str,
"date": str, # ISO timestamp string, may be empty
"text": str, # comment body
"anchor_text": str, # text the comment is anchored to
"paragraph_range": [start, end]}
``anchor_text`` and ``paragraph_range`` default to ``""`` and
``[None, None]`` when no in-document anchor can be located.
Examples
--------
>>> from scitex_msword.comments import extract_comments
>>> comments = extract_comments("boost-v16.docx")
>>> [c["text"] for c in comments]
['Please rephrase this', 'REPLACE: Use the new wording']
"""
_ensure_docx_available()
blob = _read_comments_xml(document)
if not blob:
return []
parsed = _parse_comments_xml(blob)
# Anchor scanning needs an open Document.
if isinstance(document, (str, Path)):
doc_obj = docx.Document(str(document))
else:
doc_obj = document
anchors = _scan_anchors(doc_obj)
out: List[Dict[str, Any]] = []
for cid, meta in parsed.items():
anchor = anchors.get(cid, {})
out.append(
{
"id": meta["id"],
"author": meta["author"],
"date": meta["date"],
"text": meta["text"],
"anchor_text": anchor.get("anchor_text", ""),
"paragraph_range": anchor.get("paragraph_range", [None, None]),
}
)
# Sort by numeric id when possible for stable output.
out.sort(key=lambda d: (isinstance(d["id"], str), d["id"]))
return out
def _replace_in_paragraph(paragraph, anchor: str, replacement: str) -> bool:
"""Replace ``anchor`` text in ``paragraph`` with ``replacement``.
Returns True if a replacement occurred.
"""
text = paragraph.text
if anchor not in text:
return False
new_text = text.replace(anchor, replacement, 1)
# Rebuild paragraph runs with the new text, keeping first run's formatting.
first_run_fmt: Dict[str, Any] = {}
if paragraph.runs:
r = paragraph.runs[0]
first_run_fmt = {
"bold": r.bold,
"italic": r.italic,
"underline": r.underline,
}
# Clear existing runs.
for r in list(paragraph._p.findall(qn("w:r"))):
paragraph._p.remove(r)
new_run = paragraph.add_run(new_text)
if first_run_fmt.get("bold") is not None:
new_run.bold = first_run_fmt["bold"]
if first_run_fmt.get("italic") is not None:
new_run.italic = first_run_fmt["italic"]
if first_run_fmt.get("underline") is not None:
new_run.underline = first_run_fmt["underline"]
return True
[docs]
def apply_comments_as_edits(
document: "DocxDocument",
*,
comments: Optional[List[Dict[str, Any]]] = None,
grammar: str = "replace",
) -> Dict[str, Any]:
"""
Apply comments to the document body using a narrow grammar.
Only the ``REPLACE:`` grammar is currently supported, i.e. a comment
whose body matches ``r"^\\s*REPLACE\\s*:\\s*(.+?)\\s*$"`` is interpreted
as "replace this comment's anchor text with the trailing payload".
Other comments are ignored.
Parameters
----------
document : docx.Document
The Document to mutate in place.
comments : list[dict], optional
Pre-extracted comments (as returned by :func:`extract_comments`).
If ``None``, the comments are read from ``document`` directly.
grammar : str, default "replace"
Reserved for future expansion. Currently only ``"replace"``
is recognised.
Returns
-------
dict
Summary: ``{"applied": int, "skipped": int, "details": [...]}``.
Examples
--------
>>> from scitex_msword.comments import apply_comments_as_edits
>>> summary = apply_comments_as_edits(doc)
>>> summary["applied"]
2
"""
_ensure_docx_available()
if grammar != "replace":
raise ValueError(
f"Unsupported grammar: {grammar!r}. Only 'replace' is implemented."
)
if comments is None:
comments = extract_comments(document)
paragraphs = list(document.paragraphs)
applied = 0
skipped = 0
details: List[Dict[str, Any]] = []
for c in comments:
m = _REPLACE_RE.match(c.get("text", ""))
if not m:
skipped += 1
details.append({"id": c.get("id"), "status": "skipped", "reason": "no-grammar-match"})
continue
replacement = m.group(1)
anchor = c.get("anchor_text") or ""
if not anchor:
skipped += 1
details.append({"id": c.get("id"), "status": "skipped", "reason": "no-anchor"})
continue
start, end = c.get("paragraph_range", [None, None])
if start is None or end is None:
skipped += 1
details.append({"id": c.get("id"), "status": "skipped", "reason": "no-range"})
continue
did_apply = False
for pi in range(max(0, start), min(end + 1, len(paragraphs))):
if _replace_in_paragraph(paragraphs[pi], anchor, replacement):
did_apply = True
break
if did_apply:
applied += 1
details.append({"id": c.get("id"), "status": "applied"})
else:
skipped += 1
details.append(
{"id": c.get("id"), "status": "skipped", "reason": "anchor-not-found"}
)
return {"applied": applied, "skipped": skipped, "details": details}
__all__ = ["extract_comments", "apply_comments_as_edits"]