Source code for scitex_msword.utils

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 16:45:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/msword/utils.py

"""
Utility functions for processing MS Word documents.

These functions can be used as post_import_hooks or called directly
to process document structures.
"""

from __future__ import annotations

from typing import Any, Dict, List










[docs] def normalize_section_headings(doc: Dict[str, Any]) -> Dict[str, Any]: """ Normalize section headings for consistency. Converts common section titles to standard academic format: - "intro" -> "Introduction" - "method" -> "Methods" - etc. Parameters ---------- doc : dict SciTeX writer document. Returns ------- dict Document with normalized headings. """ blocks = doc.get("blocks", []) # Common normalizations normalizations = { "intro": "Introduction", "introduction": "Introduction", "method": "Methods", "methods": "Methods", "materials and methods": "Materials and Methods", "result": "Results", "results": "Results", "discussion": "Discussion", "conclusion": "Conclusions", "conclusions": "Conclusions", "acknowledgement": "Acknowledgements", "acknowledgements": "Acknowledgements", "reference": "References", "references": "References", "bibliography": "References", } for block in blocks: if block.get("type") == "heading" and block.get("level") == 1: text = block.get("text", "").strip().lower() if text in normalizations: block["text"] = normalizations[text] return doc
[docs] def validate_document(doc: Dict[str, Any]) -> Dict[str, Any]: """ Validate document structure and add warnings. Checks for common issues: - Missing required sections - Unmatched caption numbers - Empty references section - Duplicate figure numbers Parameters ---------- doc : dict SciTeX writer document. Returns ------- dict Document with warnings added. """ blocks = doc.get("blocks", []) warnings = doc.get("warnings", []) # Check for required sections headings = [b.get("text", "").lower() for b in blocks if b.get("type") == "heading"] required_sections = [ "introduction", "methods", "results", "discussion", "references", ] for section in required_sections: if not any(section in h for h in headings): warnings.append(f"Missing section: {section.title()}") # Check for duplicate figure numbers figure_numbers = [ b.get("number") for b in blocks if b.get("type") == "caption" and b.get("caption_type") == "figure" ] seen = set() for num in figure_numbers: if num in seen: warnings.append(f"Duplicate figure number: {num}") seen.add(num) # Check for missing references references = doc.get("references", []) if not references: ref_blocks = [b for b in blocks if b.get("type") == "reference-paragraph"] if not ref_blocks: warnings.append("No references found in document") doc["warnings"] = warnings return doc
[docs] def create_post_import_hook(*functions): """ Create a composite post_import_hook from multiple functions. Parameters ---------- *functions : callable Functions to apply in sequence. Returns ------- callable A single hook that applies all functions. Examples -------- >>> from scitex.msword.utils import ( ... link_captions_to_images, ... normalize_section_headings, ... create_post_import_hook, ... ) >>> hook = create_post_import_hook( ... link_captions_to_images, ... normalize_section_headings, ... ) >>> # Use with custom profile >>> profile.post_import_hooks = [hook] """ def composite_hook(doc: Dict[str, Any]) -> Dict[str, Any]: for func in functions: doc = func(doc) return doc return composite_hook
__all__ = [ "link_captions_to_images", "link_captions_to_images_by_proximity", "normalize_section_headings", "validate_document", "create_post_import_hook", ]