Source code for scitex_tex._export

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2025-12-11 16:00:00
# File: /home/ywatanabe/proj/scitex-code/src/scitex/tex/_export.py

"""
Export SciTeX writer documents to LaTeX format.

This module converts the intermediate document format (from scitex.msword
or scitex.writer) into LaTeX source files.
"""

from __future__ import annotations

import os
import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Journal-specific document class configurations
JOURNAL_PRESETS = {
    "article": {
        "document_class": "article",
        "class_options": [],
        "required_packages": [],
    },
    "ieee": {
        "document_class": "IEEEtran",
        "class_options": ["conference"],
        "required_packages": ["cite", "amsmath", "algorithmic"],
    },
    "elsevier": {
        "document_class": "elsarticle",
        "class_options": ["preprint", "12pt"],
        "required_packages": ["lineno", "hyperref"],
    },
    "springer": {
        "document_class": "svjour3",
        "class_options": ["smallextended"],
        "required_packages": [],
    },
    "aps": {
        "document_class": "revtex4-2",
        "class_options": ["aps", "prl", "preprint"],
        "required_packages": [],
    },
    "mdpi": {
        "document_class": "article",
        "class_options": [],
        "required_packages": ["mdpi"],
    },
    "acm": {
        "document_class": "acmart",
        "class_options": ["sigconf"],
        "required_packages": [],
    },
}


[docs] def export_tex( writer_doc: Dict[str, Any], output_path: str | Path, document_class: str = "article", packages: Optional[List[str]] = None, preamble: Optional[str] = None, image_dir: Optional[str | Path] = None, export_images: bool = True, journal_preset: Optional[str] = None, class_options: Optional[List[str]] = None, use_bibtex: bool = False, ) -> Path: """ Export a SciTeX writer document to LaTeX format. Parameters ---------- writer_doc : dict SciTeX writer document structure containing: - blocks: List of document blocks (headings, paragraphs, captions, etc.) - metadata: Document metadata (title, author, etc.) - images: Image references with binary data - references: Bibliography entries output_path : str | Path Output path for the .tex file. document_class : str LaTeX document class (article, report, book, etc.). Overridden if journal_preset is specified. packages : list[str] | None Additional LaTeX packages to include. preamble : str | None Additional preamble content. image_dir : str | Path | None Directory to save extracted images. If None, uses "{output_stem}_figures/" next to the output .tex file. Set export_images=False to skip image export. export_images : bool Whether to export images to files. Default True. journal_preset : str | None Use a journal-specific preset: "ieee", "elsevier", "springer", "aps", "mdpi", "acm". Sets document_class and required packages. class_options : list[str] | None Document class options (e.g., ["12pt", "twocolumn"]). use_bibtex : bool If True, generate \\bibliography{} instead of thebibliography. Creates a .bib file alongside the .tex file. Returns ------- Path The path to the written .tex file. Examples -------- >>> from scitex.msword import load_docx >>> from scitex.tex import export_tex >>> doc = load_docx("manuscript.docx") >>> export_tex(doc, "manuscript.tex") PosixPath('manuscript.tex') >>> # Export for IEEE conference >>> export_tex(doc, "manuscript.tex", journal_preset="ieee") >>> # Export with custom image directory >>> export_tex(doc, "manuscript.tex", image_dir="./figures") """ output_path = Path(output_path) # Apply journal preset if specified effective_class = document_class effective_options = class_options or [] extra_packages = [] if journal_preset and journal_preset in JOURNAL_PRESETS: preset = JOURNAL_PRESETS[journal_preset] effective_class = preset["document_class"] effective_options = preset["class_options"] + (class_options or []) extra_packages = preset["required_packages"] # Extract components from writer_doc blocks = writer_doc.get("blocks", []) metadata = writer_doc.get("metadata", {}) references = writer_doc.get("references", []) images = writer_doc.get("images", []) # Handle image export image_map: Dict[str, str] = {} # hash -> relative path if export_images and images: if image_dir is None: image_dir = output_path.parent / f"{output_path.stem}_figures" else: image_dir = Path(image_dir) image_dir.mkdir(parents=True, exist_ok=True) image_map = _write_images_to_dir(images, image_dir, output_path.parent) # Combine packages all_packages = extra_packages + (packages or []) # Build LaTeX content latex_content = _build_latex_document( blocks=blocks, metadata=metadata, references=references, document_class=effective_class, class_options=effective_options, packages=all_packages if all_packages else None, preamble=preamble, image_map=image_map, use_bibtex=use_bibtex, output_stem=output_path.stem, ) # Write to file output_path.write_text(latex_content, encoding="utf-8") # Generate .bib file if using bibtex if use_bibtex and references: bib_path = output_path.with_suffix(".bib") bib_content = _generate_bibtex(references) bib_path.write_text(bib_content, encoding="utf-8") return output_path
def _generate_bibtex(references: List[Dict[str, Any]]) -> str: """Generate BibTeX content from references.""" entries = [] for ref in references: num = ref.get("number", len(entries) + 1) text = ref.get("text", ref.get("raw", "")) # Basic entry - in practice, would parse author/title/year entry = f"""@misc{{ref{num}, note = {{{text}}} }}""" entries.append(entry) return "\n\n".join(entries) def _write_images_to_dir( images: List[Dict[str, Any]], image_dir: Path, tex_parent: Path, ) -> Dict[str, str]: """ Write images to directory and return hash->relative_path mapping. Parameters ---------- images : list List of image dicts with 'hash', 'extension', 'data' keys. image_dir : Path Directory to write images to. tex_parent : Path Parent directory of the .tex file (for relative paths). Returns ------- dict Mapping from image hash to relative path for LaTeX. """ image_map = {} fig_counter = 0 for img in images: img_hash = img.get("hash") ext = img.get("extension", ".png") data = img.get("data") if data is None or img_hash is None: continue # Skip duplicates (same hash = same image content) if img_hash in image_map: continue fig_counter += 1 filename = f"fig_{fig_counter}{ext}" filepath = image_dir / filename # Write image data filepath.write_bytes(data) # Store relative path from tex file location try: rel_path = filepath.relative_to(tex_parent) except ValueError: rel_path = filepath image_map[img_hash] = str(rel_path) return image_map def _build_latex_document( blocks: List[Dict[str, Any]], metadata: Dict[str, Any], references: List[Dict[str, Any]], document_class: str, class_options: Optional[List[str]] = None, packages: Optional[List[str]] = None, preamble: Optional[str] = None, image_map: Optional[Dict[str, str]] = None, use_bibtex: bool = False, output_stem: str = "document", ) -> str: """Build complete LaTeX document content.""" if image_map is None: image_map = {} lines = [] # Document class with options if class_options: opts = ",".join(class_options) lines.append(f"\\documentclass[{opts}]{{{document_class}}}") else: lines.append(f"\\documentclass{{{document_class}}}") lines.append("") # Default packages default_packages = [ "inputenc", "fontenc", "amsmath", "amssymb", "graphicx", "hyperref", ] # Package options package_options = { "inputenc": "utf8", "fontenc": "T1", } for pkg in default_packages: opt = package_options.get(pkg) if opt: lines.append(f"\\usepackage[{opt}]{{{pkg}}}") else: lines.append(f"\\usepackage{{{pkg}}}") # Additional packages if packages: for pkg in packages: if pkg not in default_packages: lines.append(f"\\usepackage{{{pkg}}}") lines.append("") # Metadata if metadata.get("title"): title = _escape_latex(metadata["title"]) lines.append(f"\\title{{{title}}}") if metadata.get("author"): author = _escape_latex(metadata["author"]) lines.append(f"\\author{{{author}}}") lines.append("") # Additional preamble if preamble: lines.append(preamble) lines.append("") # Begin document lines.append("\\begin{document}") lines.append("") # Title if metadata.get("title"): lines.append("\\maketitle") lines.append("") # Track list state for proper itemize/enumerate environments in_list = False list_type = None # Process blocks for i, block in enumerate(blocks): btype = block.get("type") # Handle list transitions if btype == "list-item": item_list_type = block.get("list_type", "unordered") if not in_list: env = "enumerate" if item_list_type == "ordered" else "itemize" lines.append(f"\\begin{{{env}}}") in_list = True list_type = item_list_type elif in_list: # Close list environment env = "enumerate" if list_type == "ordered" else "itemize" lines.append(f"\\end{{{env}}}") lines.append("") in_list = False list_type = None block_latex = _convert_block_to_latex(block, image_map) if block_latex: lines.append(block_latex) # Close any open list if in_list: env = "enumerate" if list_type == "ordered" else "itemize" lines.append(f"\\end{{{env}}}") lines.append("") # References section if references: lines.append("") if use_bibtex: lines.append(f"\\bibliographystyle{{plain}}") lines.append(f"\\bibliography{{{output_stem}}}") else: lines.append("\\begin{thebibliography}{99}") for ref in references: ref_latex = _convert_reference_to_latex(ref) if ref_latex: lines.append(ref_latex) lines.append("\\end{thebibliography}") # End document lines.append("") lines.append("\\end{document}") return "\n".join(lines) def _convert_block_to_latex( block: Dict[str, Any], image_map: Optional[Dict[str, str]] = None, ) -> Optional[str]: """Convert a single block to LaTeX.""" if image_map is None: image_map = {} btype = block.get("type", "paragraph") text = block.get("text", "") if not text and btype not in ("table", "image", "caption", "equation"): return None if btype == "heading": return _convert_heading(block) elif btype == "paragraph": return _convert_paragraph(block) elif btype == "caption": return _convert_caption(block, image_map) elif btype == "table": return _convert_table(block) elif btype == "image": return _convert_image(block, image_map) elif btype == "list-item": return _convert_list_item(block) elif btype == "equation": return _convert_equation(block) elif btype == "reference-paragraph": # Skip - handled separately in references section return None else: # Default: treat as paragraph return _escape_latex(text) + "\n" def _convert_equation(block: Dict[str, Any]) -> str: """Convert an equation block to LaTeX.""" latex = block.get("latex", "") text = block.get("text", "") if latex: # Use the converted LaTeX from OMML return f"\\begin{{equation}}\n{latex}\n\\end{{equation}}\n" elif text: # Fallback: wrap text in equation environment return f"\\begin{{equation}}\n{_escape_latex(text)}\n\\end{{equation}}\n" return "" def _convert_heading(block: Dict[str, Any]) -> str: """Convert a heading block to LaTeX.""" level = block.get("level", 1) text = _escape_latex(block.get("text", "")) # Map heading levels to LaTeX commands level_commands = { 1: "section", 2: "subsection", 3: "subsubsection", 4: "paragraph", 5: "subparagraph", } command = level_commands.get(level, "paragraph") return f"\\{command}{{{text}}}\n" def _convert_paragraph(block: Dict[str, Any]) -> str: """Convert a paragraph block to LaTeX.""" runs = block.get("runs", []) if runs: # Build paragraph from formatted runs parts = [] for run in runs: run_text = _escape_latex(run.get("text", "")) if run.get("bold"): run_text = f"\\textbf{{{run_text}}}" if run.get("italic"): run_text = f"\\textit{{{run_text}}}" if run.get("underline"): run_text = f"\\underline{{{run_text}}}" parts.append(run_text) return "".join(parts) + "\n" else: return _escape_latex(block.get("text", "")) + "\n" def _convert_caption( block: Dict[str, Any], image_map: Optional[Dict[str, str]] = None, ) -> str: """Convert a caption block to LaTeX figure/table environment.""" if image_map is None: image_map = {} caption_type = block.get("caption_type", "") number = block.get("number", "") caption_text = _escape_latex(block.get("caption_text", block.get("text", ""))) image_hash = block.get("image_hash") if caption_type == "figure": # Check if we have an associated image image_path = None if image_hash and image_hash in image_map: image_path = image_map[image_hash] lines = [ "\\begin{figure}[htbp]", "\\centering", ] if image_path: # Remove extension for includegraphics image_path_no_ext = ( image_path.rsplit(".", 1)[0] if "." in image_path else image_path ) lines.append( f"\\includegraphics[width=0.8\\textwidth]{{{image_path_no_ext}}}" ) else: lines.append(f"% Image placeholder for Figure {number}") lines.extend( [ f"\\caption{{{caption_text}}}", f"\\label{{fig:{number}}}", "\\end{figure}", "", ] ) return "\n".join(lines) elif caption_type == "table": # Table captions - typically above the table return f"% Table {number}: {caption_text}\n" else: return f"% Caption: {caption_text}\n" def _convert_image( block: Dict[str, Any], image_map: Optional[Dict[str, str]] = None, ) -> str: """Convert an image block to LaTeX includegraphics.""" if image_map is None: image_map = {} image_hash = block.get("image_hash") or block.get("hash") width = block.get("width", "0.8\\textwidth") if image_hash and image_hash in image_map: image_path = image_map[image_hash] # Remove extension for includegraphics image_path_no_ext = ( image_path.rsplit(".", 1)[0] if "." in image_path else image_path ) lines = [ "\\begin{figure}[htbp]", "\\centering", f"\\includegraphics[width={width}]{{{image_path_no_ext}}}", "\\end{figure}", "", ] return "\n".join(lines) return "% Image placeholder\n" def _convert_table(block: Dict[str, Any]) -> str: """Convert a table block to LaTeX.""" rows = block.get("rows", []) if not rows: return "" num_cols = len(rows[0]) if rows else 0 col_spec = "|" + "c|" * num_cols lines = [ "\\begin{table}[htbp]", "\\centering", f"\\begin{{tabular}}{{{col_spec}}}", "\\hline", ] for i, row in enumerate(rows): escaped_cells = [_escape_latex(str(cell)) for cell in row] lines.append(" & ".join(escaped_cells) + " \\\\") lines.append("\\hline") lines.extend( [ "\\end{tabular}", "\\end{table}", "", ] ) return "\n".join(lines) def _convert_list_item(block: Dict[str, Any]) -> str: """Convert a list item to LaTeX.""" text = _escape_latex(block.get("text", "")) return f"\\item {text}\n" def _convert_reference_to_latex(ref: Dict[str, Any]) -> str: """Convert a reference entry to LaTeX bibitem.""" number = ref.get("number") text = _escape_latex(ref.get("text", ref.get("raw", ""))) if number: return f"\\bibitem{{ref{number}}} {text}" else: return f"\\bibitem{{}} {text}" def _escape_latex(text: str) -> str: """Escape special LaTeX characters.""" if not text: return "" # Characters that need escaping in LaTeX replacements = [ ("\\", "\\textbackslash{}"), ("&", "\\&"), ("%", "\\%"), ("$", "\\$"), ("#", "\\#"), ("_", "\\_"), ("{", "\\{"), ("}", "\\}"), ("~", "\\textasciitilde{}"), ("^", "\\textasciicircum{}"), ] # Apply replacements (order matters - backslash first) result = text for old, new in replacements: # Skip if already escaped if old == "\\": # Don't escape existing LaTeX commands result = re.sub(r"(?<!\\)\\(?![a-zA-Z{])", new, result) else: result = result.replace(old, new) return result
[docs] @dataclass class CompileResult: """Result of LaTeX compilation. Attributes ---------- success : bool Whether compilation succeeded. pdf_path : Path | None Path to generated PDF, or None if failed. exit_code : int Process exit code. stdout : str Standard output from compiler. stderr : str Standard error from compiler. log_content : str Content of .log file if available. errors : list[str] Extracted error messages. warnings : list[str] Extracted warning messages. """ success: bool pdf_path: Optional[Path] exit_code: int stdout: str stderr: str log_content: str = "" errors: List[str] = None warnings: List[str] = None def __post_init__(self): if self.errors is None: self.errors = [] if self.warnings is None: self.warnings = []
[docs] def compile_tex( tex_path: str | Path, output_dir: Optional[str | Path] = None, compiler: str = "pdflatex", runs: int = 2, clean: bool = True, timeout: int = 120, ) -> CompileResult: """ Compile a LaTeX file to PDF. Parameters ---------- tex_path : str | Path Path to the .tex file. output_dir : str | Path | None Output directory for PDF. If None, uses same directory as tex file. compiler : str LaTeX compiler to use: "pdflatex", "xelatex", "lualatex", or "latexmk". Default is "pdflatex". runs : int Number of compilation passes (for references/ToC). Default is 2. Ignored if compiler is "latexmk". clean : bool Remove auxiliary files (.aux, .log, .out, etc.) after compilation. Default is True. timeout : int Timeout in seconds for each compilation pass. Default is 120. Returns ------- CompileResult Compilation result with success status, PDF path, and logs. Examples -------- >>> from scitex.tex import compile_tex >>> result = compile_tex("manuscript.tex") >>> if result.success: ... print(f"PDF created: {result.pdf_path}") ... else: ... print(f"Errors: {result.errors}") >>> # Use latexmk for automatic multi-pass compilation >>> result = compile_tex("manuscript.tex", compiler="latexmk") Notes ----- Requires LaTeX to be installed on the system (texlive, miktex, etc.). """ tex_path = Path(tex_path).absolute() if not tex_path.exists(): return CompileResult( success=False, pdf_path=None, exit_code=1, stdout="", stderr=f"File not found: {tex_path}", errors=[f"File not found: {tex_path}"], ) # Determine output directory if output_dir is None: output_dir = tex_path.parent else: output_dir = Path(output_dir).absolute() output_dir.mkdir(parents=True, exist_ok=True) # Check if compiler is available compiler_cmd = shutil.which(compiler) if compiler_cmd is None: return CompileResult( success=False, pdf_path=None, exit_code=127, stdout="", stderr=f"Compiler not found: {compiler}", errors=[f"Compiler not found: {compiler}. Install texlive or miktex."], ) # Build command if compiler == "latexmk": cmd = [ compiler, "-pdf", "-interaction=nonstopmode", f"-output-directory={output_dir}", str(tex_path), ] runs = 1 # latexmk handles multi-pass else: cmd = [ compiler, "-interaction=nonstopmode", "-halt-on-error", f"-output-directory={output_dir}", str(tex_path), ] # Run compilation stdout_all = [] stderr_all = [] exit_code = 0 for run_num in range(runs): try: result = subprocess.run( cmd, cwd=tex_path.parent, capture_output=True, text=True, timeout=timeout, ) stdout_all.append(f"=== Pass {run_num + 1} ===\n{result.stdout}") stderr_all.append(result.stderr) exit_code = result.returncode # If compilation failed, don't continue if exit_code != 0: break except subprocess.TimeoutExpired: return CompileResult( success=False, pdf_path=None, exit_code=124, stdout="\n".join(stdout_all), stderr=f"Compilation timed out after {timeout} seconds", errors=[f"Compilation timed out after {timeout} seconds"], ) except Exception as e: return CompileResult( success=False, pdf_path=None, exit_code=1, stdout="\n".join(stdout_all), stderr=str(e), errors=[str(e)], ) # Check for output PDF pdf_name = tex_path.stem + ".pdf" pdf_path = output_dir / pdf_name # Read log file for detailed errors/warnings log_path = output_dir / (tex_path.stem + ".log") log_content = "" errors = [] warnings = [] if log_path.exists(): try: log_content = log_path.read_text(encoding="utf-8", errors="replace") errors, warnings = _parse_latex_log(log_content) except Exception: pass # Clean auxiliary files if clean: aux_extensions = [ ".aux", ".log", ".out", ".toc", ".lof", ".lot", ".bbl", ".blg", ".fls", ".fdb_latexmk", ".synctex.gz", ] for ext in aux_extensions: aux_file = output_dir / (tex_path.stem + ext) if aux_file.exists(): try: aux_file.unlink() except Exception: pass success = exit_code == 0 and pdf_path.exists() return CompileResult( success=success, pdf_path=pdf_path if pdf_path.exists() else None, exit_code=exit_code, stdout="\n".join(stdout_all), stderr="\n".join(stderr_all), log_content=log_content, errors=errors, warnings=warnings, )
def _parse_latex_log(log_content: str) -> Tuple[List[str], List[str]]: """Parse LaTeX log file for errors and warnings.""" errors = [] warnings = [] lines = log_content.split("\n") for i, line in enumerate(lines): # Error patterns if line.startswith("!"): # Collect multi-line error message error_lines = [line] for j in range(i + 1, min(i + 5, len(lines))): if lines[j].startswith("l.") or lines[j].strip() == "": break error_lines.append(lines[j]) errors.append(" ".join(error_lines)) elif "Error:" in line or "Fatal error" in line: errors.append(line.strip()) # Warning patterns elif "Warning:" in line: warnings.append(line.strip()) elif "Underfull" in line or "Overfull" in line: warnings.append(line.strip()) return errors, warnings __all__ = ["export_tex", "compile_tex", "CompileResult"]