Source code for scitex_notebook._convert

#!/usr/bin/env python3
"""Convert Jupyter notebooks to SciTeX-compatible Python scripts."""

from __future__ import annotations

import re
from pathlib import Path
from typing import List, Union

from ._compile import compile_notebook
from ._parse import get_code_cells, parse_notebook

# IPython magic patterns to strip
_MAGIC_RE = re.compile(r"^\s*[%!].*$", re.MULTILINE)

# Import statement pattern (matches: import x, import x as y, import x.y.z as w, from x import y)
_IMPORT_RE = re.compile(
    r"^(?:import\s+\S+(?:\s+as\s+\S+)?|from\s+\S+\s+import\s+.+)$", re.MULTILINE
)

# Common notebook patterns to convert to SciTeX equivalents
_CONVERSIONS = [
    # plt.show() → stx.io.save(fig, "figure.png")
    (re.compile(r"plt\.show\(\)"), '# stx.io.save(fig, "figure.png")  # was: plt.show()'),
    # plt.savefig("...") → stx.io.save(fig, "...")
    (
        re.compile(r'plt\.savefig\((["\'].*?["\'])\)'),
        r"stx.io.save(fig, \1)  # was: plt.savefig",
    ),
    # df.to_csv("...") → stx.io.save(df, "...")
    (
        re.compile(r'(\w+)\.to_csv\((["\'].*?["\'])\)'),
        r"stx.io.save(\1, \2)  # was: .to_csv",
    ),
    # pd.read_csv("...") → stx.io.load("...")
    (
        re.compile(r'pd\.read_csv\((["\'].*?["\'])\)'),
        r'stx.io.load(\1)  # was: pd.read_csv',
    ),
    # np.save("...", arr) → stx.io.save(arr, "...")
    (
        re.compile(r'np\.save\((["\'].*?["\'])\s*,\s*(.+?)\)'),
        r"stx.io.save(\2, \1)  # was: np.save",
    ),
    # np.load("...") → stx.io.load("...")
    (
        re.compile(r'np\.load\((["\'].*?["\'])\)'),
        r'stx.io.load(\1)  # was: np.load',
    ),
]



[docs]
def convert_notebook(
    path: Union[str, Path],
    output: Union[str, Path, None] = None,
    order: str = "cell",
    mode: str = "per_cell",
) -> str:
    """Convert a .ipynb notebook to a .py script with @stx.session.

    Parameters
    ----------
    path : str or Path
        Path to the .ipynb file.
    output : str or Path, optional
        Output .py file path. If None, returns string only.
    order : str
        Cell ordering: "cell" (notebook order) or "dag" (execution order
        from clew DB timestamps).
    mode : str
        Conversion mode:
        - "per_cell": Each code cell becomes a separate @stx.session function (default).
        - "unified": All cells merged into a single @stx.session main() function.
          Markdown cells become comments, imports are hoisted, and common
          notebook patterns (plt.show, pd.read_csv, etc.) are converted to
          SciTeX equivalents (stx.io.save/load).

    Returns
    -------
    str
        The generated Python script content.
    """
    path = Path(path)

    if mode == "unified":
        script = _convert_unified(path)
    elif order == "cell":
        script = _convert_cell_order(path)
    elif order == "dag":
        script = _convert_dag_order(path)
    else:
        raise ValueError(f"Invalid order: {order!r}. Must be 'cell' or 'dag'.")

    if output is not None:
        output = Path(output)
        output.parent.mkdir(parents=True, exist_ok=True)
        output.write_text(script, encoding="utf-8")

    return script



def _convert_unified(path: Path) -> str:
    """Convert notebook into a single @stx.session main() function.

    - Markdown cells become block comments
    - Imports are hoisted to module level
    - Code is merged into main() body
    - Common IO patterns are converted to stx.io equivalents
    """
    all_cells = parse_notebook(path)
    imports: List[str] = []
    body_lines: List[str] = []
    has_plt = False

    for cell in all_cells:
        if cell["cell_type"] == "markdown":
            # Convert markdown to comments
            md_text = cell["source"].strip()
            if not md_text:
                continue
            body_lines.append("")
            for md_line in md_text.splitlines():
                body_lines.append(f"    # {md_line}")
            body_lines.append("")

        elif cell["cell_type"] == "code":
            source = _clean_source(cell["source"])
            if not source.strip():
                continue

            # Separate imports from body code
            cell_imports, cell_body = _separate_imports(source)
            imports.extend(cell_imports)

            if cell_body.strip():
                # Apply SciTeX conversions
                cell_body = _apply_conversions(cell_body)
                body_lines.append("")
                for line in cell_body.splitlines():
                    body_lines.append(f"    {line}" if line.strip() else "")

            # Track matplotlib usage
            if "plt." in source or "matplotlib" in source:
                has_plt = True

    # Deduplicate imports
    seen = set()
    unique_imports = []
    for imp in imports:
        if imp not in seen:
            seen.add(imp)
            unique_imports.append(imp)

    # Filter out imports that stx.session provides (plt is injected)
    filtered_imports = [
        imp
        for imp in unique_imports
        if not imp.startswith("import matplotlib")
        and not imp.startswith("from matplotlib")
        and "matplotlib.pyplot" not in imp
    ]

    # Build script
    lines = [
        "#!/usr/bin/env python3",
        f'"""Converted from {path.name} using scitex notebook convert --mode unified."""',
        "",
        "import scitex as stx",
    ]

    # Add non-scitex imports
    for imp in filtered_imports:
        if "scitex" not in imp:
            lines.append(imp)

    lines.append("")
    lines.append("")

    # Build injected parameters
    injected = ["    CONFIG=stx.INJECTED,", "    logger=stx.INJECTED,"]
    if has_plt:
        injected.append("    plt=stx.INJECTED,")

    lines.append("@stx.session(seed=42)")
    lines.append("def main(")
    lines.extend(injected)
    lines.append("):")
    lines.append(f'    """Analysis pipeline converted from {path.name}."""')

    # Add body
    lines.extend(body_lines)

    lines.append("")
    lines.append("    return 0")
    lines.append("")
    lines.append("")
    lines.append('if __name__ == "__main__":')
    lines.append("    main()")
    lines.append("")
    lines.append("# EOF")
    lines.append("")

    return "\n".join(lines)


def _separate_imports(source: str) -> tuple:
    """Separate import statements from body code."""
    imports = []
    body_lines = []

    for line in source.splitlines():
        stripped = line.strip()
        if _IMPORT_RE.match(stripped):
            imports.append(stripped)
        else:
            body_lines.append(line)

    return imports, "\n".join(body_lines)


def _apply_conversions(source: str) -> str:
    """Apply SciTeX pattern conversions to source code."""
    for pattern, replacement in _CONVERSIONS:
        source = pattern.sub(replacement, source)
    return source


def _convert_cell_order(path: Path) -> str:
    """Convert notebook in cell index order."""
    cells = get_code_cells(path)
    lines = _script_header(path)

    for cell in cells:
        source = _clean_source(cell["source"])
        if not source.strip():
            continue

        idx = cell["index"]
        func_name = f"cell_{idx:02d}"

        lines.append("")
        lines.append("@stx.session")
        lines.append(f"def {func_name}():")
        for line in source.splitlines():
            lines.append(f"    {line}" if line.strip() else "")
        lines.append("    return 0")
        lines.append("")
        lines.append(f"{func_name}()")
        lines.append("")

    return "\n".join(lines)


def _convert_dag_order(path: Path) -> str:
    """Convert notebook in DAG execution order from clew DB."""
    compiled = compile_notebook(path)

    if not compiled.execution_order:
        # No execution history; fall back to cell order
        return _convert_cell_order(path)

    return compiled.to_script()


def _clean_source(source: str) -> str:
    """Strip IPython magics and clean up source code."""
    return _MAGIC_RE.sub("", source)


def _script_header(path: Path) -> List[str]:
    """Generate script header."""
    return [
        "#!/usr/bin/env python3",
        f'"""Converted from {path.name}."""',
        "",
        "import scitex as stx",
        "",
    ]


# EOF