Source code for scitex_clew._hash

#!/usr/bin/env python3
# Timestamp: "2026-02-01 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-python/src/scitex/verify/_hash.py
"""File and directory hashing utilities for verification."""

from __future__ import annotations

import hashlib
from pathlib import Path
from typing import Dict, Union


[docs] def hash_file( path: Union[str, Path], algorithm: str = "sha256", chunk_size: int = 8192, ) -> str: """ Compute hash of a file. Parameters ---------- path : str or Path Path to the file to hash algorithm : str, optional Hash algorithm (default: sha256) chunk_size : int, optional Size of chunks to read (default: 8192) Returns ------- str Hexadecimal hash string (first 32 characters) Examples -------- >>> hash_file("data.csv") 'a1b2c3d4e5f6...' """ path = Path(path) if not path.exists(): raise FileNotFoundError(f"File not found: {path}") hasher = hashlib.new(algorithm) with open(path, "rb") as f: while chunk := f.read(chunk_size): hasher.update(chunk) return hasher.hexdigest()[:32]
[docs] def hash_directory( path: Union[str, Path], pattern: str = "*", recursive: bool = True, algorithm: str = "sha256", ) -> Dict[str, str]: """ Compute hashes for all files in a directory. Parameters ---------- path : str or Path Directory path pattern : str, optional Glob pattern for files (default: "*") recursive : bool, optional Whether to search recursively (default: True) algorithm : str, optional Hash algorithm (default: sha256) Returns ------- dict Mapping of relative paths to hashes Examples -------- >>> hash_directory("./data/") {'input.csv': 'a1b2...', 'config.yaml': 'c3d4...'} """ path = Path(path) if not path.is_dir(): raise NotADirectoryError(f"Not a directory: {path}") glob_method = path.rglob if recursive else path.glob hashes = {} for file_path in glob_method(pattern): if file_path.is_file(): rel_path = str(file_path.relative_to(path)) hashes[rel_path] = hash_file(file_path, algorithm=algorithm) return hashes
def hash_files( paths: list[Union[str, Path]], algorithm: str = "sha256", ) -> Dict[str, str]: """ Compute hashes for a list of files. Parameters ---------- paths : list of str or Path List of file paths algorithm : str, optional Hash algorithm (default: sha256) Returns ------- dict Mapping of paths to hashes """ hashes = {} for path in paths: path = Path(path) if path.exists() and path.is_file(): hashes[str(path)] = hash_file(path, algorithm=algorithm) return hashes def combine_hashes(hashes: Dict[str, str], algorithm: str = "sha256") -> str: """ Combine multiple hashes into a single hash. Creates a deterministic combined hash from a dictionary of hashes. Parameters ---------- hashes : dict Mapping of names to hashes algorithm : str, optional Hash algorithm (default: sha256) Returns ------- str Combined hash (first 32 characters) Examples -------- >>> hashes = {'input.csv': 'a1b2...', 'script.py': 'c3d4...'} >>> combine_hashes(hashes) 'e5f6g7h8...' """ hasher = hashlib.new(algorithm) # Sort by key for deterministic ordering for key in sorted(hashes.keys()): hasher.update(f"{key}:{hashes[key]}".encode()) return hasher.hexdigest()[:32] def verify_hash( path: Union[str, Path], expected_hash: str, algorithm: str = "sha256", ) -> bool: """ Verify that a file matches an expected hash. Parameters ---------- path : str or Path Path to the file expected_hash : str Expected hash value algorithm : str, optional Hash algorithm (default: sha256) Returns ------- bool True if hash matches, False otherwise """ try: actual_hash = hash_file(path, algorithm=algorithm) # Compare only the length of expected_hash (may be truncated) return actual_hash[: len(expected_hash)] == expected_hash except FileNotFoundError: return False # EOF