Source code for scitex_stats.tests.normality._test_ks_2samp

#!/usr/bin/env python3
# Timestamp: "2025-10-01 17:30:00 (ywatanabe)"
# File: scitex_stats/tests/normality/_test_ks_2samp.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------


r"""
Functionalities:
  - Perform two-sample Kolmogorov-Smirnov test (compare two empirical distributions)
  - Generate CDF comparison plots
  - Support flexible output formats (dict or DataFrame)

Dependencies:
  - packages: numpy, pandas, scipy, matplotlib

IO:
  - input: Two samples (arrays or Series)
  - output: Test results (dict or DataFrame) and optional figure
"""

"""Imports"""
import argparse  # noqa: E402
from typing import Literal, Optional, Union  # noqa: E402

import matplotlib.axes  # noqa: E402
import matplotlib.pyplot as _mpl_plt  # noqa: E402
import matplotlib.pyplot as plt  # noqa: E402
import numpy as np  # noqa: E402
import pandas as pd  # noqa: E402
from scipy import stats  # noqa: E402

from scitex_stats._logging import getLogger
from scitex_stats._utils._formatters import fmt_stat, fmt_sym  # noqa: E402

logger = getLogger(__name__)

"""Functions"""


[docs] def test_ks_2samp( x: Union[np.ndarray, pd.Series, str], y: Union[np.ndarray, pd.Series, str], var_x: str = "x", var_y: str = "y", alternative: Literal["two-sided", "less", "greater"] = "two-sided", alpha: float = 0.05, plot: bool = False, ax: Optional[matplotlib.axes.Axes] = None, data: Union[pd.DataFrame, str, None] = None, return_as: Literal["dict", "dataframe"] = "dict", decimals: int = 3, verbose: bool = False, ) -> Union[dict, pd.DataFrame]: r""" Perform two-sample Kolmogorov-Smirnov test. Parameters ---------- x, y : arrays or Series Two samples to compare var_x, var_y : str Labels for samples alternative : {'two-sided', 'less', 'greater'}, default 'two-sided' Alternative hypothesis alpha : float, default 0.05 Significance level plot : bool, default False Whether to generate CDF comparison plot data : DataFrame, str, or None, optional DataFrame or CSV path. When provided, string values for x/y are resolved as column names (seaborn-style). return_as : {'dict', 'dataframe'}, default 'dict' Output format decimals : int, default 3 Number of decimal places for rounding Returns ------- results : dict or DataFrame Test results including: - test_method: 'Kolmogorov-Smirnov test (2-sample)' - statistic_name: 'D' - statistic: KS D-statistic - pvalue: p-value - pstars: Significance stars - rejected: Whether null hypothesis is rejected - n_x, n_y: Sample sizes - var_x, var_y: Variable labels - H0: Null hypothesis description fig : matplotlib.figure.Figure, optional Figure with CDF comparison (only if plot=True) Notes ----- The two-sample Kolmogorov-Smirnov test compares the ECDFs of two samples. **Null Hypothesis (H0)**: Both samples come from the same distribution **Test Statistic D**: .. math:: D = \\sup_x |F_{n_1}(x) - F_{n_2}(x)| Where F_{n_1} and F_{n_2} are the empirical CDFs. **Advantages**: - Distribution-free (non-parametric) - Tests entire distribution, not just location - Can detect differences in location, scale, or shape **Disadvantages**: - Less powerful than t-test when assumptions are met - Most sensitive to differences near the center of distributions - Less sensitive to tail differences **When to use**: - Comparing two independent samples - No assumptions about distribution shape - Want to test overall distribution equality (not just means) - Alternative to t-test when normality violated **Comparison with other tests**: - vs t-test: More robust, less powerful - vs Mann-Whitney U: Tests different hypotheses (distribution vs median) - vs Brunner-Munzel: KS tests full distribution, BM tests P(X>Y) Examples -------- >>> # Two samples from same distribution >>> x = np.random.normal(0, 1, 100) >>> y = np.random.normal(0, 1, 100) >>> result = test_ks_2samp(x, y) >>> result['rejected'] False >>> # Two samples from different distributions >>> x = np.random.normal(0, 1, 100) >>> y = np.random.normal(2, 1, 100) >>> result = test_ks_2samp(x, y) >>> result['rejected'] True """ # Resolve column names from DataFrame (seaborn-style data= parameter) if data is not None: from scitex_stats._utils._csv_support import resolve_columns resolved = resolve_columns(data, x=x, y=y) x, y = resolved["x"], resolved["y"] from scitex_stats._utils._formatters import p2stars from scitex_stats._utils._normalizers import convert_results, force_dataframe # Convert to numpy arrays and remove NaN x = np.asarray(x) y = np.asarray(y) x = x[~np.isnan(x)] y = y[~np.isnan(y)] n_x = len(x) n_y = len(y) # Perform two-sample KS test ks_result = stats.ks_2samp(x, y, alternative=alternative) d_stat = float(ks_result.statistic) pvalue = float(ks_result.pvalue) # Determine rejection rejected = pvalue < alpha # Compile results result = { "test_method": "Kolmogorov-Smirnov test (2-sample)", "statistic": round(d_stat, decimals), "stat_symbol": "D", "n_x": n_x, "n_y": n_y, "var_x": var_x, "var_y": var_y, "pvalue": round(pvalue, decimals), "stars": p2stars(pvalue), "alpha": alpha, "significant": rejected, "same_distribution": not rejected, "H0": "Both samples come from the same distribution", } # Log results if verbose if verbose: logger.info( f"KS test (2-sample): D = {d_stat:.3f}, p = {pvalue:.4f} {p2stars(pvalue)}" ) logger.info(f"Same distribution: {not rejected}") # Auto-enable plotting if ax is provided if ax is not None: plot = True # Generate plot if requested if plot: if ax is None: fig, axes = _mpl_plt.subplots(1, 2, figsize=(14, 6)) _plot_ks_2samp_full(x, y, var_x, var_y, result, axes) else: _plot_ks_2samp_simple(x, y, var_x, var_y, result, ax) # Convert to requested format if return_as == "dataframe": result = force_dataframe(result) elif return_as not in ["dict", "dataframe"]: return convert_results(result, return_as=return_as) return result
def _plot_ks_2samp_full(x, y, var_x, var_y, result, axes): """Create 2-panel CDF comparison plot for two-sample KS test.""" from scitex_stats._plot_helpers import stats_text_box # Plot 1: CDF comparison ax = axes[0] # Compute ECDFs x_sorted = np.sort(x) ecdf_x = np.arange(1, len(x) + 1) / len(x) y_sorted = np.sort(y) ecdf_y = np.arange(1, len(y) + 1) / len(y) # Plot both ECDFs ax.step(x_sorted, ecdf_x, where="post", label=var_x) ax.step(y_sorted, ecdf_y, where="post", label=var_y) ax.set_xlabel("Value") ax.set_ylabel("Cumulative Probability") ax.set_title("KS Test (2-sample)") ax.legend() # Add text with results stats_text_box( ax, [ fmt_stat("D", result["statistic"]), fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]), f"Same dist: {result['same_distribution']}", f"{fmt_sym('n_x')} = {result['n_x']}, {fmt_sym('n_y')} = {result['n_y']}", ], ) # Plot 2: Overlapping histograms ax = axes[1] ax.hist( x, bins="auto", density=True, label=var_x, ) ax.hist( y, bins="auto", density=True, label=var_y, ) ax.set_xlabel("Value") ax.set_ylabel("Density") ax.set_title("Histogram") ax.legend() def _plot_ks_2samp_simple(x, y, var_x, var_y, result, ax): """Create single CDF comparison plot on provided axes.""" from scitex_stats._plot_helpers import stats_text_box # Compute ECDFs x_sorted = np.sort(x) ecdf_x = np.arange(1, len(x) + 1) / len(x) y_sorted = np.sort(y) ecdf_y = np.arange(1, len(y) + 1) / len(y) # Plot both ECDFs ax.step(x_sorted, ecdf_x, where="post", label=var_x) ax.step(y_sorted, ecdf_y, where="post", label=var_y) ax.set_xlabel("Value") ax.set_ylabel("Cumulative Probability") ax.set_title("KS Test (2-sample)") ax.legend() # Add text with results stats_text_box( ax, [ fmt_stat("D", result["statistic"]), fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]), f"Same dist: {result['same_distribution']}", f"{fmt_sym('n_x')} = {result['n_x']}, {fmt_sym('n_y')} = {result['n_y']}", ], ) """Main function""" def main(args): """Demonstrate two-sample Kolmogorov-Smirnov test functionality.""" logger.info("Demonstrating two-sample Kolmogorov-Smirnov test") # Set random seed np.random.seed(42) # Example 1: Two-sample test - same distribution logger.info("\n=== Example 1: Two-sample KS test (same distribution) ===") x1 = np.random.normal(0, 1, 100) y1 = np.random.normal(0, 1, 100) result1 = test_ks_2samp(x1, y1, var_x="Sample 1", var_y="Sample 2", verbose=True) # Example 2: Two-sample test - different means logger.info("\n=== Example 2: Two-sample KS test (different means) ===") x2 = np.random.normal(0, 1, 100) y2 = np.random.normal(2, 1, 100) result2 = test_ks_2samp(x2, y2, var_x="Group A", var_y="Group B", verbose=True) # Example 3: Two-sample test with visualization logger.info("\n=== Example 3: Two-sample KS test with visualization ===") x3 = np.random.normal(5, 1, 80) y3 = np.random.exponential(2, 80) result3 = test_ks_2samp( x3, y3, var_x="Normal", var_y="Exponential", plot=True, verbose=True ) plt.gcf().savefig("./ks_2samp_example.jpg") plt.close("all") # Example 4: Export results logger.info("\n=== Example 4: Export results ===") from scitex_stats._utils._normalizers import force_dataframe test_results = [result1, result2, result3] df = force_dataframe(test_results) logger.info(f"\nDataFrame shape: {df.shape}") df.to_excel("./ks_2samp_results.xlsx", index=False) logger.info("Results exported to ./ks_2samp_results.xlsx") return 0 def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description="Demonstrate two-sample Kolmogorov-Smirnov test" ) parser.add_argument("--verbose", action="store_true", help="Enable verbose output") return parser.parse_args() def run_main(): """Run main without the scitex umbrella session helpers.""" import matplotlib matplotlib.use("Agg") args = parse_args() return main(args) if __name__ == "__main__": run_main() # EOF