"""
Clean Isotonic binning implementation for architecture.
This module provides IsotonicBinning that inherits from SupervisedBinningBase.
Uses isotonic regression to find optimal cut points that preserve monotonic relationships.
"""
from __future__ import annotations
from typing import Any
import numpy as np
from sklearn.isotonic import IsotonicRegression
from ..base import SupervisedBinningBase
from ..config import apply_config_defaults
from ..utils import (
BinEdgesDict,
ConfigurationError,
FittingError,
create_param_dict_for_config,
resolve_n_bins_parameter,
validate_bin_number_parameter,
)
# pylint: disable=too-many-ancestors
[docs]
class IsotonicBinning(SupervisedBinningBase):
"""Isotonic regression-based monotonic binning implementation using clean architecture.
Creates bins using isotonic regression to find optimal cut points that preserve
monotonic relationships between features and targets. The transformer fits an
isotonic (monotonic, non-decreasing or non-increasing) function to the data and
identifies significant changes in this function to determine bin boundaries.
This method is particularly valuable for cases where domain knowledge suggests
a monotonic relationship between features and targets, such as risk modeling,
credit scoring, or any application where preserving order relationships is critical.
The isotonic regression ensures that the average target values within bins
maintain the specified monotonic relationship.
The algorithm works by:
1. Sorting data by feature values
2. Fitting an isotonic regression model to preserve monotonicity
3. Identifying cut points where significant changes occur in the fitted function
4. Creating bins that respect both the monotonic constraint and the minimum samples requirement
When insufficient variability is found in the fitted isotonic function, the algorithm
creates a single bin or falls back to simple boundary definitions.
This implementation follows the clean binlearn architecture with straight inheritance,
dynamic column resolution, and parameter reconstruction capabilities.
Args:
max_bins: Maximum number of bins to create. Controls the granularity of binning.
Can be an integer or a string expression like 'sqrt', 'log2', etc. for
dynamic calculation based on data size. If None, uses configuration default.
min_samples_per_bin: Minimum number of samples required per bin. Ensures
statistical significance of bins. Must be positive integer. If None,
uses configuration default.
increasing: Whether to enforce increasing monotonicity (True) or decreasing
monotonicity (False). True means higher feature values correspond to
higher target values. If None, uses configuration default.
y_min: Minimum value for the fitted isotonic function output. Clips the
fitted values to be at least this value. If None, no minimum constraint.
y_max: Maximum value for the fitted isotonic function output. Clips the
fitted values to be at most this value. If None, no maximum constraint.
min_change_threshold: Minimum relative change in fitted values required to
create a new bin boundary. Controls sensitivity to function changes.
Must be positive. If None, uses configuration default.
clip: Whether to clip values outside the fitted range to the nearest bin edge.
If None, uses configuration default.
preserve_dataframe: Whether to preserve pandas DataFrame structure in transform
operations. If None, uses configuration default.
guidance_columns: Column specification for target/guidance data used in
supervised binning. Can be column names, indices, or callable selector.
bin_edges: Pre-computed bin edges for reconstruction. Should not be provided
during normal usage.
bin_representatives: Pre-computed bin representatives for reconstruction.
Should not be provided during normal usage.
class_: Class name for reconstruction compatibility. Internal use only.
module_: Module name for reconstruction compatibility. Internal use only.
Attributes:
max_bins: Maximum number of bins to create
min_samples_per_bin: Minimum samples required per bin
increasing: Whether monotonicity is increasing or decreasing
y_min: Minimum constraint for fitted values
y_max: Maximum constraint for fitted values
min_change_threshold: Threshold for significant changes in fitted function
Example:
>>> import numpy as np
>>> from binlearn.methods import IsotonicBinning
>>>
>>> # Create data with monotonic relationship
>>> np.random.seed(42)
>>> X = np.random.uniform(0, 10, 1000).reshape(-1, 1)
>>> # Target increases monotonically with some noise
>>> y = 2 * X.flatten() + np.random.normal(0, 1, 1000)
>>>
>>> # Initialize isotonic binning
>>> binner = IsotonicBinning(
... max_bins=5,
... min_samples_per_bin=50,
... increasing=True,
... min_change_threshold=0.05
... )
>>>
>>> # Fit with target data
>>> binner.fit(X, y)
>>> X_binned = binner.transform(X)
>>>
>>> # Check monotonic preservation
>>> bin_means = []
>>> for bin_idx in range(len(binner.bin_edges_[0]) - 1):
... bin_mask = X_binned[:, 0] == bin_idx
... bin_means.append(np.mean(y[bin_mask]))
>>> print("Bin target means:", bin_means) # Should be monotonically increasing
Note:
- Requires target/guidance data for supervised learning of monotonic relationships
- Preserves monotonic relationship between features and average target values within bins
- Particularly useful for risk modeling, scoring, and ranking applications
- Handles constant features and insufficient variability gracefully
- Each column is processed independently with its corresponding target data
- The fitted isotonic models are stored and can be used for analysis
See Also:
Chi2Binning: Statistical significance-based supervised binning
TreeBinning: Decision tree-based supervised binning
SupervisedBinningBase: Base class for supervised binning methods
References:
Robertson, T., Wright, F. T., & Dykstra, R. L. (1988). Order Restricted Statistical
Inference.
"""
# pylint: disable=too-many-arguments,too-many-positional-arguments,too-many-locals
[docs]
def __init__(
self,
max_bins: int | str | None = None,
min_samples_per_bin: int | None = None,
increasing: bool | None = None,
y_min: float | None = None,
y_max: float | None = None,
min_change_threshold: float | None = None,
clip: bool | None = None,
preserve_dataframe: bool | None = None,
guidance_columns: Any = None,
*,
bin_edges: BinEdgesDict | None = None,
bin_representatives: BinEdgesDict | None = None,
class_: ( # pylint: disable=unused-argument
str | None
) = None, # For reconstruction compatibility
module_: ( # pylint: disable=unused-argument
str | None
) = None, # For reconstruction compatibility
):
"""Initialize Isotonic binning with monotonicity and quality parameters.
Sets up isotonic regression-based binning with specified parameters for
monotonicity preservation and bin quality control. Applies configuration
defaults for any unspecified parameters and validates the resulting configuration.
Args:
max_bins: Maximum number of bins to create per column. Controls granularity
of the binning. Can be:
- Integer: Exact maximum number of bins
- String: Dynamic calculation expression ('sqrt', 'log2', etc.)
Must be positive. If None, uses configuration default.
min_samples_per_bin: Minimum number of samples required per bin. Ensures
statistical reliability of each bin. Must be positive integer.
If None, uses configuration default.
increasing: Whether to enforce increasing monotonicity (True) or decreasing
monotonicity (False). True means higher feature values should correspond
to higher average target values. If None, uses configuration default.
y_min: Minimum value constraint for the fitted isotonic function output.
Clips fitted values to be at least this value. Must be numeric.
If None, no minimum constraint is applied.
y_max: Maximum value constraint for the fitted isotonic function output.
Clips fitted values to be at most this value. Must be numeric and
greater than y_min if both are specified. If None, no maximum constraint.
min_change_threshold: Minimum relative change in fitted values required
to create a new bin boundary. Controls sensitivity to changes in the
isotonic function. Must be positive float. If None, uses configuration default.
clip: Whether to clip transformed values outside the fitted range to the
nearest bin edge. If None, uses configuration default.
preserve_dataframe: Whether to preserve pandas DataFrame structure in
transform operations. If None, uses configuration default.
guidance_columns: Column specification for target/guidance data. Can be
column names, indices, or callable selector. Required for supervised
binning during fit operations.
bin_edges: Pre-computed bin edges dictionary for reconstruction. Internal
use only - should not be provided during normal initialization.
bin_representatives: Pre-computed representatives dictionary for
reconstruction. Internal use only.
class_: Class name string for reconstruction compatibility. Internal use only.
module_: Module name string for reconstruction compatibility. Internal use only.
Example:
>>> # Standard initialization for increasing monotonic relationship
>>> binner = IsotonicBinning(
... max_bins=8,
... min_samples_per_bin=30,
... increasing=True,
... min_change_threshold=0.02
... )
>>>
>>> # Decreasing monotonic relationship with value constraints
>>> binner = IsotonicBinning(
... max_bins=6,
... min_samples_per_bin=50,
... increasing=False,
... y_min=0.0,
... y_max=1.0,
... guidance_columns=['risk_score']
... )
>>>
>>> # Use configuration defaults
>>> binner = IsotonicBinning(guidance_columns='target')
Note:
- Parameter validation occurs during initialization
- Configuration defaults are applied for None parameters
- The increasing parameter is crucial for defining the expected relationship direction
- y_min and y_max constraints help with numerical stability and domain knowledge
enforcement
- Reconstruction parameters should not be provided during normal usage
- Guidance columns must be specified for supervised binning to work properly
"""
# Prepare user parameters for config integration (exclude never-configurable params)
# Use standardized initialization pattern
user_params = create_param_dict_for_config(
max_bins=max_bins,
min_samples_per_bin=min_samples_per_bin,
increasing=increasing,
y_min=y_min,
y_max=y_max,
min_change_threshold=min_change_threshold,
clip=clip,
preserve_dataframe=preserve_dataframe,
)
# Apply configuration defaults for isotonic method
resolved_params = apply_config_defaults("isotonic", user_params)
# Store method-specific parameters
self.max_bins = resolved_params.get("max_bins", 10)
self.min_samples_per_bin = resolved_params.get("min_samples_per_bin", 5)
self.increasing = resolved_params.get("increasing", True)
self.y_min = resolved_params.get("y_min", None)
self.y_max = resolved_params.get("y_max", None)
self.min_change_threshold = resolved_params.get("min_change_threshold", 0.01)
# Dictionary to store fitted isotonic models for each feature
self._isotonic_models: dict[Any, IsotonicRegression] = {}
# Initialize parent with resolved parameters (never-configurable params passed as-is)
SupervisedBinningBase.__init__(
self,
clip=resolved_params.get("clip"),
preserve_dataframe=resolved_params.get("preserve_dataframe"),
guidance_columns=guidance_columns, # Never configurable
bin_edges=bin_edges, # Never configurable
bin_representatives=bin_representatives, # Never configurable
)
def _validate_params(self) -> None:
"""Validate Isotonic binning parameters."""
# Call parent validation
SupervisedBinningBase._validate_params(self)
# Validate max_bins using centralized utility
validate_bin_number_parameter(self.max_bins, param_name="max_bins")
# Validate min_samples_per_bin parameter
if not isinstance(self.min_samples_per_bin, int) or self.min_samples_per_bin < 1:
raise ConfigurationError(
"min_samples_per_bin must be a positive integer",
suggestions=["Example: min_samples_per_bin=5"],
)
# Validate increasing parameter
if not isinstance(self.increasing, bool):
raise ConfigurationError(
"increasing must be a boolean value",
suggestions=["Use increasing=True for increasing monotonicity"],
)
# Validate y_min and y_max parameters
if self.y_min is not None and not isinstance(self.y_min, int | float):
raise ConfigurationError(
"y_min must be a number or None",
suggestions=["Example: y_min=0.0"],
)
if self.y_max is not None and not isinstance(self.y_max, int | float):
raise ConfigurationError(
"y_max must be a number or None",
suggestions=["Example: y_max=1.0"],
)
if self.y_min is not None and self.y_max is not None and self.y_min >= self.y_max:
raise ConfigurationError(
"y_min must be less than y_max",
suggestions=["Example: y_min=0.0, y_max=1.0"],
)
# Validate min_change_threshold parameter
if not isinstance(self.min_change_threshold, int | float) or self.min_change_threshold <= 0:
raise ConfigurationError(
"min_change_threshold must be a positive number",
suggestions=["Example: min_change_threshold=0.01"],
)
def _calculate_bins(
self,
x_col: np.ndarray[Any, Any],
col_id: Any,
guidance_data: np.ndarray[Any, Any] | None = None,
) -> tuple[list[float], list[float]]:
"""Calculate isotonic regression-based bins for a single column.
Uses isotonic regression to fit a monotonic function to the feature-target
relationship, then identifies cut points based on significant changes in
the fitted function.
Args:
x_col: Preprocessed column data (from base class)
col_id: Column identifier for error reporting
guidance_data: Target/guidance data for supervised binning (required)
Returns:
Tuple of (bin_edges, bin_representatives)
Raises:
FittingError: If guidance_data is None or insufficient data for binning
"""
# Require guidance data for supervised binning
if guidance_data is None:
raise FittingError(f"Column {col_id}: guidance_data is required for isotonic binning")
# Prepare guidance data for processing
guidance_data_numeric = self._prepare_target_values(guidance_data)
# Validate guidance data shape matches feature data
if len(guidance_data_numeric) != len(x_col):
raise ValueError(
f"Column {col_id}: Guidance data length ({len(guidance_data_numeric)}) "
f"does not match feature data length ({len(x_col)})"
)
# Check if we have sufficient data
if len(x_col) < self.min_samples_per_bin:
raise FittingError(
f"Column {col_id}: Insufficient data points ({len(x_col)}) "
f"for isotonic binning. Need at least {self.min_samples_per_bin}."
)
# Create isotonic binning
return self._create_isotonic_bins(x_col, guidance_data_numeric, col_id)
# pylint: disable=too-many-locals
def _create_isotonic_bins(
self, x_col: np.ndarray[Any, Any], y_col: np.ndarray[Any, Any], col_id: Any
) -> tuple[list[float], list[float]]:
"""Create bins using isotonic regression.
Fits an isotonic regression model to the feature-target relationship and
identifies optimal cut points based on changes in the fitted function.
Args:
x_col: Clean feature data (no NaN values)
y_col: Clean target data (no NaN values)
col_id: Column identifier for model storage
Returns:
Tuple of (bin_edges, bin_representatives)
Note:
The data is already preprocessed by the base class, so we don't need
to handle NaN/inf values or constant data here.
"""
# Resolve max_bins parameter for this dataset
resolved_max_bins = resolve_n_bins_parameter(
self.max_bins, data_shape=(len(x_col), 1), param_name="max_bins"
)
# Handle infinity values in feature data first (before constant feature check)
if np.any(np.isinf(x_col)):
x_finite_mask = np.isfinite(x_col)
if not np.any(x_finite_mask):
raise ValueError(f"Column {col_id}: All feature values are infinite")
# Replace inf values with finite extremes
x_min_finite = np.min(x_col[x_finite_mask])
x_max_finite = np.max(x_col[x_finite_mask])
x_range = x_max_finite - x_min_finite
# Replace -inf with minimum - 10% of range, +inf with maximum + 10% of range
x_col = np.where(x_col == -np.inf, x_min_finite - max(abs(x_range) * 0.1, 1.0), x_col)
x_col = np.where(x_col == np.inf, x_max_finite + max(abs(x_range) * 0.1, 1.0), x_col)
# Handle constant feature data
if len(np.unique(x_col)) == 1:
x_val = float(x_col[0])
return ([x_val - 0.1, x_val + 0.1], [x_val])
# Sort data by feature values for isotonic regression
sort_indices = np.argsort(x_col)
x_sorted = x_col[sort_indices]
y_sorted = y_col[sort_indices]
# Ensure both arrays are 1D for sklearn's IsotonicRegression
x_sorted = x_sorted.flatten()
y_sorted = y_sorted.flatten()
# Fit isotonic regression using safe sklearn call
try:
isotonic_model = IsotonicRegression(
increasing=self.increasing,
y_min=self.y_min,
y_max=self.y_max,
out_of_bounds="clip",
)
y_fitted = isotonic_model.fit_transform(x_sorted, y_sorted)
except (
ValueError,
RuntimeError,
ImportError,
Exception,
) as e: # pylint: disable=broad-exception-caught
raise ValueError(f"Column {col_id}: Isotonic regression failed: {e}") from e
# Store the fitted model
self._isotonic_models[col_id] = isotonic_model
# Find cut points based on fitted function changes
cut_points = self._find_cut_points(x_sorted, y_fitted, resolved_max_bins)
# Create bin edges and representatives
return self._create_bins_from_cuts(x_sorted, y_fitted, cut_points, col_id)
def _prepare_target_values(self, y_values: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
"""Prepare target values for isotonic regression.
Args:
y_values: Raw target values (may be 2D with shape (n_samples, 1))
Returns:
Processed target values suitable for isotonic regression (1D array)
"""
# Flatten if 2D with single column (guidance_data format)
if y_values.ndim == 2 and y_values.shape[1] == 1:
y_values_flat = y_values.flatten()
else:
y_values_flat = y_values
# Convert to float for isotonic regression
y_processed = y_values_flat.astype(float)
return y_processed
def _find_cut_points(
self,
x_sorted: np.ndarray[Any, Any],
y_fitted: np.ndarray[Any, Any],
max_bins: int, # pylint: disable=unused-argument
) -> list[int]:
"""Find cut points based on changes in fitted isotonic function.
Identifies locations where the fitted function has significant changes
that warrant creating new bin boundaries.
Args:
x_sorted: Sorted feature values
y_fitted: Fitted isotonic regression values
max_bins: Maximum number of bins allowed (resolved from string)
Returns:
Indices of cut points in the sorted arrays
"""
_ = x_sorted
cut_indices = [0] # Always start with first point
if len(y_fitted) <= 1:
return cut_indices
# Calculate relative changes in fitted values
y_range = np.max(y_fitted) - np.min(y_fitted)
if y_range == 0:
return cut_indices
# Find points with significant changes
for i in range(1, len(y_fitted)):
# Check if there's a significant change from the last cut point
last_cut_idx = cut_indices[-1]
y_change = abs(y_fitted[i] - y_fitted[last_cut_idx])
relative_change = y_change / y_range
# Check if we have enough samples since last cut
samples_since_cut = i - last_cut_idx
if (
relative_change >= self.min_change_threshold
and samples_since_cut >= self.min_samples_per_bin
and len(cut_indices) < max_bins - 1 # Ensure we don't exceed max_bins
):
cut_indices.append(i)
return cut_indices
def _create_bins_from_cuts(
self,
x_sorted: np.ndarray[Any, Any],
y_fitted: np.ndarray[Any, Any], # pylint: disable=unused-argument
cut_indices: list[int],
col_id: Any, # pylint: disable=unused-argument
) -> tuple[list[float], list[float]]:
"""Create bin edges and representatives from cut points.
Args:
x_sorted: Sorted feature values
y_fitted: Fitted isotonic regression values
cut_indices: Indices of cut points
col_id: Column identifier for error reporting
Returns:
Tuple of (bin_edges, bin_representatives)
"""
if len(cut_indices) == 1:
# Only one cut point - create single bin
x_min, x_max = float(np.min(x_sorted)), float(np.max(x_sorted))
if x_min == x_max:
x_max = x_min + 1.0
return [x_min, x_max], [(x_min + x_max) / 2]
# Create bin edges
bin_edges = []
bin_representatives = []
for i, cut_idx in enumerate(cut_indices):
if i == 0:
# First bin edge
bin_edges.append(float(x_sorted[cut_idx]))
else:
# Find midpoint between consecutive cut points for bin boundary
prev_cut_idx = cut_indices[i - 1]
if cut_idx > prev_cut_idx:
midpoint = (x_sorted[prev_cut_idx] + x_sorted[cut_idx]) / 2
bin_edges.append(float(midpoint))
# Representative is the mean of feature values in this bin
bin_x_values = x_sorted[prev_cut_idx:cut_idx]
bin_representative = float(np.mean(bin_x_values))
bin_representatives.append(bin_representative)
# Add final bin edge and representative
bin_edges.append(float(x_sorted[-1]))
if len(cut_indices) > 1:
final_bin_x = x_sorted[cut_indices[-1] :]
final_representative = float(np.mean(final_bin_x))
bin_representatives.append(final_representative)
else:
bin_representatives.append(float(np.mean(x_sorted)))
return bin_edges, bin_representatives