Source code for binlearn.methods._equal_width_minimum_weight_binning

"""
Clean Equal Width Minimum Weight binning implementation for  architecture.

This module provides EqualWidthMinimumWeightBinning that inherits from SupervisedBinningBase.
Uses equal-width bins with minimum weight constraints from guidance data.
"""

from __future__ import annotations

import warnings
from typing import Any

import numpy as np

from ..base import SupervisedBinningBase
from ..config import apply_config_defaults
from ..utils import (
    BinEdgesDict,
    ConfigurationError,
    DataQualityWarning,
    FittingError,
    create_equal_width_bins,
    create_param_dict_for_config,
    resolve_n_bins_parameter,
    validate_bin_number_for_calculation,
    validate_bin_number_parameter,
)


# pylint: disable=too-many-ancestors
[docs] class EqualWidthMinimumWeightBinning(SupervisedBinningBase): """Equal-width binning with minimum weight constraint implementation using clean architecture. Creates bins of equal width across the range of each feature, but adjusts the number of bins to ensure each bin contains at least the specified minimum total weight from the guidance column. This method combines the interpretability of equal-width binning with weight-based constraints for more balanced bins. This approach is particularly valuable when working with weighted data where statistical significance or minimum sample requirements must be maintained within each bin. The algorithm starts with equal-width bins and then merges adjacent underweight bins until all remaining bins meet the minimum weight requirement. The weight constraint helps ensure that: - Each bin has sufficient statistical power for analysis - Bins are meaningful for weighted modeling or evaluation - Sparse regions in the data don't create unreliable bins - The resulting binning respects both spatial (equal-width) and statistical (weight) considerations When no bins can meet the minimum weight requirement individually, the algorithm creates a single bin containing all data to maintain functionality. This implementation follows the clean binlearn architecture with straight inheritance, dynamic column resolution, and parameter reconstruction capabilities. Args: n_bins: Initial number of equal-width bins to create before weight-based merging. Controls the granularity of the initial binning. Can be an integer or a string expression like 'sqrt', 'log2', etc. for dynamic calculation. Final number of bins may be smaller due to merging. If None, uses configuration default. minimum_weight: Minimum total weight required per bin. Bins with lower total weight will be merged with adjacent bins until this requirement is met. Must be positive. If None, uses configuration default. bin_range: Optional tuple specifying (min, max) range for binning. If provided, bins are created within this range rather than the data's natural range. Useful for ensuring consistent binning across datasets. If None, uses data's min/max values. clip: Whether to clip values outside the fitted range to the nearest bin edge. If None, uses configuration default. preserve_dataframe: Whether to preserve pandas DataFrame structure in transform operations. If None, uses configuration default. guidance_columns: Column specification for weight/guidance data used in supervised binning. Should point to weight values for each sample. bin_edges: Pre-computed bin edges for reconstruction. Should not be provided during normal usage. bin_representatives: Pre-computed bin representatives for reconstruction. Should not be provided during normal usage. class_: Class name for reconstruction compatibility. Internal use only. module_: Module name for reconstruction compatibility. Internal use only. Attributes: n_bins: Initial number of bins before merging minimum_weight: Minimum weight requirement per bin bin_range: Optional fixed range for binning Example: >>> import numpy as np >>> from binlearn.methods import EqualWidthMinimumWeightBinning >>> >>> # Create sample data with weights >>> np.random.seed(42) >>> X = np.random.uniform(0, 100, 1000).reshape(-1, 1) >>> weights = np.random.exponential(2.0, 1000) # Exponentially distributed weights >>> >>> # Initialize with minimum weight constraint >>> binner = EqualWidthMinimumWeightBinning( ... n_bins=10, ... minimum_weight=50.0, ... guidance_columns='weight' ... ) >>> >>> # Fit with weight data >>> binner.fit(X, weights.reshape(-1, 1)) >>> X_binned = binner.transform(X) >>> >>> # Check bin weights >>> for i, edges in enumerate(zip(binner.bin_edges_[0][:-1], binner.bin_edges_[0][1:])): ... left, right = edges ... mask = (X >= left) & (X < right) if i < len(binner.bin_edges_[0]) - 2 ... else (X >= left) & (X <= right) ... bin_weight = np.sum(weights[mask.flatten()]) ... print(f"Bin {i}: [{left:.1f}, {right:.1f}] weight: {bin_weight:.1f}") Note: - Requires guidance data containing weight values for each sample - Final number of bins may be less than n_bins due to merging underweight bins - All weights must be non-negative (negative weights raise ValueError) - Bins are merged by combining adjacent underweight bins - Creates a single bin if no individual bins can meet the weight requirement - Each column is processed independently with its corresponding weight data - Weight-based merging preserves the equal-width property where possible See Also: EqualWidthBinning: Standard equal-width binning without weight constraints EqualFrequencyBinning: Equal-frequency binning for balanced sample counts SupervisedBinningBase: Base class for supervised binning methods References: This method extends standard equal-width binning with statistical adequacy constraints commonly used in risk modeling and weighted analysis scenarios. """ # pylint: disable=too-many-arguments,too-many-positional-arguments
[docs] def __init__( self, n_bins: int | str | None = None, minimum_weight: float | None = None, bin_range: tuple[float, float] | None = None, clip: bool | None = None, preserve_dataframe: bool | None = None, guidance_columns: Any = None, *, bin_edges: BinEdgesDict | None = None, bin_representatives: BinEdgesDict | None = None, class_: ( # pylint: disable=unused-argument str | None ) = None, # For reconstruction compatibility module_: ( # pylint: disable=unused-argument str | None ) = None, # For reconstruction compatibility ): """Initialize Equal Width Minimum Weight binning with weight constraints. Sets up equal-width binning with minimum weight constraints, combining spatial and statistical adequacy requirements. Applies configuration defaults for any unspecified parameters and validates the resulting configuration. Args: n_bins: Initial number of equal-width bins to create before weight-based merging. Controls the granularity of the initial binning. Can be: - Integer: Exact initial number of bins - String: Dynamic calculation expression ('sqrt', 'log2', etc.) Final number of bins may be smaller due to merging. Must be positive. If None, uses configuration default. minimum_weight: Minimum total weight required per bin. Bins with total weight below this threshold will be merged with adjacent bins until the requirement is met. Must be positive. If None, uses configuration default. bin_range: Optional tuple specifying (min_value, max_value) range for binning. If provided, equal-width bins are created within this range regardless of the actual data range. Useful for: - Consistent binning across multiple datasets - Excluding outliers from bin range calculation - Domain-specific range constraints Must be (min, max) where min < max. If None, uses data's actual range. clip: Whether to clip transformed values outside the fitted range to the nearest bin edge. If None, uses configuration default. preserve_dataframe: Whether to preserve pandas DataFrame structure in transform operations. If None, uses configuration default. guidance_columns: Column specification for weight/guidance data. Should point to columns containing weight values for each sample. Required for supervised binning during fit operations. bin_edges: Pre-computed bin edges dictionary for reconstruction. Internal use only - should not be provided during normal initialization. bin_representatives: Pre-computed representatives dictionary for reconstruction. Internal use only. class_: Class name string for reconstruction compatibility. Internal use only. module_: Module name string for reconstruction compatibility. Internal use only. Example: >>> # Standard initialization with weight constraints >>> binner = EqualWidthMinimumWeightBinning( ... n_bins=8, ... minimum_weight=100.0, ... guidance_columns='sample_weight' ... ) >>> >>> # Custom range with tighter weight requirements >>> binner = EqualWidthMinimumWeightBinning( ... n_bins=12, ... minimum_weight=50.0, ... bin_range=(0, 1000), ... guidance_columns=['weight_column'] ... ) >>> >>> # Use configuration defaults >>> binner = EqualWidthMinimumWeightBinning( ... guidance_columns='weights' ... ) Note: - Parameter validation occurs during initialization - Configuration defaults are applied for None parameters - The minimum_weight parameter is crucial for determining bin merging behavior - bin_range allows for consistent binning across datasets with different ranges - Guidance columns must point to weight data for the minimum weight constraint to work - Reconstruction parameters should not be provided during normal usage """ # Use standardized initialization pattern user_params = create_param_dict_for_config( n_bins=n_bins, minimum_weight=minimum_weight, bin_range=bin_range, clip=clip, preserve_dataframe=preserve_dataframe, ) # Apply configuration defaults for equal_width_minimum_weight method resolved_params = apply_config_defaults("equal_width_minimum_weight", user_params) # Store method-specific parameters self.n_bins = resolved_params.get("n_bins", 10) self.minimum_weight = resolved_params.get("minimum_weight", 1.0) self.bin_range = resolved_params.get("bin_range", None) # Initialize parent with resolved parameters (never-configurable params passed as-is) SupervisedBinningBase.__init__( self, clip=resolved_params.get("clip"), preserve_dataframe=resolved_params.get("preserve_dataframe"), guidance_columns=guidance_columns, # Never configurable bin_edges=bin_edges, # Never configurable bin_representatives=bin_representatives, # Never configurable )
def _validate_params(self) -> None: """Validate Equal Width Minimum Weight binning parameters.""" # Call parent validation SupervisedBinningBase._validate_params(self) # Validate n_bins using centralized utility validate_bin_number_parameter(self.n_bins, param_name="n_bins") # Validate minimum_weight parameter if not isinstance(self.minimum_weight, int | float) or self.minimum_weight <= 0: raise ConfigurationError( "minimum_weight must be a positive number", suggestions=["Example: minimum_weight=1.0"], ) # Validate bin_range parameter if self.bin_range is not None: if not isinstance(self.bin_range, list | tuple) or len(self.bin_range) != 2: raise ConfigurationError( "bin_range must be a tuple/list of two numbers (min, max)", suggestions=["Example: bin_range=(0, 100)"], ) min_val, max_val = self.bin_range if not isinstance(min_val, int | float) or not isinstance(max_val, int | float): raise ConfigurationError( "bin_range values must be numbers", suggestions=["Example: bin_range=(0.0, 100.0)"], ) if min_val >= max_val: raise ConfigurationError( "bin_range minimum must be less than maximum", suggestions=["Example: bin_range=(0, 100) where 0 < 100"], ) def _calculate_bins( self, x_col: np.ndarray[Any, Any], col_id: Any, guidance_data: np.ndarray[Any, Any] | None = None, ) -> tuple[list[float], list[float]]: """Calculate equal-width bins with minimum weight constraint for a single column. Computes bin edges and representatives starting with equal-width bins and then merging adjacent bins that don't meet the minimum weight requirement from the guidance data. Args: x_col: Preprocessed column data (from base class) col_id: Column identifier for error reporting guidance_data: Weight values for each data point (required) Returns: Tuple of (bin_edges, bin_representatives) Raises: FittingError: If guidance_data is None or insufficient data for binning """ # Require guidance data for supervised binning if guidance_data is None: raise FittingError( f"Column {col_id}: EqualWidthMinimumWeightBinning requires guidance_data " "to calculate weights for minimum weight constraint" ) # Validate n_bins for calculation validate_bin_number_for_calculation(self.n_bins, param_name="n_bins") resolved_n_bins = resolve_n_bins_parameter( self.n_bins, data_shape=(len(x_col), 1), param_name="n_bins" ) # Extract the single weight column (guaranteed to have shape (n_samples, 1) # by SupervisedBinningBase) weights = guidance_data[:, 0] return self._create_equal_width_minimum_weight_bins(x_col, weights, col_id, resolved_n_bins) # pylint: disable=too-many-locals def _create_equal_width_minimum_weight_bins( self, x_col: np.ndarray[Any, Any], weights: np.ndarray[Any, Any], col_id: Any, n_bins: int, ) -> tuple[list[float], list[float]]: """Create equal-width bins with minimum weight constraints. Args: x_col: Preprocessed column data (no NaN/inf values) weights: Weight values for each data point col_id: Column identifier for error reporting n_bins: Number of initial bins to create Returns: Tuple of (bin_edges, bin_representatives) Note: The data is already preprocessed by the base class, so we don't need to handle NaN/inf values here. """ # Check for negative weights if np.any(weights < 0): raise ValueError( f"Column {col_id}: guidance_data contains negative values. " "All weights must be non-negative." ) # Determine the range for binning if self.bin_range is not None: min_val, max_val = self.bin_range else: min_val, max_val = float(np.min(x_col)), float(np.max(x_col)) # Handle constant data if min_val == max_val: # Create a single bin with small extension epsilon = 1e-8 if min_val != 0 else 1e-8 edges = [min_val - epsilon, min_val + epsilon] representatives = [min_val] return edges, representatives # Create initial equal-width bins using standardized utility if self.bin_range is not None: initial_edges = create_equal_width_bins(x_col, n_bins, data_range=self.bin_range) else: initial_edges = create_equal_width_bins(x_col, n_bins) # Calculate weights in each initial bin bin_weights = [] for i in range(n_bins): left_edge = initial_edges[i] right_edge = initial_edges[i + 1] # Include left edge, exclude right edge (except for last bin) if i == n_bins - 1: # Last bin includes right edge mask = (x_col >= left_edge) & (x_col <= right_edge) else: mask = (x_col >= left_edge) & (x_col < right_edge) total_weight = np.sum(weights[mask]) bin_weights.append(total_weight) # Merge bins with insufficient weight merged_edges, _ = self._merge_underweight_bins(list(initial_edges), bin_weights, col_id) # Create representatives as bin centers representatives = [] for i in range(len(merged_edges) - 1): center = (merged_edges[i] + merged_edges[i + 1]) / 2 representatives.append(center) return merged_edges, representatives def _merge_underweight_bins( self, edges: list[float], bin_weights: list[float], col_id: Any, ) -> tuple[list[float], list[float]]: """Merge adjacent bins that don't meet minimum weight requirement. Args: edges: Initial bin edges bin_weights: Weight in each bin col_id: Column identifier for warnings Returns: Tuple of (merged_edges, merged_weights) """ if len(edges) <= 2: # Only one bin, can't merge further return edges, bin_weights merged_edges = [edges[0]] # Start with first edge merged_weights = [] current_weight = 0.0 for i, weight in enumerate(bin_weights): current_weight += weight # Check if we've reached minimum weight or this is the last bin if current_weight >= self.minimum_weight or i == len(bin_weights) - 1: # Close current merged bin merged_edges.append(edges[i + 1]) merged_weights.append(current_weight) current_weight = 0.0 # Check if we ended up with no bins due to all weights being too small if len(merged_weights) == 0: warnings.warn( f"Column {col_id}: No bins meet minimum weight requirement " f"({self.minimum_weight}). Creating single bin with total weight " f"{sum(bin_weights)}.", DataQualityWarning, stacklevel=2, ) # Return single bin with all data return [edges[0], edges[-1]], [sum(bin_weights)] return merged_edges, merged_weights