Source code for binlearn.base._flexible_binning_base

"""
Clean flexible binning base class for V2 architecture.
"""

from __future__ import annotations

from abc import abstractmethod
from typing import Any

import numpy as np

from ..utils import (
    BinEdgesDict,
    ColumnList,
    ConfigurationError,
    FlexibleBinSpec,
    transform_value_to_flexible_bin,
    validate_bin_representatives_format,
)
from ._general_binning_base import GeneralBinningBase


# pylint: disable=too-many-ancestors
[docs] class FlexibleBinningBase(GeneralBinningBase): """Base class for flexible binning methods that support mixed bin types. This class extends GeneralBinningBase to provide specialized functionality for flexible binning methods. Unlike traditional interval-based binning, flexible binning supports mixed bin types within the same feature, including singleton bins (exact value matching) and interval bins (range matching) in any combination. Flexible binning is particularly useful for: - Categorical features with numeric representations - Mixed data types requiring different binning strategies per value - Custom binning schemes that don't fit traditional interval patterns - Data with important singleton values that should be preserved exactly Key Features: - Mixed bin types: Combine singleton and interval bins in the same feature - Custom bin specifications: Define bins as either exact values or ranges - Automatic representative generation: Creates numeric representatives for mixed bins - Flexible transformation: Handles both numeric and non-numeric data appropriately Attributes: bin_spec: Dictionary mapping column identifiers to lists of flexible bin definitions. Each bin can be either a scalar (singleton) or tuple (interval). bin_representatives: Dictionary mapping column identifiers to lists of numeric representative values for each bin. Auto-generated if not provided. Example: >>> # Example of flexible bin specification >>> bin_spec = { ... 'mixed_feature': [ ... 42, # Singleton bin: exactly value 42 ... (10, 20), # Interval bin: range [10, 20] ... 'special', # Categorical singleton ... (100, 200) # Another interval ... ] ... } Note: - This is an abstract base class - use concrete implementations like ManualFlexibleBinning - Bin representatives are automatically generated as midpoints for intervals and preserved values for singletons (with numeric conversion where possible) - Inherits all functionality from GeneralBinningBase including fit/transform interface - Subclasses must implement the abstract _do_fit_single_column method """ # pylint: disable=too-many-arguments
[docs] def __init__( self, preserve_dataframe: bool | None = None, fit_jointly: bool | None = None, guidance_columns: Any = None, *, bin_spec: FlexibleBinSpec | None = None, bin_representatives: BinEdgesDict | None = None, ): """Initialize flexible binning base class. Args: preserve_dataframe: Whether to preserve the original DataFrame format during transformation. If True, returns DataFrame when input is DataFrame. If False, returns numpy array. If None, uses the global configuration default from binlearn.config.preserve_dataframe. fit_jointly: Whether to fit all columns together using shared information. If True, performs joint fitting across columns. If False, fits each column independently. If None, uses method-specific default behavior. guidance_columns: Additional columns to use as guidance for binning decisions. These columns are not binned themselves but can influence the binning of other columns. Can be column names/indices or None for no guidance. bin_spec: Pre-defined flexible bin specification as a dictionary mapping column identifiers to lists of bin definitions. Each bin definition can be either a scalar value (singleton bin) or a tuple (interval bin). If provided, no fitting is performed and this specification is used directly. bin_representatives: Pre-defined representative values for each bin as a dictionary mapping column identifiers to lists of numeric values. Must match the structure of bin_spec if provided. If None, representatives are automatically generated from bin_spec. Raises: ConfigurationError: If bin specifications are invalid or incompatible. Example: >>> # Initialize with custom bin specification >>> bin_spec = { ... 'feature1': [10, (20, 30), 40], ... 'feature2': ['A', 'B', (1, 5)] ... } >>> binner = ConcreteFlexibleBinner(bin_spec=bin_spec) >>> >>> # Initialize for automatic fitting >>> binner = ConcreteFlexibleBinner(fit_jointly=True) Note: - When bin_spec is provided, the binning is pre-configured and fit() becomes a no-op - bin_representatives will be auto-generated if not provided with bin_spec - guidance_columns feature may not be supported by all flexible binning methods - All parameters are passed to the parent GeneralBinningBase constructor """ # Initialize parent GeneralBinningBase.__init__( self, preserve_dataframe=preserve_dataframe, fit_jointly=fit_jointly, guidance_columns=guidance_columns, ) # Store flexible-specific parameters self.bin_spec = bin_spec self.bin_representatives = bin_representatives # Working fitted attributes self.bin_spec_: FlexibleBinSpec = {} self.bin_representatives_: BinEdgesDict = {} # Initialize sklearn attributes to avoid W0201 warnings self._feature_names_in: list[Any] | None = None self._n_features_in: int = 0 # Configure fitted attributes for the base class self._fitted_attributes = ["bin_spec_", "bin_representatives_"] # Validate parameters early self._validate_params()
# pylint: disable=too-many-branches def _validate_params(self) -> None: """Validate flexible binning parameters and initialize fitted attributes. This method performs comprehensive validation of the flexible binning parameters and pre-processes any provided bin specifications. It handles auto-generation of bin representatives and sets up the fitted state when complete specifications are provided. Raises: ConfigurationError: If bin specifications are invalid, bin_spec is not a dictionary, or bin_representatives format is invalid. Note: - Calls parent class parameter validation first - Auto-generates numeric representatives for bin specifications when not provided - For singleton bins, uses the value itself (converted to float if possible) - For interval bins, uses the midpoint as the representative - Non-numeric singleton values get a placeholder representative (0.0) - Sets sklearn attributes automatically when complete specifications are provided """ # Call parent validation GeneralBinningBase._validate_params(self) # Process provided bin specifications # pylint: disable=too-many-nested-blocks try: if self.bin_spec is not None: # For now, just check it's a dictionary if not isinstance(self.bin_spec, dict): raise ValueError("bin_spec must be a dictionary") self.bin_spec_ = self.bin_spec if self.bin_representatives is not None: validate_bin_representatives_format(self.bin_representatives) self.bin_representatives_ = self.bin_representatives elif self.bin_spec_: # For flexible binning, auto-generate proper numeric representatives # bin_spec contains mixed values (singletons and tuples for intervals) # but representatives must be all numeric self.bin_representatives_ = {} for col, spec in self.bin_spec_.items(): if isinstance(spec, list): representatives = [] for spec_item in spec: if isinstance(spec_item, tuple) and len(spec_item) == 2: # Interval bin: use midpoint as representative representatives.append(float((spec_item[0] + spec_item[1]) / 2)) elif not isinstance(spec_item, tuple): # Singleton bin: use the value itself as representative try: representatives.append(float(spec_item)) except (ValueError, TypeError): # For non-numeric singleton bins, use a placeholder representatives.append(0.0) else: # Fallback for unexpected formats representatives.append(0.0) self.bin_representatives_[col] = representatives # If we have complete specifications, mark as fitted and set sklearn attributes if self.bin_spec_ and self.bin_representatives_: self._set_sklearn_attributes_from_specs() except ValueError as e: raise ConfigurationError(str(e)) from e def _set_sklearn_attributes_from_specs(self) -> None: """Set sklearn-compatible attributes from flexible bin specifications. This method configures the sklearn-compatible attributes (_feature_names_in and _n_features_in) based on the provided bin specifications and guidance columns. It ensures compatibility with sklearn's metadata routing and feature inspection APIs. Note: - Extracts column identifiers from bin_spec_ as primary features - Adds guidance_columns to the feature list if provided - Handles both single guidance column and list of guidance columns - Avoids duplicate columns when guidance columns overlap with binning columns - Sets _feature_names_in to the complete list of input columns - Sets _n_features_in to the total number of expected input features """ if self.bin_spec_ is not None: # Get column names/indices from bin_spec binning_columns = list(self.bin_spec_.keys()) # Add guidance columns if specified all_features = binning_columns.copy() if self.guidance_columns is not None: guidance_cols = ( [self.guidance_columns] if not isinstance(self.guidance_columns, list) else self.guidance_columns ) # Add guidance columns that aren't already in binning columns for col in guidance_cols: if col not in all_features: all_features.append(col) # Set sklearn attributes self._feature_names_in = all_features self._n_features_in = len(all_features) def _fit_per_column_independently( self, X: np.ndarray[Any, Any], columns: ColumnList, guidance_data: np.ndarray[Any, Any] | None = None, **fit_params: Any, ) -> None: """Fit flexible binning parameters independently for each column. This method implements independent column fitting for flexible binning, where each column is analyzed separately to determine its optimal bin structure. This is the default fitting strategy for most flexible binning methods. Args: X: Input data array with shape (n_samples, n_features) where each column will be fitted independently. columns: List of column identifiers corresponding to the columns in X. Used for bin specification keys and error messages. guidance_data: Optional guidance data array that can influence binning decisions but is not binned itself. Same guidance data is provided to all columns during fitting. **fit_params: Additional parameters passed to the underlying bin calculation method (_calculate_flexible_bins). Note: - Creates separate bin specifications for each column in self.bin_spec_ - Creates separate representatives for each column in self.bin_representatives_ - Validates that each column contains numeric data before processing - Delegates actual bin calculation to the abstract _calculate_flexible_bins method - Each column is processed independently without sharing information """ self.bin_spec_ = {} self.bin_representatives_ = {} for i, col in enumerate(columns): x_col = X[:, i] # Validate numeric data self._validate_numeric_data(x_col, col) # Use the same guidance_data for all columns (not indexed per column) edges, representatives = self._calculate_flexible_bins(x_col, col, guidance_data) self.bin_spec_[col] = edges self.bin_representatives_[col] = representatives def _fit_jointly_across_columns( self, X: np.ndarray[Any, Any], columns: ColumnList, **fit_params: Any ) -> None: """Fit flexible binning parameters jointly across all columns. This method implements joint fitting for flexible binning, where information from all columns is considered together during bin determination. For most flexible binning methods, this defaults to independent fitting unless overridden by specific implementations. Args: X: Input data array with shape (n_samples, n_features) where all columns will be considered together during fitting. columns: List of column identifiers corresponding to the columns in X. Used for bin specification keys and error messages. **fit_params: Additional parameters passed to the underlying fitting logic. Note: - Default implementation delegates to _fit_per_column_independently - Subclasses can override this method to implement true joint fitting - Joint fitting might consider correlations or dependencies between columns - The choice between joint and independent fitting is controlled by fit_jointly parameter """ # For flexible binning, joint fitting is typically the same as per-column fitting # unless overridden by specific implementations self._fit_per_column_independently(X, columns, None, **fit_params) def _transform_columns_to_bins( self, X: np.ndarray[Any, Any], columns: ColumnList ) -> np.ndarray[Any, Any]: """Transform data columns to bin indices using flexible bin mapping. This method performs the core transformation operation for flexible binning, mapping each value in the input data to its corresponding bin index. It handles both singleton and interval bins within the same column. Args: X: Input data array with shape (n_samples, n_features) to transform. Each column should correspond to a column that was fitted. columns: List of column identifiers corresponding to the columns in X. Used to match with the fitted bin specifications. Returns: Transformed array with same shape as input, containing integer bin indices. Values that don't match any bin are assigned MISSING_VALUE (-1). Raises: ValueError: If the number of input columns doesn't match the number of fitted bin specifications. Note: - Uses transform_value_to_flexible_bin utility for individual value transformation - Handles missing values and out-of-range values by assigning MISSING_VALUE (-1) - Each value is compared against all bin definitions for its column - For singleton bins, uses exact equality comparison - For interval bins, uses inclusive range comparison [start, end] - Returns the first matching bin index if multiple bins could match """ if X.size == 0: return np.empty((X.shape[0], 0)) # Validate that input has same number of columns as bin specifications if X.shape[1] != len(self.bin_spec_): raise ValueError( f"Input data has {X.shape[1]} columns but bin specifications " f"are provided for {len(self.bin_spec_)} columns" ) result = np.empty_like(X, dtype=int) available_keys = list(self.bin_spec_.keys()) for i, col in enumerate(columns): # Find the right bin specification - this will raise ValueError for missing columns key = self._get_column_key(col, available_keys, i) bin_defs = self.bin_spec_[key] # Transform this column col_data = X[:, i] for row_idx, value in enumerate(col_data): # Use utility function for transformation result[row_idx, i] = transform_value_to_flexible_bin(value, bin_defs) return result def _inverse_transform_bins_to_values( self, X: np.ndarray[Any, Any], columns: ColumnList ) -> np.ndarray[Any, Any]: """Transform bin indices back to representative values. This method performs the inverse transformation for flexible binning, mapping bin indices back to their corresponding representative values. It's used to reconstruct approximate original values from bin indices. Args: X: Input array with shape (n_samples, n_features) containing integer bin indices to be transformed back to representative values. columns: List of column identifiers corresponding to the columns in X. Used to match with the fitted bin representatives. Returns: Array with same shape as input, containing float representative values for each bin index. Invalid indices are clipped to valid range. Note: - Uses the bin_representatives_ fitted during training - Invalid bin indices (< 0 or >= n_bins) are clipped to valid range - For flexible binning, representatives are typically: - The original singleton value for singleton bins - The interval midpoint for interval bins - Output is always float type for consistency, even if original values were integers - Missing values (MISSING_VALUE indices) are clipped to first bin representative """ if X.size == 0: return np.empty((X.shape[0], 0)) result = np.empty_like(X, dtype=float) available_keys = list(self.bin_representatives_.keys()) for i, col in enumerate(columns): # Get the right bin specification using column key resolution key = self._get_column_key(col, available_keys, i) representatives = np.array(self.bin_representatives_[key]) bin_indices = X[:, i].astype(int) # Clip indices to valid range bin_indices = np.clip(bin_indices, 0, len(representatives) - 1) result[:, i] = representatives[bin_indices] return result @abstractmethod def _calculate_flexible_bins( self, x_col: np.ndarray[Any, Any], col_id: Any, guidance_data: np.ndarray[Any, Any] | None = None, ) -> tuple[list[Any], list[Any]]: """Calculate flexible bin values and representatives for a column. For flexible binning, this typically identifies unique values or patterns rather than creating fixed intervals. Args: x_col: Column data to analyze col_id: Column identifier guidance_data: Optional guidance data for this column Returns: Tuple of (bin_values, representatives) where: - bin_values: List of values that define the bins - representatives: List of representative values for each bin """