"""
Clean interval binning base class for V2 architecture.
This module provides interval-based binning functionality that inherits from GeneralBinningBase.
"""
from __future__ import annotations
from abc import abstractmethod
from typing import Any
import numpy as np
from ..config import get_config
from ..utils import (
BinEdgesDict,
ColumnList,
ConfigurationError,
FittingError,
default_representatives,
validate_bin_edges_format,
validate_bin_representatives_format,
validate_bins,
)
from ._general_binning_base import GeneralBinningBase
# pylint: disable=too-many-ancestors,too-many-instance-attributes
[docs]
class IntervalBinningBase(GeneralBinningBase):
"""Interval-based binning functionality inheriting from GeneralBinningBase.
This abstract base class provides specialized functionality for binning methods
that create discrete intervals from continuous data. It extends GeneralBinningBase
with interval-specific features like bin edge management, representative value
calculation, and out-of-range value handling.
Key Features:
- Interval boundary (bin edges) management and validation
- Representative value calculation and storage
- Clipping behavior for out-of-range values
- sklearn-compatible fitted attributes
- Comprehensive parameter validation
The class manages two core concepts:
- Bin edges: Define interval boundaries [a, b, c] creating bins [a,b) and [b,c]
- Representatives: Values that represent each bin (typically centers or means)
Parameters:
-----------
clip : bool, optional
Whether to clip out-of-range values to the nearest bin boundaries.
If None, uses the global configuration default. When True:
- Values below minimum edge are assigned to first bin
- Values above maximum edge are assigned to last bin
When False, out-of-range values get special indices (BELOW_RANGE, ABOVE_RANGE).
preserve_dataframe : bool, optional
Inherited from GeneralBinningBase. Whether to preserve DataFrame format.
fit_jointly : bool, optional
Inherited from GeneralBinningBase. Whether to fit columns jointly.
guidance_columns : GuidanceColumns, optional
Inherited from GeneralBinningBase. Guidance column specification.
bin_edges : BinEdgesDict, optional
Pre-specified bin edges as a dictionary mapping column identifiers to
edge lists. If provided, the fitting process will validate and use these
edges instead of computing them from data.
bin_representatives : BinEdgesDict, optional
Pre-specified bin representatives as a dictionary mapping column identifiers
to representative value lists. If provided, validates consistency with bin_edges.
Attributes:
-----------
clip : bool
Whether to clip out-of-range values to bin boundaries.
bin_edges : BinEdgesDict | None
Pre-specified bin edges (input parameter).
bin_representatives : BinEdgesDict | None
Pre-specified bin representatives (input parameter).
bin_edges_ : BinEdgesDict
Fitted bin edges after calling fit(). Dictionary mapping each column
to its list of bin boundary values.
bin_representatives_ : BinEdgesDict
Fitted bin representatives after calling fit(). Dictionary mapping each
column to its list of representative values.
Note:
-----
This is an abstract base class. Concrete implementations must provide the
abstract method _calculate_bins() to define how bin edges are computed
from input data for their specific binning algorithm.
"""
# pylint: disable=too-many-arguments,too-many-positional-arguments
[docs]
def __init__(
self,
clip: bool | None = None,
preserve_dataframe: bool | None = None,
fit_jointly: bool | None = None,
guidance_columns: Any = None,
*,
bin_edges: BinEdgesDict | None = None,
bin_representatives: BinEdgesDict | None = None,
):
"""Initialize interval binning base with configuration and validation.
Sets up the interval binning transformer with the specified parameters,
applying configuration defaults and performing early parameter validation
to catch configuration errors before fitting.
Args:
clip: Whether to clip out-of-range values to bin boundaries.
If None, uses global configuration default.
preserve_dataframe: Whether to preserve DataFrame format in output.
Passed to GeneralBinningBase. If None, uses global configuration default.
fit_jointly: Whether to fit all columns jointly rather than independently.
Passed to GeneralBinningBase. If None, uses global configuration default.
guidance_columns: Specification of guidance columns for supervised binning.
Passed to GeneralBinningBase.
bin_edges: Pre-specified bin edges for manual binning. If provided,
the fitting process validates and uses these instead of computing
from data.
bin_representatives: Pre-specified bin representatives. If provided,
must be consistent with bin_edges.
Raises:
ValueError: If clip parameter is invalid or pre-specified bins are
inconsistent.
ConfigurationError: If parameter validation fails.
Note:
Early parameter validation helps catch configuration issues before
expensive fitting operations. The bin_edges_ and bin_representatives_
attributes are initialized as empty dictionaries and populated during fitting.
"""
# Initialize parent
GeneralBinningBase.__init__(
self,
preserve_dataframe=preserve_dataframe,
fit_jointly=fit_jointly,
guidance_columns=guidance_columns,
)
# Load configuration defaults
config = get_config()
if clip is None:
clip = config.default_clip
# Store interval-specific parameters
self.clip = clip
self.bin_edges = bin_edges
self.bin_representatives = bin_representatives
# Working fitted attributes
self.bin_edges_: BinEdgesDict = {}
self.bin_representatives_: BinEdgesDict = {}
# Initialize sklearn attributes to avoid W0201 warnings
self._feature_names_in: list[Any] | None = None
self._n_features_in: int = 0
# Configure fitted attributes for the base class
self._fitted_attributes = ["bin_edges_", "bin_representatives_"]
# Validate parameters early
self._validate_params()
def _validate_params(self) -> None:
"""Validate interval binning parameters."""
# Call parent validation
GeneralBinningBase._validate_params(self)
# Validate clip parameter
if not isinstance(self.clip, bool):
raise TypeError("clip must be a boolean")
# Process provided bin specifications
try:
if self.bin_edges is not None:
validate_bin_edges_format(self.bin_edges)
self.bin_edges_ = self.bin_edges
if self.bin_representatives is not None:
validate_bin_representatives_format(self.bin_representatives, self.bin_edges)
self.bin_representatives_ = self.bin_representatives
# Validate compatibility
validate_bins(self.bin_edges_, self.bin_representatives_)
elif self.bin_edges_:
# Generate default representatives
self.bin_representatives_ = {}
for col, edges in self.bin_edges_.items():
edges_list = list(edges)
self.bin_representatives_[col] = default_representatives(edges_list)
# If we have complete specifications, mark as fitted and set sklearn attributes
if self.bin_edges_ and self.bin_representatives_:
self._set_sklearn_attributes_from_specs()
except ValueError as e:
raise ConfigurationError(str(e)) from e
def _set_sklearn_attributes_from_specs(self) -> None:
"""Set sklearn attributes from bin specifications."""
if self.bin_edges_ is not None:
# Get column names/indices from bin_edges
binning_columns = list(self.bin_edges_.keys())
# Add guidance columns if specified
all_features = binning_columns.copy()
if self.guidance_columns is not None:
guidance_cols = (
[self.guidance_columns]
if not isinstance(self.guidance_columns, list)
else self.guidance_columns
)
# Add guidance columns that aren't already in binning columns
for col in guidance_cols:
if col not in all_features:
all_features.append(col)
# Set sklearn attributes
self._feature_names_in = all_features
self._n_features_in = len(all_features)
def _fit_per_column_independently(
self,
X: np.ndarray[Any, Any],
columns: ColumnList,
guidance_data: np.ndarray[Any, Any] | None = None,
**fit_params: Any,
) -> None:
"""Fit binning parameters independently for each column."""
self.bin_edges_ = {}
self.bin_representatives_ = {}
for i, col in enumerate(columns):
x_col = X[:, i]
# Validate and preprocess numeric data
x_col_processed = self._validate_and_preprocess_column(x_col, col)
# Use the same guidance_data for all columns (not indexed per column)
edges, representatives = self._calculate_bins(x_col_processed, col, guidance_data)
self.bin_edges_[col] = edges
self.bin_representatives_[col] = representatives
def _fit_jointly_across_columns(
self, X: np.ndarray[Any, Any], columns: ColumnList, **fit_params: Any
) -> None:
"""Fit binning parameters jointly across all columns."""
# For interval binning, joint fitting is the same as per-column fitting
# since intervals don't depend on other columns
self._fit_per_column_independently(X, columns, None, **fit_params)
def _transform_columns_to_bins(
self, X: np.ndarray[Any, Any], columns: ColumnList
) -> np.ndarray[Any, Any]:
"""Transform columns to bin indices."""
if X.size == 0:
return np.empty((X.shape[0], 0))
# Validate that input has same number of columns as bin specifications
if X.shape[1] != len(self.bin_edges_):
raise ValueError(
f"Input data has {X.shape[1]} columns but bin specifications "
f"are provided for {len(self.bin_edges_)} columns"
)
result = np.empty_like(X, dtype=int)
available_keys = list(self.bin_edges_.keys())
for i, col in enumerate(columns):
# Get the right bin specification using column key resolution
key = self._get_column_key(col, available_keys, i)
edges = np.array(self.bin_edges_[key])
column_data = X[:, i]
# Handle special values (NaN, inf)
is_special = np.isnan(column_data) | np.isinf(column_data)
# Apply clipping if enabled
if self.clip:
column_data = np.clip(column_data, edges[0], edges[-1])
# Digitize to get bin indices
bin_indices = np.digitize(column_data, edges) - 1
# Ensure bin indices are in valid range
bin_indices = np.clip(bin_indices, 0, len(edges) - 2)
# Handle special values - assign to last bin
bin_indices[is_special] = len(edges) - 2
result[:, i] = bin_indices
return result
def _inverse_transform_bins_to_values(
self, X: np.ndarray[Any, Any], columns: ColumnList
) -> np.ndarray[Any, Any]:
"""Transform bin indices to representative values."""
if X.size == 0:
return np.empty((X.shape[0], 0))
result = np.empty_like(X, dtype=float)
available_keys = list(self.bin_representatives_.keys())
for i, col in enumerate(columns):
# Get the right bin specification using column key resolution
key = self._get_column_key(col, available_keys, i)
representatives = np.array(self.bin_representatives_[key])
bin_indices = X[:, i].astype(int)
# Clip indices to valid range
bin_indices = np.clip(bin_indices, 0, len(representatives) - 1)
result[:, i] = representatives[bin_indices]
return result
def _validate_and_preprocess_column(
self, x_col: np.ndarray[Any, Any], col_id: Any
) -> np.ndarray[Any, Any]:
"""Validate column data for interval binning.
Args:
x_col: Raw column data
col_id: Column identifier for error messages
Returns:
The original column data (unchanged)
Raises:
FittingError: If column contains only NaN values
"""
# Check for all-NaN column
if np.all(np.isnan(x_col)):
raise FittingError(f"Column {col_id} contains only NaN values. Cannot perform binning.")
return x_col
@abstractmethod
def _calculate_bins(
self,
x_col: np.ndarray[Any, Any],
col_id: Any,
guidance_data: np.ndarray[Any, Any] | None = None,
) -> tuple[list[float], list[float]]:
"""Calculate bin edges and representatives for a column.
Subclasses must implement this method to define their binning strategy.
"""