Source code for binlearn.methods._equal_width_binning

"""
Clean equal width binning implementation for  architecture.

This module provides EqualWidthBinning that inherits from IntervalBinningBase.
"""

from __future__ import annotations

from typing import Any

import numpy as np

from ..base import IntervalBinningBase
from ..config import apply_config_defaults
from ..utils import (
    BinEdgesDict,
    create_equal_width_bins,
    create_param_dict_for_config,
    validate_positive_integer,
    validate_range_parameter,
)


# pylint: disable=too-many-ancestors
[docs] class EqualWidthBinning(IntervalBinningBase): """Equal width binning implementation for creating uniform interval bins. This class implements equal width binning, one of the most fundamental and widely-used discretization methods. It divides the range of continuous values into a specified number of bins, each having the same width (range span). The method is simple, interpretable, and works well when data is uniformly distributed across the value range. Equal width binning is particularly effective for: - Uniformly distributed data where equal intervals make intuitive sense - Creating interpretable bins with consistent spacing - Baseline discretization before trying more sophisticated methods - Applications where maintaining consistent bin widths is important Key Features: - Creates bins with identical width (max_value - min_value) / n_bins - Handles custom value ranges via bin_range parameter - Automatic bin representative calculation as interval midpoints - Inherits all interval binning capabilities (clipping, format preservation, etc.) - Supports both automatic range detection and user-specified ranges Algorithm: 1. Determine value range (from data or user-specified bin_range) 2. Create n_bins equally-spaced intervals across the range 3. Assign representative values as bin centers 4. Transform values by finding their containing interval Parameters: n_bins: Number of bins to create. Must be a positive integer. Default value can be configured globally via binlearn.config. bin_range: Optional custom range as (min, max) tuple. If provided, bins are created within this range regardless of actual data range. Useful for ensuring consistent binning across datasets. Attributes: bin_edges_: Dictionary mapping column identifiers to lists of bin edges after fitting. Each edge list contains n_bins + 1 values. bin_representatives_: Dictionary mapping column identifiers to lists of bin representatives (typically bin centers). Example: >>> import numpy as np >>> from binlearn.methods import EqualWidthBinning >>> >>> # Basic equal width binning >>> X = np.array([[1.0, 10.0], [2.0, 20.0], [3.0, 30.0], [4.0, 40.0]]) >>> binner = EqualWidthBinning(n_bins=3) >>> binner.fit(X) >>> X_binned = binner.transform(X) >>> print(X_binned) # [[0, 0], [1, 1], [2, 2], [2, 2]] >>> >>> # With custom range >>> binner_custom = EqualWidthBinning(n_bins=2, bin_range=(0.0, 10.0)) >>> binner_custom.fit(X) >>> # Bins are created in [0, 10] range regardless of actual data range Note: - Only works with numeric data - non-numeric columns will raise errors - Constant columns (same value everywhere) are handled by creating single bin - Outliers can significantly affect bin boundaries in automatic range mode - Consider using bin_range parameter for consistent binning across datasets - Inherits clipping behavior and DataFrame preservation from IntervalBinningBase """ # pylint: disable=too-many-arguments,too-many-positional-arguments
[docs] def __init__( self, n_bins: int | None = None, bin_range: tuple[float, float] | None = None, clip: bool | None = None, preserve_dataframe: bool | None = None, fit_jointly: bool | None = None, guidance_columns: Any = None, *, bin_edges: BinEdgesDict | None = None, bin_representatives: BinEdgesDict | None = None, class_: ( # pylint: disable=unused-argument str | None ) = None, # For reconstruction compatibility module_: ( # pylint: disable=unused-argument str | None ) = None, # For reconstruction compatibility ): """Initialize equal width binning with configuration and parameters. Sets up the equal width binning method with user-specified parameters and configuration defaults. The method integrates with binlearn's global configuration system while allowing parameter-specific overrides. Args: n_bins: Number of equal-width bins to create for each column. Must be a positive integer. If None, uses the global configuration default for the 'equal_width' method, typically 5. bin_range: Optional custom range as (min, max) tuple within which to create bins. If provided, bins are created within this range regardless of the actual data range. Useful for ensuring consistent binning across different datasets. If None, range is determined automatically from the data during fitting. clip: Whether to clip out-of-range values to the nearest bin boundary during transformation. If True, values outside the range are assigned to the nearest edge bin. If False, they receive special out-of-range indices. If None, uses global configuration default. preserve_dataframe: Whether to preserve DataFrame format in outputs when input is a DataFrame. If None, uses global configuration default. fit_jointly: Whether to fit all columns together (not applicable for equal width binning as it's inherently per-column). If None, uses global configuration default. guidance_columns: Additional columns to include in input validation but not to bin. Not typically used for equal width binning. If None, no guidance columns are expected. bin_edges: Pre-computed bin edges for reconstruction/deserialization. If provided, no fitting is performed and these edges are used directly. Should be a dictionary mapping column identifiers to lists of edge values. bin_representatives: Pre-computed bin representatives for reconstruction. Must be provided together with bin_edges. Should be a dictionary mapping column identifiers to lists of representative values. class_: Class name for reconstruction compatibility (ignored during normal initialization). module_: Module name for reconstruction compatibility (ignored during normal initialization). Example: >>> # Basic initialization with defaults >>> binner = EqualWidthBinning() >>> >>> # Custom number of bins and range >>> binner = EqualWidthBinning(n_bins=10, bin_range=(0.0, 100.0)) >>> >>> # Reconstruction from saved parameters >>> binner = EqualWidthBinning( ... n_bins=5, ... bin_edges={'col1': [0, 1, 2, 3, 4, 5]}, ... bin_representatives={'col1': [0.5, 1.5, 2.5, 3.5, 4.5]} ... ) Note: - Parameters integrate with binlearn's global configuration system - None values allow configuration defaults to take effect - Pre-computed edges and representatives enable object reconstruction - Class follows sklearn's estimator interface conventions """ # Use standardized initialization pattern user_params = create_param_dict_for_config( n_bins=n_bins, bin_range=bin_range, clip=clip, preserve_dataframe=preserve_dataframe, fit_jointly=fit_jointly, ) # Apply configuration defaults params = apply_config_defaults("equal_width", user_params) # Store equal width specific parameters self.n_bins = params.get("n_bins", 5) self.bin_range = params.get("bin_range", bin_range) # Initialize parent with resolved config parameters IntervalBinningBase.__init__( self, clip=params.get("clip"), preserve_dataframe=params.get("preserve_dataframe"), fit_jointly=params.get("fit_jointly"), guidance_columns=guidance_columns, bin_edges=bin_edges, bin_representatives=bin_representatives, )
def _validate_params(self) -> None: """Validate equal width binning specific parameters. Performs comprehensive validation of equal width binning parameters to ensure they meet the method's requirements. This includes checking parameter types, ranges, and logical consistency. Raises: ConfigurationError: If any parameter is invalid: - n_bins is not a positive integer - bin_range is not a valid (min, max) tuple with min < max Note: - Calls parent class validation first for inherited parameters - Validates that n_bins is appropriate for creating meaningful bins - Ensures bin_range, if provided, defines a valid interval - Called automatically during initialization """ # Call parent validation IntervalBinningBase._validate_params(self) # Validate n_bins using standardized validator validate_positive_integer(self.n_bins, "n_bins") # Validate bin_range using standardized validator validate_range_parameter(self.bin_range, "bin_range") def _calculate_bins( self, x_col: np.ndarray[Any, Any], col_id: Any, guidance_data: np.ndarray[Any, Any] | None = None, ) -> tuple[list[float], list[float]]: """Calculate equal width bin edges and representatives for a single column. This method implements the core equal width binning algorithm by determining the appropriate value range and creating evenly-spaced intervals within that range. Args: x_col: Preprocessed column data as numpy array. The data has been validated and cleaned by the parent class, containing only finite numeric values. col_id: Column identifier for error messages and logging. Can be string, integer, or other hashable type. guidance_data: Optional guidance data (not used in equal width binning but included for interface compatibility). Ignored in this method. Returns: Tuple containing: - bin_edges: List of n_bins + 1 edge values defining the bin boundaries - bin_representatives: List of n_bins representative values (bin centers) Example: >>> x_col = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) >>> edges, reps = binner._calculate_bins(x_col, 'feature1') >>> # With n_bins=3: edges might be [1.0, 2.33, 3.67, 5.0] >>> # reps might be [1.67, 3.0, 4.33] Note: - Uses bin_range parameter if provided, otherwise determines range from data - Creates exactly n_bins intervals with equal width - Representatives are calculated as interval midpoints - Handles edge cases like constant data (handled by parent class) """ # Use the new equal-width utility edges = create_equal_width_bins( data=x_col, n_bins=self.n_bins, data_range=self.bin_range, add_epsilon=True ) # Create representatives as bin centers representatives = [(edges[i] + edges[i + 1]) / 2 for i in range(self.n_bins)] return list(edges), representatives