Source code for binlearn.base._interval_binning_base

"""
Clean interval binning base class for V2 architecture.

This module provides interval-based binning functionality that inherits from GeneralBinningBase.
"""

from __future__ import annotations

from abc import abstractmethod
from typing import Any

import numpy as np

from ..config import get_config
from ..utils import (
    BinEdgesDict,
    ColumnList,
    ConfigurationError,
    FittingError,
    default_representatives,
    validate_bin_edges_format,
    validate_bin_representatives_format,
    validate_bins,
)
from ._general_binning_base import GeneralBinningBase


# pylint: disable=too-many-ancestors,too-many-instance-attributes

[docs]
class IntervalBinningBase(GeneralBinningBase):
    """Interval-based binning functionality inheriting from GeneralBinningBase.

    This abstract base class provides specialized functionality for binning methods
    that create discrete intervals from continuous data. It extends GeneralBinningBase
    with interval-specific features like bin edge management, representative value
    calculation, and out-of-range value handling.

    Key Features:
    - Interval boundary (bin edges) management and validation
    - Representative value calculation and storage
    - Clipping behavior for out-of-range values
    - sklearn-compatible fitted attributes
    - Comprehensive parameter validation

    The class manages two core concepts:
    - Bin edges: Define interval boundaries [a, b, c] creating bins [a,b) and [b,c]
    - Representatives: Values that represent each bin (typically centers or means)

    Parameters:
    -----------
    clip : bool, optional
        Whether to clip out-of-range values to the nearest bin boundaries.
        If None, uses the global configuration default. When True:
        - Values below minimum edge are assigned to first bin
        - Values above maximum edge are assigned to last bin
        When False, out-of-range values get special indices (BELOW_RANGE, ABOVE_RANGE).

    preserve_dataframe : bool, optional
        Inherited from GeneralBinningBase. Whether to preserve DataFrame format.

    fit_jointly : bool, optional
        Inherited from GeneralBinningBase. Whether to fit columns jointly.

    guidance_columns : GuidanceColumns, optional
        Inherited from GeneralBinningBase. Guidance column specification.

    bin_edges : BinEdgesDict, optional
        Pre-specified bin edges as a dictionary mapping column identifiers to
        edge lists. If provided, the fitting process will validate and use these
        edges instead of computing them from data.

    bin_representatives : BinEdgesDict, optional
        Pre-specified bin representatives as a dictionary mapping column identifiers
        to representative value lists. If provided, validates consistency with bin_edges.

    Attributes:
    -----------
    clip : bool
        Whether to clip out-of-range values to bin boundaries.

    bin_edges : BinEdgesDict | None
        Pre-specified bin edges (input parameter).

    bin_representatives : BinEdgesDict | None
        Pre-specified bin representatives (input parameter).

    bin_edges_ : BinEdgesDict
        Fitted bin edges after calling fit(). Dictionary mapping each column
        to its list of bin boundary values.

    bin_representatives_ : BinEdgesDict
        Fitted bin representatives after calling fit(). Dictionary mapping each
        column to its list of representative values.

    Note:
    -----
    This is an abstract base class. Concrete implementations must provide the
    abstract method _calculate_bins() to define how bin edges are computed
    from input data for their specific binning algorithm.
    """

    # pylint: disable=too-many-arguments,too-many-positional-arguments

[docs]
    def __init__(
        self,
        clip: bool | None = None,
        preserve_dataframe: bool | None = None,
        fit_jointly: bool | None = None,
        guidance_columns: Any = None,
        *,
        bin_edges: BinEdgesDict | None = None,
        bin_representatives: BinEdgesDict | None = None,
    ):
        """Initialize interval binning base with configuration and validation.

        Sets up the interval binning transformer with the specified parameters,
        applying configuration defaults and performing early parameter validation
        to catch configuration errors before fitting.

        Args:
            clip: Whether to clip out-of-range values to bin boundaries.
                If None, uses global configuration default.
            preserve_dataframe: Whether to preserve DataFrame format in output.
                Passed to GeneralBinningBase. If None, uses global configuration default.
            fit_jointly: Whether to fit all columns jointly rather than independently.
                Passed to GeneralBinningBase. If None, uses global configuration default.
            guidance_columns: Specification of guidance columns for supervised binning.
                Passed to GeneralBinningBase.
            bin_edges: Pre-specified bin edges for manual binning. If provided,
                the fitting process validates and uses these instead of computing
                from data.
            bin_representatives: Pre-specified bin representatives. If provided,
                must be consistent with bin_edges.

        Raises:
            ValueError: If clip parameter is invalid or pre-specified bins are
                inconsistent.
            ConfigurationError: If parameter validation fails.

        Note:
            Early parameter validation helps catch configuration issues before
            expensive fitting operations. The bin_edges_ and bin_representatives_
            attributes are initialized as empty dictionaries and populated during fitting.
        """
        # Initialize parent
        GeneralBinningBase.__init__(
            self,
            preserve_dataframe=preserve_dataframe,
            fit_jointly=fit_jointly,
            guidance_columns=guidance_columns,
        )

        # Load configuration defaults
        config = get_config()
        if clip is None:
            clip = config.default_clip

        # Store interval-specific parameters
        self.clip = clip
        self.bin_edges = bin_edges
        self.bin_representatives = bin_representatives

        # Working fitted attributes
        self.bin_edges_: BinEdgesDict = {}
        self.bin_representatives_: BinEdgesDict = {}

        # Initialize sklearn attributes to avoid W0201 warnings
        self._feature_names_in: list[Any] | None = None
        self._n_features_in: int = 0

        # Configure fitted attributes for the base class
        self._fitted_attributes = ["bin_edges_", "bin_representatives_"]

        # Validate parameters early
        self._validate_params()


    def _validate_params(self) -> None:
        """Validate interval binning parameters."""
        # Call parent validation
        GeneralBinningBase._validate_params(self)

        # Validate clip parameter
        if not isinstance(self.clip, bool):
            raise TypeError("clip must be a boolean")

        # Process provided bin specifications
        try:
            if self.bin_edges is not None:
                validate_bin_edges_format(self.bin_edges)
                self.bin_edges_ = self.bin_edges

                if self.bin_representatives is not None:
                    validate_bin_representatives_format(self.bin_representatives, self.bin_edges)
                    self.bin_representatives_ = self.bin_representatives

                    # Validate compatibility
                    validate_bins(self.bin_edges_, self.bin_representatives_)
                elif self.bin_edges_:
                    # Generate default representatives
                    self.bin_representatives_ = {}
                    for col, edges in self.bin_edges_.items():
                        edges_list = list(edges)
                        self.bin_representatives_[col] = default_representatives(edges_list)

                # If we have complete specifications, mark as fitted and set sklearn attributes
                if self.bin_edges_ and self.bin_representatives_:
                    self._set_sklearn_attributes_from_specs()

        except ValueError as e:
            raise ConfigurationError(str(e)) from e

    def _set_sklearn_attributes_from_specs(self) -> None:
        """Set sklearn attributes from bin specifications."""
        if self.bin_edges_ is not None:
            # Get column names/indices from bin_edges
            binning_columns = list(self.bin_edges_.keys())

            # Add guidance columns if specified
            all_features = binning_columns.copy()
            if self.guidance_columns is not None:
                guidance_cols = (
                    [self.guidance_columns]
                    if not isinstance(self.guidance_columns, list)
                    else self.guidance_columns
                )
                # Add guidance columns that aren't already in binning columns
                for col in guidance_cols:
                    if col not in all_features:
                        all_features.append(col)

            # Set sklearn attributes
            self._feature_names_in = all_features
            self._n_features_in = len(all_features)

    def _fit_per_column_independently(
        self,
        X: np.ndarray[Any, Any],
        columns: ColumnList,
        guidance_data: np.ndarray[Any, Any] | None = None,
        **fit_params: Any,
    ) -> None:
        """Fit binning parameters independently for each column."""
        self.bin_edges_ = {}
        self.bin_representatives_ = {}

        for i, col in enumerate(columns):
            x_col = X[:, i]

            # Validate and preprocess numeric data
            x_col_processed = self._validate_and_preprocess_column(x_col, col)

            # Use the same guidance_data for all columns (not indexed per column)
            edges, representatives = self._calculate_bins(x_col_processed, col, guidance_data)
            self.bin_edges_[col] = edges
            self.bin_representatives_[col] = representatives

    def _fit_jointly_across_columns(
        self, X: np.ndarray[Any, Any], columns: ColumnList, **fit_params: Any
    ) -> None:
        """Fit binning parameters jointly across all columns."""
        # For interval binning, joint fitting is the same as per-column fitting
        # since intervals don't depend on other columns
        self._fit_per_column_independently(X, columns, None, **fit_params)

    def _transform_columns_to_bins(
        self, X: np.ndarray[Any, Any], columns: ColumnList
    ) -> np.ndarray[Any, Any]:
        """Transform columns to bin indices."""
        if X.size == 0:
            return np.empty((X.shape[0], 0))

        # Validate that input has same number of columns as bin specifications
        if X.shape[1] != len(self.bin_edges_):
            raise ValueError(
                f"Input data has {X.shape[1]} columns but bin specifications "
                f"are provided for {len(self.bin_edges_)} columns"
            )

        result = np.empty_like(X, dtype=int)
        available_keys = list(self.bin_edges_.keys())

        for i, col in enumerate(columns):
            # Get the right bin specification using column key resolution
            key = self._get_column_key(col, available_keys, i)
            edges = np.array(self.bin_edges_[key])
            column_data = X[:, i]

            # Handle special values (NaN, inf)
            is_special = np.isnan(column_data) | np.isinf(column_data)

            # Apply clipping if enabled
            if self.clip:
                column_data = np.clip(column_data, edges[0], edges[-1])

            # Digitize to get bin indices
            bin_indices = np.digitize(column_data, edges) - 1

            # Ensure bin indices are in valid range
            bin_indices = np.clip(bin_indices, 0, len(edges) - 2)

            # Handle special values - assign to last bin
            bin_indices[is_special] = len(edges) - 2

            result[:, i] = bin_indices

        return result

    def _inverse_transform_bins_to_values(
        self, X: np.ndarray[Any, Any], columns: ColumnList
    ) -> np.ndarray[Any, Any]:
        """Transform bin indices to representative values."""
        if X.size == 0:
            return np.empty((X.shape[0], 0))

        result = np.empty_like(X, dtype=float)
        available_keys = list(self.bin_representatives_.keys())

        for i, col in enumerate(columns):
            # Get the right bin specification using column key resolution
            key = self._get_column_key(col, available_keys, i)
            representatives = np.array(self.bin_representatives_[key])
            bin_indices = X[:, i].astype(int)

            # Clip indices to valid range
            bin_indices = np.clip(bin_indices, 0, len(representatives) - 1)

            result[:, i] = representatives[bin_indices]

        return result

    def _validate_and_preprocess_column(
        self, x_col: np.ndarray[Any, Any], col_id: Any
    ) -> np.ndarray[Any, Any]:
        """Validate column data for interval binning.

        Args:
            x_col: Raw column data
            col_id: Column identifier for error messages

        Returns:
            The original column data (unchanged)

        Raises:
            FittingError: If column contains only NaN values
        """
        # Check for all-NaN column
        if np.all(np.isnan(x_col)):
            raise FittingError(f"Column {col_id} contains only NaN values. Cannot perform binning.")

        return x_col

    @abstractmethod
    def _calculate_bins(
        self,
        x_col: np.ndarray[Any, Any],
        col_id: Any,
        guidance_data: np.ndarray[Any, Any] | None = None,
    ) -> tuple[list[float], list[float]]:
        """Calculate bin edges and representatives for a column.

        Subclasses must implement this method to define their binning strategy.
        """