Source code for binlearn.base._flexible_binning_base

"""
Clean flexible binning base class for V2 architecture.
"""

from __future__ import annotations

from abc import abstractmethod
from typing import Any

import numpy as np

from ..utils import (
    BinEdgesDict,
    ColumnList,
    ConfigurationError,
    FlexibleBinSpec,
    transform_value_to_flexible_bin,
    validate_bin_representatives_format,
)
from ._general_binning_base import GeneralBinningBase


# pylint: disable=too-many-ancestors

[docs]
class FlexibleBinningBase(GeneralBinningBase):
    """Base class for flexible binning methods that support mixed bin types.

    This class extends GeneralBinningBase to provide specialized functionality for
    flexible binning methods. Unlike traditional interval-based binning, flexible
    binning supports mixed bin types within the same feature, including singleton
    bins (exact value matching) and interval bins (range matching) in any combination.

    Flexible binning is particularly useful for:
    - Categorical features with numeric representations
    - Mixed data types requiring different binning strategies per value
    - Custom binning schemes that don't fit traditional interval patterns
    - Data with important singleton values that should be preserved exactly

    Key Features:
    - Mixed bin types: Combine singleton and interval bins in the same feature
    - Custom bin specifications: Define bins as either exact values or ranges
    - Automatic representative generation: Creates numeric representatives for mixed bins
    - Flexible transformation: Handles both numeric and non-numeric data appropriately

    Attributes:
        bin_spec: Dictionary mapping column identifiers to lists of flexible bin
            definitions. Each bin can be either a scalar (singleton) or tuple (interval).
        bin_representatives: Dictionary mapping column identifiers to lists of
            numeric representative values for each bin. Auto-generated if not provided.

    Example:
        >>> # Example of flexible bin specification
        >>> bin_spec = {
        ...     'mixed_feature': [
        ...         42,           # Singleton bin: exactly value 42
        ...         (10, 20),     # Interval bin: range [10, 20]
        ...         'special',    # Categorical singleton
        ...         (100, 200)    # Another interval
        ...     ]
        ... }

    Note:
        - This is an abstract base class - use concrete implementations like ManualFlexibleBinning
        - Bin representatives are automatically generated as midpoints for intervals and
          preserved values for singletons (with numeric conversion where possible)
        - Inherits all functionality from GeneralBinningBase including fit/transform interface
        - Subclasses must implement the abstract _do_fit_single_column method
    """

    # pylint: disable=too-many-arguments

[docs]
    def __init__(
        self,
        preserve_dataframe: bool | None = None,
        fit_jointly: bool | None = None,
        guidance_columns: Any = None,
        *,
        bin_spec: FlexibleBinSpec | None = None,
        bin_representatives: BinEdgesDict | None = None,
    ):
        """Initialize flexible binning base class.

        Args:
            preserve_dataframe: Whether to preserve the original DataFrame format
                during transformation. If True, returns DataFrame when input is DataFrame.
                If False, returns numpy array. If None, uses the global configuration
                default from binlearn.config.preserve_dataframe.
            fit_jointly: Whether to fit all columns together using shared information.
                If True, performs joint fitting across columns. If False, fits each
                column independently. If None, uses method-specific default behavior.
            guidance_columns: Additional columns to use as guidance for binning decisions.
                These columns are not binned themselves but can influence the binning
                of other columns. Can be column names/indices or None for no guidance.
            bin_spec: Pre-defined flexible bin specification as a dictionary mapping
                column identifiers to lists of bin definitions. Each bin definition
                can be either a scalar value (singleton bin) or a tuple (interval bin).
                If provided, no fitting is performed and this specification is used directly.
            bin_representatives: Pre-defined representative values for each bin as a
                dictionary mapping column identifiers to lists of numeric values.
                Must match the structure of bin_spec if provided. If None, representatives
                are automatically generated from bin_spec.

        Raises:
            ConfigurationError: If bin specifications are invalid or incompatible.

        Example:
            >>> # Initialize with custom bin specification
            >>> bin_spec = {
            ...     'feature1': [10, (20, 30), 40],
            ...     'feature2': ['A', 'B', (1, 5)]
            ... }
            >>> binner = ConcreteFlexibleBinner(bin_spec=bin_spec)
            >>>
            >>> # Initialize for automatic fitting
            >>> binner = ConcreteFlexibleBinner(fit_jointly=True)

        Note:
            - When bin_spec is provided, the binning is pre-configured and fit() becomes a no-op
            - bin_representatives will be auto-generated if not provided with bin_spec
            - guidance_columns feature may not be supported by all flexible binning methods
            - All parameters are passed to the parent GeneralBinningBase constructor
        """
        # Initialize parent
        GeneralBinningBase.__init__(
            self,
            preserve_dataframe=preserve_dataframe,
            fit_jointly=fit_jointly,
            guidance_columns=guidance_columns,
        )

        # Store flexible-specific parameters
        self.bin_spec = bin_spec
        self.bin_representatives = bin_representatives

        # Working fitted attributes
        self.bin_spec_: FlexibleBinSpec = {}
        self.bin_representatives_: BinEdgesDict = {}

        # Initialize sklearn attributes to avoid W0201 warnings
        self._feature_names_in: list[Any] | None = None
        self._n_features_in: int = 0

        # Configure fitted attributes for the base class
        self._fitted_attributes = ["bin_spec_", "bin_representatives_"]

        # Validate parameters early
        self._validate_params()


    # pylint: disable=too-many-branches
    def _validate_params(self) -> None:
        """Validate flexible binning parameters and initialize fitted attributes.

        This method performs comprehensive validation of the flexible binning parameters
        and pre-processes any provided bin specifications. It handles auto-generation
        of bin representatives and sets up the fitted state when complete specifications
        are provided.

        Raises:
            ConfigurationError: If bin specifications are invalid, bin_spec is not a
                dictionary, or bin_representatives format is invalid.

        Note:
            - Calls parent class parameter validation first
            - Auto-generates numeric representatives for bin specifications when not provided
            - For singleton bins, uses the value itself (converted to float if possible)
            - For interval bins, uses the midpoint as the representative
            - Non-numeric singleton values get a placeholder representative (0.0)
            - Sets sklearn attributes automatically when complete specifications are provided
        """
        # Call parent validation
        GeneralBinningBase._validate_params(self)

        # Process provided bin specifications
        # pylint: disable=too-many-nested-blocks
        try:
            if self.bin_spec is not None:
                # For now, just check it's a dictionary
                if not isinstance(self.bin_spec, dict):
                    raise ValueError("bin_spec must be a dictionary")
                self.bin_spec_ = self.bin_spec

                if self.bin_representatives is not None:
                    validate_bin_representatives_format(self.bin_representatives)
                    self.bin_representatives_ = self.bin_representatives
                elif self.bin_spec_:
                    # For flexible binning, auto-generate proper numeric representatives
                    # bin_spec contains mixed values (singletons and tuples for intervals)
                    # but representatives must be all numeric
                    self.bin_representatives_ = {}
                    for col, spec in self.bin_spec_.items():
                        if isinstance(spec, list):
                            representatives = []
                            for spec_item in spec:
                                if isinstance(spec_item, tuple) and len(spec_item) == 2:
                                    # Interval bin: use midpoint as representative
                                    representatives.append(float((spec_item[0] + spec_item[1]) / 2))
                                elif not isinstance(spec_item, tuple):
                                    # Singleton bin: use the value itself as representative
                                    try:
                                        representatives.append(float(spec_item))
                                    except (ValueError, TypeError):
                                        # For non-numeric singleton bins, use a placeholder
                                        representatives.append(0.0)
                                else:
                                    # Fallback for unexpected formats
                                    representatives.append(0.0)
                            self.bin_representatives_[col] = representatives

                # If we have complete specifications, mark as fitted and set sklearn attributes
                if self.bin_spec_ and self.bin_representatives_:
                    self._set_sklearn_attributes_from_specs()

        except ValueError as e:
            raise ConfigurationError(str(e)) from e

    def _set_sklearn_attributes_from_specs(self) -> None:
        """Set sklearn-compatible attributes from flexible bin specifications.

        This method configures the sklearn-compatible attributes (_feature_names_in
        and _n_features_in) based on the provided bin specifications and guidance
        columns. It ensures compatibility with sklearn's metadata routing and
        feature inspection APIs.

        Note:
            - Extracts column identifiers from bin_spec_ as primary features
            - Adds guidance_columns to the feature list if provided
            - Handles both single guidance column and list of guidance columns
            - Avoids duplicate columns when guidance columns overlap with binning columns
            - Sets _feature_names_in to the complete list of input columns
            - Sets _n_features_in to the total number of expected input features
        """
        if self.bin_spec_ is not None:
            # Get column names/indices from bin_spec
            binning_columns = list(self.bin_spec_.keys())

            # Add guidance columns if specified
            all_features = binning_columns.copy()
            if self.guidance_columns is not None:
                guidance_cols = (
                    [self.guidance_columns]
                    if not isinstance(self.guidance_columns, list)
                    else self.guidance_columns
                )
                # Add guidance columns that aren't already in binning columns
                for col in guidance_cols:
                    if col not in all_features:
                        all_features.append(col)

            # Set sklearn attributes
            self._feature_names_in = all_features
            self._n_features_in = len(all_features)

    def _fit_per_column_independently(
        self,
        X: np.ndarray[Any, Any],
        columns: ColumnList,
        guidance_data: np.ndarray[Any, Any] | None = None,
        **fit_params: Any,
    ) -> None:
        """Fit flexible binning parameters independently for each column.

        This method implements independent column fitting for flexible binning,
        where each column is analyzed separately to determine its optimal bin
        structure. This is the default fitting strategy for most flexible
        binning methods.

        Args:
            X: Input data array with shape (n_samples, n_features) where each
                column will be fitted independently.
            columns: List of column identifiers corresponding to the columns in X.
                Used for bin specification keys and error messages.
            guidance_data: Optional guidance data array that can influence binning
                decisions but is not binned itself. Same guidance data is provided
                to all columns during fitting.
            **fit_params: Additional parameters passed to the underlying bin
                calculation method (_calculate_flexible_bins).

        Note:
            - Creates separate bin specifications for each column in self.bin_spec_
            - Creates separate representatives for each column in self.bin_representatives_
            - Validates that each column contains numeric data before processing
            - Delegates actual bin calculation to the abstract _calculate_flexible_bins method
            - Each column is processed independently without sharing information
        """
        self.bin_spec_ = {}
        self.bin_representatives_ = {}

        for i, col in enumerate(columns):
            x_col = X[:, i]

            # Validate numeric data
            self._validate_numeric_data(x_col, col)

            # Use the same guidance_data for all columns (not indexed per column)
            edges, representatives = self._calculate_flexible_bins(x_col, col, guidance_data)
            self.bin_spec_[col] = edges
            self.bin_representatives_[col] = representatives

    def _fit_jointly_across_columns(
        self, X: np.ndarray[Any, Any], columns: ColumnList, **fit_params: Any
    ) -> None:
        """Fit flexible binning parameters jointly across all columns.

        This method implements joint fitting for flexible binning, where information
        from all columns is considered together during bin determination. For most
        flexible binning methods, this defaults to independent fitting unless
        overridden by specific implementations.

        Args:
            X: Input data array with shape (n_samples, n_features) where all
                columns will be considered together during fitting.
            columns: List of column identifiers corresponding to the columns in X.
                Used for bin specification keys and error messages.
            **fit_params: Additional parameters passed to the underlying fitting logic.

        Note:
            - Default implementation delegates to _fit_per_column_independently
            - Subclasses can override this method to implement true joint fitting
            - Joint fitting might consider correlations or dependencies between columns
            - The choice between joint and independent fitting is controlled by fit_jointly
                parameter
        """
        # For flexible binning, joint fitting is typically the same as per-column fitting
        # unless overridden by specific implementations
        self._fit_per_column_independently(X, columns, None, **fit_params)

    def _transform_columns_to_bins(
        self, X: np.ndarray[Any, Any], columns: ColumnList
    ) -> np.ndarray[Any, Any]:
        """Transform data columns to bin indices using flexible bin mapping.

        This method performs the core transformation operation for flexible binning,
        mapping each value in the input data to its corresponding bin index. It
        handles both singleton and interval bins within the same column.

        Args:
            X: Input data array with shape (n_samples, n_features) to transform.
                Each column should correspond to a column that was fitted.
            columns: List of column identifiers corresponding to the columns in X.
                Used to match with the fitted bin specifications.

        Returns:
            Transformed array with same shape as input, containing integer bin indices.
            Values that don't match any bin are assigned MISSING_VALUE (-1).

        Raises:
            ValueError: If the number of input columns doesn't match the number of
                fitted bin specifications.

        Note:
            - Uses transform_value_to_flexible_bin utility for individual value transformation
            - Handles missing values and out-of-range values by assigning MISSING_VALUE (-1)
            - Each value is compared against all bin definitions for its column
            - For singleton bins, uses exact equality comparison
            - For interval bins, uses inclusive range comparison [start, end]
            - Returns the first matching bin index if multiple bins could match
        """
        if X.size == 0:
            return np.empty((X.shape[0], 0))

        # Validate that input has same number of columns as bin specifications
        if X.shape[1] != len(self.bin_spec_):
            raise ValueError(
                f"Input data has {X.shape[1]} columns but bin specifications "
                f"are provided for {len(self.bin_spec_)} columns"
            )

        result = np.empty_like(X, dtype=int)
        available_keys = list(self.bin_spec_.keys())

        for i, col in enumerate(columns):
            # Find the right bin specification - this will raise ValueError for missing columns
            key = self._get_column_key(col, available_keys, i)
            bin_defs = self.bin_spec_[key]

            # Transform this column
            col_data = X[:, i]

            for row_idx, value in enumerate(col_data):
                # Use utility function for transformation
                result[row_idx, i] = transform_value_to_flexible_bin(value, bin_defs)

        return result

    def _inverse_transform_bins_to_values(
        self, X: np.ndarray[Any, Any], columns: ColumnList
    ) -> np.ndarray[Any, Any]:
        """Transform bin indices back to representative values.

        This method performs the inverse transformation for flexible binning,
        mapping bin indices back to their corresponding representative values.
        It's used to reconstruct approximate original values from bin indices.

        Args:
            X: Input array with shape (n_samples, n_features) containing integer
                bin indices to be transformed back to representative values.
            columns: List of column identifiers corresponding to the columns in X.
                Used to match with the fitted bin representatives.

        Returns:
            Array with same shape as input, containing float representative values
            for each bin index. Invalid indices are clipped to valid range.

        Note:
            - Uses the bin_representatives_ fitted during training
            - Invalid bin indices (< 0 or >= n_bins) are clipped to valid range
            - For flexible binning, representatives are typically:
              - The original singleton value for singleton bins
              - The interval midpoint for interval bins
            - Output is always float type for consistency, even if original values were integers
            - Missing values (MISSING_VALUE indices) are clipped to first bin representative
        """
        if X.size == 0:
            return np.empty((X.shape[0], 0))

        result = np.empty_like(X, dtype=float)
        available_keys = list(self.bin_representatives_.keys())

        for i, col in enumerate(columns):
            # Get the right bin specification using column key resolution
            key = self._get_column_key(col, available_keys, i)
            representatives = np.array(self.bin_representatives_[key])
            bin_indices = X[:, i].astype(int)

            # Clip indices to valid range
            bin_indices = np.clip(bin_indices, 0, len(representatives) - 1)

            result[:, i] = representatives[bin_indices]

        return result

    @abstractmethod
    def _calculate_flexible_bins(
        self,
        x_col: np.ndarray[Any, Any],
        col_id: Any,
        guidance_data: np.ndarray[Any, Any] | None = None,
    ) -> tuple[list[Any], list[Any]]:
        """Calculate flexible bin values and representatives for a column.

        For flexible binning, this typically identifies unique values or patterns
        rather than creating fixed intervals.

        Args:
            x_col: Column data to analyze
            col_id: Column identifier
            guidance_data: Optional guidance data for this column

        Returns:
            Tuple of (bin_values, representatives) where:
            - bin_values: List of values that define the bins
            - representatives: List of representative values for each bin
        """