Source code for binlearn.base._supervised_binning_base

"""
Clean supervised binning base class for V2 architecture.

This module provides supervised binning functionality that inherits from IntervalBinningBase.
For binning methods that use target/label information to optimize bin boundaries.
"""

from __future__ import annotations

import warnings
from typing import Any

import numpy as np

from ..utils import (
    ArrayLike,
    BinEdgesDict,
    ColumnList,
    DataQualityWarning,
    ValidationError,
)
from ._interval_binning_base import IntervalBinningBase


# pylint: disable=too-many-ancestors

[docs]
class SupervisedBinningBase(IntervalBinningBase):
    """Base class for supervised binning methods that use target information.

    This class extends IntervalBinningBase to provide specialized functionality for
    supervised binning methods. These methods use target variable information (y)
    to optimize bin boundaries, typically aiming to create bins with homogeneous
    target distributions or maximize predictive power.

    Supervised binning is particularly effective for:
    - Binary classification with continuous predictors
    - Regression tasks where binning should preserve target relationships
    - Feature selection and engineering based on target correlation
    - Creating interpretable bins aligned with target behavior

    Key Features:
    - Target-aware bin boundary optimization
    - Built-in target data validation and preprocessing
    - Feature-target pair validation for data quality
    - Automatic handling of supervised learning constraints
    - Integration with guidance column requirements for targets

    Constraints:
    - Does not support joint fitting across multiple features (fit_jointly=False)
    - Requires exactly one guidance column to serve as the target variable
    - Target data must be provided during fit() call
    - Feature and target data must have compatible shapes and no missing values

    Attributes:
        All attributes from IntervalBinningBase plus:
        - Target-specific validation and preprocessing capabilities
        - Enhanced error handling for supervised learning scenarios

    Example:
        >>> # Supervised binning for binary classification
        >>> X = np.array([[1.2, 2.3], [3.4, 4.5], [5.6, 6.7]])
        >>> y = np.array([0, 1, 0])  # Binary target
        >>>
        >>> binner = ConcreteSupervisedBinner()
        >>> binner.fit(X, guidance_data=y)
        >>> X_binned = binner.transform(X)

    Note:
        - This is an abstract base class - use concrete implementations like Chi2Binning
        - Inherits all interval binning functionality (bin edges, representatives, etc.)
        - Target data is passed via guidance_data parameter in fit() method
        - Subclasses must implement _calculate_bin_edges with target-aware logic
    """

    # pylint: disable=too-many-arguments

[docs]
    def __init__(
        self,
        clip: bool | None = None,
        preserve_dataframe: bool | None = None,
        guidance_columns: Any = None,
        *,
        bin_edges: BinEdgesDict | None = None,
        bin_representatives: BinEdgesDict | None = None,
    ):
        """Initialize supervised binning base class.

        Args:
            clip: Whether to clip out-of-range values during transformation to the
                nearest bin boundary. If True, values below the minimum edge are
                assigned to the first bin, and values above the maximum edge are
                assigned to the last bin. If False, out-of-range values get special
                indices (BELOW_RANGE, ABOVE_RANGE). If None, uses global default.
            preserve_dataframe: Whether to preserve the original DataFrame format
                during transformation. If True, returns DataFrame when input is DataFrame.
                If False, returns numpy array. If None, uses global configuration default.
            guidance_columns: Column identifier for the target variable. For supervised
                binning, this should specify exactly one column that contains the target
                values. Can be column name/index or None (target passed via guidance_data).
            bin_edges: Pre-defined bin edges as a dictionary mapping column identifiers
                to lists of edge values. If provided, no fitting is performed and these
                edges are used directly. Must be compatible with supervised binning
                constraints if provided.
            bin_representatives: Pre-defined representative values for each bin as a
                dictionary mapping column identifiers to lists of values. Must match
                the structure of bin_edges if provided.

        Raises:
            ValidationError: If parameters are incompatible with supervised binning
                requirements (e.g., multiple guidance columns specified).

        Note:
            - Supervised binning does not support fit_jointly=True (always fits independently)
            - Target data should be provided via guidance_data parameter in fit() method
            - Only one guidance column is supported (the target variable)
            - Pre-defined bin edges should be optimized for the target if provided
        """
        # Initialize parent (supervised binning doesn't support fit_jointly)
        IntervalBinningBase.__init__(
            self,
            clip=clip,
            preserve_dataframe=preserve_dataframe,
            fit_jointly=False,  # Supervised binning always processes columns independently
            guidance_columns=guidance_columns,
            bin_edges=bin_edges,
            bin_representatives=bin_representatives,
        )


    def _validate_params(self) -> None:
        """Validate supervised binning parameters with additional constraints.

        Extends the parent class validation to add supervised learning specific
        constraints and warnings. This method ensures that the parameter configuration
        is appropriate for supervised binning methods.

        Warns:
            DataQualityWarning: If multiple guidance columns are specified, which
                may lead to unexpected behavior in supervised binning methods that
                typically expect a single target variable.

        Note:
            - Calls parent class parameter validation first
            - Supervised binning typically works best with exactly one target column
            - Multiple guidance columns are allowed but discouraged with a warning
            - This method is called during initialization to catch configuration issues early
        """
        # Call parent validation
        IntervalBinningBase._validate_params(self)

        # Additional validation for supervised binning
        if self.guidance_columns is not None:
            # For supervised binning, we typically expect a single guidance column (the target)
            if isinstance(self.guidance_columns, list) and len(self.guidance_columns) > 1:
                warnings.warn(
                    "Supervised binning typically works best with a single target column. "
                    "Multiple guidance columns may lead to unexpected behavior.",
                    DataQualityWarning,
                    stacklevel=2,
                )


[docs]
    def validate_guidance_data(
        self, guidance_data: ArrayLike, name: str = "guidance_data"
    ) -> np.ndarray[Any, Any]:
        """Validate and preprocess guidance data for supervised binning.

        Ensures that the guidance data is appropriate for supervised binning
        by validating its shape and checking for data quality issues.

        Args:
            guidance_data: Raw guidance/target data to validate.
                Should be a 2D array with shape (n_samples, 1) or 1D array
                with shape (n_samples,).
            name: Name used in error messages for better debugging context.

        Returns:
            Validated and preprocessed guidance data with shape (n_samples, 1).

        Raises:
            ValidationError: If guidance data has invalid shape or format.
        """
        if guidance_data is None:
            raise ValidationError(f"{name} cannot be None for supervised binning")

        # Convert to numpy array if needed
        if not isinstance(guidance_data, np.ndarray):
            guidance_data = np.array(guidance_data)

        # Ensure 2D shape
        if guidance_data.ndim == 1:
            guidance_data = guidance_data.reshape(-1, 1)
        elif guidance_data.ndim > 2:
            raise ValidationError(f"{name} must be 1D or 2D array, got {guidance_data.ndim}D")

        # Check for empty data
        if guidance_data.size == 0:
            raise ValidationError(f"{name} cannot be empty")

        # Supervised binning requires exactly one guidance column
        if guidance_data.shape[1] != 1:
            raise ValidationError(
                f"Supervised binning requires exactly one target column, "
                f"got {guidance_data.shape[1]} columns"
            )

        return guidance_data  # type: ignore[no-any-return]


    def _validate_feature_target_pair(
        self,
        feature_data: np.ndarray[Any, Any],
        target_data: np.ndarray[Any, Any],
        col_id: Any,
    ) -> tuple[np.ndarray[Any, Any], np.ndarray[Any, Any]]:
        """Validate and clean feature-target data pairs for supervised binning.

        This method ensures that feature and target data are compatible for supervised
        binning by checking shapes and removing rows with invalid target values. The
        feature data should already be preprocessed by the parent class, but target
        data may still contain missing or invalid values that need handling.

        Args:
            feature_data: Preprocessed feature column data that has been validated
                and cleaned by the parent class. Should contain only finite values.
            target_data: Raw target/guidance data that may contain NaN or infinite
                values. Must have the same number of rows as feature_data.
            col_id: Column identifier used for generating informative error and
                warning messages.

        Returns:
            Tuple containing:
            - cleaned_feature_data: Feature data with rows removed where target had invalid values
            - cleaned_target_data: Target data with invalid values (NaN, inf) removed
            Both arrays have the same length and correspond row-wise.

        Raises:
            ValidationError: If feature and target data have mismatched lengths.

        Warns:
            DataQualityWarning: If a significant number of rows are removed due to
                invalid target values (>5% of data OR >5 rows), or if insufficient
                valid data remains after cleaning.

        Example:
            >>> feature = np.array([1.0, 2.0, 3.0, 4.0])
            >>> target = np.array([0.0, np.nan, 1.0, 0.0])
            >>> clean_feat, clean_targ = binner._validate_feature_target_pair(
            ...     feature, target, 'col1'
            ... )
            >>> # Result: clean_feat = [1.0, 3.0, 4.0], clean_targ = [0.0, 1.0, 0.0]

        Note:
            - Feature data is assumed to be already cleaned by parent class validation
            - Only removes rows based on target data validity
            - Maintains row correspondence between feature and target data
            - Warns if data quality may be compromised by excessive missing values
            - Requires at least 2 valid samples to proceed with binning
        """
        if len(feature_data) != len(target_data):
            raise ValidationError(
                f"Feature and target data must have same length for column {col_id}. "
                f"Got feature: {len(feature_data)}, target: {len(target_data)}"
            )

        # Remove rows where target has missing values (feature data is already preprocessed)
        target_valid = ~(np.isnan(target_data.flatten()) | np.isinf(target_data.flatten()))

        cleaned_feature = feature_data[target_valid]
        cleaned_target = target_data[target_valid]

        # Warn if missing values were removed, but only if some valid data remains
        # and if a significant portion was removed (more than 5% OR more than 5 rows)
        removed_count = len(feature_data) - len(cleaned_feature)
        if removed_count > 0 and len(cleaned_feature) >= 2:
            removal_ratio = removed_count / len(feature_data)
            if removal_ratio > 0.05 or removed_count > 5:
                warnings.warn(
                    f"Column {col_id}: Removed {removed_count} rows with missing values "
                    "in target data. "
                    f"Using {len(cleaned_feature)} valid samples for binning.",
                    DataQualityWarning,
                    stacklevel=2,
                )
            else:
                # Explicit else branch: missing values removed but below warning thresholds
                # This makes the 157->166 branch explicitly testable
                pass

        # Check if we have sufficient data after cleaning
        if len(cleaned_feature) < 2:
            warnings.warn(
                f"Column {col_id} has insufficient valid data points ({len(cleaned_feature)}) "
                "after removing missing values. Results may be unreliable.",
                DataQualityWarning,
                stacklevel=2,
            )

        return cleaned_feature, cleaned_target

    def _fit_per_column_independently(
        self,
        X: np.ndarray[Any, Any],
        columns: ColumnList,
        guidance_data: np.ndarray[Any, Any] | None = None,
        **fit_params: Any,
    ) -> None:
        """Fit supervised binning parameters independently for each column.

        This method implements the fitting logic for supervised binning by validating
        the guidance data (target) and delegating to the parent class implementation.
        It ensures that target information is properly formatted before bin computation.

        Args:
            X: Input feature data array with shape (n_samples, n_features) where
                each column will be binned using target information.
            columns: List of column identifiers corresponding to the columns in X.
                Used for bin specification keys and error messages.
            guidance_data: Target/label data that guides the binning process. Must
                be provided for supervised binning and should have shape (n_samples,)
                or (n_samples, 1). Cannot be None.
            **fit_params: Additional parameters passed to the underlying bin
                calculation methods.

        Raises:
            ValidationError: If guidance_data is None or has invalid format for
                supervised binning (wrong shape, multiple columns, etc.).

        Note:
            - Validates that guidance_data is provided (required for supervised binning)
            - Ensures guidance_data has exactly one column (the target variable)
            - Delegates actual fitting to parent class after target validation
            - Each feature column is fitted independently using the same target data
            - The target data is validated once and used for all feature columns
        """
        if guidance_data is None:
            raise ValidationError("Supervised binning requires guidance data (target labels)")

        # Validate that guidance data has exactly one column
        validated_guidance = self.validate_guidance_data(guidance_data, "guidance_data")

        # Call parent implementation with validated single-column guidance data
        super()._fit_per_column_independently(X, columns, validated_guidance, **fit_params)