Source code for binlearn.methods._isotonic_binning

"""
Clean Isotonic binning implementation for  architecture.

This module provides IsotonicBinning that inherits from SupervisedBinningBase.
Uses isotonic regression to find optimal cut points that preserve monotonic relationships.
"""

from __future__ import annotations

from typing import Any

import numpy as np
from sklearn.isotonic import IsotonicRegression

from ..base import SupervisedBinningBase
from ..config import apply_config_defaults
from ..utils import (
    BinEdgesDict,
    ConfigurationError,
    FittingError,
    create_param_dict_for_config,
    resolve_n_bins_parameter,
    validate_bin_number_parameter,
)


# pylint: disable=too-many-ancestors

[docs]
class IsotonicBinning(SupervisedBinningBase):
    """Isotonic regression-based monotonic binning implementation using clean architecture.

    Creates bins using isotonic regression to find optimal cut points that preserve
    monotonic relationships between features and targets. The transformer fits an
    isotonic (monotonic, non-decreasing or non-increasing) function to the data and
    identifies significant changes in this function to determine bin boundaries.

    This method is particularly valuable for cases where domain knowledge suggests
    a monotonic relationship between features and targets, such as risk modeling,
    credit scoring, or any application where preserving order relationships is critical.
    The isotonic regression ensures that the average target values within bins
    maintain the specified monotonic relationship.

    The algorithm works by:
    1. Sorting data by feature values
    2. Fitting an isotonic regression model to preserve monotonicity
    3. Identifying cut points where significant changes occur in the fitted function
    4. Creating bins that respect both the monotonic constraint and the minimum samples requirement

    When insufficient variability is found in the fitted isotonic function, the algorithm
    creates a single bin or falls back to simple boundary definitions.

    This implementation follows the clean binlearn architecture with straight inheritance,
    dynamic column resolution, and parameter reconstruction capabilities.

    Args:
        max_bins: Maximum number of bins to create. Controls the granularity of binning.
            Can be an integer or a string expression like 'sqrt', 'log2', etc. for
            dynamic calculation based on data size. If None, uses configuration default.
        min_samples_per_bin: Minimum number of samples required per bin. Ensures
            statistical significance of bins. Must be positive integer. If None,
            uses configuration default.
        increasing: Whether to enforce increasing monotonicity (True) or decreasing
            monotonicity (False). True means higher feature values correspond to
            higher target values. If None, uses configuration default.
        y_min: Minimum value for the fitted isotonic function output. Clips the
            fitted values to be at least this value. If None, no minimum constraint.
        y_max: Maximum value for the fitted isotonic function output. Clips the
            fitted values to be at most this value. If None, no maximum constraint.
        min_change_threshold: Minimum relative change in fitted values required to
            create a new bin boundary. Controls sensitivity to function changes.
            Must be positive. If None, uses configuration default.
        clip: Whether to clip values outside the fitted range to the nearest bin edge.
            If None, uses configuration default.
        preserve_dataframe: Whether to preserve pandas DataFrame structure in transform
            operations. If None, uses configuration default.
        guidance_columns: Column specification for target/guidance data used in
            supervised binning. Can be column names, indices, or callable selector.
        bin_edges: Pre-computed bin edges for reconstruction. Should not be provided
            during normal usage.
        bin_representatives: Pre-computed bin representatives for reconstruction.
            Should not be provided during normal usage.
        class_: Class name for reconstruction compatibility. Internal use only.
        module_: Module name for reconstruction compatibility. Internal use only.

    Attributes:
        max_bins: Maximum number of bins to create
        min_samples_per_bin: Minimum samples required per bin
        increasing: Whether monotonicity is increasing or decreasing
        y_min: Minimum constraint for fitted values
        y_max: Maximum constraint for fitted values
        min_change_threshold: Threshold for significant changes in fitted function

    Example:
        >>> import numpy as np
        >>> from binlearn.methods import IsotonicBinning
        >>>
        >>> # Create data with monotonic relationship
        >>> np.random.seed(42)
        >>> X = np.random.uniform(0, 10, 1000).reshape(-1, 1)
        >>> # Target increases monotonically with some noise
        >>> y = 2 * X.flatten() + np.random.normal(0, 1, 1000)
        >>>
        >>> # Initialize isotonic binning
        >>> binner = IsotonicBinning(
        ...     max_bins=5,
        ...     min_samples_per_bin=50,
        ...     increasing=True,
        ...     min_change_threshold=0.05
        ... )
        >>>
        >>> # Fit with target data
        >>> binner.fit(X, y)
        >>> X_binned = binner.transform(X)
        >>>
        >>> # Check monotonic preservation
        >>> bin_means = []
        >>> for bin_idx in range(len(binner.bin_edges_[0]) - 1):
        ...     bin_mask = X_binned[:, 0] == bin_idx
        ...     bin_means.append(np.mean(y[bin_mask]))
        >>> print("Bin target means:", bin_means)  # Should be monotonically increasing

    Note:
        - Requires target/guidance data for supervised learning of monotonic relationships
        - Preserves monotonic relationship between features and average target values within bins
        - Particularly useful for risk modeling, scoring, and ranking applications
        - Handles constant features and insufficient variability gracefully
        - Each column is processed independently with its corresponding target data
        - The fitted isotonic models are stored and can be used for analysis

    See Also:
        Chi2Binning: Statistical significance-based supervised binning
        TreeBinning: Decision tree-based supervised binning
        SupervisedBinningBase: Base class for supervised binning methods

    References:
        Robertson, T., Wright, F. T., & Dykstra, R. L. (1988). Order Restricted Statistical
            Inference.
    """

    # pylint: disable=too-many-arguments,too-many-positional-arguments,too-many-locals

[docs]
    def __init__(
        self,
        max_bins: int | str | None = None,
        min_samples_per_bin: int | None = None,
        increasing: bool | None = None,
        y_min: float | None = None,
        y_max: float | None = None,
        min_change_threshold: float | None = None,
        clip: bool | None = None,
        preserve_dataframe: bool | None = None,
        guidance_columns: Any = None,
        *,
        bin_edges: BinEdgesDict | None = None,
        bin_representatives: BinEdgesDict | None = None,
        class_: (  # pylint: disable=unused-argument
            str | None
        ) = None,  # For reconstruction compatibility
        module_: (  # pylint: disable=unused-argument
            str | None
        ) = None,  # For reconstruction compatibility
    ):
        """Initialize Isotonic binning with monotonicity and quality parameters.

        Sets up isotonic regression-based binning with specified parameters for
        monotonicity preservation and bin quality control. Applies configuration
        defaults for any unspecified parameters and validates the resulting configuration.

        Args:
            max_bins: Maximum number of bins to create per column. Controls granularity
                of the binning. Can be:
                - Integer: Exact maximum number of bins
                - String: Dynamic calculation expression ('sqrt', 'log2', etc.)
                Must be positive. If None, uses configuration default.
            min_samples_per_bin: Minimum number of samples required per bin. Ensures
                statistical reliability of each bin. Must be positive integer.
                If None, uses configuration default.
            increasing: Whether to enforce increasing monotonicity (True) or decreasing
                monotonicity (False). True means higher feature values should correspond
                to higher average target values. If None, uses configuration default.
            y_min: Minimum value constraint for the fitted isotonic function output.
                Clips fitted values to be at least this value. Must be numeric.
                If None, no minimum constraint is applied.
            y_max: Maximum value constraint for the fitted isotonic function output.
                Clips fitted values to be at most this value. Must be numeric and
                greater than y_min if both are specified. If None, no maximum constraint.
            min_change_threshold: Minimum relative change in fitted values required
                to create a new bin boundary. Controls sensitivity to changes in the
                isotonic function. Must be positive float. If None, uses configuration default.
            clip: Whether to clip transformed values outside the fitted range to the
                nearest bin edge. If None, uses configuration default.
            preserve_dataframe: Whether to preserve pandas DataFrame structure in
                transform operations. If None, uses configuration default.
            guidance_columns: Column specification for target/guidance data. Can be
                column names, indices, or callable selector. Required for supervised
                binning during fit operations.
            bin_edges: Pre-computed bin edges dictionary for reconstruction. Internal
                use only - should not be provided during normal initialization.
            bin_representatives: Pre-computed representatives dictionary for
                reconstruction. Internal use only.
            class_: Class name string for reconstruction compatibility. Internal use only.
            module_: Module name string for reconstruction compatibility. Internal use only.

        Example:
            >>> # Standard initialization for increasing monotonic relationship
            >>> binner = IsotonicBinning(
            ...     max_bins=8,
            ...     min_samples_per_bin=30,
            ...     increasing=True,
            ...     min_change_threshold=0.02
            ... )
            >>>
            >>> # Decreasing monotonic relationship with value constraints
            >>> binner = IsotonicBinning(
            ...     max_bins=6,
            ...     min_samples_per_bin=50,
            ...     increasing=False,
            ...     y_min=0.0,
            ...     y_max=1.0,
            ...     guidance_columns=['risk_score']
            ... )
            >>>
            >>> # Use configuration defaults
            >>> binner = IsotonicBinning(guidance_columns='target')

        Note:
            - Parameter validation occurs during initialization
            - Configuration defaults are applied for None parameters
            - The increasing parameter is crucial for defining the expected relationship direction
            - y_min and y_max constraints help with numerical stability and domain knowledge
                enforcement
            - Reconstruction parameters should not be provided during normal usage
            - Guidance columns must be specified for supervised binning to work properly
        """
        # Prepare user parameters for config integration (exclude never-configurable params)
        # Use standardized initialization pattern
        user_params = create_param_dict_for_config(
            max_bins=max_bins,
            min_samples_per_bin=min_samples_per_bin,
            increasing=increasing,
            y_min=y_min,
            y_max=y_max,
            min_change_threshold=min_change_threshold,
            clip=clip,
            preserve_dataframe=preserve_dataframe,
        )

        # Apply configuration defaults for isotonic method
        resolved_params = apply_config_defaults("isotonic", user_params)

        # Store method-specific parameters
        self.max_bins = resolved_params.get("max_bins", 10)
        self.min_samples_per_bin = resolved_params.get("min_samples_per_bin", 5)
        self.increasing = resolved_params.get("increasing", True)
        self.y_min = resolved_params.get("y_min", None)
        self.y_max = resolved_params.get("y_max", None)
        self.min_change_threshold = resolved_params.get("min_change_threshold", 0.01)

        # Dictionary to store fitted isotonic models for each feature
        self._isotonic_models: dict[Any, IsotonicRegression] = {}

        # Initialize parent with resolved parameters (never-configurable params passed as-is)
        SupervisedBinningBase.__init__(
            self,
            clip=resolved_params.get("clip"),
            preserve_dataframe=resolved_params.get("preserve_dataframe"),
            guidance_columns=guidance_columns,  # Never configurable
            bin_edges=bin_edges,  # Never configurable
            bin_representatives=bin_representatives,  # Never configurable
        )


    def _validate_params(self) -> None:
        """Validate Isotonic binning parameters."""
        # Call parent validation
        SupervisedBinningBase._validate_params(self)

        # Validate max_bins using centralized utility
        validate_bin_number_parameter(self.max_bins, param_name="max_bins")

        # Validate min_samples_per_bin parameter
        if not isinstance(self.min_samples_per_bin, int) or self.min_samples_per_bin < 1:
            raise ConfigurationError(
                "min_samples_per_bin must be a positive integer",
                suggestions=["Example: min_samples_per_bin=5"],
            )

        # Validate increasing parameter
        if not isinstance(self.increasing, bool):
            raise ConfigurationError(
                "increasing must be a boolean value",
                suggestions=["Use increasing=True for increasing monotonicity"],
            )

        # Validate y_min and y_max parameters
        if self.y_min is not None and not isinstance(self.y_min, int | float):
            raise ConfigurationError(
                "y_min must be a number or None",
                suggestions=["Example: y_min=0.0"],
            )

        if self.y_max is not None and not isinstance(self.y_max, int | float):
            raise ConfigurationError(
                "y_max must be a number or None",
                suggestions=["Example: y_max=1.0"],
            )

        if self.y_min is not None and self.y_max is not None and self.y_min >= self.y_max:
            raise ConfigurationError(
                "y_min must be less than y_max",
                suggestions=["Example: y_min=0.0, y_max=1.0"],
            )

        # Validate min_change_threshold parameter
        if not isinstance(self.min_change_threshold, int | float) or self.min_change_threshold <= 0:
            raise ConfigurationError(
                "min_change_threshold must be a positive number",
                suggestions=["Example: min_change_threshold=0.01"],
            )

    def _calculate_bins(
        self,
        x_col: np.ndarray[Any, Any],
        col_id: Any,
        guidance_data: np.ndarray[Any, Any] | None = None,
    ) -> tuple[list[float], list[float]]:
        """Calculate isotonic regression-based bins for a single column.

        Uses isotonic regression to fit a monotonic function to the feature-target
        relationship, then identifies cut points based on significant changes in
        the fitted function.

        Args:
            x_col: Preprocessed column data (from base class)
            col_id: Column identifier for error reporting
            guidance_data: Target/guidance data for supervised binning (required)

        Returns:
            Tuple of (bin_edges, bin_representatives)

        Raises:
            FittingError: If guidance_data is None or insufficient data for binning
        """
        # Require guidance data for supervised binning
        if guidance_data is None:
            raise FittingError(f"Column {col_id}: guidance_data is required for isotonic binning")

        # Prepare guidance data for processing
        guidance_data_numeric = self._prepare_target_values(guidance_data)

        # Validate guidance data shape matches feature data
        if len(guidance_data_numeric) != len(x_col):
            raise ValueError(
                f"Column {col_id}: Guidance data length ({len(guidance_data_numeric)}) "
                f"does not match feature data length ({len(x_col)})"
            )

        # Check if we have sufficient data
        if len(x_col) < self.min_samples_per_bin:
            raise FittingError(
                f"Column {col_id}: Insufficient data points ({len(x_col)}) "
                f"for isotonic binning. Need at least {self.min_samples_per_bin}."
            )

        # Create isotonic binning
        return self._create_isotonic_bins(x_col, guidance_data_numeric, col_id)

    # pylint: disable=too-many-locals
    def _create_isotonic_bins(
        self, x_col: np.ndarray[Any, Any], y_col: np.ndarray[Any, Any], col_id: Any
    ) -> tuple[list[float], list[float]]:
        """Create bins using isotonic regression.

        Fits an isotonic regression model to the feature-target relationship and
        identifies optimal cut points based on changes in the fitted function.

        Args:
            x_col: Clean feature data (no NaN values)
            y_col: Clean target data (no NaN values)
            col_id: Column identifier for model storage

        Returns:
            Tuple of (bin_edges, bin_representatives)

        Note:
            The data is already preprocessed by the base class, so we don't need
            to handle NaN/inf values or constant data here.
        """
        # Resolve max_bins parameter for this dataset
        resolved_max_bins = resolve_n_bins_parameter(
            self.max_bins, data_shape=(len(x_col), 1), param_name="max_bins"
        )

        # Handle infinity values in feature data first (before constant feature check)
        if np.any(np.isinf(x_col)):
            x_finite_mask = np.isfinite(x_col)
            if not np.any(x_finite_mask):
                raise ValueError(f"Column {col_id}: All feature values are infinite")

            # Replace inf values with finite extremes
            x_min_finite = np.min(x_col[x_finite_mask])
            x_max_finite = np.max(x_col[x_finite_mask])
            x_range = x_max_finite - x_min_finite

            # Replace -inf with minimum - 10% of range, +inf with maximum + 10% of range
            x_col = np.where(x_col == -np.inf, x_min_finite - max(abs(x_range) * 0.1, 1.0), x_col)
            x_col = np.where(x_col == np.inf, x_max_finite + max(abs(x_range) * 0.1, 1.0), x_col)

        # Handle constant feature data
        if len(np.unique(x_col)) == 1:
            x_val = float(x_col[0])
            return ([x_val - 0.1, x_val + 0.1], [x_val])

        # Sort data by feature values for isotonic regression
        sort_indices = np.argsort(x_col)
        x_sorted = x_col[sort_indices]
        y_sorted = y_col[sort_indices]

        # Ensure both arrays are 1D for sklearn's IsotonicRegression
        x_sorted = x_sorted.flatten()
        y_sorted = y_sorted.flatten()

        # Fit isotonic regression using safe sklearn call
        try:
            isotonic_model = IsotonicRegression(
                increasing=self.increasing,
                y_min=self.y_min,
                y_max=self.y_max,
                out_of_bounds="clip",
            )
            y_fitted = isotonic_model.fit_transform(x_sorted, y_sorted)
        except (
            ValueError,
            RuntimeError,
            ImportError,
            Exception,
        ) as e:  # pylint: disable=broad-exception-caught
            raise ValueError(f"Column {col_id}: Isotonic regression failed: {e}") from e

        # Store the fitted model
        self._isotonic_models[col_id] = isotonic_model

        # Find cut points based on fitted function changes
        cut_points = self._find_cut_points(x_sorted, y_fitted, resolved_max_bins)

        # Create bin edges and representatives
        return self._create_bins_from_cuts(x_sorted, y_fitted, cut_points, col_id)

    def _prepare_target_values(self, y_values: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
        """Prepare target values for isotonic regression.

        Args:
            y_values: Raw target values (may be 2D with shape (n_samples, 1))

        Returns:
            Processed target values suitable for isotonic regression (1D array)
        """
        # Flatten if 2D with single column (guidance_data format)
        if y_values.ndim == 2 and y_values.shape[1] == 1:
            y_values_flat = y_values.flatten()
        else:
            y_values_flat = y_values

        # Convert to float for isotonic regression
        y_processed = y_values_flat.astype(float)

        return y_processed

    def _find_cut_points(
        self,
        x_sorted: np.ndarray[Any, Any],
        y_fitted: np.ndarray[Any, Any],
        max_bins: int,  # pylint: disable=unused-argument
    ) -> list[int]:
        """Find cut points based on changes in fitted isotonic function.

        Identifies locations where the fitted function has significant changes
        that warrant creating new bin boundaries.

        Args:
            x_sorted: Sorted feature values
            y_fitted: Fitted isotonic regression values
            max_bins: Maximum number of bins allowed (resolved from string)

        Returns:
            Indices of cut points in the sorted arrays
        """
        _ = x_sorted

        cut_indices = [0]  # Always start with first point

        if len(y_fitted) <= 1:
            return cut_indices

        # Calculate relative changes in fitted values
        y_range = np.max(y_fitted) - np.min(y_fitted)
        if y_range == 0:
            return cut_indices

        # Find points with significant changes
        for i in range(1, len(y_fitted)):
            # Check if there's a significant change from the last cut point
            last_cut_idx = cut_indices[-1]
            y_change = abs(y_fitted[i] - y_fitted[last_cut_idx])
            relative_change = y_change / y_range

            # Check if we have enough samples since last cut
            samples_since_cut = i - last_cut_idx

            if (
                relative_change >= self.min_change_threshold
                and samples_since_cut >= self.min_samples_per_bin
                and len(cut_indices) < max_bins - 1  # Ensure we don't exceed max_bins
            ):
                cut_indices.append(i)

        return cut_indices

    def _create_bins_from_cuts(
        self,
        x_sorted: np.ndarray[Any, Any],
        y_fitted: np.ndarray[Any, Any],  # pylint: disable=unused-argument
        cut_indices: list[int],
        col_id: Any,  # pylint: disable=unused-argument
    ) -> tuple[list[float], list[float]]:
        """Create bin edges and representatives from cut points.

        Args:
            x_sorted: Sorted feature values
            y_fitted: Fitted isotonic regression values
            cut_indices: Indices of cut points
            col_id: Column identifier for error reporting

        Returns:
            Tuple of (bin_edges, bin_representatives)
        """
        if len(cut_indices) == 1:
            # Only one cut point - create single bin
            x_min, x_max = float(np.min(x_sorted)), float(np.max(x_sorted))
            if x_min == x_max:
                x_max = x_min + 1.0
            return [x_min, x_max], [(x_min + x_max) / 2]

        # Create bin edges
        bin_edges = []
        bin_representatives = []

        for i, cut_idx in enumerate(cut_indices):
            if i == 0:
                # First bin edge
                bin_edges.append(float(x_sorted[cut_idx]))
            else:
                # Find midpoint between consecutive cut points for bin boundary
                prev_cut_idx = cut_indices[i - 1]
                if cut_idx > prev_cut_idx:
                    midpoint = (x_sorted[prev_cut_idx] + x_sorted[cut_idx]) / 2
                    bin_edges.append(float(midpoint))

                    # Representative is the mean of feature values in this bin
                    bin_x_values = x_sorted[prev_cut_idx:cut_idx]
                    bin_representative = float(np.mean(bin_x_values))
                    bin_representatives.append(bin_representative)

        # Add final bin edge and representative
        bin_edges.append(float(x_sorted[-1]))
        if len(cut_indices) > 1:
            final_bin_x = x_sorted[cut_indices[-1] :]
            final_representative = float(np.mean(final_bin_x))
            bin_representatives.append(final_representative)
        else:
            bin_representatives.append(float(np.mean(x_sorted)))

        return bin_edges, bin_representatives