Source code for binlearn.base._general_binning_base

"""
Clean general binning base class for V2 architecture.

This module provides the core binning orchestration logic with guidance support.
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Any

import numpy as np
from sklearn.base import TransformerMixin

from ..config import get_config
from ..utils import ArrayLike, BinningError, ColumnList, GuidanceColumns
from ._data_handling_base import DataHandlingBase


# pylint: disable=too-many-ancestors

[docs]
class GeneralBinningBase(
    ABC,
    DataHandlingBase,
    TransformerMixin,  # type: ignore[misc,unused-ignore]
):
    """Clean binning base class focusing on orchestration and guidance logic.

    This abstract base class provides the core infrastructure for all binning
    transformers in the binlearn library. It orchestrates the binning process,
    handles guidance column separation, and manages the interaction between
    fitting and transformation phases.

    The class supports two main fitting strategies:
    - Per-column independent fitting: Each column is binned independently
    - Joint fitting: All columns are considered together for binning decisions

    Key Features:
    - Guidance column support for supervised and semi-supervised binning
    - Flexible fitting strategies (independent vs joint)
    - DataFrame format preservation
    - Comprehensive error handling and validation
    - sklearn-compatible transformer interface

    Parameters:
    -----------
    preserve_dataframe : bool, optional
        Whether to preserve the original DataFrame format in output. If None,
        uses the global configuration default. When True, pandas/polars
        DataFrames are returned as DataFrames; otherwise numpy arrays.

    fit_jointly : bool, optional
        Whether to fit all columns jointly rather than independently. If None,
        uses the global configuration default. When True, all binning columns
        are considered together; when False, each column is binned independently.

    guidance_columns : GuidanceColumns, optional
        Specification of columns to use for guidance (supervision). Can be:
        - None: No guidance columns (unsupervised binning)
        - Column identifier: Single guidance column
        - List of identifiers: Multiple guidance columns
        Incompatible with fit_jointly=True.

    Attributes:
    -----------
    preserve_dataframe : bool
        Whether to preserve DataFrame format in output.

    fit_jointly : bool
        Whether to fit columns jointly or independently.

    guidance_columns : GuidanceColumns
        Specification of guidance columns for supervision.

    Note:
    -----
    This is an abstract base class and cannot be instantiated directly.
    Concrete implementations must provide the abstract methods for specific
    binning algorithms.

    The class enforces mutual exclusivity between fit_jointly=True and
    guidance_columns to prevent conflicting binning strategies.
    """


[docs]
    def __init__(
        self,
        preserve_dataframe: bool | None = None,
        fit_jointly: bool | None = None,
        guidance_columns: GuidanceColumns = None,
    ):
        """Initialize the binning transformer.

        Sets up the binning transformer with the specified configuration options,
        applying global configuration defaults where parameters are not provided.
        Validates parameter compatibility to prevent conflicting configurations.

        Args:
            preserve_dataframe: Whether to preserve DataFrame format in output.
                If None, uses global configuration default.
            fit_jointly: Whether to fit all columns together. If None, uses
                global configuration default.
            guidance_columns: Guidance column specification for supervised binning.
                Must be None if fit_jointly=True.

        Raises:
            ValueError: If guidance_columns is specified when fit_jointly=True,
                as these options are mutually exclusive.

        Note:
            The binning and guidance column lists are computed dynamically during
            fitting based on the actual input data and the guidance_columns parameter.
        """
        DataHandlingBase.__init__(self)
        TransformerMixin.__init__(self)

        # Load configuration defaults
        config = get_config()

        # Apply configuration defaults
        if preserve_dataframe is None:
            preserve_dataframe = config.preserve_dataframe
        if fit_jointly is None:
            fit_jointly = config.fit_jointly

        # Validate parameter compatibility
        if guidance_columns is not None and fit_jointly:
            raise ValueError(
                "guidance_columns and fit_jointly=True are incompatible. "
                "Use either guidance_columns for per-record guidance OR "
                "fit_jointly=True for global fitting, but not both."
            )

        # Store binning-specific parameters
        self.preserve_dataframe = preserve_dataframe
        self.fit_jointly = fit_jointly
        self.guidance_columns = guidance_columns


        # Note: binning and guidance columns are computed dynamically
        # from feature_names_in_ and guidance_columns when needed


[docs]
    def fit(self, X: Any, y: Any = None, **fit_params: Any) -> GeneralBinningBase:
        """Fit the binning transformer with comprehensive orchestration.

        This method orchestrates the complete fitting process, handling parameter
        validation, input preprocessing, column separation, and routing to the
        appropriate fitting strategy (joint vs independent).

        Args:
            X: Input data to fit the binning transformer on. Can be:
                - pandas.DataFrame: Column names are preserved
                - polars.DataFrame: Column names are preserved
                - numpy.ndarray: Numeric column indices are used
                - array-like: Converted to numpy array
            y: Target values for supervised binning methods. Ignored by
                unsupervised methods. Can be array-like or None.
            **fit_params: Additional fitting parameters passed to the specific
                binning algorithm implementation. Common parameters include:
                - guidance_data: Alternative guidance data (conflicts with fit_jointly=True)

        Returns:
            self: The fitted binning transformer instance.

        Raises:
            ValueError: If parameter validation fails, inputs are invalid, or
                conflicting parameters are provided (e.g., fit_jointly=True with
                guidance_data).
            BinningError: If the binning algorithm fails to fit the data.
            RuntimeError: If an unexpected error occurs during fitting.

        Example:
            >>> from binlearn import EqualWidthBinning
            >>> import pandas as pd
            >>> X = pd.DataFrame({'feature1': [1, 2, 3, 4, 5], 'feature2': [10, 20, 30, 40, 50]})
            >>> binner = EqualWidthBinning(n_bins=3)
            >>> binner.fit(X)
            EqualWidthBinning(...)

        Note:
            The method automatically handles column separation when guidance_columns
            is specified, routing guidance columns separately from binning columns.
            The fitting strategy (joint vs independent) is determined by the
            fit_jointly parameter.
        """
        try:
            # Step 1: Parameter validation
            self._validate_params()

            # Step 2: Runtime validation for mutually exclusive parameters
            guidance_data_provided = fit_params.get("guidance_data") is not None
            if self.fit_jointly and guidance_data_provided:
                raise ValueError(
                    "Cannot use both fit_jointly=True and guidance_data parameter. "
                    "These are mutually exclusive: fit_jointly uses all data together, "
                    "while guidance_data provides separate guidance per column."
                )

            # Step 3: Input validation and feature information extraction
            self._validate_and_prepare_input(X, "X")
            self._extract_and_validate_feature_info(X, reset=True)

            # Step 4: Column separation for guidance handling
            X_binning, X_guidance, binning_cols, _ = self._separate_binning_and_guidance_columns(X)

            # Step 4.5: Validate that we have columns to bin
            if not binning_cols:
                if self.guidance_columns is not None:
                    raise ValueError(
                        "All columns are specified as guidance_columns. "
                        "At least one column must be available for binning."
                    )

                raise ValueError("No columns available for binning.")

            # Step 5: Route to appropriate fitting strategy
            if self.fit_jointly:
                self._fit_jointly_across_columns(X_binning, binning_cols, **fit_params)
            else:
                # Handle guidance data resolution with priority order
                final_guidance_data = self._resolve_guidance_data_priority(
                    X_guidance, fit_params.pop("guidance_data", None), y
                )

                self._fit_per_column_independently(
                    X_binning, binning_cols, final_guidance_data, **fit_params
                )

            return self

        except Exception as e:
            if isinstance(e, BinningError | ValueError | RuntimeError | NotImplementedError):
                raise
            raise ValueError(f"Failed to fit binning model: {str(e)}") from e



[docs]
    def transform(self, X: Any) -> Any:
        """Transform input data using fitted binning parameters.

        Applies the fitted binning transformation to new data, converting
        continuous values to discrete bin indices or representatives.
        Handles column separation when guidance columns are present.

        Args:
            X: Input data to transform. Must have the same structure as the
                data used during fitting (same number of columns). Can be:
                - pandas.DataFrame: Column names should match training data
                - polars.DataFrame: Column names should match training data
                - numpy.ndarray: Must have same number of columns as training
                - array-like: Converted to numpy array

        Returns:
            Transformed data where continuous values are replaced with bin
            indices or representative values. The output format depends on:
            - preserve_dataframe setting: DataFrame vs array format
            - binning method: indices vs representatives
            - guidance_columns: only binning columns are transformed

        Raises:
            RuntimeError: If the transformer has not been fitted yet.
            ValueError: If the input data has incompatible structure or format.
            BinningError: If transformation fails due to data issues.

        Example:
            >>> # After fitting
            >>> X_new = pd.DataFrame({'feature1': [1.5, 2.5], 'feature2': [15, 25]})
            >>> X_binned = binner.transform(X_new)
            >>> print(X_binned)
            [[0, 0], [1, 1]]  # Bin indices

        Note:
            When guidance_columns is specified, only the binning columns are
            transformed. Guidance columns are filtered out from the output.
            The method preserves the original data format when preserve_dataframe=True.
        """
        try:
            # Step 1: Validation checks
            self._check_fitted()
            self._validate_and_prepare_input(X, "X")

            # Step 2: Column separation and transformation
            X_binning, _, binning_cols, _ = self._separate_binning_and_guidance_columns(X)

            if self.guidance_columns is None:
                # Simple case: transform all columns
                result = self._transform_columns_to_bins(X_binning, binning_cols)
                return self._format_output_like_input(
                    result, X, binning_cols, self.preserve_dataframe
                )

            # Guided case: transform only binning columns
            if X_binning.shape[1] > 0:
                result = self._transform_columns_to_bins(X_binning, binning_cols)
            else:
                result = np.empty((X_binning.shape[0], 0), dtype=int)

            return self._format_output_like_input(result, X, binning_cols, self.preserve_dataframe)

        except Exception as e:
            if isinstance(e, BinningError | RuntimeError):
                raise
            raise ValueError(f"Failed to transform data: {str(e)}") from e



[docs]
    def inverse_transform(self, X: Any) -> Any:
        """Inverse transform from bin indices back to representative values.

        Converts discrete bin indices back to their representative values,
        effectively reversing the binning transformation. This is useful for
        interpreting results or reconstructing approximate original values.

        Args:
            X: Input data containing bin indices to inverse transform. Should
                contain only binning columns (no guidance columns). Can be:
                - pandas.DataFrame: Column names should match binning columns
                - polars.DataFrame: Column names should match binning columns
                - numpy.ndarray: Must have same number of binning columns
                - array-like: Converted to numpy array

        Returns:
            Inverse transformed data where bin indices are replaced with their
            representative values (typically bin centers). Output format matches
            the preserve_dataframe setting.

        Raises:
            RuntimeError: If the transformer has not been fitted yet.
            ValueError: If input data has wrong number of columns or invalid format.
            BinningError: If inverse transformation fails.

        Example:
            >>> # After fitting and transforming
            >>> X_binned = [[0, 1], [1, 0], [2, 2]]  # Bin indices
            >>> X_reconstructed = binner.inverse_transform(X_binned)
            >>> print(X_reconstructed)
            [[0.5, 1.5], [1.5, 0.5], [2.5, 2.5]]  # Representative values

        Note:
            For guided binning (when guidance_columns is specified), the input
            should only contain the binning columns, not the guidance columns.
            The number of input columns must match the number of binning columns.
        """
        try:
            self._check_fitted()
            self._validate_and_prepare_input(X, "X")

            arr, columns = self._prepare_input(X)

            # Validate expected column count for guided binning
            if self.guidance_columns is not None:
                expected_cols = self._get_feature_count(include_guidance=False)
                if len(columns) != expected_cols:
                    raise ValueError(
                        f"Input for inverse_transform should have {expected_cols} "
                        f"columns (binning columns only), got {len(columns)}"
                    )

            result = self._inverse_transform_bins_to_values(arr, columns)
            return self._format_output_like_input(result, X, columns, self.preserve_dataframe)

        except Exception as e:
            if isinstance(e, BinningError | RuntimeError):
                raise
            raise ValueError(f"Failed to inverse transform data: {str(e)}") from e


    def _resolve_guidance_data_priority(
        self, X_guidance: np.ndarray[Any, Any] | None, external_guidance: Any, y: Any
    ) -> np.ndarray[Any, Any] | None | Any:
        """Resolve guidance data with clear priority order.

        Priority: X_guidance > external_guidance > y

        Args:
            X_guidance: Guidance columns from input X.
            external_guidance: Explicit guidance_data parameter.
            y: Target values (sklearn convenience).

        Returns:
            Resolved guidance data array or None.
        """
        if X_guidance is not None:
            return X_guidance

        if external_guidance is not None:
            return external_guidance

        if y is not None:
            y_array = np.asarray(y)
            if y_array.ndim == 1:
                y_array = y_array.reshape(-1, 1)
            # mypy doesn't understand that np.asarray returns the right type
            return y_array

        return None

    def _normalize_guidance_columns(
        self, guidance_cols: list[Any], columns: ColumnList
    ) -> list[Any]:
        """Normalize guidance columns from various formats to column names.

        This method handles the conversion of integer indices to column names,
        making the logic testable and reusable.

        Args:
            guidance_cols: List of guidance column identifiers (integers or strings)
            columns: Available column names

        Returns:
            List of normalized guidance column names

        Raises:
            ValueError: If column index is out of range
        """
        normalized_guidance_cols = []
        for col in guidance_cols:
            if isinstance(col, int):
                if 0 <= col < len(columns):
                    normalized_guidance_cols.append(columns[col])
                else:
                    raise ValueError(
                        f"Column index {col} is out of range for {len(columns)} columns"
                    )
            else:
                normalized_guidance_cols.append(col)  # This is line 239 equivalent

        return normalized_guidance_cols

    def _separate_binning_and_guidance_columns(
        self, X: ArrayLike
    ) -> tuple[np.ndarray[Any, Any], np.ndarray[Any, Any] | None, ColumnList, ColumnList | None]:
        """Separate input into binning and guidance columns.

        Core logic for handling guided vs unguided binning scenarios.

        Args:
            X: Input data with both binning and guidance columns.

        Returns:
            Tuple of (X_binning, X_guidance, binning_columns, guidance_columns).
        """
        arr, columns = self._prepare_input(X)

        if self.guidance_columns is None:
            # No guidance - all columns are binning columns
            return arr, None, columns, None

        # Normalize guidance_columns to list
        guidance_cols = (
            [self.guidance_columns]
            if not isinstance(self.guidance_columns, list)
            else self.guidance_columns
        )

        # Convert integer indices to column names if needed - now in separate method
        normalized_guidance_cols = self._normalize_guidance_columns(guidance_cols, columns)

        # Separate columns
        binning_indices = []
        guidance_indices = []
        binning_column_names = []
        guidance_column_names = []

        for i, col in enumerate(columns):
            if col in normalized_guidance_cols:
                guidance_indices.append(i)
                guidance_column_names.append(col)
            else:
                binning_indices.append(i)
                binning_column_names.append(col)

        # Extract data arrays
        X_binning = arr[:, binning_indices] if binning_indices else np.empty((arr.shape[0], 0))
        X_guidance = arr[:, guidance_indices] if guidance_indices else None

        # Don't store resolved column information - compute dynamically as needed
        return X_binning, X_guidance, binning_column_names, guidance_column_names

    def _get_feature_count(self, include_guidance: bool = True) -> int:
        """Get feature count with optional guidance exclusion."""
        n_features = getattr(self, "_n_features_in", 0)

        if not include_guidance and self.guidance_columns is not None:
            # Compute guidance column count dynamically
            guidance_cols = (
                [self.guidance_columns]
                if not isinstance(self.guidance_columns, list)
                else self.guidance_columns
            )
            return n_features - len(guidance_cols)

        return n_features

    def _get_binning_columns(self) -> list[Any] | None:
        """Compute binning columns dynamically from feature_names_in_ and guidance_columns."""
        if (
            not hasattr(self, "feature_names_in_")
            or getattr(self, "feature_names_in_", None) is None
        ):
            return None

        # At this point we know feature_names_in_ exists and is not None
        all_features = list(self.feature_names_in_)  # type: ignore[arg-type]

        if self.guidance_columns is None:
            return all_features

        # Normalize guidance_columns to list
        guidance_cols = (
            [self.guidance_columns]
            if not isinstance(self.guidance_columns, list)
            else self.guidance_columns
        )

        # Return features that are not guidance columns (guidance columns are used but not binned)
        return [col for col in all_features if col not in guidance_cols]

    def _get_column_key(self, target_col: Any, available_keys: ColumnList, col_index: int) -> Any:
        """Get the appropriate key for looking up bin specifications.

        Handles column key resolution with fallback strategies for
        different column identifier formats (names vs indices).

        Args:
            target_col: The target column identifier to find.
            available_keys: List of available keys in bin specifications.
            col_index: Index position of the column.

        Returns:
            The key to use for bin specification lookup.

        Raises:
            ValueError: If no matching key can be found.
        """
        # First try exact match
        if target_col in available_keys:
            return target_col

        # Handle feature_N -> N mapping for numpy array inputs
        if isinstance(target_col, str) and target_col.startswith("feature_"):
            try:
                feature_index = int(target_col.split("_")[1])
                if feature_index in available_keys:
                    return feature_index
            except (ValueError, IndexError):
                pass

        # Handle N -> feature_N mapping
        if isinstance(target_col, int):
            feature_name = f"feature_{target_col}"
            if feature_name in available_keys:
                return feature_name

        # Try index-based fallback
        if col_index < len(available_keys):
            return available_keys[col_index]

        # No match found
        raise ValueError(f"No bin specification found for column {target_col} (index {col_index})")

    def _validate_params(self) -> None:
        """Validate binning-specific parameters with clear error messages."""
        super()._validate_params()

        if self.preserve_dataframe is not None and not isinstance(self.preserve_dataframe, bool):
            raise TypeError("preserve_dataframe must be a boolean or None")

        if self.fit_jointly is not None and not isinstance(self.fit_jointly, bool):
            raise TypeError("fit_jointly must be a boolean or None")

        if self.guidance_columns is not None:
            if not isinstance(self.guidance_columns, list | tuple | int | str):
                raise TypeError("guidance_columns must be list, tuple, int, str, or None")

            # Guidance data and fit_jointly are mutually exclusive
            if self.fit_jointly:
                raise ValueError(
                    "fit_jointly=True cannot be used with guidance_columns. "
                    "Guidance-based fitting requires per-column processing."
                )


[docs]
    def get_input_columns(self) -> ColumnList | None:
        """Get input columns for data preparation.

        This method should be overridden by derived classes to provide
        appropriate column information without exposing binning-specific concepts.

        Returns:
            Column information or None if not available
        """
        return self._get_binning_columns()


    # Abstract methods for subclasses - renamed for clarity
    @abstractmethod
    def _fit_per_column_independently(
        self,
        X: np.ndarray[Any, Any],
        columns: ColumnList,
        guidance_data: ArrayLike | None = None,
        **fit_params: Any,
    ) -> None:
        """Fit binning parameters independently for each column.

        This abstract method must be implemented by concrete binning classes to
        define how each column is binned independently. This is the default
        fitting strategy when fit_jointly=False.

        Args:
            X: Input data array containing only the columns to be binned.
                Shape: (n_samples, n_binning_columns).
            columns: List of column identifiers corresponding to the columns in X.
                Used for error messages and result storage.
            guidance_data: Optional guidance data for supervised binning methods.
                Can be target values (y) or additional guidance information.
                Shape should be compatible with X for supervised methods.
            **fit_params: Additional algorithm-specific fitting parameters passed
                from the fit() method.

        Raises:
            NotImplementedError: This is an abstract method that must be implemented
                by concrete subclasses.

        Note:
            Implementations should store the fitted binning parameters (bin edges,
            representatives, etc.) in instance attributes for later use during
            transformation.
        """
        raise NotImplementedError("Subclasses must implement _fit_per_column_independently")

    @abstractmethod
    def _fit_jointly_across_columns(
        self, X: np.ndarray[Any, Any], columns: ColumnList, **fit_params: Any
    ) -> None:
        """Fit binning parameters jointly across all columns.

        This abstract method must be implemented by concrete binning classes to
        define how all columns are considered together for binning decisions.
        This enables more sophisticated binning strategies that consider
        inter-column relationships.

        Args:
            X: Input data array containing all columns to be binned together.
                Shape: (n_samples, n_binning_columns).
            columns: List of column identifiers corresponding to the columns in X.
                Used for error messages and result storage.
            **fit_params: Additional algorithm-specific fitting parameters passed
                from the fit() method.

        Raises:
            NotImplementedError: This is an abstract method that must be implemented
                by concrete subclasses.

        Note:
            Joint fitting is incompatible with guidance_columns and guidance_data
            parameters. Implementations should consider relationships between
            columns when determining binning parameters.
        """
        raise NotImplementedError("Subclasses must implement _fit_jointly_across_columns")

    @abstractmethod
    def _transform_columns_to_bins(
        self, X: np.ndarray[Any, Any], columns: ColumnList
    ) -> np.ndarray[Any, Any]:
        """Transform columns to bin indices using fitted parameters.

        This abstract method must be implemented by concrete binning classes to
        define how continuous values are converted to discrete bin indices
        during the transformation phase.

        Args:
            X: Input data array to transform. Contains continuous values that
                need to be converted to bin indices. Shape: (n_samples, n_columns).
            columns: List of column identifiers corresponding to the columns in X.
                Used for accessing the appropriate fitted binning parameters.

        Returns:
            Transformed data array where continuous values are replaced with
            discrete bin indices. Shape: (n_samples, n_columns).
            Bin indices should be integers where:
            - 0 to n_bins-1: Valid bin indices
            - MISSING_VALUE (-1): Missing/NaN values
            - BELOW_RANGE (-3): Values below binning range
            - ABOVE_RANGE (-2): Values above binning range

        Raises:
            NotImplementedError: This is an abstract method that must be implemented
                by concrete subclasses.

        Note:
            Implementations should handle missing values and out-of-range values
            appropriately using the framework's special index constants.
        """
        raise NotImplementedError("Subclasses must implement _transform_columns_to_bins")

    @abstractmethod
    def _inverse_transform_bins_to_values(
        self, X: np.ndarray[Any, Any], columns: ColumnList
    ) -> np.ndarray[Any, Any]:
        """Inverse transform from bin indices to representative values.

        Args:
            X: Binned data to inverse transform.
            columns: Column identifiers.

        Returns:
            Data with representative values.
        """
        raise NotImplementedError("Subclasses must implement _inverse_transform_bins_to_values")