Source code for binlearn.methods._tree_binning

"""
Clean Tree binning implementation for  architecture.

This module provides TreeBinning that inherits from SupervisedBinningBase.
Uses decision tree splits to find optimal cut points based on guidance data.
"""

from __future__ import annotations

from typing import Any

import numpy as np
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from ..base import SupervisedBinningBase
from ..config import apply_config_defaults, get_config
from ..utils import BinEdgesDict, ConfigurationError, FittingError, create_param_dict_for_config


# pylint: disable=too-many-ancestors

[docs]
class TreeBinning(SupervisedBinningBase):
    """Tree-based supervised binning implementation using clean architecture.

    Creates bins using decision tree splits guided by a target column. This method
    fits a decision tree to predict the guidance column from the features to be
    binned, then uses the tree's split thresholds to define bin boundaries that
    optimize the tree's ability to separate different target values.

    The decision tree learning algorithm automatically identifies the most informative
    split points for distinguishing between different target values, making this
    approach particularly effective for supervised learning tasks. The resulting bins
    correspond to the decision tree's internal nodes, creating intervals that maximize
    the separation of target classes or minimize target variance.

    This approach is especially valuable when:
    - The relationship between features and targets is complex and non-linear
    - Domain knowledge about optimal split points is limited
    - Automatic feature discretization is needed for downstream models
    - Interpretable binning rules are desired (tree splits are easy to understand)

    The method supports both classification and regression tasks, automatically
    selecting the appropriate decision tree variant based on the task type. The
    fitted trees are stored and can be accessed for analysis of feature importance
    and split decisions.

    This implementation follows the clean binlearn architecture with straight inheritance,
    dynamic column resolution, and parameter reconstruction capabilities.

    Args:
        task_type: Type of supervised task - either 'classification' or 'regression'.
            Determines whether to use DecisionTreeClassifier or DecisionTreeRegressor.
            If None, uses configuration default.
        tree_params: Dictionary of parameters to pass to the sklearn DecisionTree.
            Common parameters include max_depth, min_samples_split, min_samples_leaf,
            random_state. If None, uses configuration default or sensible defaults.
        clip: Whether to clip values outside the fitted range to the nearest bin edge.
            If None, uses configuration default.
        preserve_dataframe: Whether to preserve pandas DataFrame structure in transform
            operations. If None, uses configuration default.
        guidance_columns: Column specification for target/guidance data used in
            supervised binning. Can be column names, indices, or callable selector.
        bin_edges: Pre-computed bin edges for reconstruction. Should not be provided
            during normal usage.
        bin_representatives: Pre-computed bin representatives for reconstruction.
            Should not be provided during normal usage.
        class_: Class name for reconstruction compatibility. Internal use only.
        module_: Module name for reconstruction compatibility. Internal use only.

    Attributes:
        task_type: Type of supervised task ('classification' or 'regression')
        tree_params: Parameters passed to the decision tree
        _fitted_trees: Dictionary storing fitted tree models per column
        _tree_importance: Dictionary storing feature importance per column
        _tree_template: Template tree used for cloning during fitting

    Example:
        >>> import numpy as np
        >>> from binlearn.methods import TreeBinning
        >>> from sklearn.datasets import make_classification
        >>>
        >>> # Create sample classification data
        >>> X, y = make_classification(n_samples=1000, n_features=1, n_redundant=0, random_state=42)
        >>>
        >>> # Initialize tree binning for classification
        >>> binner = TreeBinning(
        ...     task_type='classification',
        ...     tree_params={'max_depth': 4, 'min_samples_leaf': 50, 'random_state': 42}
        ... )
        >>>
        >>> # Fit with target data
        >>> binner.fit(X, y)
        >>> X_binned = binner.transform(X)
        >>>
        >>> # Analyze tree splits
        >>> print(f"Number of bins: {len(binner.bin_edges_[0]) - 1}")
        >>> print(f"Split points: {binner.bin_edges_[0][1:-1]}")  # Exclude data bounds
        >>>
        >>> # Access fitted tree for analysis
        >>> tree = binner._fitted_trees[0]
        >>> print(f"Tree depth: {tree.tree_.max_depth}")

    Note:
        - Requires target/guidance data for supervised learning of optimal split points
        - Automatically selects DecisionTreeClassifier or DecisionTreeRegressor based on task_type
        - Split thresholds from the tree become the bin boundaries
        - Supports all sklearn DecisionTree parameters through tree_params
        - Fitted trees are stored and accessible for further analysis
        - Each column is processed independently with its corresponding target data
        - Handles both classification and regression tasks seamlessly

    See Also:
        Chi2Binning: Statistical significance-based supervised binning
        IsotonicBinning: Monotonic relationship preserving supervised binning
        SupervisedBinningBase: Base class for supervised binning methods

    References:
        Breiman, L., Friedman, J., Stone, C. J., & Olshen, R. A. (1984).
        Classification and regression trees.
    """

    # pylint: disable=too-many-arguments,too-many-positional-arguments

[docs]
    def __init__(
        self,
        task_type: str | None = None,
        tree_params: dict[str, Any] | None = None,
        clip: bool | None = None,
        preserve_dataframe: bool | None = None,
        guidance_columns: Any = None,
        *,
        bin_edges: BinEdgesDict | None = None,
        bin_representatives: BinEdgesDict | None = None,
        class_: (  # pylint: disable=unused-argument
            str | None
        ) = None,  # For reconstruction compatibility
        module_: (  # pylint: disable=unused-argument
            str | None
        ) = None,  # For reconstruction compatibility
    ):
        """Initialize Tree binning with decision tree parameters and task configuration.

        Sets up decision tree-based binning with specified tree parameters and task type.
        Creates a tree template that will be cloned for each column during fitting.
        Applies configuration defaults for any unspecified parameters.

        Args:
            task_type: Type of supervised learning task. Must be either:
                - 'classification': Uses DecisionTreeClassifier for discrete targets
                - 'regression': Uses DecisionTreeRegressor for continuous targets
                If None, uses configuration default (typically 'classification').
            tree_params: Dictionary of parameters to pass to the sklearn DecisionTree
                constructor. Common parameters include:
                - max_depth: Maximum depth of the tree (int or None)
                - min_samples_split: Minimum samples required to split a node (int)
                - min_samples_leaf: Minimum samples required at each leaf (int)
                - random_state: Random seed for reproducible results (int or None)
                If None, uses sensible defaults.
            clip: Whether to clip transformed values outside the fitted range to the
                nearest bin edge. If None, uses configuration default.
            preserve_dataframe: Whether to preserve pandas DataFrame structure in
                transform operations. If None, uses configuration default.
            guidance_columns: Column specification for target/guidance data. Can be
                column names, indices, or callable selector. Required for supervised
                binning during fit operations.
            bin_edges: Pre-computed bin edges dictionary for reconstruction. Internal
                use only - should not be provided during normal initialization.
            bin_representatives: Pre-computed representatives dictionary for
                reconstruction. Internal use only.
            class_: Class name string for reconstruction compatibility. Internal use only.
            module_: Module name string for reconstruction compatibility. Internal use only.

        Raises:
            ConfigurationError: If task_type is not 'classification' or 'regression',
                or if tree_params contains invalid parameters.

        Example:
            >>> # Classification with custom tree parameters
            >>> binner = TreeBinning(
            ...     task_type='classification',
            ...     tree_params={
            ...         'max_depth': 5,
            ...         'min_samples_leaf': 20,
            ...         'random_state': 42
            ...     },
            ...     guidance_columns='target_class'
            ... )
            >>>
            >>> # Regression with minimal tree constraints
            >>> binner = TreeBinning(
            ...     task_type='regression',
            ...     tree_params={'max_depth': 3, 'min_samples_split': 10},
            ...     guidance_columns=['continuous_target']
            ... )
            >>>
            >>> # Use configuration defaults
            >>> binner = TreeBinning(guidance_columns='target')

        Note:
            - Parameter validation occurs during initialization
            - Tree template is created during initialization and cloned for each column
            - Configuration defaults are applied for None parameters
            - The tree_params dictionary is validated against sklearn DecisionTree parameters
            - Guidance columns must be specified for supervised binning to work properly
            - Reconstruction parameters should not be provided during normal usage
        """
        # Use standardized initialization pattern
        user_params = create_param_dict_for_config(
            task_type=task_type,
            tree_params=tree_params,
            clip=clip,
            preserve_dataframe=preserve_dataframe,
        )

        # Apply configuration defaults
        resolved_params = apply_config_defaults("supervised", user_params)

        # Store method-specific parameters
        self.task_type = resolved_params.get("task_type", "classification")
        self.tree_params = resolved_params.get("tree_params", None)

        # Validate task type
        if self.task_type not in ["classification", "regression"]:
            raise ConfigurationError(
                f"task_type must be 'classification' or 'regression', got '{self.task_type}'"
            )

        # Initialize tree storage attributes
        self._fitted_trees: dict[Any, Any] = {}
        self._tree_importance: dict[Any, float] = {}
        self._tree_template: DecisionTreeClassifier | DecisionTreeRegressor | None = None

        # Initialize parent with resolved parameters
        SupervisedBinningBase.__init__(
            self,
            clip=resolved_params.get("clip"),
            preserve_dataframe=resolved_params.get("preserve_dataframe"),
            guidance_columns=guidance_columns,
            bin_edges=bin_edges,
            bin_representatives=bin_representatives,
        )

        # Create tree template after parent initialization
        self._create_tree_template()


    def _validate_params(self) -> None:
        """Validate Tree binning parameters."""
        # Call parent validation
        SupervisedBinningBase._validate_params(self)

        # Validate tree_params if provided
        if self.tree_params is not None:
            if not isinstance(self.tree_params, dict):
                raise ConfigurationError(
                    "tree_params must be a dictionary",
                    suggestions=["Example: tree_params={'max_depth': 3, 'min_samples_leaf': 5}"],
                )

    def _create_tree_template(self) -> None:
        """Create tree template with merged parameters."""
        if self._tree_template is not None:
            return

        # Create simple tree template with default parameters
        default_params = {
            "max_depth": 3,
            "min_samples_leaf": 1,
            "min_samples_split": 2,
            "random_state": None,
        }

        # Merge user params with defaults
        merged_params = {**default_params, **(self.tree_params or {})}

        # Initialize the appropriate tree model template
        try:
            if self.task_type == "classification":
                self._tree_template = DecisionTreeClassifier(**merged_params)
            else:  # regression
                self._tree_template = DecisionTreeRegressor(**merged_params)
        except TypeError as e:
            raise ConfigurationError(
                f"Invalid tree_params: {str(e)}",
                suggestions=[
                    "Check that all tree_params are valid DecisionTree parameters",
                    "Common parameters: max_depth, min_samples_split,"
                    " min_samples_leaf, random_state",
                ],
            ) from e

    # pylint: disable=too-many-locals
    def _calculate_bins(
        self,
        x_col: np.ndarray[Any, Any],
        col_id: Any,
        guidance_data: np.ndarray[Any, Any] | None = None,
    ) -> tuple[list[float], list[float]]:
        """Calculate bins using decision tree splits for a single column.

        Fits a decision tree to predict the guidance data from the feature column,
        then extracts the tree's split thresholds to create optimal bin boundaries.

        Args:
            x_col: Preprocessed column data (from base class)
            col_id: Column identifier for error reporting
            guidance_data: Target/guidance data for supervised binning (required)

        Returns:
            Tuple of (bin_edges, bin_representatives)

        Raises:
            FittingError: If guidance_data is None or tree fitting fails
        """
        # Require guidance data for supervised binning
        if guidance_data is None:
            raise FittingError(f"Column {col_id}: guidance_data is required for tree binning")

        # Validate and clean feature-target pairs (removes NaN/inf from target)
        x_col_clean, guidance_clean = self._validate_feature_target_pair(
            x_col, guidance_data, col_id
        )

        # Check for insufficient data after cleaning
        min_samples_split = (self.tree_params or {}).get("min_samples_split", 2)
        if len(x_col_clean) < min_samples_split:
            raise FittingError(
                f"Column {col_id}: Insufficient data points ({len(x_col_clean)}) "
                f"for tree binning after cleaning. Need at least {min_samples_split}."
            )

        # Fit decision tree
        try:
            if self._tree_template is None:
                raise FittingError("Tree template not initialized")
            tree = clone(self._tree_template)
            # Reshape x_col_clean to 2D for sklearn compatibility
            x_col_2d = x_col_clean.reshape(-1, 1)
            tree.fit(x_col_2d, guidance_clean)
        except (
            ValueError,
            RuntimeError,
            ImportError,
            Exception,
        ) as e:  # pylint: disable=broad-exception-caught
            raise FittingError(
                f"Column {col_id}: Failed to fit decision tree: {str(e)}",
                suggestions=[
                    "Check if your target values are valid for the chosen task_type",
                    "Try adjusting tree_params (e.g., reduce max_depth)",
                    "Ensure you have enough data for the tree parameters",
                ],
            ) from e

        # Extract split points from the tree
        split_points = self._extract_split_points(tree, x_col_clean)

        # Store tree information for later access
        self._store_tree_info(tree, col_id)

        # Create bin edges
        data_min: float = float(np.min(x_col_clean))
        data_max: float = float(np.max(x_col_clean))

        # Handle constant column case: create bins with eps margins
        config = get_config()
        if abs(data_max - data_min) <= config.float_tolerance:
            # Constant column: create edges at constant_value ± eps
            constant_value = data_min  # Same as data_max
            eps = config.float_tolerance * 10  # Use larger margin than tolerance
            bin_edges = [constant_value - eps, constant_value + eps]
        else:
            # Combine data bounds with split points
            all_edges = [data_min] + sorted(split_points) + [data_max]
            # Remove duplicates while preserving order
            bin_edges = self._filter_duplicate_edges(all_edges)

        # Calculate representatives (midpoints of bins)
        representatives = []
        for i in range(len(bin_edges) - 1):
            rep = (bin_edges[i] + bin_edges[i + 1]) / 2
            representatives.append(rep)

        return bin_edges, representatives

    def _filter_duplicate_edges(self, all_edges: list[float]) -> list[float]:
        """Filter out duplicate edges based on float tolerance.

        Args:
            all_edges: List of edge values to filter

        Returns:
            Filtered list with duplicates removed based on float_tolerance
        """
        config = get_config()
        bin_edges: list[float] = []
        for edge in all_edges:
            if not bin_edges or abs(edge - bin_edges[-1]) > config.float_tolerance:
                bin_edges.append(edge)
        return bin_edges

    def _extract_split_points(self, tree: Any, x_data: np.ndarray[Any, Any]) -> list[float]:
        """Extract split points from a fitted decision tree.

        Args:
            tree: Fitted decision tree model
            x_data: Training data used to fit the tree

        Returns:
            List of unique split threshold values extracted from the tree
        """
        _ = x_data

        split_points = []

        # Access the tree structure
        tree_structure = tree.tree_
        feature = tree_structure.feature
        threshold = tree_structure.threshold

        # Extract thresholds for splits on our single feature (index 0)
        for node_id in range(tree_structure.node_count):
            if feature[node_id] == 0:  # Split on our feature
                split_points.append(float(threshold[node_id]))

        return split_points

    def _store_tree_info(self, tree: Any, col_id: Any) -> None:
        """Store tree information for later access.

        Args:
            tree: Fitted decision tree model
            col_id: Column identifier
        """
        self._fitted_trees[col_id] = tree

        # Calculate and store feature importance (always 1.0 for single feature)
        self._tree_importance[col_id] = 1.0