Source code for binlearn.methods._gaussian_mixture_binning

"""
Clean Gaussian Mixture binning implementation for  architecture.

This module provides GaussianMixtureBinning that inherits from IntervalBinningBase.
Uses Gaussian Mixture Model clustering to find natural probabilistic bin boundaries.
"""

from __future__ import annotations

from typing import Any

import numpy as np
from sklearn.exceptions import ConvergenceWarning
from sklearn.mixture import GaussianMixture

from ..base import IntervalBinningBase
from ..config import apply_config_defaults
from ..utils import (
    BinEdgesDict,
    ConfigurationError,
    apply_equal_width_fallback,
    create_param_dict_for_config,
    resolve_n_bins_parameter,
    validate_bin_number_for_calculation,
    validate_bin_number_parameter,
)
from ..utils._parameter_validation import validate_random_state


# pylint: disable=too-many-ancestors
[docs] class GaussianMixtureBinning(IntervalBinningBase): """Gaussian Mixture Model clustering-based binning implementation using clean architecture. Creates bins based on Gaussian Mixture Model (GMM) clustering of each feature. The bin edges are determined by the decision boundaries between mixture components, creating bins that represent natural probabilistic groupings in the data based on underlying Gaussian distributions. The GMM algorithm assumes the data can be modeled as a mixture of Gaussian distributions and finds the optimal parameters (means, covariances, weights) for each component. Bin boundaries are placed at the midpoints between adjacent component means, creating intervals that correspond to regions where different Gaussian components are most likely. This approach is particularly effective for data with multiple modes or natural clustering, as it can identify and separate these distributions automatically. Unlike k-means clustering, GMM provides probabilistic cluster assignments and can handle clusters of different shapes and densities. When GMM fitting fails (e.g., due to numerical issues or insufficient data), the algorithm automatically falls back to equal-width binning to ensure robust operation. This implementation follows the clean binlearn architecture with straight inheritance, dynamic column resolution, and parameter reconstruction capabilities. Args: n_components: Number of Gaussian components (mixture components) to fit. Controls the number of bins created. Can be an integer or a string expression like 'sqrt', 'log2', etc. for dynamic calculation based on data size. If None, uses configuration default. random_state: Random seed for reproducible GMM fitting. Controls the random initialization of component parameters. If None, results may vary between runs due to random initialization. allow_fallback: Whether to fall back to equal-width binning when GMM fitting fails. If True (default), uses equal-width binning as fallback with a warning. If False, raises an error when GMM fails. If None, uses configuration default. clip: Whether to clip values outside the fitted range to the nearest bin edge. If None, uses configuration default. preserve_dataframe: Whether to preserve pandas DataFrame structure in transform operations. If None, uses configuration default. fit_jointly: Whether to fit all columns together (False for GMM - always fits columns independently). If None, uses configuration default. bin_edges: Pre-computed bin edges for reconstruction. Should not be provided during normal usage. bin_representatives: Pre-computed bin representatives for reconstruction. Should not be provided during normal usage. class_: Class name for reconstruction compatibility. Internal use only. module_: Module name for reconstruction compatibility. Internal use only. Attributes: n_components: Number of mixture components to fit random_state: Random seed for reproducible results allow_fallback: Whether to fall back to equal-width binning when needed allow_fallback: Whether to fall back to equal-width binning on failure Example: >>> import numpy as np >>> from binlearn.methods import GaussianMixtureBinning >>> >>> # Create sample data with multiple modes >>> np.random.seed(42) >>> data = np.concatenate([ ... np.random.normal(-2, 0.5, 200), # First mode ... np.random.normal(1, 0.8, 300), # Second mode ... np.random.normal(4, 0.3, 150) # Third mode ... ]) >>> >>> # Initialize GMM binning with 3 components >>> binner = GaussianMixtureBinning(n_components=3, random_state=42) >>> >>> # Fit and transform >>> X = data.reshape(-1, 1) >>> binner.fit(X) >>> X_binned = binner.transform(X) >>> >>> # Check identified components >>> print(f"Number of bins: {len(binner.bin_edges_[0]) - 1}") >>> print(f"Bin representatives: {binner.bin_representatives_[0]}") Note: - GMM is particularly effective for data with natural multimodal distributions - Component means become the bin representatives (centers of identified modes) - Bin boundaries are placed at midpoints between adjacent component means - Requires sufficient data points (at least n_components) per column - Falls back to equal-width binning if GMM fitting fails - Each column is processed independently (unsupervised approach) - Uses full covariance type for maximum flexibility in component shapes See Also: KMeansBinning: Alternative clustering-based binning with hard cluster assignments DBSCANBinning: Density-based clustering for irregularly shaped clusters EqualWidthBinning: Simple equal-width interval binning """ # pylint: disable=too-many-arguments,too-many-positional-arguments
[docs] def __init__( self, n_components: int | str | None = None, random_state: int | None = None, allow_fallback: bool | None = None, clip: bool | None = None, preserve_dataframe: bool | None = None, fit_jointly: bool | None = None, *, bin_edges: BinEdgesDict | None = None, bin_representatives: BinEdgesDict | None = None, class_: ( # pylint: disable=unused-argument str | None ) = None, # For reconstruction compatibility module_: ( # pylint: disable=unused-argument str | None ) = None, # For reconstruction compatibility ): """Initialize Gaussian Mixture binning with clustering parameters. Sets up GMM-based binning with specified parameters. Applies configuration defaults for any unspecified parameters and validates the resulting configuration. Args: n_components: Number of Gaussian components (mixture components) to fit. Controls the number of bins created. Can be: - Integer: Exact number of components - String: Dynamic calculation expression ('sqrt', 'log2', etc.) Must be positive. If None, uses configuration default. random_state: Random seed for reproducible GMM fitting. Controls the random initialization of component parameters. Should be a non-negative integer. If None, results may vary between runs. allow_fallback: Whether to fall back to equal-width binning when GMM fitting fails. If True (default), uses equal-width binning as fallback with a warning. If False, raises an error when GMM fails. If None, uses configuration default. clip: Whether to clip transformed values outside the fitted range to the nearest bin edge. If None, uses configuration default. preserve_dataframe: Whether to preserve pandas DataFrame structure in transform operations. If None, uses configuration default. fit_jointly: Whether to fit all columns together. Always False for GMM as it processes columns independently. If None, uses configuration default. bin_edges: Pre-computed bin edges dictionary for reconstruction. Internal use only - should not be provided during normal initialization. bin_representatives: Pre-computed representatives dictionary for reconstruction. Internal use only. class_: Class name string for reconstruction compatibility. Internal use only. module_: Module name string for reconstruction compatibility. Internal use only. Example: >>> # Standard initialization with 5 components >>> binner = GaussianMixtureBinning(n_components=5, random_state=42) >>> >>> # Dynamic component count based on data size >>> binner = GaussianMixtureBinning(n_components='sqrt', random_state=123) >>> >>> # Use configuration defaults >>> binner = GaussianMixtureBinning() >>> >>> # Custom configuration with clipping >>> binner = GaussianMixtureBinning( ... n_components=8, ... random_state=42, ... clip=True, ... preserve_dataframe=True ... ) Note: - Parameter validation occurs during initialization - Configuration defaults are applied for None parameters - The random_state parameter ensures reproducible results across runs - n_components can use dynamic expressions for adaptive bin counts - Reconstruction parameters should not be provided during normal usage """ # Use standardized initialization pattern user_params = create_param_dict_for_config( n_components=n_components, random_state=random_state, allow_fallback=allow_fallback, clip=clip, preserve_dataframe=preserve_dataframe, fit_jointly=fit_jointly, ) # Apply configuration defaults resolved_params = apply_config_defaults("gaussian_mixture", user_params) # Store method-specific parameters self.n_components = resolved_params.get("n_components", 10) self.random_state = resolved_params.get("random_state", None) self.allow_fallback = resolved_params.get("allow_fallback", True) # Initialize parent with resolved parameters IntervalBinningBase.__init__( self, clip=resolved_params.get("clip"), preserve_dataframe=resolved_params.get("preserve_dataframe"), fit_jointly=resolved_params.get("fit_jointly"), guidance_columns=None, bin_edges=bin_edges, bin_representatives=bin_representatives, )
def _validate_params(self) -> None: """Validate Gaussian Mixture binning parameters.""" # Call parent validation IntervalBinningBase._validate_params(self) # Validate n_components using centralized utility validate_bin_number_parameter(self.n_components, param_name="n_components") # Validate random_state parameter using centralized utility validate_random_state(self.random_state) def _calculate_bins( self, x_col: np.ndarray[Any, Any], col_id: Any, guidance_data: np.ndarray[Any, Any] | None = None, ) -> tuple[list[float], list[float]]: """Calculate Gaussian Mixture Model clustering-based bins for a single column. Uses GMM clustering to find natural probabilistic groupings and creates bin boundaries at decision boundaries between components. Args: x_col: Preprocessed column data (from base class) col_id: Column identifier for error reporting guidance_data: Not used for GMM binning (unsupervised) Returns: Tuple of (bin_edges, bin_representatives) Raises: ValueError: If n_components is invalid or insufficient data for clustering """ # Validate n_components for calculation validate_bin_number_for_calculation(self.n_components, param_name="n_components") resolved_n_components = resolve_n_bins_parameter( self.n_components, data_shape=(len(x_col), 1), param_name="n_components" ) return self._create_gmm_bins(x_col, col_id, resolved_n_components) # pylint: disable=too-many-locals def _create_gmm_bins( self, x_col: np.ndarray[Any, Any], col_id: Any, n_components: int, ) -> tuple[list[float], list[float]]: """Create Gaussian Mixture Model clustering-based bins. Args: x_col: Column data that may contain NaN/inf values col_id: Column identifier for error reporting n_components: Number of mixture components to create Returns: Tuple of (bin_edges, bin_representatives) Note: We need to filter out NaN/inf values before GMM fitting. """ # Filter out NaN and infinite values for GMM fitting finite_mask = np.isfinite(x_col) x_col_clean = x_col[finite_mask] if len(x_col_clean) < n_components: raise ValueError( f"Column {col_id}: Insufficient finite values ({len(x_col_clean)}) " f"for {n_components} components. Need at least {n_components} values." ) # Reshape data for GMM (expects 2D array) X_reshaped = x_col_clean.reshape(-1, 1) try: # Apply Gaussian Mixture Model clustering using safe sklearn call gmm = GaussianMixture( n_components=n_components, random_state=self.random_state, covariance_type="full" ) gmm.fit(X_reshaped) # Get component means and sort them means = np.array(gmm.means_).flatten() sorted_indices = np.argsort(means) sorted_means = means[sorted_indices] # Check if GMM produced valid means (within data range) min_val, max_val = float(np.min(x_col_clean)), float(np.max(x_col_clean)) # If any means are significantly outside the data range, fall back tolerance = 1e-10 # Small tolerance for floating point precision if np.any(sorted_means < min_val - tolerance) or np.any( sorted_means > max_val + tolerance ): raise ValueError( f"GMM produced means outside data range: {sorted_means} not " f"in [{min_val}, {max_val}]" ) # Calculate component boundaries edges = [min_val] # Start with data minimum # Create boundaries between adjacent components for i in range(len(sorted_means) - 1): boundary = (sorted_means[i] + sorted_means[i + 1]) / 2 edges.append(float(boundary)) edges.append(max_val) # End with data maximum # Representatives are the component means reps = [float(mean) for mean in sorted_means] return edges, reps except ( ValueError, RuntimeError, ConvergenceWarning, ) as e: # Check if fallback is allowed if not self.allow_fallback: raise ConfigurationError( f"GMM fitting failed: {str(e)}", suggestions=[ "Try reducing n_components", "Increase sample size", "Check data distribution", "Set allow_fallback=True to enable equal-width fallback", ], ) from e # Use standardized equal-width fallback return list(apply_equal_width_fallback(x_col_clean, n_components, "GMM")), [ float(val) for val in np.linspace(np.min(x_col_clean), np.max(x_col_clean), n_components) ]