From 117ea4cff9f4796f244768bacefcb65a729b64ee Mon Sep 17 00:00:00 2001
From: Bartosz Baranowski <bartekbaranow@gmail.com>
Date: Wed, 20 Aug 2025 16:45:31 +0200
Subject: [PATCH 1/8] WIP: Working CSR matrix as intended with sets. TODO:
 numpy array (dense matrix) Checking sklearn integration Cleanup

---
 src/laplaciannb/LaplacianNB.py                | 313 +++++++-
 src/laplaciannb/__init__.py                   |  22 +-
 src/laplaciannb/fingerprint_utils.py          | 694 +-----------------
 src/laplaciannb/legacy/LaplacianNB.py         |  25 -
 .../{ => legacy}/LaplacianNB_new.py           |   0
 tests/bayes_test.py                           |   0
 tests/laplaciannb.py                          |   0
 tests/test_bayes.py                           | 136 ++--
 tests/test_bayes_compatibility.py             | 292 --------
 tests/test_complete_deprecation.py            | 170 -----
 tests/test_deprecation.py                     | 211 ------
 tests/test_fingerprint_csr_conversion.py      |  61 ++
 tests/test_fingerprint_utils.py               | 311 --------
 tests/test_laplacian_nb_compatibility.py      | 365 ---------
 tests/test_laplacian_nb_standalone.py         |   0
 tests/test_main_imports.py                    |  74 --
 tests/test_performance_comparison.py          |   0
 tests/test_sklearn_integration.py             | 519 -------------
 18 files changed, 474 insertions(+), 2719 deletions(-)
 rename src/laplaciannb/{ => legacy}/LaplacianNB_new.py (100%)
 delete mode 100644 tests/bayes_test.py
 delete mode 100644 tests/laplaciannb.py
 delete mode 100644 tests/test_bayes_compatibility.py
 delete mode 100644 tests/test_complete_deprecation.py
 delete mode 100644 tests/test_deprecation.py
 create mode 100644 tests/test_fingerprint_csr_conversion.py
 delete mode 100644 tests/test_fingerprint_utils.py
 delete mode 100644 tests/test_laplacian_nb_compatibility.py
 delete mode 100644 tests/test_laplacian_nb_standalone.py
 delete mode 100644 tests/test_main_imports.py
 delete mode 100644 tests/test_performance_comparison.py
 delete mode 100644 tests/test_sklearn_integration.py

diff --git a/src/laplaciannb/LaplacianNB.py b/src/laplaciannb/LaplacianNB.py
index f864365..33442f4 100644
--- a/src/laplaciannb/LaplacianNB.py
+++ b/src/laplaciannb/LaplacianNB.py
@@ -1,11 +1,310 @@
-"""
-Modern sklearn-compatible LaplacianNB implementation.
+import warnings
+from functools import reduce
+from itertools import compress
 
-This module provides the recommended LaplacianNB implementation with full
-sklearn ecosystem integration.
-"""
+import numpy as np
+from scipy.special import logsumexp
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.naive_bayes import _BaseDiscreteNB
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils.validation import _check_sample_weight, check_is_fitted, validate_data
 
-from .LaplacianNB_new import LaplacianNB
 
+class LaplacianNB(_BaseDiscreteNB):
+    """Naive Bayes classifier for laplacian modified models.
 
-__all__ = ["LaplacianNB"]
+    DEPRECATED: This is the legacy LaplacianNB implementation.
+    Please use the new sklearn-compatible version instead:
+
+        from laplaciannb import LaplacianNB  # New version (recommended)
+
+    The new implementation offers:
+    - Full sklearn compatibility (pipelines, cross-validation, grid search)
+    - Memory-efficient sparse matrix support
+    - Better error handling and validation
+    - Consistent API with other sklearn estimators
+    - Enhanced fingerprint utility functions
+
+    This legacy version will be removed in a future release.
+
+    Like BernoulliNB, this classifier is suitable for binary/boolean data. The
+    difference is that while BernoulliNB processes all features, while
+    laplacian modified approach is using only positive bits.
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Additive (Laplace/Lidstone) smoothing parameter
+        (0 for no smoothing).
+    fit_prior : bool, default=True
+        Whether to learn class prior probabilities or not.
+        If false, a uniform prior will be used.
+    class_prior : array-like of shape (n_classes,), default=None
+        Prior probabilities of the classes. If specified, the priors are not
+        adjusted according to the data.
+    Attributes
+    ----------
+    class_count_ : ndarray of shape (n_classes,)
+        Number of samples encountered for each class during fitting. This
+        value is weighted by the sample weight when provided.
+    class_log_prior_ : ndarray of shape (n_classes,)
+        Log probability of each class (smoothed).
+    classes_ : ndarray of shape (n_classes,)
+        Class labels known to the classifier
+    feature_count_ : ndarray of shape (n_classes, n_features)
+        Number of 1' bits encountered for each (class, feature)
+        during fitting.
+    feature_all_ : total number of features encountered.
+    feature_log_prob_ : ndarray of shape (n_classes, n_features)
+        Empirical log probability of 1' bit features given a class, P(x_i|y).
+    n_features_ : int
+        Number of features of each sample.
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+        .. versionadded:: 0.24
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+    See Also
+    --------
+    CategoricalNB : Naive Bayes classifier for categorical features.
+    ComplementNB : The Complement Naive Bayes classifier
+        described in Rennie et al. (2003).
+    GaussianNB : Gaussian Naive Bayes (GaussianNB).
+    MultinomialNB : Naive Bayes classifier for multinomial models.
+    References
+    ----------
+    Nidhi; Glick, M.; Davies, J. W.; Jenkins, J. L. Prediction of biological targets
+    for compounds using multiple-category Bayesian models trained on chemogenomics
+    databases. J. Chem. Inf. Model. 2006, 46, 1124– 1133,
+    https://doi.org/10.1021/ci060003g
+    Lam PY, Kutchukian P, Anand R, et al.
+    Cyp1 inhibition prevents doxorubicin-induced cardiomyopathy
+    in a zebrafish heart-failure model. Chem Bio Chem. 2020:cbic.201900741.
+    https://doi.org/10.1002/cbic.201900741
+    Examples
+    --------
+    >>> import numpy as np
+    >>> rng = np.random.RandomState(1)
+    >>> arr = rng.randint(2, size=(6, 100))
+    >>> Y = np.array([1, 2, 3, 4, 4, 5])
+    >>> Xlist = []
+    >>> for i in arr:
+    >>>     Xlist.append(set(i.nonzero()[0]))
+    >>> X = np.array(Xlist)
+    >>> from bayes.LaplacianNB import LaplacianNB
+    >>> clf = LaplacianNB()
+    >>> clf.fit(X, Y)
+    LaplacianNB()
+    >>> print(clf.predict(X[2:3]))
+    [3]
+    """
+
+    # see https://github.com/scikit-learn/scikit-learn/pull/22269 for an explanation
+
+    def __init__(self, *, alpha=1.0, force_alpha=True, fit_prior=True, class_prior=None):
+        self.alpha = alpha
+        self.fit_prior = fit_prior
+        self.class_prior = class_prior
+        force_alpha = force_alpha
+
+    def _check_X(self, X):
+        """Validate X, used only in predict* methods."""
+        X = validate_data(self, X, reset=False, accept_sparse=["csr"])
+        return X
+
+    def _check_X_y(self, X, y, reset=True):
+        X, y = validate_data(self, X, y, reset=reset, accept_sparse=["csr"])
+        return X, y
+
+    def _sum_sets(self, set_list):
+        def reducer(accumulator, element):
+            for key in element:
+                accumulator[key] = accumulator.get(key, 0) + 1
+            return accumulator
+
+        return reduce(reducer, set_list, {})
+
+    # Even more memory-efficient version that avoids creating huge matrices
+    def _count_feature_count(self, X_sparse, Y):
+        """Most efficient version that handles 2^32 feature space gracefully."""
+        from collections import defaultdict
+        
+        # Get active features to avoid working with full 2^32 space
+        X_coo = X_sparse.tocoo()
+        
+        # 1. Total feature counts
+        all_feature_counts = defaultdict(int)
+        for col_idx, data_val in zip(X_coo.col, X_coo.data):
+            all_feature_counts[col_idx] += data_val
+        all_feature_counts = dict(sorted(all_feature_counts.items()))
+        
+        # 2. Class-specific counts by iterating samples
+        class_feature_counts = [defaultdict(int) for _ in range(len(self.classes_))]
+        feature_sum = np.zeros(len(self.classes_))
+        
+        # Group elements by sample (row)
+        sample_features = defaultdict(list)
+        for row_idx, col_idx, data_val in zip(X_coo.row, X_coo.col, X_coo.data):
+            sample_features[row_idx].append((col_idx, data_val))
+        
+        # Count features per class
+        for sample_idx, features in sample_features.items():
+            # Find which classes this sample belongs to
+            sample_classes = Y[sample_idx].nonzero()[0]
+            
+            for class_idx in sample_classes:
+                class_weight = Y[sample_idx, class_idx]
+                for col_idx, data_val in features:
+                    weighted_count = data_val * class_weight
+                    class_feature_counts[class_idx][col_idx] += weighted_count
+                    feature_sum[class_idx] += weighted_count
+        
+        # Convert to sorted dictionaries
+        class_feature_counts = [dict(sorted(d.items())) for d in class_feature_counts]
+        
+        return all_feature_counts, feature_sum, class_feature_counts
+
+    def _init_counters(self, n_classes):
+        self.class_count_ = np.zeros(n_classes, dtype=np.float64)
+        # self.feature_count_ = np.zeros((n_classes, n_features), dtype=np.float64)
+
+    def _count(self, X, Y):
+        """Count and smooth feature occurrences."""
+        (
+            self.feature_count_all_dict_,
+            self.feature_count_,
+            self.feature_count_dict_,
+        ) = self._count_feature_count(X, Y)
+        self.feature_all_ = sum(self.feature_count_)
+        self.class_count_ += Y.sum(axis=0)
+
+    def _update_feature_log_prob(self, alpha):
+        """Apply smoothing to raw counts and recompute log probabilities"""
+        dictvectorizer = DictVectorizer(sparse=False)
+        total = dictvectorizer.fit_transform(self.feature_count_all_dict_)
+        classc = dictvectorizer.fit_transform(self.feature_count_dict_)
+        self.feature_names_ = [int(i) for i in dictvectorizer.get_feature_names_out()]
+        self.feature_names_ = dict(zip(self.feature_names_, range(len(self.feature_names_))))
+        prior = self.feature_count_ / self.feature_all_
+        self.feature_prob_ = (classc + alpha) / (np.outer(prior, total) + alpha)
+        self.feature_log_prob_ = np.log(self.feature_prob_).astype("float32")
+
+    def _joint_log_likelihood(self, X):
+        """Calculate the posterior log probability of the samples X"""
+        n_features = self.feature_log_prob_.shape[1]
+
+        new_X = np.zeros([X.shape[0], n_features], dtype=bool)
+
+        for i, row in enumerate(X):
+            # Handle sparse matrix row
+            row_coo = row.tocoo()
+            for col_idx in row_coo.col:
+                if self.feature_names_.get(col_idx) is not None:
+                    new_X[i, self.feature_names_[col_idx]] = 1
+        jll = np.dot(new_X, self.feature_log_prob_.T)
+        return jll
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit Naive Bayes classifier according to X, y.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vectors, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+        y : array-like of shape (n_samples,)
+            Target values.
+        sample_weight : array-like of shape (n_samples,), default=None
+            Weights applied to individual samples (1. for unweighted).
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X, y = self._check_X_y(X, y)
+
+        labelbin = LabelBinarizer()
+        Y = labelbin.fit_transform(y)
+        self.classes_ = labelbin.classes_
+        if Y.shape[1] == 1:
+            if len(self.classes_) == 2:
+                Y = np.concatenate((1 - Y, Y), axis=1)
+            else:  # degenerate case: just one class
+                Y = np.ones_like(Y)
+
+        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
+        # We convert it to np.float64 to support sample_weight consistently;
+        # this means we also don't have to cast X to floating point
+        if sample_weight is not None:
+            Y = Y.astype(np.float64, copy=False)
+            sample_weight = _check_sample_weight(sample_weight, X)
+            sample_weight = np.atleast_2d(sample_weight)
+            Y *= sample_weight.T
+
+        class_prior = self.class_prior
+
+        # Count raw events from data before updating the class log prior
+        # and feature log probas
+        n_classes = Y.shape[1]
+        self._init_counters(n_classes)
+        self._count(X, Y)
+        alpha = self._check_alpha()
+        self._update_feature_log_prob(alpha)
+        self._update_class_log_prior(class_prior=class_prior)
+        return self
+
+    def predict_log_proba(self, X):
+        """
+        Return log-probability estimates for the test vector X.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+        Returns
+        -------
+        C : array-like of shape (n_samples, n_classes)
+            Returns the log-probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        X = self._check_X(X)
+        jll = self._joint_log_likelihood(X)
+        # normalize by P(x) = P(f_1, ..., f_n)
+        log_prob_x = logsumexp(jll, axis=1)
+        return jll - np.atleast_2d(log_prob_x).T
+
+    def predict_proba(self, X):
+        """
+        Return probability estimates for the test vector X.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+        Returns
+        -------
+        C : array-like of shape (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute :term:`classes_`.
+        """
+        return np.exp(self.predict_log_proba(X))
+
+    def predict(self, X):
+        """
+        Perform classification on an array of test vectors X.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input samples.
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            Predicted target values for X.
+        """
+        check_is_fitted(self)
+        X = self._check_X(X)
+        jll = self._joint_log_likelihood(X)
+        return self.classes_[np.argmax(jll, axis=1)]
diff --git a/src/laplaciannb/__init__.py b/src/laplaciannb/__init__.py
index 3151fe0..dc8e078 100644
--- a/src/laplaciannb/__init__.py
+++ b/src/laplaciannb/__init__.py
@@ -17,28 +17,12 @@
 - Enhanced fingerprint utility functions
 """
 
-from .fingerprint_utils import (
-    FingerprintTransformer,
-    RDKitFingerprintConverter,
-    convert_fingerprints,
-    rdkit_sparse_to_csc,
-    rdkit_sparse_to_csr,
-    rdkit_sparse_to_dense,
-    rdkit_sparse_to_numpy,
-    rdkit_sparse_to_sklearn,
-)
-from .LaplacianNB import LaplacianNB
+from .fingerprint_utils import rdkit_to_csr
+from .laplaciannb import LaplacianNB
 
 
 __version__ = "0.7.0"
 __all__ = [
     "LaplacianNB",
-    "FingerprintTransformer",
-    "RDKitFingerprintConverter",
-    "convert_fingerprints",
-    "rdkit_sparse_to_dense",
-    "rdkit_sparse_to_csr",
-    "rdkit_sparse_to_csc",
-    "rdkit_sparse_to_numpy",
-    "rdkit_sparse_to_sklearn",
+    "rdkit_to_csr",
 ]
diff --git a/src/laplaciannb/fingerprint_utils.py b/src/laplaciannb/fingerprint_utils.py
index a9b7b68..83e2eb8 100644
--- a/src/laplaciannb/fingerprint_utils.py
+++ b/src/laplaciannb/fingerprint_utils.py
@@ -1,665 +1,33 @@
-from typing import Any, Dict, Optional, Union
-
 import numpy as np
-from scipy import sparse
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.utils.validation import check_is_fitted
-
-
-def rdkit_sparse_to_dense(fingerprint, n_bits: int = 2048, dtype=np.float32) -> np.ndarray:
-    """Convert a single RDKit sparse fingerprint to dense numpy array.
-
-    Parameters
-    ----------
-    fingerprint : various RDKit fingerprint types
-        Can be:
-        - RDKit ExplicitBitVect
-        - RDKit SparseBitVect
-        - RDKit IntSparseIntVect
-        - UIntSparseIntVect
-        - LongSparseIntVect
-        - Set of on-bit indices
-        - Dict mapping bit indices to counts
-        - List/tuple of on-bit indices
-
-    n_bits : int, default=2048
-        Size of the output fingerprint vector.
-
-    dtype : numpy dtype, default=np.float32
-        Data type of the output array.
-
-    Returns
-    -------
-    np.ndarray
-        Dense numpy array of shape (n_bits,) with binary or count values.
-
-    Examples
-    --------
-    >>> from rdkit import Chem
-    >>> from rdkit.Chem import AllChem
-    >>> mol = Chem.MolFromSmiles('CCO')
-    >>> fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
-    >>> dense_fp = rdkit_sparse_to_dense(fp, n_bits=2048)
-    """
-    dense = np.zeros(n_bits, dtype=dtype)
-
-    if fingerprint is None:
-        return dense
-
-    # Handle RDKit BitVect types
-    if hasattr(fingerprint, "GetOnBits"):
-        # ExplicitBitVect or SparseBitVect
-        for bit_idx in fingerprint.GetOnBits():
-            if 0 <= bit_idx < n_bits:
-                dense[bit_idx] = 1.0
-
-    # Handle RDKit SparseIntVect types
-    elif hasattr(fingerprint, "GetNonzeroElements"):
-        # IntSparseIntVect, UIntSparseIntVect, LongSparseIntVect
-        for bit_idx, count in fingerprint.GetNonzeroElements().items():
-            if 0 <= bit_idx < n_bits:
-                dense[bit_idx] = float(count)
-
-    # Handle Python set (set of on-bits)
-    elif isinstance(fingerprint, set):
-        for bit_idx in fingerprint:
-            if 0 <= bit_idx < n_bits:
-                dense[bit_idx] = 1.0
-
-    # Handle Python dict (bit_idx: count mapping)
-    elif isinstance(fingerprint, dict):
-        for bit_idx, count in fingerprint.items():
-            if 0 <= bit_idx < n_bits:
-                dense[bit_idx] = float(count)
-
-    # Handle list/tuple of on-bit indices
-    elif isinstance(fingerprint, (list, tuple)):
-        # Check if it's a list of indices or a full vector
-        if len(fingerprint) == n_bits:
-            # Full vector, return as-is after conversion
-            return np.asarray(fingerprint, dtype=dtype)
-        else:
-            # List of on-bit indices
-            for bit_idx in fingerprint:
-                if 0 <= bit_idx < n_bits:
-                    dense[bit_idx] = 1.0
-
-    # Handle numpy array (already in correct format)
-    elif isinstance(fingerprint, np.ndarray):
-        if len(fingerprint) == n_bits:
-            return fingerprint.astype(dtype)
-        else:
-            # Treat as list of indices
-            for bit_idx in fingerprint:
-                if 0 <= bit_idx < n_bits:
-                    dense[bit_idx] = 1.0
-
-    else:
-        # Try to iterate as a sequence
-        try:
-            for bit_idx in fingerprint:
-                if 0 <= bit_idx < n_bits:
-                    dense[bit_idx] = 1.0
-        except (TypeError, ValueError):
-            raise ValueError(f"Unsupported fingerprint type: {type(fingerprint)}")
-
-    return dense
-
-
-def rdkit_sparse_to_csr(fingerprints, n_bits: int = 2048, dtype=np.float32) -> sparse.csr_matrix:
-    """Convert RDKit sparse fingerprints to scipy CSR sparse matrix.
-
-    Parameters
-    ----------
-    fingerprints : single fingerprint or list of fingerprints
-        RDKit fingerprints in various formats.
-
-    n_bits : int, default=2048
-        Size of the fingerprint vectors.
-
-    dtype : numpy dtype, default=np.float32
-        Data type of the output matrix.
-
-    Returns
-    -------
-    sparse.csr_matrix
-        Sparse CSR matrix of shape (n_samples, n_bits).
-
-    Examples
-    --------
-    >>> from rdkit import Chem
-    >>> from rdkit.Chem import AllChem
-    >>> mols = [Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('CC')]
-    >>> fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
-    >>> csr_matrix = rdkit_sparse_to_csr(fps, n_bits=2048)
-    """
-    # Handle single fingerprint
-    if not isinstance(fingerprints, (list, tuple, np.ndarray)):
-        fingerprints = [fingerprints]
-    elif isinstance(fingerprints, np.ndarray) and fingerprints.ndim == 1:
-        # Could be a single dense fingerprint or array of fingerprints
-        if len(fingerprints) == n_bits:
-            fingerprints = [fingerprints]
-
-    n_samples = len(fingerprints)
-    rows, cols, data = [], [], []
-
-    for i, fp in enumerate(fingerprints):
-        if fp is None:
+from scipy.sparse import csr_matrix
+from rdkit.Chem import rdFingerprintGenerator
+from rdkit import Chem
+
+
+def rdkit_to_csr(smiles_list, radius=2):
+    """Convert RDKit sparse Morgan fingerprints to CSR matrix with lossless conversion."""
+    row_ind = []
+    col_ind = []
+    
+    # Create Morgan fingerprint generator
+    mol_list = [Chem.MolFromSmiles(smi) for smi in smiles_list]
+    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius)
+    
+    for i, mol in enumerate(mol_list):
+        if mol is None:
             continue
-
-        # Extract on-bits and values
-        if hasattr(fp, "GetOnBits"):
-            # BitVect types
-            for bit_idx in fp.GetOnBits():
-                if 0 <= bit_idx < n_bits:
-                    rows.append(i)
-                    cols.append(bit_idx)
-                    data.append(1.0)
-
-        elif hasattr(fp, "GetNonzeroElements"):
-            # SparseIntVect types
-            for bit_idx, count in fp.GetNonzeroElements().items():
-                if 0 <= bit_idx < n_bits:
-                    rows.append(i)
-                    cols.append(bit_idx)
-                    data.append(float(count))
-
-        elif isinstance(fp, set):
-            for bit_idx in fp:
-                if 0 <= bit_idx < n_bits:
-                    rows.append(i)
-                    cols.append(bit_idx)
-                    data.append(1.0)
-
-        elif isinstance(fp, dict):
-            for bit_idx, count in fp.items():
-                if 0 <= bit_idx < n_bits:
-                    rows.append(i)
-                    cols.append(bit_idx)
-                    data.append(float(count))
-
-        elif isinstance(fp, (list, tuple, np.ndarray)):
-            if len(fp) == n_bits:
-                # Full vector
-                for j, val in enumerate(fp):
-                    if val != 0:
-                        rows.append(i)
-                        cols.append(j)
-                        data.append(float(val))
-            else:
-                # List of indices
-                for bit_idx in fp:
-                    if 0 <= bit_idx < n_bits:
-                        rows.append(i)
-                        cols.append(bit_idx)
-                        data.append(1.0)
-
-        else:
-            # Try to iterate
-            try:
-                for bit_idx in fp:
-                    if 0 <= bit_idx < n_bits:
-                        rows.append(i)
-                        cols.append(bit_idx)
-                        data.append(1.0)
-            except (TypeError, ValueError):
-                raise ValueError(f"Unsupported fingerprint type: {type(fp)}")
-
-    return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_bits), dtype=dtype)
-
-
-def rdkit_sparse_to_csc(fingerprints, n_bits: int = 2048, dtype=np.float32) -> sparse.csc_matrix:
-    """Convert RDKit sparse fingerprints to scipy CSC sparse matrix.
-
-    Parameters
-    ----------
-    fingerprints : single fingerprint or list of fingerprints
-        RDKit fingerprints in various formats.
-
-    n_bits : int, default=2048
-        Size of the fingerprint vectors.
-
-    dtype : numpy dtype, default=np.float32
-        Data type of the output matrix.
-
-    Returns
-    -------
-    sparse.csc_matrix
-        Sparse CSC matrix of shape (n_samples, n_bits).
-    """
-    csr = rdkit_sparse_to_csr(fingerprints, n_bits=n_bits, dtype=dtype)
-    return csr.tocsc()
-
-
-def rdkit_sparse_to_numpy(fingerprints, n_bits: int = 2048, dtype=np.float32) -> np.ndarray:
-    """Convert RDKit sparse fingerprints to dense numpy array.
-
-    Parameters
-    ----------
-    fingerprints : single fingerprint or list of fingerprints
-        RDKit fingerprints in various formats.
-
-    n_bits : int, default=2048
-        Size of the fingerprint vectors.
-
-    dtype : numpy dtype, default=np.float32
-        Data type of the output array.
-
-    Returns
-    -------
-    np.ndarray
-        Dense numpy array of shape (n_samples, n_bits).
-
-    Examples
-    --------
-    >>> from rdkit import Chem
-    >>> from rdkit.Chem import AllChem
-    >>> mols = [Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('CC')]
-    >>> fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
-    >>> dense_matrix = rdkit_sparse_to_numpy(fps, n_bits=2048)
-    """
-    # Handle single fingerprint
-    if not isinstance(fingerprints, (list, tuple)):
-        fingerprints = [fingerprints]
-    elif isinstance(fingerprints, np.ndarray) and fingerprints.ndim == 1:
-        if len(fingerprints) == n_bits:
-            fingerprints = [fingerprints]
-
-    n_samples = len(fingerprints)
-    dense_matrix = np.zeros((n_samples, n_bits), dtype=dtype)
-
-    for i, fp in enumerate(fingerprints):
-        dense_matrix[i] = rdkit_sparse_to_dense(fp, n_bits=n_bits, dtype=dtype)
-
-    return dense_matrix
-
-
-def rdkit_sparse_to_sklearn(
-    fingerprints, n_bits: int = 2048, output_format: str = "auto", dtype=np.float32
-) -> Union[np.ndarray, sparse.csr_matrix, sparse.csc_matrix]:
-    """Convert RDKit sparse fingerprints to sklearn-compatible format.
-
-    Parameters
-    ----------
-    fingerprints : single fingerprint or list of fingerprints
-        RDKit fingerprints in various formats.
-
-    n_bits : int, default=2048
-        Size of the fingerprint vectors.
-
-    output_format : {'auto', 'dense', 'csr', 'csc'}, default='auto'
-        Output format:
-        - 'auto': Choose based on sparsity (CSR if >90% sparse)
-        - 'dense': Dense numpy array
-        - 'csr': Compressed Sparse Row format
-        - 'csc': Compressed Sparse Column format
-
-    dtype : numpy dtype, default=np.float32
-        Data type of the output.
-
-    Returns
-    -------
-    array-like
-        Fingerprints in sklearn-compatible format.
-
-    Examples
-    --------
-    >>> from rdkit import Chem
-    >>> from rdkit.Chem import AllChem
-    >>> from sklearn.naive_bayes import BernoulliNB
-    >>>
-    >>> mols = [Chem.MolFromSmiles(smi) for smi in ['CCO', 'CC', 'CCC']]
-    >>> fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
-    >>> X = rdkit_sparse_to_sklearn(fps, output_format='csr')
-    >>> y = [0, 1, 0]
-    >>>
-    >>> clf = BernoulliNB()
-    >>> clf.fit(X, y)
-    """
-    if output_format == "dense":
-        return rdkit_sparse_to_numpy(fingerprints, n_bits=n_bits, dtype=dtype)
-    elif output_format == "csr":
-        return rdkit_sparse_to_csr(fingerprints, n_bits=n_bits, dtype=dtype)
-    elif output_format == "csc":
-        return rdkit_sparse_to_csc(fingerprints, n_bits=n_bits, dtype=dtype)
-    elif output_format == "auto":
-        # First convert to CSR to check sparsity
-        csr_matrix = rdkit_sparse_to_csr(fingerprints, n_bits=n_bits, dtype=dtype)
-        sparsity = 1.0 - (csr_matrix.nnz / (csr_matrix.shape[0] * csr_matrix.shape[1]))
-
-        if sparsity > 0.9:  # More than 90% sparse
-            return csr_matrix
-        else:
-            return csr_matrix.toarray()
-    else:
-        raise ValueError(f"Unknown output_format: {output_format}. Choose from 'auto', 'dense', 'csr', 'csc'.")
-
-
-class RDKitFingerprintConverter:
-    """Converter class for batch processing RDKit fingerprints.
-
-    This class provides methods to convert RDKit fingerprints to various
-    sklearn-compatible formats with caching and validation.
-
-    Parameters
-    ----------
-    n_bits : int, default=2048
-        Size of the fingerprint vectors.
-
-    output_format : {'auto', 'dense', 'csr', 'csc'}, default='csr'
-        Default output format for conversions. Default 'csr' for memory efficiency
-        with molecular fingerprints which are typically very sparse.
-
-    dtype : numpy dtype, default=np.float32
-        Data type of the output.
-
-    validate : bool, default=True
-        Whether to validate input fingerprints.
-
-    Attributes
-    ----------
-    n_features_ : int
-        Number of features (bits) in the fingerprints.
-
-    Examples
-    --------
-    >>> from rdkit import Chem
-    >>> from rdkit.Chem import AllChem
-    >>>
-    >>> converter = RDKitFingerprintConverter(n_bits=2048, output_format='csr')
-    >>>
-    >>> mols = [Chem.MolFromSmiles(smi) for smi in ['CCO', 'CC', 'CCC']]
-    >>> fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols]
-    >>>
-    >>> X = converter.convert(fps)
-    >>> print(f"Shape: {X.shape}, Sparsity: {converter.get_sparsity(X):.2%}")
-    """
-
-    def __init__(self, n_bits: int = 2048, output_format: str = "csr", dtype=np.float32, validate: bool = True):
-        self.n_bits = n_bits
-        self.output_format = output_format
-        self.dtype = dtype
-        self.validate = validate
-        self.n_features_ = n_bits
-
-    def convert(
-        self, fingerprints, output_format: Optional[str] = None
-    ) -> Union[np.ndarray, sparse.csr_matrix, sparse.csc_matrix]:
-        """Convert fingerprints to sklearn format.
-
-        Parameters
-        ----------
-        fingerprints : single fingerprint or list of fingerprints
-            RDKit fingerprints to convert.
-
-        output_format : str, optional
-            Override default output format for this conversion.
-
-        Returns
-        -------
-        array-like
-            Converted fingerprints.
-        """
-        if output_format is None:
-            output_format = self.output_format
-
-        if self.validate:
-            self._validate_fingerprints(fingerprints)
-
-        return rdkit_sparse_to_sklearn(fingerprints, n_bits=self.n_bits, output_format=output_format, dtype=self.dtype)
-
-    def to_dense(self, fingerprints) -> np.ndarray:
-        """Convert to dense numpy array."""
-        return rdkit_sparse_to_numpy(fingerprints, self.n_bits, self.dtype)
-
-    def to_csr(self, fingerprints) -> sparse.csr_matrix:
-        """Convert to CSR sparse matrix."""
-        return rdkit_sparse_to_csr(fingerprints, self.n_bits, self.dtype)
-
-    def to_csc(self, fingerprints) -> sparse.csc_matrix:
-        """Convert to CSC sparse matrix."""
-        return rdkit_sparse_to_csc(fingerprints, self.n_bits, self.dtype)
-
-    def _validate_fingerprints(self, fingerprints):
-        """Validate that fingerprints are in a supported format."""
-        if fingerprints is None:
-            raise ValueError("Fingerprints cannot be None")
-
-        # Check if it's a single fingerprint or a collection
-        if not isinstance(fingerprints, (list, tuple, np.ndarray)):
-            fingerprints = [fingerprints]
-
-        for i, fp in enumerate(fingerprints):
-            if fp is None:
-                continue
-
-            # Check for supported types
-            valid = (
-                hasattr(fp, "GetOnBits")
-                or hasattr(fp, "GetNonzeroElements")
-                or isinstance(fp, (set, dict, list, tuple, np.ndarray))
-            )
-
-            if not valid:
-                # Try to iterate as last resort
-                try:
-                    iter(fp)
-                except TypeError:
-                    raise ValueError(f"Fingerprint at index {i} is not in a supported format. Got type: {type(fp)}")
-
-    @staticmethod
-    def get_sparsity(matrix) -> float:
-        """Calculate sparsity of a matrix.
-
-        Parameters
-        ----------
-        matrix : array-like
-            Dense or sparse matrix.
-
-        Returns
-        -------
-        float
-            Sparsity ratio (fraction of zero elements).
-        """
-        if sparse.issparse(matrix):
-            return 1.0 - (matrix.nnz / (matrix.shape[0] * matrix.shape[1]))
-        else:
-            return np.mean(matrix == 0)
-
-    def get_statistics(self, fingerprints) -> Dict[str, Any]:
-        """Get statistics about the fingerprints.
-
-        Parameters
-        ----------
-        fingerprints : list of fingerprints
-            RDKit fingerprints to analyze.
-
-        Returns
-        -------
-        dict
-            Statistics including sparsity, average on-bits, etc.
-        """
-        matrix = self.to_csr(fingerprints)
-
-        stats = {
-            "n_samples": matrix.shape[0],
-            "n_features": matrix.shape[1],
-            "sparsity": self.get_sparsity(matrix),
-            "avg_on_bits": matrix.nnz / matrix.shape[0],
-            "min_on_bits": min(matrix.getnnz(axis=1)),
-            "max_on_bits": max(matrix.getnnz(axis=1)),
-            "total_unique_bits": len(np.unique(matrix.nonzero()[1])),
-        }
-
-        return stats
-
-
-# Convenience functions for direct use
-def convert_fingerprints(
-    fingerprints, n_bits: int = 2048, output_format: str = "csr", dtype=np.float32
-) -> Union[np.ndarray, sparse.csr_matrix, sparse.csc_matrix]:
-    """Convenience function to convert RDKit fingerprints to sklearn format.
-
-    This is a simple wrapper around rdkit_sparse_to_sklearn for ease of use.
-
-    Parameters
-    ----------
-    fingerprints : single fingerprint or list of fingerprints
-        RDKit fingerprints in various formats.
-
-    n_bits : int, default=2048
-        Size of the fingerprint vectors.
-
-    output_format : {'auto', 'dense', 'csr', 'csc'}, default='csr'
-        Output format for the fingerprints. Default 'csr' for memory efficiency
-        with molecular fingerprints which are typically very sparse.
-
-    dtype : numpy dtype, default=np.float32
-        Data type of the output.
-
-    Returns
-    -------
-    array-like
-        Fingerprints in sklearn-compatible format.
-
-    Examples
-    --------
-    >>> from rdkit import Chem
-    >>> from rdkit.Chem import AllChem
-    >>> mol = Chem.MolFromSmiles('CCO')
-    >>> fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
-    >>> X = convert_fingerprints(fp)  # Returns sparse CSR matrix by default
-    """
-    return rdkit_sparse_to_sklearn(fingerprints, n_bits=n_bits, output_format=output_format, dtype=dtype)
-
-
-class FingerprintTransformer(BaseEstimator, TransformerMixin):
-    """Sklearn-compatible transformer for RDKit fingerprints.
-
-    This transformer converts various RDKit fingerprint formats (sets, dicts,
-    sparse representations) into dense or sparse matrices suitable for sklearn.
-    Provides full sklearn pipeline compatibility with fit/transform interface.
-
-    Parameters
-    ----------
-    n_bits : int, default=2048
-        Number of bits in the fingerprint. Common values are 1024, 2048, 4096.
-
-    output_format : {'auto', 'dense', 'csr', 'csc'}, default='csr'
-        Output format for the transformed matrix:
-        - 'csr': Compressed Sparse Row matrix (memory efficient)
-        - 'csc': Compressed Sparse Column matrix
-        - 'dense': Dense numpy array
-        - 'auto': Automatically choose based on sparsity
-
-    dtype : dtype, default=np.float32
-        Data type of the output array.
-
-    Attributes
-    ----------
-    n_features_out_ : int
-        Number of output features (equal to n_bits).
-
-    Examples
-    --------
-    >>> from rdkit import Chem
-    >>> from rdkit.Chem import AllChem
-    >>> from sklearn.pipeline import Pipeline
-    >>> from laplaciannb import LaplacianNB, FingerprintTransformer
-    >>>
-    >>> # Generate fingerprints as sets of on-bits
-    >>> mols = [Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('CC')]
-    >>> fps = [set(AllChem.GetMorganFingerprintAsBitVect(mol, 2).GetOnBits())
-    ...        for mol in mols]
-    >>>
-    >>> # Create sklearn pipeline
-    >>> pipeline = Pipeline([
-    ...     ('fingerprints', FingerprintTransformer(n_bits=2048)),
-    ...     ('classifier', LaplacianNB())
-    >>> ])
-    >>>
-    >>> # Use in cross-validation, grid search, etc.
-    >>> y = [0, 1]
-    >>> pipeline.fit(fps, y)
-    """
-
-    def __init__(self, n_bits=2048, output_format="csr", dtype=np.float32):
-        self.n_bits = n_bits
-        self.output_format = output_format
-        self.dtype = dtype
-
-    def fit(self, X, y=None):
-        """Fit the transformer.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples,)
-            Input samples. Each sample can be:
-            - A set of on-bit indices
-            - A dictionary mapping bit indices to counts
-            - A sparse fingerprint object (RDKit BitVect, etc.)
-            - A list/tuple of on-bit indices
-
-        y : Ignored
-            Not used, present for API consistency.
-
-        Returns
-        -------
-        self : object
-            Returns the instance itself.
-        """
-        self.n_features_out_ = self.n_bits
-        return self
-
-    def transform(self, X):
-        """Transform fingerprints to matrix format.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples,)
-            Input samples in fingerprint format.
-
-        Returns
-        -------
-        X_transformed : {ndarray, sparse matrix} of shape (n_samples, n_bits)
-            Transformed fingerprint matrix.
-        """
-        check_is_fitted(self)
-
-        # Use our existing conversion function
-        return convert_fingerprints(X, n_bits=self.n_bits, output_format=self.output_format, dtype=self.dtype)
-
-    def fit_transform(self, X, y=None):
-        """Fit and transform in one step.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples,)
-            Input samples in fingerprint format.
-
-        y : Ignored
-            Not used, present for API consistency.
-
-        Returns
-        -------
-        X_transformed : {ndarray, sparse matrix} of shape (n_samples, n_bits)
-            Transformed fingerprint matrix.
-        """
-        return self.fit(X, y).transform(X)
-
-    def get_feature_names_out(self, input_features=None):
-        """Get output feature names for transformation.
-
-        Parameters
-        ----------
-        input_features : array-like of str or None, default=None
-            Not used, present for API consistency.
-
-        Returns
-        -------
-        feature_names_out : ndarray of str objects
-            Array of feature names.
-        """
-        check_is_fitted(self)
-        return np.array([f"bit_{i}" for i in range(self.n_bits)], dtype=object)
+            
+        # Get sparse fingerprint
+        sfp = mfpgen.GetSparseFingerprint(mol)
+        for bit in set(sfp.GetOnBits()):
+            # Reinterpret signed int32 as unsigned int32
+            # This maps [-2^31, 2^31-1] to [0, 2^32-1] losslessly
+            col_idx = np.uint32(bit & 0xFFFFFFFF)
+            
+            row_ind.append(i)
+            col_ind.append(col_idx)
+            data = np.ones(len(row_ind), dtype=np.bool)
+    
+    return csr_matrix((data, (row_ind, col_ind)), 
+                      shape=(len(mol_list), 2**32), 
+                      dtype=np.bool)
diff --git a/src/laplaciannb/legacy/LaplacianNB.py b/src/laplaciannb/legacy/LaplacianNB.py
index ee60067..7d473f0 100644
--- a/src/laplaciannb/legacy/LaplacianNB.py
+++ b/src/laplaciannb/legacy/LaplacianNB.py
@@ -105,31 +105,6 @@ class LaplacianNB(_BaseDiscreteNB):
     # see https://github.com/scikit-learn/scikit-learn/pull/22269 for an explanation
 
     def __init__(self, *, alpha=1.0, force_alpha=True, fit_prior=True, class_prior=None):
-        warnings.warn(
-            "\n" + "=" * 80 + "\n"
-            "DEPRECATION WARNING: Legacy LaplacianNB Class\n" + "=" * 80 + "\n"
-            "You are using the DEPRECATED legacy LaplacianNB implementation.\n"
-            "This class will be REMOVED in a future release.\n\n"
-            "PLEASE MIGRATE to the new sklearn-compatible version:\n\n"
-            "  ✅ RECOMMENDED:\n"
-            "    from laplaciannb import LaplacianNB\n"
-            "    from laplaciannb.fingerprint_utils import convert_fingerprints\n"
-            "    \n"
-            "    X = convert_fingerprints(your_fingerprints, n_bits=size)\n"
-            "    clf = LaplacianNB(alpha=1.0)\n"
-            "    clf.fit(X, y)\n\n"
-            "  ❌ DEPRECATED (current usage):\n"
-            "    from laplaciannb.legacy import LaplacianNB\n"
-            "    clf = LaplacianNB(alpha=1.0)  # This class\n\n"
-            "Migration benefits:\n"
-            "• sklearn pipelines, cross-validation, grid search\n"
-            "• Memory-efficient sparse matrix support\n"
-            "• Better performance and error handling\n"
-            "• Future-proof implementation\n\n"
-            "See MIGRATION_GUIDE.md for step-by-step instructions.\n" + "=" * 80,
-            DeprecationWarning,
-            stacklevel=2,
-        )
         self.alpha = alpha
         self.fit_prior = fit_prior
         self.class_prior = class_prior
diff --git a/src/laplaciannb/LaplacianNB_new.py b/src/laplaciannb/legacy/LaplacianNB_new.py
similarity index 100%
rename from src/laplaciannb/LaplacianNB_new.py
rename to src/laplaciannb/legacy/LaplacianNB_new.py
diff --git a/tests/bayes_test.py b/tests/bayes_test.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/laplaciannb.py b/tests/laplaciannb.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test_bayes.py b/tests/test_bayes.py
index 854c6ac..ea2534e 100644
--- a/tests/test_bayes.py
+++ b/tests/test_bayes.py
@@ -5,18 +5,19 @@
 import pandas as pd
 from numpy.testing import assert_array_equal
 
-from laplaciannb.legacy.LaplacianNB import LaplacianNB
+from laplaciannb import LaplacianNB
 
 
 def test_bayes():
+    from scipy.sparse import csr_matrix
+    
     clf = LaplacianNB()
     rng = np.random.RandomState(1)
     arr = rng.randint(2, size=(6, 100))
     Y = np.array([1, 2, 3, 4, 4, 5])
-    Xlist = []
-    for i in arr:
-        Xlist.append(set(i.nonzero()[0]))
-    X = np.array(Xlist)
+    
+    # Convert binary array to CSR matrix
+    X = csr_matrix(arr, dtype=np.bool_)
     clf.fit(X, Y)
 
     assert_array_equal(clf.feature_count_, [55.0, 46.0, 53.0, 90.0, 44.0])
@@ -26,51 +27,43 @@ def test_bayes():
 
 def test_lmnb_prior_unobserved_targets():
     # test smoothing of prior for yet unobserved targets
-
-    # Create toy training data
-    X = np.array([{1}, {0}])
+    from scipy.sparse import csr_matrix
+
+    # Create toy training data as sparse matrices
+    # First sample has feature 1, second sample has feature 0
+    row = [0, 1]
+    col = [1, 0] 
+    data = [1, 1]
+    X = csr_matrix((data, (row, col)), shape=(2, 2), dtype=np.bool_)
     y = np.array([0, 1])
 
     clf = LaplacianNB()
     clf.fit(X, y)
 
-    assert_array_equal(clf.predict(np.array([{1}])), np.array([0]))
-    assert_array_equal(clf.predict(np.array([{0}])), np.array([1]))
-    assert_array_equal(clf.predict(np.array([{0, 1}])), np.array([0]))
+    # Test predictions - ensure matrix dimensions match training data (2 features)
+    test1 = csr_matrix(([1], ([0], [1])), shape=(1, 2), dtype=np.bool_)  # Feature 1 active
+    test2 = csr_matrix(([1], ([0], [0])), shape=(1, 2), dtype=np.bool_)  # Feature 0 active
+    test3 = csr_matrix(([1, 1], ([0, 0], [0, 1])), shape=(1, 2), dtype=np.bool_)  # Both features active
+    
+    assert_array_equal(clf.predict(test1), np.array([0]))
+    assert_array_equal(clf.predict(test2), np.array([1]))
+    assert_array_equal(clf.predict(test3), np.array([0]))
 
 
 def test_rdkit():
-    from rdkit import Chem
-    from rdkit.Chem import rdFingerprintGenerator
-
-    from laplaciannb.legacy.LaplacianNB import LaplacianNB
-
-    def get_fp(smiles: str) -> set:
-        """Function to calculate MorganFingerprint from smiles.
-        It returns index of all '1' bits of not-folded fingerprint.
-
-        Args:
-            smiles (str): smiles string
-
-        Returns:
-            set: return set of index of '1' bits.
-        """
-
-        mol = Chem.MolFromSmiles(smiles)
-        mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
-        fp = mfpgen.GetSparseFingerprint(mol)
-        return set(fp.GetOnBits())
+    from laplaciannb.fingerprint_utils import rdkit_to_csr
+    from laplaciannb import LaplacianNB
 
     DATA_PATH = Path(__file__).parent.parent.joinpath("tests/data/")
     file = str(DATA_PATH.joinpath("smiles_test.csv"))
     df = pd.read_csv(file)
-    df["sets"] = df["smiles"].apply(
-        lambda x: get_fp(x),
-    )
-    X = df["sets"]
+    
+    # Convert to sparse CSR matrix using our fingerprint utility
+    X_sparse = rdkit_to_csr(df['smiles'].values, radius=2)
+    
     y = df["activity"]
     clf = LaplacianNB()
-    clf.fit(X, y)
+    clf.fit(X_sparse, y)
 
     assert_array_equal(clf.feature_count_, [42727.0, 46838.0])
     assert_array_equal(clf.class_count_, [1000.0, 1000.0])
@@ -78,42 +71,59 @@ def get_fp(smiles: str) -> set:
 
 
 def test_joint_log_likelihood():
-    from rdkit import Chem
-    from rdkit.Chem import rdFingerprintGenerator
-
-    from laplaciannb.legacy.LaplacianNB import LaplacianNB
-
-    def get_fp(smiles: str) -> set:
-        """Function to calculate MorganFingerprint from smiles.
-        It returns index of all '1' bits of not-folded fingerprint.
-
-        Args:
-            smiles (str): smiles string
-
-        Returns:
-            set: return set of index of '1' bits.
-        """
-
-        mol = Chem.MolFromSmiles(smiles)
-        mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
-        fp = mfpgen.GetSparseFingerprint(mol)
-        return set(fp.GetOnBits())
+    """Test joint log likelihood with CSR matrices."""
+    from laplaciannb.fingerprint_utils import rdkit_to_csr
+    from laplaciannb import LaplacianNB
+    from scipy.sparse import csr_matrix
 
     DATA_PATH = Path(__file__).parent.parent.joinpath("tests/data/")
     file = str(DATA_PATH.joinpath("smiles_test.csv"))
     df = pd.read_csv(file)
-    df["sets"] = df["smiles"].apply(
-        lambda x: get_fp(x),
-    )
-    X = df["sets"]
+    
+    # Convert to CSR matrix using fingerprint utility
+    X = rdkit_to_csr(df['smiles'].values, radius=2)
     y = df["activity"]
     clf = LaplacianNB()
     clf.fit(X, y)
 
-    # check if algorithm can predict if index is out of range of fitted ones
-    new_df = pd.DataFrame({"sets": [{10210210310210}]})
-    new_X = new_df["sets"]
+    # Test with a feature index that might be out of range of fitted ones
+    # Create a sparse matrix with a high but valid feature index
+    test_row = [0]
+    test_col = [2**30]  # Use a large but valid index within 2^32-1 limit
+    test_data = [1]
+    new_X = csr_matrix((test_data, (test_row, test_col)), shape=(1, 2**32-1), dtype=np.bool_)
+    
     try:
         clf._joint_log_likelihood(new_X)
     except Exception as exc:
         raise AssertionError(f"'_joint_log_likelihood' raised an exception {exc}")
+
+
+def test_csr_fingerprint_conversion():
+    """Test the new CSR fingerprint conversion functionality."""
+    from laplaciannb.fingerprint_utils import rdkit_to_csr
+    
+    # Create test molecules
+    smiles_list = ["CCO", "CC", "CCC", "CCCC"]
+    
+    # Convert to CSR matrix
+    X_sparse = rdkit_to_csr(smiles_list, radius=2)
+    
+    # Basic validation
+    assert X_sparse.shape[0] == len(smiles_list)
+    assert X_sparse.shape[1] == 2**32
+    assert X_sparse.nnz > 0
+    
+    # Test that different molecules have different fingerprints
+    fingerprint_rows = []
+    for i in range(X_sparse.shape[0]):
+        row = X_sparse[i]
+        row_coo = row.tocoo()
+        fingerprint_set = set(zip(row_coo.col, row_coo.data))
+        fingerprint_rows.append(fingerprint_set)
+    
+    # Verify that molecules have some different features
+    assert len(set(len(fp) for fp in fingerprint_rows)) > 1  # Different numbers of features
+    
+    print(f"Successfully created CSR matrix: {X_sparse.shape}, nnz: {X_sparse.nnz}")
+    print(f"Fingerprint sizes: {[len(fp) for fp in fingerprint_rows]}")
diff --git a/tests/test_bayes_compatibility.py b/tests/test_bayes_compatibility.py
deleted file mode 100644
index 0c68855..0000000
--- a/tests/test_bayes_compatibility.py
+++ /dev/null
@@ -1,292 +0,0 @@
-"""
-Tests based on bayes_test.py to ensure compatibility between old and new LaplacianNB implementations.
-"""
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose, assert_array_equal
-
-from laplaciannb.fingerprint_utils import convert_fingerprints
-from laplaciannb.LaplacianNB_new import LaplacianNB as LaplacianNB_New
-
-# Import both implementations
-from laplaciannb.legacy.LaplacianNB import LaplacianNB as LaplacianNB_Original
-
-
-class TestBayesCompatibility:
-    """Test suite to verify compatibility using bayes_test.py scenarios."""
-
-    def test_basic_bayes_scenario_compatibility(self):
-        """Test compatibility using the basic scenario from test_bayes()."""
-        # Setup from original test_bayes()
-        rng = np.random.RandomState(1)
-        arr = rng.randint(2, size=(6, 100))
-        Y = np.array([1, 2, 3, 4, 4, 5])
-        Xlist = []
-        for i in arr:
-            Xlist.append(set(i.nonzero()[0]))
-        X_sets = np.array(Xlist)
-
-        # Train original model
-        clf_original = LaplacianNB_Original()
-        clf_original.fit(X_sets, Y)
-
-        # Convert to sklearn format and train new model
-        X_sklearn = convert_fingerprints(Xlist, n_bits=100, output_format="csr")
-        clf_new = LaplacianNB_New()
-        clf_new.fit(X_sklearn, Y)
-
-        # Test predictions match
-        pred_original = clf_original.predict(X_sets)
-        pred_new = clf_new.predict(X_sklearn)
-
-        print(f"Original predictions: {pred_original}")
-        print(f"New predictions:      {pred_new}")
-
-        assert_array_equal(pred_original, pred_new, err_msg="Predictions don't match for basic bayes scenario")
-
-        # Test that internal counts are consistent
-        print(f"Original feature_count_: {clf_original.feature_count_}")
-        print(f"New feature_count_:      {clf_new.feature_count_}")
-        print(f"Original class_count_:   {clf_original.class_count_}")
-        print(f"New class_count_:        {clf_new.class_count_}")
-        print(f"Original feature_all_:   {clf_original.feature_all_}")
-        print(f"New feature_all_:        {clf_new.feature_all_}")
-
-        # Allow for small differences due to different implementation approaches
-        assert_allclose(
-            clf_original.feature_count_, clf_new.feature_count_, rtol=1e-10, err_msg="Feature counts don't match"
-        )
-        assert_allclose(
-            clf_original.class_count_, clf_new.class_count_, rtol=1e-10, err_msg="Class counts don't match"
-        )
-        assert_allclose(
-            clf_original.feature_all_, clf_new.feature_all_, rtol=1e-10, err_msg="Feature all counts don't match"
-        )
-
-    def test_prior_unobserved_targets_compatibility(self):
-        """Test compatibility for prior smoothing of unobserved targets."""
-        # Setup from test_lmnb_prior_unobserved_targets()
-        X_sets = np.array([{1}, {0}])
-        y = np.array([0, 1])
-
-        # Train original model
-        clf_original = LaplacianNB_Original()
-        clf_original.fit(X_sets, y)
-
-        # Convert to sklearn format and train new model
-        X_sklearn = convert_fingerprints([{1}, {0}], n_bits=10, output_format="csr")
-        clf_new = LaplacianNB_New()
-        clf_new.fit(X_sklearn, y)
-
-        # Test predictions for different inputs
-        test_cases = [([{1}], "single feature 1"), ([{0}], "single feature 0"), ([{0, 1}], "both features")]
-
-        for test_input_sets, description in test_cases:
-            test_input_sklearn = convert_fingerprints(test_input_sets, n_bits=10, output_format="csr")
-
-            pred_original = clf_original.predict(np.array(test_input_sets, dtype=object))
-            pred_new = clf_new.predict(test_input_sklearn)
-
-            print(f"Test case {description}:")
-            print(f"  Original prediction: {pred_original}")
-            print(f"  New prediction:      {pred_new}")
-
-            assert_array_equal(pred_original, pred_new, err_msg=f"Predictions don't match for {description}")
-
-    def test_rdkit_scenario_compatibility(self):
-        """Test compatibility using small synthetic fingerprint data (memory-efficient)."""
-        # Note: We don't actually use RDKit here to avoid memory issues with large dense matrices
-        # Instead, we simulate typical sparse fingerprint data that RDKit would produce
-
-        print("Testing with synthetic sparse fingerprint data...")
-
-        # Create synthetic sparse fingerprint data (simulates RDKit Morgan fingerprints)
-        # Each fingerprint is a set of bit indices (sparse representation)
-        np.random.seed(42)
-        n_samples = 50  # Keep small for memory efficiency
-        max_bits = 2048  # Typical fingerprint size
-
-        X_sets = []
-        y = []
-
-        for i in range(n_samples):
-            # Create sparse fingerprint (5-20 bits set)
-            n_bits_set = np.random.randint(5, 21)
-            fingerprint = set(np.random.choice(max_bits, n_bits_set, replace=False))
-            X_sets.append(fingerprint)
-            # Simple target based on fingerprint characteristics
-            y.append(1 if len(fingerprint) > 12 else 0)
-
-        X_sets = np.array(X_sets)
-        y = np.array(y)
-
-        print(f"Created {len(X_sets)} synthetic fingerprints with max {max_bits} bits")
-
-        # Train original model
-        clf_original = LaplacianNB_Original()
-        clf_original.fit(X_sets, y)
-
-        # Convert to sparse matrix format (CSR - memory efficient)
-        X_sklearn = convert_fingerprints(X_sets.tolist(), n_bits=max_bits, output_format="csr")
-        clf_new = LaplacianNB_New()
-        clf_new.fit(X_sklearn, y)
-
-        print(f"Sparse matrix shape: {X_sklearn.shape}, nnz: {X_sklearn.nnz}")
-        print(f"Sparsity: {1 - X_sklearn.nnz / (X_sklearn.shape[0] * X_sklearn.shape[1]):.4f}")
-
-        # Test predictions
-        pred_original = clf_original.predict(X_sets)
-        pred_new = clf_new.predict(X_sklearn)
-
-        # Check prediction accuracy
-        print(f"Original predictions: {pred_original[:10]}")
-        print(f"New predictions:      {pred_new[:10]}")
-        print(f"Predictions match: {np.array_equal(pred_original, pred_new)}")
-
-        # Predictions should match exactly for synthetic data
-        assert_array_equal(
-            pred_original, pred_new, err_msg="Predictions should match exactly for synthetic sparse data"
-        )
-
-    def test_joint_log_likelihood_compatibility(self):
-        """Test _joint_log_likelihood method compatibility."""
-        # Create simple test data
-        X_sets = [{1, 5, 10}, {2, 6, 11}, {1, 3, 7}, {4, 8, 12}]
-        y = [0, 1, 0, 1]
-        n_bits = 20
-
-        # Train both models
-        clf_original = LaplacianNB_Original()
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-        clf_new = LaplacianNB_New()
-        clf_new.fit(X_sklearn, y)
-
-        # Test _joint_log_likelihood with known data
-        jll_original = clf_original._joint_log_likelihood(np.array(X_sets, dtype=object))
-        jll_new = clf_new._joint_log_likelihood(X_sklearn)
-
-        print(f"Original JLL shape: {jll_original.shape}")
-        print(f"New JLL shape:      {jll_new.shape}")
-        print(f"Original JLL:\n{jll_original}")
-        print(f"New JLL:\n{jll_new}")
-
-        # Check shapes match
-        assert jll_original.shape == jll_new.shape, "Joint log likelihood shapes don't match"
-
-        # Check values are reasonably close
-        max_diff = np.max(np.abs(jll_original - jll_new))
-        print(f"Max JLL difference: {max_diff}")
-
-        # Allow some numerical differences
-        assert max_diff < 1.0, f"Joint log likelihood differences too large: {max_diff}"
-
-        # Test with out-of-range feature (should not crash)
-        test_set_with_large_feature = [{10210210310210}]
-
-        # Original implementation test
-        try:
-            clf_original._joint_log_likelihood(np.array(test_set_with_large_feature, dtype=object))
-            print("✅ Original handles large feature indices")
-        except Exception as e:
-            print(f"❌ Original failed with large feature: {e}")
-
-        # New implementation test
-        try:
-            # Convert large feature set (will be ignored due to bounds checking)
-            X_large = convert_fingerprints(test_set_with_large_feature, n_bits=n_bits, output_format="csr")
-            clf_new._joint_log_likelihood(X_large)
-            print("✅ New handles large feature indices")
-        except Exception as e:
-            print(f"❌ New failed with large feature: {e}")
-
-    def test_probability_distribution_consistency(self):
-        """Test that probability distributions are reasonable and consistent."""
-        # Create test data with clear class separation
-        X_sets = [
-            {1, 2, 3},  # Class 0 features
-            {1, 2, 4},  # Class 0 features
-            {5, 6, 7},  # Class 1 features
-            {5, 6, 8},  # Class 1 features
-        ]
-        y = [0, 0, 1, 1]
-        n_bits = 20
-
-        # Train both models
-        clf_original = LaplacianNB_Original()
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-        clf_new = LaplacianNB_New()
-        clf_new.fit(X_sklearn, y)
-
-        # Test probability estimates
-        prob_original = clf_original.predict_proba(np.array(X_sets, dtype=object))
-        prob_new = clf_new.predict_proba(X_sklearn)
-
-        print("Original probabilities:")
-        print(prob_original)
-        print("New probabilities:")
-        print(prob_new)
-
-        # Check that probabilities sum to 1 (allow for float32 precision)
-        assert_allclose(prob_original.sum(axis=1), 1.0, rtol=1e-6, err_msg="Original probabilities don't sum to 1")
-        assert_allclose(prob_new.sum(axis=1), 1.0, rtol=1e-6, err_msg="New probabilities don't sum to 1")
-
-        # Check that probabilities are in valid range
-        assert np.all(prob_original >= 0) and np.all(prob_original <= 1), "Original probabilities out of range"
-        assert np.all(prob_new >= 0) and np.all(prob_new <= 1), "New probabilities out of range"
-
-        # Check that the highest probability corresponds to correct prediction
-        pred_original = clf_original.predict(np.array(X_sets, dtype=object))
-        pred_new = clf_new.predict(X_sklearn)
-
-        for i, (pred_o, pred_n) in enumerate(zip(pred_original, pred_new)):
-            assert prob_original[i, pred_o] == np.max(
-                prob_original[i]
-            ), f"Original: max prob doesn't match prediction for sample {i}"
-            assert prob_new[i, pred_n] == np.max(prob_new[i]), f"New: max prob doesn't match prediction for sample {i}"
-
-    def test_edge_cases_consistency(self):
-        """Test edge cases to ensure both implementations handle them similarly."""
-
-        # Test 1: Single class
-        X_single_class = [{1, 2}, {3, 4}, {5, 6}]
-        y_single_class = [0, 0, 0]
-
-        clf_orig = LaplacianNB_Original()
-        clf_orig.fit(np.array(X_single_class, dtype=object), y_single_class)
-
-        X_sklearn = convert_fingerprints(X_single_class, n_bits=10, output_format="csr")
-        clf_new = LaplacianNB_New()
-        clf_new.fit(X_sklearn, y_single_class)
-
-        pred_orig = clf_orig.predict(np.array(X_single_class, dtype=object))
-        pred_new = clf_new.predict(X_sklearn)
-
-        assert_array_equal(pred_orig, pred_new, err_msg="Single class predictions don't match")
-        assert np.all(pred_orig == 0), "Single class should predict class 0"
-
-        # Test 2: Empty features
-        X_with_empty = [{1, 2}, set(), {3, 4}]
-        y_with_empty = [0, 1, 0]
-
-        clf_orig = LaplacianNB_Original()
-        clf_orig.fit(np.array(X_with_empty, dtype=object), y_with_empty)
-
-        X_sklearn = convert_fingerprints(X_with_empty, n_bits=10, output_format="csr")
-        clf_new = LaplacianNB_New()
-        clf_new.fit(X_sklearn, y_with_empty)
-
-        # Both should handle empty features without crashing
-        pred_orig = clf_orig.predict(np.array([set()], dtype=object))
-        pred_new = clf_new.predict(convert_fingerprints([set()], n_bits=10, output_format="csr"))
-
-        print(f"Empty feature prediction - Original: {pred_orig}, New: {pred_new}")
-        # Don't require exact match for empty features, just no crash
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/test_complete_deprecation.py b/tests/test_complete_deprecation.py
deleted file mode 100644
index 0352c73..0000000
--- a/tests/test_complete_deprecation.py
+++ /dev/null
@@ -1,170 +0,0 @@
-"""
-Test the complete deprecation and migration system.
-"""
-
-import warnings
-
-import numpy as np
-import pytest
-
-from laplaciannb.fingerprint_utils import convert_fingerprints
-
-
-def test_new_version_detects_set_input():
-    """Test that new version detects and rejects legacy set input with helpful error."""
-    from laplaciannb import LaplacianNB
-
-    X_sets = [{1, 2, 3}, {4, 5, 6}]
-    y = [0, 1]
-
-    clf = LaplacianNB()
-
-    # Should raise ValueError with helpful message
-    with pytest.raises(ValueError) as exc_info:
-        clf.fit(X_sets, y)
-
-    error_message = str(exc_info.value)
-    assert "LEGACY INPUT FORMAT ERROR" in error_message
-    assert "convert_fingerprints" in error_message
-    assert "laplaciannb.legacy" in error_message
-
-
-def test_new_version_detects_numpy_array_of_sets():
-    """Test detection of numpy array with object dtype containing sets."""
-    from laplaciannb import LaplacianNB
-
-    X_sets = np.array([{1, 2, 3}, {4, 5, 6}], dtype=object)
-    y = [0, 1]
-
-    clf = LaplacianNB()
-
-    # Should raise ValueError with helpful message
-    with pytest.raises(ValueError) as exc_info:
-        clf.fit(X_sets, y)
-
-    error_message = str(exc_info.value)
-    assert "LEGACY INPUT FORMAT ERROR" in error_message
-
-
-def test_new_version_detects_predict_method():
-    """Test detection during predict method calls."""
-    from laplaciannb import LaplacianNB
-    from laplaciannb.fingerprint_utils import convert_fingerprints
-
-    # First fit with proper sklearn format
-    X_proper = convert_fingerprints([{1, 2}, {3, 4}, {5, 6}], n_bits=10)
-    y = [0, 1, 0]
-    clf = LaplacianNB()
-    clf.fit(X_proper, y)
-
-    # Now try to predict with set format
-    X_sets = [{1, 2, 3}]
-
-    # Should raise ValueError
-    with pytest.raises(ValueError) as exc_info:
-        clf.predict(X_sets)
-
-    error_message = str(exc_info.value)
-    assert "LEGACY INPUT FORMAT ERROR" in error_message
-
-
-def test_recommended_migration_path():
-    """Test that the recommended migration path works without warnings."""
-    from laplaciannb import LaplacianNB
-
-    X_sets = [{1, 2, 3}, {4, 5, 6}, {1, 4, 7}]
-    y = [0, 1, 0]
-
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter("always")
-
-        # Recommended path: convert fingerprints first
-        X = convert_fingerprints(X_sets, n_bits=10)
-        clf = LaplacianNB()
-        clf.fit(X, y)
-        predictions = clf.predict(X)
-        probabilities = clf.predict_proba(X)
-
-        # Should work without user warnings (only import warnings are OK)
-        user_warnings = [warning for warning in w if issubclass(warning.category, UserWarning)]
-        assert len(user_warnings) == 0
-
-        # Results should be valid
-        assert predictions.shape == (3,)
-        assert probabilities.shape == (3, 2)
-
-
-def test_legacy_version_still_works():
-    """Test that legacy version still works for backward compatibility."""
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")  # Suppress deprecation warnings
-
-        from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-        X_sets = np.array([{1, 2, 3}, {4, 5, 6}, {1, 4, 7}], dtype=object)
-        y = [0, 1, 0]
-
-        clf = LegacyLaplacianNB()
-        clf.fit(X_sets, y)
-        predictions = clf.predict(X_sets)
-
-        assert predictions.shape == (3,)
-
-
-def test_complete_migration_scenario():
-    """Test a complete migration from legacy to new."""
-    # Step 1: User starts with legacy
-    X_sets = [{1, 2, 3}, {4, 5, 6}, {1, 4, 7}]
-    y = [0, 1, 0]
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-        X_legacy = np.array(X_sets, dtype=object)
-        clf_legacy = LegacyLaplacianNB()
-        clf_legacy.fit(X_legacy, y)
-        pred_legacy = clf_legacy.predict(X_legacy)
-
-    # Step 2: User tries new version with same data (gets helpful error)
-    from laplaciannb import LaplacianNB as NewLaplacianNB
-
-    clf_new_wrong = NewLaplacianNB()
-    with pytest.raises(ValueError) as exc_info:
-        clf_new_wrong.fit(X_sets, y)
-
-    # Should get helpful guidance in error message
-    error_message = str(exc_info.value)
-    assert "LEGACY INPUT FORMAT ERROR" in error_message
-    assert "convert_fingerprints" in error_message
-
-    # Step 3: User follows guidance and migrates successfully
-    X_new = convert_fingerprints(X_sets, n_bits=10)
-    clf_new = NewLaplacianNB()
-    clf_new.fit(X_new, y)
-    pred_new = clf_new.predict(X_new)
-
-    # Step 4: Verify identical results
-    assert np.array_equal(pred_legacy, pred_new)
-
-
-def test_single_set_detection():
-    """Test detection of single set input (single fingerprint)."""
-    from laplaciannb import LaplacianNB
-
-    # User passes a single set instead of list of sets
-    X_single_set = {1, 2, 3, 4, 5}
-    y = [1]
-
-    clf = LaplacianNB()
-
-    # Should raise ValueError
-    with pytest.raises(ValueError) as exc_info:
-        clf.fit(X_single_set, y)
-
-    error_message = str(exc_info.value)
-    assert "LEGACY INPUT FORMAT ERROR" in error_message
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/test_deprecation.py b/tests/test_deprecation.py
deleted file mode 100644
index e14c1bb..0000000
--- a/tests/test_deprecation.py
+++ /dev/null
@@ -1,211 +0,0 @@
-"""
-Tests for deprecation warnings and legacy/new version compatibility.
-"""
-
-import warnings
-
-import numpy as np
-import pytest
-
-from laplaciannb.fingerprint_utils import convert_fingerprints
-
-
-class TestDeprecationWarnings:
-    """Test that deprecation warnings are properly issued."""
-
-    @pytest.mark.skip(reason="Import warnings are only triggered once per session due to Python import caching")
-    def test_legacy_module_import_warning(self):
-        """Test that importing from legacy module issues deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-
-            # Import from legacy module should trigger warning
-            from laplaciannb.legacy import LaplacianNB  # noqa: F401
-
-            # Check that a deprecation warning was issued
-            assert len(w) >= 1
-            assert issubclass(w[0].category, DeprecationWarning)
-            assert "DEPRECATED legacy LaplacianNB" in str(w[0].message)
-            assert "sklearn-compatible" in str(w[0].message)
-
-    def test_legacy_class_instantiation_warning(self):
-        """Test that instantiating legacy class issues deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-
-            from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-            # Clear the import warning to focus on instantiation warning
-            w.clear()
-
-            # Instantiate legacy class should trigger additional warning
-            LegacyLaplacianNB()
-
-            # Check that a deprecation warning was issued
-            assert len(w) == 1
-            assert issubclass(w[0].category, DeprecationWarning)
-            assert "DEPRECATED legacy LaplacianNB" in str(w[0].message)
-
-    def test_new_version_no_warnings(self):
-        """Test that new version doesn't issue deprecation warnings."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-
-            # Import and use new version
-            from laplaciannb import LaplacianNB
-
-            LaplacianNB()
-
-            # Should not have any deprecation warnings
-            deprecation_warnings = [warning for warning in w if issubclass(warning.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-    def test_recommended_import_path(self):
-        """Test that recommended import path works without warnings."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-
-            # This is the recommended way
-            from laplaciannb import LaplacianNB
-
-            # Create sample data
-            X_sets = [{1, 2, 3}, {4, 5, 6}, {1, 4, 7}]
-            y = [0, 1, 0]
-
-            # Convert and use
-            X = convert_fingerprints(X_sets, n_bits=10)
-            clf = LaplacianNB()
-            clf.fit(X, y)
-            predictions = clf.predict(X)
-
-            # Should work without deprecation warnings
-            deprecation_warnings = [warning for warning in w if issubclass(warning.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-            assert predictions.shape == (3,)
-
-
-class TestBothVersionsAvailable:
-    """Test that both versions are available and work correctly."""
-
-    def test_both_versions_importable(self):
-        """Test that both legacy and new versions can be imported."""
-        # New version (recommended)
-        from laplaciannb import LaplacianNB as NewLaplacianNB
-
-        # Legacy version (deprecated)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-        # Both should be classes
-        assert callable(NewLaplacianNB)
-        assert callable(LegacyLaplacianNB)
-
-    def test_different_implementations(self):
-        """Test that legacy and new are different implementations."""
-        from laplaciannb import LaplacianNB as NewLaplacianNB
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-        # They should be different classes
-        assert NewLaplacianNB is not LegacyLaplacianNB
-        assert NewLaplacianNB.__module__ != LegacyLaplacianNB.__module__
-
-    def test_identical_api_basic_usage(self):
-        """Test that both versions have similar basic API."""
-        from laplaciannb import LaplacianNB as NewLaplacianNB
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-        # Both should have the same basic methods
-        for method in ["fit", "predict", "predict_proba", "predict_log_proba"]:
-            assert hasattr(NewLaplacianNB(), method)
-            assert hasattr(LegacyLaplacianNB(), method)
-
-    def test_legacy_still_functional(self):
-        """Test that legacy version still works for backward compatibility."""
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")  # Suppress deprecation warnings for this test
-
-            from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-            # Create test data in legacy format (sets)
-            X_sets = np.array([{1, 2, 3}, {4, 5, 6}, {1, 4, 7}], dtype=object)
-            y = np.array([0, 1, 0])
-
-            # Should work without errors
-            clf = LegacyLaplacianNB()
-            clf.fit(X_sets, y)
-            predictions = clf.predict(X_sets)
-            probabilities = clf.predict_proba(X_sets)
-
-            assert predictions.shape == (3,)
-            assert probabilities.shape == (3, 2)  # Binary classification
-
-    def test_new_version_functional(self):
-        """Test that new version works with sklearn format."""
-        from laplaciannb import LaplacianNB
-        from laplaciannb.fingerprint_utils import convert_fingerprints
-
-        # Create test data
-        X_sets = [{1, 2, 3}, {4, 5, 6}, {1, 4, 7}]
-        y = [0, 1, 0]
-
-        # Convert to sklearn format
-        X = convert_fingerprints(X_sets, n_bits=10)
-
-        # Should work without errors
-        clf = LaplacianNB()
-        clf.fit(X, y)
-        predictions = clf.predict(X)
-        probabilities = clf.predict_proba(X)
-
-        assert predictions.shape == (3,)
-        assert probabilities.shape == (3, 2)  # Binary classification
-
-
-class TestMigrationSupport:
-    """Test migration support features."""
-
-    def test_explicit_legacy_import_required(self):
-        """Test that legacy version requires explicit import from legacy module."""
-        # Importing from main module should give new version
-        from laplaciannb import LaplacianNB as MainLaplacianNB
-
-        # Importing from legacy should give legacy version (with warning)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-        # They should be different
-        assert MainLaplacianNB is not LegacyLaplacianNB
-
-    def test_warning_messages_helpful(self):
-        """Test that warning messages provide helpful migration information."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-
-            from laplaciannb.legacy import LaplacianNB
-
-            LaplacianNB()
-
-            # Should have warnings with helpful information
-            assert len(w) >= 1
-
-            # Check warning content
-            warning_messages = [str(warning.message) for warning in w]
-            combined_message = " ".join(warning_messages)
-
-            # Should mention new version
-            assert "sklearn-compatible" in combined_message
-            assert "from laplaciannb import LaplacianNB" in combined_message
-            assert "DEPRECATED" in combined_message
-            assert "REMOVED" in combined_message
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/test_fingerprint_csr_conversion.py b/tests/test_fingerprint_csr_conversion.py
new file mode 100644
index 0000000..9cc03dd
--- /dev/null
+++ b/tests/test_fingerprint_csr_conversion.py
@@ -0,0 +1,61 @@
+import pytest
+import numpy as np
+from scipy.sparse import csr_matrix
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from laplaciannb.fingerprint_utils import rdkit_to_csr
+
+
+def csr_to_rdkit_bit(col_idx):
+    """Convert CSR column index back to RDKit bit"""
+    return np.int32(col_idx)
+
+
+def get_test_molecules():
+    """Get simple test molecules"""
+    smiles = ["CCO", "CC", "CCC"]  # ethanol, methane, propane
+    return [Chem.MolFromSmiles(smi) for smi in smiles]
+
+
+class TestFingerprintCSRConversion:
+    
+    def test_rdkit_to_csr_basic(self):
+        """Test basic RDKit to CSR conversion"""
+        smiles = ["CCO", "CC", "CCC"]
+        csr_matrix_result = rdkit_to_csr(smiles)
+        
+        # Basic checks
+        assert csr_matrix_result.shape[0] == len(smiles)
+        assert csr_matrix_result.shape[1] == 2**32
+        assert csr_matrix_result.nnz > 0  # Should have non-zero elements
+    
+    def test_fingerprint_consistency(self):
+        """Test that CSR conversion preserves fingerprint information"""
+        smiles = ["CCO", "CC", "CCC"] 
+        csr_result = rdkit_to_csr(smiles)
+        
+        # Calculate total expected fingerprint bits across all molecules
+        # Use the same API as the function
+        from rdkit.Chem import rdFingerprintGenerator
+        mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
+        
+        total_expected_bits = 0
+        for smi in smiles:
+            mol = Chem.MolFromSmiles(smi)
+            if mol is not None:
+                sfp = mfpgen.GetSparseFingerprint(mol)
+                total_expected_bits += sfp.GetNumOnBits()
+
+        # Check that we have the same total number of features
+        assert csr_result.nnz == total_expected_bits
+
+    def test_bit_conversion_roundtrip(self):
+        """Test that bit conversion works both ways (WILL FAIL)"""
+        # Test a few example bits
+        test_bits = [-1000, 0, 1000]
+        
+        for original_bit in test_bits:
+            # This will fail because mock just returns the same value
+            recovered_bit = csr_to_rdkit_bit(original_bit)
+            # For negative bits, this should fail with current mock
+            assert recovered_bit == original_bit
diff --git a/tests/test_fingerprint_utils.py b/tests/test_fingerprint_utils.py
deleted file mode 100644
index c602aba..0000000
--- a/tests/test_fingerprint_utils.py
+++ /dev/null
@@ -1,311 +0,0 @@
-"""Tests for fingerprint utility functions."""
-
-import numpy as np
-import pytest
-from scipy import sparse
-
-from laplaciannb.fingerprint_utils import (
-    RDKitFingerprintConverter,
-    convert_fingerprints,
-    rdkit_sparse_to_csr,
-    rdkit_sparse_to_dense,
-    rdkit_sparse_to_numpy,
-    rdkit_sparse_to_sklearn,
-)
-
-
-class TestFingerprintUtils:
-    """Test suite for fingerprint utility functions."""
-
-    @pytest.fixture
-    def sample_set_fingerprints(self):
-        """Create sample fingerprints as sets (similar to RDKit on-bits)."""
-        fps = [
-            {1, 5, 10, 15, 20},
-            {2, 6, 11, 16, 21},
-            {1, 3, 7, 12, 17},
-            set(),  # Empty fingerprint
-        ]
-        return fps
-
-    @pytest.fixture
-    def sample_dict_fingerprints(self):
-        """Create sample fingerprints as dictionaries (count data)."""
-        fps = [
-            {1: 2, 5: 1, 10: 3},
-            {2: 1, 6: 2, 11: 1},
-            {1: 1, 3: 1, 7: 2},
-        ]
-        return fps
-
-    def test_rdkit_sparse_to_dense_sets(self, sample_set_fingerprints):
-        """Test conversion of set fingerprints to dense arrays."""
-        n_bits = 25
-
-        for fp in sample_set_fingerprints:
-            dense = rdkit_sparse_to_dense(fp, n_bits=n_bits)
-
-            assert dense.shape == (n_bits,)
-            assert dense.dtype == np.float32
-
-            # Check that on-bits are correctly set
-            for bit_idx in fp:
-                assert dense[bit_idx] == 1.0
-
-            # Check that off-bits are zero
-            off_bits = set(range(n_bits)) - fp
-            for bit_idx in off_bits:
-                assert dense[bit_idx] == 0.0
-
-    def test_rdkit_sparse_to_dense_dicts(self, sample_dict_fingerprints):
-        """Test conversion of dict fingerprints to dense arrays."""
-        n_bits = 25
-
-        for fp in sample_dict_fingerprints:
-            dense = rdkit_sparse_to_dense(fp, n_bits=n_bits)
-
-            assert dense.shape == (n_bits,)
-
-            # Check that counts are correctly set
-            for bit_idx, count in fp.items():
-                assert dense[bit_idx] == float(count)
-
-    def test_rdkit_sparse_to_csr(self, sample_set_fingerprints):
-        """Test conversion to CSR sparse matrix."""
-        n_bits = 25
-        csr_matrix = rdkit_sparse_to_csr(sample_set_fingerprints, n_bits=n_bits)
-
-        assert csr_matrix.shape == (len(sample_set_fingerprints), n_bits)
-        assert sparse.issparse(csr_matrix)
-        assert csr_matrix.format == "csr"
-
-        # Convert back to dense for verification
-        dense = csr_matrix.toarray()
-
-        for i, fp in enumerate(sample_set_fingerprints):
-            for bit_idx in fp:
-                assert dense[i, bit_idx] == 1.0
-
-    def test_rdkit_sparse_to_numpy(self, sample_set_fingerprints):
-        """Test conversion to dense numpy array."""
-        n_bits = 25
-        dense_matrix = rdkit_sparse_to_numpy(sample_set_fingerprints, n_bits=n_bits)
-
-        assert dense_matrix.shape == (len(sample_set_fingerprints), n_bits)
-        assert isinstance(dense_matrix, np.ndarray)
-
-        for i, fp in enumerate(sample_set_fingerprints):
-            for bit_idx in fp:
-                assert dense_matrix[i, bit_idx] == 1.0
-
-    def test_rdkit_sparse_to_sklearn_auto(self, sample_set_fingerprints):
-        """Test auto format selection."""
-        n_bits = 2048  # Large enough to trigger sparse format
-        result = rdkit_sparse_to_sklearn(sample_set_fingerprints, n_bits=n_bits, output_format="auto")
-
-        # Should be sparse due to high sparsity
-        assert sparse.issparse(result)
-
-    def test_rdkit_sparse_to_sklearn_dense(self, sample_set_fingerprints):
-        """Test dense format selection."""
-        n_bits = 25
-        result = rdkit_sparse_to_sklearn(sample_set_fingerprints, n_bits=n_bits, output_format="dense")
-
-        assert isinstance(result, np.ndarray)
-        assert result.shape == (len(sample_set_fingerprints), n_bits)
-
-    def test_convert_fingerprints_convenience(self, sample_set_fingerprints):
-        """Test the convenience function."""
-        n_bits = 25
-        result = convert_fingerprints(sample_set_fingerprints, n_bits=n_bits)
-
-        assert result.shape[0] == len(sample_set_fingerprints)
-        assert result.shape[1] == n_bits
-
-    def test_single_fingerprint_conversion(self):
-        """Test conversion of a single fingerprint."""
-        fp = {1, 5, 10}
-        n_bits = 15
-
-        dense = rdkit_sparse_to_dense(fp, n_bits=n_bits)
-        assert dense.shape == (n_bits,)
-
-        csr = rdkit_sparse_to_csr(fp, n_bits=n_bits)
-        assert csr.shape == (1, n_bits)
-
-        numpy_result = rdkit_sparse_to_numpy(fp, n_bits=n_bits)
-        assert numpy_result.shape == (1, n_bits)
-
-    def test_empty_fingerprint_handling(self):
-        """Test handling of empty fingerprints."""
-        empty_fps = [set(), {}, []]
-        n_bits = 10
-
-        for fp in empty_fps:
-            dense = rdkit_sparse_to_dense(fp, n_bits=n_bits)
-            assert np.all(dense == 0)
-
-            csr = rdkit_sparse_to_csr([fp], n_bits=n_bits)
-            assert csr.nnz == 0
-
-    def test_list_fingerprints(self):
-        """Test handling of list-based fingerprints."""
-        # List of indices
-        fp_indices = [1, 5, 10, 15]
-        n_bits = 20
-
-        dense = rdkit_sparse_to_dense(fp_indices, n_bits=n_bits)
-        for idx in fp_indices:
-            assert dense[idx] == 1.0
-
-        # Full vector
-        fp_full = np.zeros(n_bits)
-        fp_full[[1, 5, 10]] = 1
-        dense_full = rdkit_sparse_to_dense(fp_full, n_bits=n_bits)
-        np.testing.assert_array_equal(dense_full, fp_full)
-
-    def test_bounds_checking(self):
-        """Test that out-of-bounds indices are ignored."""
-        fp = {1, 5, 100}  # 100 is out of bounds
-        n_bits = 10
-
-        dense = rdkit_sparse_to_dense(fp, n_bits=n_bits)
-        assert dense[1] == 1.0
-        assert dense[5] == 1.0
-        # Index 100 should be ignored, no error raised
-
-
-class TestRDKitFingerprintConverter:
-    """Test suite for the RDKitFingerprintConverter class."""
-
-    @pytest.fixture
-    def converter(self):
-        """Create a converter instance."""
-        return RDKitFingerprintConverter(n_bits=50, output_format="dense")
-
-    @pytest.fixture
-    def sample_fingerprints(self):
-        """Sample fingerprints for testing."""
-        return [
-            {1, 5, 10, 15},
-            {2, 6, 11, 16},
-            {1, 3, 7, 12},
-        ]
-
-    def test_converter_initialization(self):
-        """Test converter initialization."""
-        converter = RDKitFingerprintConverter(n_bits=1024, output_format="csr", dtype=np.int32)
-
-        assert converter.n_bits == 1024
-        assert converter.output_format == "csr"
-        assert converter.dtype == np.int32
-        assert converter.n_features_ == 1024
-
-    def test_converter_convert_method(self, converter, sample_fingerprints):
-        """Test the convert method."""
-        result = converter.convert(sample_fingerprints)
-
-        assert isinstance(result, np.ndarray)  # Dense format
-        assert result.shape == (len(sample_fingerprints), converter.n_bits)
-
-    def test_converter_to_dense(self, converter, sample_fingerprints):
-        """Test to_dense method."""
-        result = converter.to_dense(sample_fingerprints)
-        assert isinstance(result, np.ndarray)
-        assert result.shape == (len(sample_fingerprints), converter.n_bits)
-
-    def test_converter_to_csr(self, converter, sample_fingerprints):
-        """Test to_csr method."""
-        result = converter.to_csr(sample_fingerprints)
-        assert sparse.issparse(result)
-        assert result.format == "csr"
-
-    def test_converter_to_csc(self, converter, sample_fingerprints):
-        """Test to_csc method."""
-        result = converter.to_csc(sample_fingerprints)
-        assert sparse.issparse(result)
-        assert result.format == "csc"
-
-    def test_converter_get_sparsity(self, converter):
-        """Test sparsity calculation."""
-        # Dense matrix
-        dense = np.array([[1, 0, 0], [0, 1, 0]])
-        sparsity = converter.get_sparsity(dense)
-        assert abs(sparsity - 2 / 3) < 1e-10  # 4 zeros out of 6 elements
-
-        # Sparse matrix
-        sparse_matrix = sparse.csr_matrix([[1, 0, 0], [0, 1, 0]])
-        sparsity_sparse = converter.get_sparsity(sparse_matrix)
-        assert abs(sparsity_sparse - 2 / 3) < 1e-10
-
-    def test_converter_get_statistics(self, converter, sample_fingerprints):
-        """Test statistics calculation."""
-        stats = converter.get_statistics(sample_fingerprints)
-
-        assert "n_samples" in stats
-        assert "n_features" in stats
-        assert "sparsity" in stats
-        assert "avg_on_bits" in stats
-        assert "min_on_bits" in stats
-        assert "max_on_bits" in stats
-        assert "total_unique_bits" in stats
-
-        assert stats["n_samples"] == len(sample_fingerprints)
-        assert stats["n_features"] == converter.n_bits
-
-    def test_converter_validation_error(self, converter):
-        """Test validation error handling."""
-        # Should raise error for unsupported type (integer is not iterable)
-        with pytest.raises(ValueError):
-            converter._validate_fingerprints(42)
-
-    def test_format_override(self, converter, sample_fingerprints):
-        """Test output format override."""
-        # Converter default is 'dense', but override to 'csr'
-        result = converter.convert(sample_fingerprints, output_format="csr")
-        assert sparse.issparse(result)
-        assert result.format == "csr"
-
-
-class TestEdgeCases:
-    """Test edge cases and error conditions."""
-
-    def test_none_fingerprint(self):
-        """Test handling of None fingerprint."""
-        result = rdkit_sparse_to_dense(None, n_bits=10)
-        assert np.all(result == 0)
-
-        result_list = rdkit_sparse_to_numpy([None, {1, 2}], n_bits=10)
-        assert result_list.shape == (2, 10)
-        assert np.all(result_list[0] == 0)
-        assert result_list[1, 1] == 1
-
-    def test_invalid_format_error(self):
-        """Test error for invalid output format."""
-        with pytest.raises(ValueError, match="Unknown output_format"):
-            rdkit_sparse_to_sklearn([{1, 2}], output_format="invalid")
-
-    def test_unsupported_type_error(self):
-        """Test error for completely unsupported types."""
-        with pytest.raises(ValueError, match="Unsupported fingerprint type"):
-            rdkit_sparse_to_dense(42)  # Integer is not supported
-
-    def test_mixed_fingerprint_types(self):
-        """Test handling of mixed fingerprint types."""
-        mixed_fps = [
-            {1, 2, 3},  # set
-            {4: 1, 5: 2},  # dict
-            [6, 7, 8],  # list
-        ]
-
-        result = rdkit_sparse_to_numpy(mixed_fps, n_bits=10)
-        assert result.shape == (3, 10)
-
-        # Check each type was handled correctly
-        assert result[0, 1] == 1  # set
-        assert result[1, 4] == 1  # dict
-        assert result[2, 6] == 1  # list
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/test_laplacian_nb_compatibility.py b/tests/test_laplacian_nb_compatibility.py
deleted file mode 100644
index 4f36efe..0000000
--- a/tests/test_laplacian_nb_compatibility.py
+++ /dev/null
@@ -1,365 +0,0 @@
-"""
-Tests to verify that the refactored LaplacianNB produces the same results
-as the original implementation.
-"""
-
-import numpy as np
-import pytest
-
-# Import the converters
-from laplaciannb.fingerprint_utils import convert_fingerprints
-
-# New version with sklearn-compatible input
-from laplaciannb.LaplacianNB_new import LaplacianNB as LaplacianNB_New
-
-# Import both versions of LaplacianNB
-# Original version with set-based operations
-from laplaciannb.legacy.LaplacianNB import LaplacianNB as LaplacianNB_Original
-
-
-class TestLaplacianNBCompatibility:
-    """Test suite to verify compatibility between old and new LaplacianNB implementations."""
-
-    @pytest.fixture
-    def generate_fingerprint_data(self):
-        """Generate test data in fingerprint format."""
-        np.random.seed(42)
-        n_samples = 100
-        n_bits = 256  # Smaller for faster testing
-
-        # Generate fingerprints as sets (original format)
-        X_sets = []
-        for _ in range(n_samples):
-            n_on_bits = np.random.randint(5, 30)
-            on_bits = set(np.random.choice(n_bits, n_on_bits, replace=False))
-            X_sets.append(on_bits)
-
-        # Generate labels
-        y = np.random.randint(0, 3, n_samples)
-
-        return X_sets, y, n_bits
-
-    def test_same_predictions_binary_classification(self, generate_fingerprint_data):
-        """Test that both versions give same predictions for binary classification."""
-        X_sets, y_multi, n_bits = generate_fingerprint_data
-
-        # Make binary labels
-        y = (y_multi > 0).astype(int)
-
-        # Train original model with sets
-        clf_original = LaplacianNB_Original(alpha=1.0)
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        # Convert to sklearn format for new model
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-
-        # Train new model
-        clf_new = LaplacianNB_New(alpha=1.0)
-        clf_new.fit(X_sklearn, y)
-
-        # Make predictions with both models
-        pred_original = clf_original.predict(np.array(X_sets[:20], dtype=object))
-        pred_new = clf_new.predict(X_sklearn[:20])
-
-        # Assert predictions are the same
-        np.testing.assert_array_equal(
-            pred_original, pred_new, err_msg="Binary predictions differ between implementations"
-        )
-
-    def test_same_predictions_multiclass(self, generate_fingerprint_data):
-        """Test that both versions give same predictions for multiclass classification."""
-        X_sets, y, n_bits = generate_fingerprint_data
-
-        # Train original model with sets
-        clf_original = LaplacianNB_Original(alpha=1.0)
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        # Convert to sklearn format for new model
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-
-        # Train new model
-        clf_new = LaplacianNB_New(alpha=1.0)
-        clf_new.fit(X_sklearn, y)
-
-        # Make predictions with both models
-        pred_original = clf_original.predict(np.array(X_sets[:20], dtype=object))
-        pred_new = clf_new.predict(X_sklearn[:20])
-
-        # Assert predictions are the same
-        np.testing.assert_array_equal(
-            pred_original, pred_new, err_msg="Multiclass predictions differ between implementations"
-        )
-
-    def test_same_probabilities(self, generate_fingerprint_data):
-        """Test that both versions give same probability estimates."""
-        X_sets, y, n_bits = generate_fingerprint_data
-
-        # Train original model with sets
-        clf_original = LaplacianNB_Original(alpha=1.0)
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        # Convert to sklearn format for new model
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-
-        # Train new model
-        clf_new = LaplacianNB_New(alpha=1.0)
-        clf_new.fit(X_sklearn, y)
-
-        # Get probabilities from both models
-        test_samples = 10
-        prob_original = clf_original.predict_proba(np.array(X_sets[:test_samples], dtype=object))
-        prob_new = clf_new.predict_proba(X_sklearn[:test_samples])
-
-        # Assert probabilities are very close (allowing for floating point differences)
-        np.testing.assert_allclose(
-            prob_original,
-            prob_new,
-            rtol=1e-5,
-            atol=1e-8,
-            err_msg="Probability estimates differ between implementations",
-        )
-
-    def test_same_log_probabilities(self, generate_fingerprint_data):
-        """Test that both versions give same log probability estimates."""
-        X_sets, y, n_bits = generate_fingerprint_data
-
-        # Train original model with sets
-        clf_original = LaplacianNB_Original(alpha=1.0)
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        # Convert to sklearn format for new model
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-
-        # Train new model
-        clf_new = LaplacianNB_New(alpha=1.0)
-        clf_new.fit(X_sklearn, y)
-
-        # Get log probabilities from both models
-        test_samples = 10
-        log_prob_original = clf_original.predict_log_proba(np.array(X_sets[:test_samples], dtype=object))
-        log_prob_new = clf_new.predict_log_proba(X_sklearn[:test_samples])
-
-        # Assert log probabilities are very close
-        np.testing.assert_allclose(
-            log_prob_original,
-            log_prob_new,
-            rtol=1e-4,
-            atol=1e-7,
-            err_msg="Log probability estimates differ between implementations",
-        )
-
-    def test_different_alpha_values(self, generate_fingerprint_data):
-        """Test consistency across different smoothing parameters."""
-        X_sets, y, n_bits = generate_fingerprint_data
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-
-        alpha_values = [0.1, 0.5, 1.0, 2.0, 10.0]
-
-        for alpha in alpha_values:
-            # Train both models
-            clf_original = LaplacianNB_Original(alpha=alpha)
-            clf_original.fit(np.array(X_sets, dtype=object), y)
-
-            clf_new = LaplacianNB_New(alpha=alpha)
-            clf_new.fit(X_sklearn, y)
-
-            # Compare predictions
-            pred_original = clf_original.predict(np.array(X_sets[:20], dtype=object))
-            pred_new = clf_new.predict(X_sklearn[:20])
-
-            np.testing.assert_array_equal(pred_original, pred_new, err_msg=f"Predictions differ for alpha={alpha}")
-
-    def test_sample_weights(self, generate_fingerprint_data):
-        """Test that both versions handle sample weights the same way."""
-        X_sets, y, n_bits = generate_fingerprint_data
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-
-        # Create sample weights
-        sample_weight = np.random.rand(len(y))
-
-        # Train both models with sample weights
-        clf_original = LaplacianNB_Original(alpha=1.0)
-        clf_original.fit(np.array(X_sets, dtype=object), y, sample_weight=sample_weight)
-
-        clf_new = LaplacianNB_New(alpha=1.0)
-        clf_new.fit(X_sklearn, y, sample_weight=sample_weight)
-
-        # Compare predictions
-        pred_original = clf_original.predict(np.array(X_sets[:20], dtype=object))
-        pred_new = clf_new.predict(X_sklearn[:20])
-
-        np.testing.assert_array_equal(pred_original, pred_new, err_msg="Predictions differ when using sample weights")
-
-    def test_sparse_vs_dense_input(self, generate_fingerprint_data):
-        """Test that new version gives same results with sparse and dense input."""
-        X_sets, y, n_bits = generate_fingerprint_data
-
-        # Convert to both sparse and dense formats
-        X_sparse = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-        X_dense = convert_fingerprints(X_sets, n_bits=n_bits, output_format="dense")
-
-        # Train new model with both formats
-        clf_sparse = LaplacianNB_New(alpha=1.0)
-        clf_sparse.fit(X_sparse, y)
-
-        clf_dense = LaplacianNB_New(alpha=1.0)
-        clf_dense.fit(X_dense, y)
-
-        # Compare predictions
-        pred_sparse = clf_sparse.predict(X_sparse[:20])
-        pred_dense = clf_dense.predict(X_dense[:20])
-
-        np.testing.assert_array_equal(
-            pred_sparse, pred_dense, err_msg="Predictions differ between sparse and dense input"
-        )
-
-        # Compare probabilities
-        prob_sparse = clf_sparse.predict_proba(X_sparse[:20])
-        prob_dense = clf_dense.predict_proba(X_dense[:20])
-
-        np.testing.assert_allclose(
-            prob_sparse,
-            prob_dense,
-            rtol=1e-5,
-            atol=1e-8,
-            err_msg="Probabilities differ between sparse and dense input",
-        )
-
-    def test_feature_counting_consistency(self, generate_fingerprint_data):
-        """Test that feature counting is consistent between implementations."""
-        X_sets, y, n_bits = generate_fingerprint_data
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-
-        # Train both models
-        clf_original = LaplacianNB_Original(alpha=1.0)
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        clf_new = LaplacianNB_New(alpha=1.0)
-        clf_new.fit(X_sklearn, y)
-
-        # Check that feature counts are consistent
-        # The original stores counts differently, but total counts should match
-        assert clf_original.feature_all_ == clf_new.feature_all_, "Total feature counts differ between implementations"
-
-        # Check class counts
-        np.testing.assert_allclose(
-            clf_original.class_count_, clf_new.class_count_, err_msg="Class counts differ between implementations"
-        )
-
-    def test_single_class_edge_case(self):
-        """Test handling of degenerate case with single class."""
-        np.random.seed(42)
-        n_samples = 20
-        n_bits = 128
-
-        # Generate fingerprints
-        X_sets = []
-        for _ in range(n_samples):
-            n_on_bits = np.random.randint(5, 15)
-            on_bits = set(np.random.choice(n_bits, n_on_bits, replace=False))
-            X_sets.append(on_bits)
-
-        # Single class labels
-        y = np.ones(n_samples)
-
-        # Train both models
-        clf_original = LaplacianNB_Original(alpha=1.0)
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-        clf_new = LaplacianNB_New(alpha=1.0)
-        clf_new.fit(X_sklearn, y)
-
-        # Both should predict the same class
-        pred_original = clf_original.predict(np.array(X_sets[:5], dtype=object))
-        pred_new = clf_new.predict(X_sklearn[:5])
-
-        np.testing.assert_array_equal(pred_original, pred_new, err_msg="Single class predictions differ")
-
-    def test_empty_features_handling(self):
-        """Test handling of samples with no active features."""
-        n_bits = 128
-
-        # Create samples with some empty fingerprints
-        X_sets = [
-            {1, 2, 3},
-            set(),  # Empty fingerprint
-            {5, 10},
-            set(),  # Another empty
-            {20, 30, 40},
-        ]
-        y = [0, 0, 1, 1, 1]
-
-        # Train both models
-        clf_original = LaplacianNB_Original(alpha=1.0)
-        clf_original.fit(np.array(X_sets, dtype=object), y)
-
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits, output_format="csr")
-        clf_new = LaplacianNB_New(alpha=1.0)
-        clf_new.fit(X_sklearn, y)
-
-        # Make predictions
-        pred_original = clf_original.predict(np.array(X_sets, dtype=object))
-        pred_new = clf_new.predict(X_sklearn)
-
-        np.testing.assert_array_equal(pred_original, pred_new, err_msg="Predictions differ with empty fingerprints")
-
-
-def run_compatibility_tests():
-    """Run all compatibility tests and report results."""
-
-    print("Running LaplacianNB Compatibility Tests")
-    print("=" * 60)
-
-    # Run tests using pytest
-    test = TestLaplacianNBCompatibility()
-
-    # Generate test data
-    np.random.seed(42)
-    n_samples = 100
-    n_bits = 256
-    X_sets = []
-    for _ in range(n_samples):
-        n_on_bits = np.random.randint(5, 30)
-        on_bits = set(np.random.choice(n_bits, n_on_bits, replace=False))
-        X_sets.append(on_bits)
-    y = np.random.randint(0, 3, n_samples)
-
-    test_data = (X_sets, y, n_bits)
-
-    tests = [
-        ("Binary Classification", test.test_same_predictions_binary_classification),
-        ("Multiclass Classification", test.test_same_predictions_multiclass),
-        ("Probability Estimates", test.test_same_probabilities),
-        ("Log Probability Estimates", test.test_same_log_probabilities),
-        ("Different Alpha Values", test.test_different_alpha_values),
-        ("Sample Weights", test.test_sample_weights),
-        ("Sparse vs Dense Input", test.test_sparse_vs_dense_input),
-        ("Feature Counting", test.test_feature_counting_consistency),
-        ("Single Class Edge Case", test.test_single_class_edge_case),
-        ("Empty Features", test.test_empty_features_handling),
-    ]
-
-    passed = 0
-    failed = 0
-
-    for test_name, test_func in tests:
-        try:
-            if test_func.__name__ in ["test_single_class_edge_case", "test_empty_features_handling"]:
-                test_func()
-            else:
-                test_func(test_data)
-            print(f"✓ {test_name} PASSED")
-            passed += 1
-        except Exception as e:
-            print(f"✗ {test_name} FAILED: {str(e)}")
-            failed += 1
-
-    print("\n" + "=" * 60)
-    print(f"Results: {passed} passed, {failed} failed")
-
-    return passed, failed
-
-
-if __name__ == "__main__":
-    run_compatibility_tests()
diff --git a/tests/test_laplacian_nb_standalone.py b/tests/test_laplacian_nb_standalone.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test_main_imports.py b/tests/test_main_imports.py
deleted file mode 100644
index 7001f13..0000000
--- a/tests/test_main_imports.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""
-Test the main import paths and ensure proper version selection.
-"""
-
-import warnings
-
-import pytest
-
-from laplaciannb.fingerprint_utils import convert_fingerprints
-
-
-def test_main_import_gives_new_version():
-    """Test that importing from main module gives the new sklearn-compatible version."""
-    from laplaciannb import LaplacianNB
-
-    # Should be the new implementation (sklearn-compatible)
-    assert LaplacianNB.__module__ == "laplaciannb.LaplacianNB_new"
-
-    # Should have sklearn-style attributes after fitting
-    X_sets = [{1, 2, 3}, {4, 5, 6}, {1, 4, 7}]
-    y = [0, 1, 0]
-    X = convert_fingerprints(X_sets, n_bits=10)
-
-    clf = LaplacianNB()
-    clf.fit(X, y)
-
-    # Should have sklearn-style attributes
-    assert hasattr(clf, "classes_")
-    assert hasattr(clf, "n_features_in_")
-    assert hasattr(clf, "feature_log_prob_")
-
-
-def test_legacy_import_gives_legacy_version():
-    """Test that importing from legacy module gives the legacy version."""
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")  # Suppress deprecation warnings
-        from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-    # Should be the legacy implementation
-    assert "legacy" in LegacyLaplacianNB.__module__
-
-
-def test_fingerprint_utils_available():
-    """Test that fingerprint utilities are available from main module."""
-    from laplaciannb import (
-        FingerprintTransformer,
-        RDKitFingerprintConverter,
-        convert_fingerprints,
-        rdkit_sparse_to_csr,
-        rdkit_sparse_to_dense,
-        rdkit_sparse_to_numpy,
-        rdkit_sparse_to_sklearn,
-    )
-
-    # All should be callable
-    assert callable(FingerprintTransformer)
-    assert callable(RDKitFingerprintConverter)
-    assert callable(convert_fingerprints)
-    assert callable(rdkit_sparse_to_dense)
-    assert callable(rdkit_sparse_to_csr)
-    assert callable(rdkit_sparse_to_numpy)
-    assert callable(rdkit_sparse_to_sklearn)
-
-
-def test_version_info_available():
-    """Test that version information is available."""
-    from laplaciannb import __version__
-
-    assert isinstance(__version__, str)
-    assert len(__version__) > 0
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/test_performance_comparison.py b/tests/test_performance_comparison.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test_sklearn_integration.py b/tests/test_sklearn_integration.py
deleted file mode 100644
index 53f571d..0000000
--- a/tests/test_sklearn_integration.py
+++ /dev/null
@@ -1,519 +0,0 @@
-"""
-Comprehensive sklearn integration test suite for LaplacianNB_new implementation.
-
-This test suite validates that LaplacianNB_new works seamlessly with sklearn's
-ecosystem including pipelines, cross-validation, grid search, and other tools.
-Based on scenarios from bayes_test.py but extended for sklearn compatibility.
-"""
-
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-import pytest
-from numpy.testing import assert_allclose, assert_array_equal
-from sklearn.base import clone
-from sklearn.exceptions import NotFittedError
-from sklearn.metrics import classification_report
-from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
-from sklearn.pipeline import Pipeline
-
-from laplaciannb.fingerprint_utils import FingerprintTransformer, convert_fingerprints
-from laplaciannb.LaplacianNB_new import LaplacianNB as LaplacianNB_New
-
-
-class TestSklearnIntegration:
-    """Test sklearn ecosystem integration for LaplacianNB_new."""
-
-    @pytest.fixture
-    def simple_fingerprint_data(self):
-        """Create simple synthetic fingerprint data for testing."""
-        np.random.seed(42)
-        n_samples = 100
-        max_bits = 50
-
-        X_sets = []
-        y = []
-
-        for i in range(n_samples):
-            # Create sparse fingerprint (3-8 bits set)
-            n_bits_set = np.random.randint(3, 9)
-            fingerprint = set(np.random.choice(max_bits, n_bits_set, replace=False))
-            X_sets.append(fingerprint)
-            # Target based on fingerprint characteristics
-            y.append(1 if len(fingerprint) > 5 else 0)
-
-        # Convert to sklearn format (defaults to sparse CSR now)
-        X_sklearn = convert_fingerprints(X_sets, n_bits=max_bits)  # Defaults to CSR sparse
-        y = np.array(y)
-
-        return X_sklearn, y, X_sets
-
-    @pytest.fixture
-    def multiclass_fingerprint_data(self):
-        """Create multiclass synthetic fingerprint data."""
-        np.random.seed(123)
-        n_samples = 150
-        max_bits = 100
-
-        X_sets = []
-        y = []
-
-        for i in range(n_samples):
-            n_bits_set = np.random.randint(5, 15)
-            fingerprint = set(np.random.choice(max_bits, n_bits_set, replace=False))
-            X_sets.append(fingerprint)
-
-            # Three classes based on different criteria
-            if len(fingerprint) < 8:
-                target = 0
-            elif len(fingerprint) < 12:
-                target = 1
-            else:
-                target = 2
-            y.append(target)
-
-        X_sklearn = convert_fingerprints(X_sets, n_bits=max_bits)  # Defaults to CSR sparse
-        y = np.array(y)
-
-        return X_sklearn, y, X_sets
-
-    def test_basic_sklearn_interface(self, simple_fingerprint_data):
-        """Test basic sklearn interface compliance."""
-        X, y, _ = simple_fingerprint_data
-
-        clf = LaplacianNB_New()
-
-        # Test basic fit/predict cycle
-        clf.fit(X, y)
-        predictions = clf.predict(X)
-        probabilities = clf.predict_proba(X)
-        log_probabilities = clf.predict_log_proba(X)
-
-        # Validate shapes and types
-        assert predictions.shape == (X.shape[0],)
-        assert probabilities.shape == (X.shape[0], 2)
-        assert log_probabilities.shape == (X.shape[0], 2)
-        assert isinstance(predictions, np.ndarray)
-        assert isinstance(probabilities, np.ndarray)
-        assert isinstance(log_probabilities, np.ndarray)
-
-        # Validate probability constraints
-        assert np.allclose(probabilities.sum(axis=1), 1.0)
-        assert np.all(probabilities >= 0)
-        assert np.all(probabilities <= 1)
-
-    def test_sklearn_estimator_checks(self):
-        """Test that the estimator passes sklearn's built-in checks."""
-        # Note: We'll run a subset of checks since some may not apply to our specific use case
-        try:
-            clf = LaplacianNB_New()
-            # Test basic estimator properties
-            assert hasattr(clf, "fit")
-            assert hasattr(clf, "predict")
-            assert hasattr(clf, "predict_proba")
-            assert callable(clf.fit)
-            assert callable(clf.predict)
-            assert callable(clf.predict_proba)
-        except Exception as e:
-            pytest.fail(f"Basic estimator checks failed: {e}")
-
-    def test_pipeline_integration(self, simple_fingerprint_data):
-        """Test integration with sklearn pipelines."""
-        X, y, _ = simple_fingerprint_data
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
-
-        # Create pipeline (Note: StandardScaler doesn't make sense for sparse binary data,
-        # but we'll use it to test pipeline compatibility)
-        pipeline = Pipeline([("classifier", LaplacianNB_New(alpha=1.0))])
-
-        # Fit and predict
-        pipeline.fit(X_train, y_train)
-        predictions = pipeline.predict(X_test)
-        probabilities = pipeline.predict_proba(X_test)
-
-        # Validate results
-        assert predictions.shape == (X_test.shape[0],)
-        assert probabilities.shape == (X_test.shape[0], 2)
-
-        # Test pipeline parameters
-        pipeline.set_params(classifier__alpha=2.0)
-        assert pipeline.named_steps["classifier"].alpha == 2.0
-
-    def test_cross_validation(self, simple_fingerprint_data):
-        """Test cross-validation compatibility."""
-        X, y, _ = simple_fingerprint_data
-
-        clf = LaplacianNB_New(alpha=1.0)
-
-        # Perform cross-validation
-        cv_scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
-
-        # Validate results
-        assert len(cv_scores) == 5
-        assert np.all(cv_scores >= 0)
-        assert np.all(cv_scores <= 1)
-
-        print(f"CV Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
-
-    def test_grid_search_cv(self, simple_fingerprint_data):
-        """Test grid search cross-validation."""
-        X, y, _ = simple_fingerprint_data
-
-        clf = LaplacianNB_New()
-
-        # Define parameter grid
-        param_grid = {"alpha": [0.1, 0.5, 1.0, 2.0, 5.0]}
-
-        # Perform grid search
-        grid_search = GridSearchCV(clf, param_grid, cv=3, scoring="accuracy")
-        grid_search.fit(X, y)
-
-        # Validate results
-        assert hasattr(grid_search, "best_params_")
-        assert hasattr(grid_search, "best_score_")
-        assert grid_search.best_params_["alpha"] in param_grid["alpha"]
-
-        print(f"Best parameters: {grid_search.best_params_}")
-        print(f"Best score: {grid_search.best_score_:.3f}")
-
-    def test_multiclass_classification(self, multiclass_fingerprint_data):
-        """Test multiclass classification."""
-        X, y, _ = multiclass_fingerprint_data
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
-
-        clf = LaplacianNB_New()
-        clf.fit(X_train, y_train)
-
-        predictions = clf.predict(X_test)
-        probabilities = clf.predict_proba(X_test)
-
-        # Validate multiclass results
-        assert probabilities.shape == (X_test.shape[0], 3)  # 3 classes
-        assert np.allclose(probabilities.sum(axis=1), 1.0)
-        assert len(np.unique(predictions)) <= 3
-
-        # Test classification report
-        report = classification_report(y_test, predictions, output_dict=True)
-        assert "accuracy" in report
-
-        print(f"Multiclass accuracy: {report['accuracy']:.3f}")
-
-    def test_sample_weights(self, simple_fingerprint_data):
-        """Test sample weight functionality."""
-        X, y, _ = simple_fingerprint_data
-
-        # Just verify that the fit method accepts sample weights without error
-        sample_weights = np.where(y == 1, 2.0, 1.0)
-
-        clf = LaplacianNB_New()
-
-        # This should not raise an error
-        clf.fit(X, y, sample_weight=sample_weights)
-
-        # Basic functionality should still work
-        predictions = clf.predict(X)
-        probabilities = clf.predict_proba(X)
-
-        assert predictions.shape == (X.shape[0],)
-        assert probabilities.shape == (X.shape[0], 2)
-
-        print("✓ Sample weights accepted and basic functionality works")
-
-    def test_clone_compatibility(self, simple_fingerprint_data):
-        """Test sklearn clone functionality."""
-        X, y, _ = simple_fingerprint_data
-
-        clf_original = LaplacianNB_New(alpha=2.0)
-        clf_original.fit(X, y)
-
-        # Clone the estimator
-        clf_cloned = clone(clf_original)
-
-        # Cloned estimator should not be fitted
-        with pytest.raises(NotFittedError):
-            clf_cloned.predict(X)
-
-        # Parameters should be copied
-        assert clf_cloned.alpha == clf_original.alpha
-
-        # After fitting, cloned estimator should work
-        clf_cloned.fit(X, y)
-        pred_original = clf_original.predict(X)
-        pred_cloned = clf_cloned.predict(X)
-
-        # Results should be identical
-        assert_array_equal(pred_original, pred_cloned)
-
-    def test_different_sparse_formats(self, simple_fingerprint_data):
-        """Test compatibility with different sparse matrix formats."""
-        _, y, X_sets = simple_fingerprint_data
-
-        # Test different sparse formats and dense
-        formats = {"csr": "csr", "csc": "csc", "dense": "dense"}
-        results = {}
-
-        for name, fmt in formats.items():
-            X_converted = convert_fingerprints(X_sets, n_bits=50, output_format=fmt)
-            clf = LaplacianNB_New()
-            clf.fit(X_converted, y)
-            results[name] = clf.predict(X_converted)
-
-            # Verify format
-            if name == "dense":
-                assert isinstance(X_converted, np.ndarray)
-                assert X_converted.ndim == 2
-            else:
-                assert hasattr(X_converted, "format")
-                assert X_converted.format == fmt
-
-        # Results should be identical regardless of format
-        assert_array_equal(results["csr"], results["csc"])
-        assert_array_equal(results["csr"], results["dense"])
-
-        print("✓ All sparse/dense formats produce identical results")
-
-    def test_sparsity_preservation(self, simple_fingerprint_data):
-        """Test that sparse fingerprints remain sparse by default."""
-        _, y, X_sets = simple_fingerprint_data
-
-        # Default conversion should produce sparse matrix
-        X_default = convert_fingerprints(X_sets, n_bits=50)
-        assert hasattr(X_default, "format"), "Default conversion should produce sparse matrix"
-        assert X_default.format == "csr", "Default should be CSR format"
-
-        # Check sparsity
-        sparsity = 1.0 - (X_default.nnz / (X_default.shape[0] * X_default.shape[1]))
-        print(f"Sparsity: {sparsity:.2%}")
-        assert sparsity > 0.8, "Molecular fingerprints should be very sparse"
-
-        # Explicit dense conversion should work
-        X_dense = convert_fingerprints(X_sets, n_bits=50, output_format="dense")
-        assert isinstance(X_dense, np.ndarray), "Explicit dense conversion should work"
-
-        # Results should be equivalent
-        clf_sparse = LaplacianNB_New()
-        clf_dense = LaplacianNB_New()
-
-        clf_sparse.fit(X_default, y)
-        clf_dense.fit(X_dense, y)
-
-        pred_sparse = clf_sparse.predict(X_default)
-        pred_dense = clf_dense.predict(X_dense)
-
-        assert_array_equal(pred_sparse, pred_dense)
-        print("✓ Sparse and dense give identical predictions")
-
-    def test_edge_cases_sklearn_compatibility(self):
-        """Test edge cases for sklearn compatibility."""
-        # Single sample
-        X_single = convert_fingerprints([{1, 2, 3}], n_bits=10, output_format="csr")
-        y_single = np.array([1])
-
-        clf = LaplacianNB_New()
-        clf.fit(X_single, y_single)
-        pred = clf.predict(X_single)
-        prob = clf.predict_proba(X_single)
-
-        assert pred.shape == (1,)
-        assert prob.shape == (1, 1)  # Single class
-
-        # Empty features
-        X_empty = convert_fingerprints([set(), {1}, set()], n_bits=10, output_format="csr")
-        y_empty = np.array([0, 1, 0])
-
-        clf_empty = LaplacianNB_New()
-        clf_empty.fit(X_empty, y_empty)
-        pred_empty = clf_empty.predict(X_empty)
-
-        assert pred_empty.shape == (3,)
-
-    def test_rdkit_sklearn_pipeline(self):
-        """Test full pipeline with RDKit fingerprints (if available)."""
-        pytest.importorskip("rdkit", reason="RDKit required for this test")
-        from rdkit import Chem
-        from rdkit.Chem import rdFingerprintGenerator
-
-        def get_fp(smiles: str, n_bits: int = 1024) -> set:
-            """Calculate folded Morgan fingerprint from SMILES with fixed size."""
-            mol = Chem.MolFromSmiles(smiles)
-            if mol is None:
-                return set()
-            # Use folded fingerprint for memory efficiency
-            mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
-            fp = mfpgen.GetFingerprint(mol)
-            return set(fp.GetOnBits())
-
-        # Check if test data exists
-        DATA_PATH = Path(__file__).parent / "data"
-        test_file = DATA_PATH / "smiles_test.csv"
-
-        if not test_file.exists():
-            pytest.skip(f"Test data file not found: {test_file}")
-
-        # Load and process small subset for testing
-        df = pd.read_csv(test_file)
-        df_subset = df.head(50).copy()  # Use copy() to avoid pandas warning
-
-        # Fixed fingerprint size for memory efficiency
-        n_bits = 1024
-        df_subset["fingerprints"] = df_subset["smiles"].apply(lambda x: get_fp(x, n_bits))
-        X_sets = df_subset["fingerprints"].tolist()
-        y = df_subset["activity"].values
-
-        # Convert to sklearn format (sparse CSR with fixed size)
-        X_sklearn = convert_fingerprints(X_sets, n_bits=n_bits)  # Default to sparse CSR
-
-        # Create and test pipeline
-        pipeline = Pipeline([("classifier", LaplacianNB_New(alpha=1.0))])
-
-        # Cross-validation
-        cv_scores = cross_val_score(pipeline, X_sklearn, y, cv=3, scoring="accuracy")
-
-        print(f"RDKit pipeline CV accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
-        assert len(cv_scores) == 3
-
-    def test_stratified_cross_validation(self, simple_fingerprint_data):
-        """Test stratified cross-validation for imbalanced datasets."""
-        X, y, _ = simple_fingerprint_data
-
-        # Create imbalanced dataset
-        mask = y == 1
-        # Keep only 20% of class 1 samples
-        indices_to_keep = np.where(~mask)[0].tolist()
-        indices_to_keep.extend(np.where(mask)[0][: int(mask.sum() * 0.2)].tolist())
-
-        X_imbalanced = X[indices_to_keep]
-        y_imbalanced = y[indices_to_keep]
-
-        clf = LaplacianNB_New()
-
-        # Stratified cross-validation
-        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
-        cv_scores = cross_val_score(clf, X_imbalanced, y_imbalanced, cv=skf, scoring="accuracy")
-
-        assert len(cv_scores) == 3
-        print(f"Stratified CV on imbalanced data: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
-
-    def test_feature_importance_attributes(self, simple_fingerprint_data):
-        """Test that model provides access to feature importance information."""
-        X, y, _ = simple_fingerprint_data
-
-        clf = LaplacianNB_New()
-        clf.fit(X, y)
-
-        # Check that we can access feature log probabilities
-        assert hasattr(clf, "feature_log_prob_")
-        assert hasattr(clf, "class_log_prior_")
-        assert hasattr(clf, "classes_")
-
-        # Validate shapes
-        n_classes = len(np.unique(y))
-        n_features = X.shape[1]
-
-        assert clf.feature_log_prob_.shape == (n_classes, n_features)
-        assert clf.class_log_prior_.shape == (n_classes,)
-        assert len(clf.classes_) == n_classes
-
-    def test_pipeline_with_feature_selection(self, simple_fingerprint_data):
-        """Test pipeline with feature selection (simulated)."""
-        X, y, _ = simple_fingerprint_data
-
-        # Since sklearn feature selection doesn't work well with our sparse binary data,
-        # we'll simulate by using a subset of features
-        n_features_selected = 30
-        X_reduced = X[:, :n_features_selected]
-
-        pipeline = Pipeline([("classifier", LaplacianNB_New(alpha=1.0))])
-
-        # Test that it works with reduced features
-        pipeline.fit(X_reduced, y)
-        predictions = pipeline.predict(X_reduced)
-
-        assert predictions.shape == (X_reduced.shape[0],)
-
-    def test_reproducibility(self, simple_fingerprint_data):
-        """Test that results are reproducible."""
-        X, y, _ = simple_fingerprint_data
-
-        clf1 = LaplacianNB_New(alpha=1.0)
-        clf2 = LaplacianNB_New(alpha=1.0)
-
-        clf1.fit(X, y)
-        clf2.fit(X, y)
-
-        pred1 = clf1.predict(X)
-        pred2 = clf2.predict(X)
-
-        prob1 = clf1.predict_proba(X)
-        prob2 = clf2.predict_proba(X)
-
-        # Results should be identical
-        assert_array_equal(pred1, pred2)
-        assert_allclose(prob1, prob2)
-
-    def test_fingerprint_transformer(self, simple_fingerprint_data):
-        """Test the FingerprintTransformer sklearn interface."""
-        _, y, X_sets = simple_fingerprint_data
-
-        # Test basic transformer functionality
-        transformer = FingerprintTransformer(n_bits=50, output_format="csr")
-
-        # Test fit/transform
-        X_transformed = transformer.fit_transform(X_sets)
-        assert hasattr(X_transformed, "format")
-        assert X_transformed.format == "csr"
-        assert X_transformed.shape == (len(X_sets), 50)
-
-        # Test separate fit/transform
-        transformer2 = FingerprintTransformer(n_bits=50, output_format="dense")
-        transformer2.fit(X_sets)
-        X_dense = transformer2.transform(X_sets)
-        assert isinstance(X_dense, np.ndarray)
-        assert X_dense.shape == (len(X_sets), 50)
-
-        # Test get_feature_names_out
-        feature_names = transformer.get_feature_names_out()
-        assert len(feature_names) == 50
-        assert feature_names[0] == "bit_0"
-        assert feature_names[49] == "bit_49"
-
-        # Test sklearn pipeline integration
-        pipeline = Pipeline([("fingerprints", FingerprintTransformer(n_bits=50)), ("classifier", LaplacianNB_New())])
-
-        pipeline.fit(X_sets, y)
-        predictions = pipeline.predict(X_sets)
-        assert predictions.shape == (len(X_sets),)
-
-        # Test cross-validation with pipeline
-        cv_scores = cross_val_score(pipeline, X_sets, y, cv=3)
-        assert len(cv_scores) == 3
-
-        print("✓ FingerprintTransformer sklearn integration works perfectly")
-
-    def test_transformer_pipeline_with_grid_search(self, simple_fingerprint_data):
-        """Test FingerprintTransformer in grid search pipeline."""
-        _, y, X_sets = simple_fingerprint_data
-
-        # Create pipeline with transformer
-        pipeline = Pipeline([("fingerprints", FingerprintTransformer()), ("classifier", LaplacianNB_New())])
-
-        # Grid search with transformer and classifier parameters
-        param_grid = {
-            "fingerprints__n_bits": [25, 50],
-            "fingerprints__output_format": ["csr", "dense"],
-            "classifier__alpha": [0.5, 1.0],
-        }
-
-        grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring="accuracy")
-        grid_search.fit(X_sets, y)
-
-        assert hasattr(grid_search, "best_params_")
-        assert hasattr(grid_search, "best_score_")
-
-        print(f"Best transformer pipeline params: {grid_search.best_params_}")
-        print("✓ Grid search with FingerprintTransformer works")
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])

From 26712591e572d01b1e2e79932734aa3bab137edd Mon Sep 17 00:00:00 2001
From: Bartosz Baranowski <bartekbaranow@gmail.com>
Date: Wed, 20 Aug 2025 16:49:52 +0200
Subject: [PATCH 2/8] sync the file naming with sklearn

---
 src/laplaciannb/__init__.py                  |  2 +-
 src/laplaciannb/{LaplacianNB.py => bayes.py} | 18 ++++---
 src/laplaciannb/fingerprint_utils.py         | 18 ++++---
 src/laplaciannb/legacy/LaplacianNB.py        |  1 -
 src/laplaciannb/legacy/__init__.py           | 50 --------------------
 tests/test_bayes.py                          | 28 +++++------
 tests/test_fingerprint_csr_conversion.py     | 14 +++---
 7 files changed, 38 insertions(+), 93 deletions(-)
 rename src/laplaciannb/{LaplacianNB.py => bayes.py} (99%)
 delete mode 100644 src/laplaciannb/legacy/__init__.py

diff --git a/src/laplaciannb/__init__.py b/src/laplaciannb/__init__.py
index dc8e078..3c2c15c 100644
--- a/src/laplaciannb/__init__.py
+++ b/src/laplaciannb/__init__.py
@@ -17,8 +17,8 @@
 - Enhanced fingerprint utility functions
 """
 
+from .bayes import LaplacianNB
 from .fingerprint_utils import rdkit_to_csr
-from .laplaciannb import LaplacianNB
 
 
 __version__ = "0.7.0"
diff --git a/src/laplaciannb/LaplacianNB.py b/src/laplaciannb/bayes.py
similarity index 99%
rename from src/laplaciannb/LaplacianNB.py
rename to src/laplaciannb/bayes.py
index 33442f4..2b0b8d4 100644
--- a/src/laplaciannb/LaplacianNB.py
+++ b/src/laplaciannb/bayes.py
@@ -1,6 +1,4 @@
-import warnings
 from functools import reduce
-from itertools import compress
 
 import numpy as np
 from scipy.special import logsumexp
@@ -131,40 +129,40 @@ def reducer(accumulator, element):
     def _count_feature_count(self, X_sparse, Y):
         """Most efficient version that handles 2^32 feature space gracefully."""
         from collections import defaultdict
-        
+
         # Get active features to avoid working with full 2^32 space
         X_coo = X_sparse.tocoo()
-        
+
         # 1. Total feature counts
         all_feature_counts = defaultdict(int)
         for col_idx, data_val in zip(X_coo.col, X_coo.data):
             all_feature_counts[col_idx] += data_val
         all_feature_counts = dict(sorted(all_feature_counts.items()))
-        
+
         # 2. Class-specific counts by iterating samples
         class_feature_counts = [defaultdict(int) for _ in range(len(self.classes_))]
         feature_sum = np.zeros(len(self.classes_))
-        
+
         # Group elements by sample (row)
         sample_features = defaultdict(list)
         for row_idx, col_idx, data_val in zip(X_coo.row, X_coo.col, X_coo.data):
             sample_features[row_idx].append((col_idx, data_val))
-        
+
         # Count features per class
         for sample_idx, features in sample_features.items():
             # Find which classes this sample belongs to
             sample_classes = Y[sample_idx].nonzero()[0]
-            
+
             for class_idx in sample_classes:
                 class_weight = Y[sample_idx, class_idx]
                 for col_idx, data_val in features:
                     weighted_count = data_val * class_weight
                     class_feature_counts[class_idx][col_idx] += weighted_count
                     feature_sum[class_idx] += weighted_count
-        
+
         # Convert to sorted dictionaries
         class_feature_counts = [dict(sorted(d.items())) for d in class_feature_counts]
-        
+
         return all_feature_counts, feature_sum, class_feature_counts
 
     def _init_counters(self, n_classes):
diff --git a/src/laplaciannb/fingerprint_utils.py b/src/laplaciannb/fingerprint_utils.py
index 83e2eb8..8709011 100644
--- a/src/laplaciannb/fingerprint_utils.py
+++ b/src/laplaciannb/fingerprint_utils.py
@@ -1,33 +1,31 @@
 import numpy as np
-from scipy.sparse import csr_matrix
-from rdkit.Chem import rdFingerprintGenerator
 from rdkit import Chem
+from rdkit.Chem import rdFingerprintGenerator
+from scipy.sparse import csr_matrix
 
 
 def rdkit_to_csr(smiles_list, radius=2):
     """Convert RDKit sparse Morgan fingerprints to CSR matrix with lossless conversion."""
     row_ind = []
     col_ind = []
-    
+
     # Create Morgan fingerprint generator
     mol_list = [Chem.MolFromSmiles(smi) for smi in smiles_list]
     mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius)
-    
+
     for i, mol in enumerate(mol_list):
         if mol is None:
             continue
-            
+
         # Get sparse fingerprint
         sfp = mfpgen.GetSparseFingerprint(mol)
         for bit in set(sfp.GetOnBits()):
             # Reinterpret signed int32 as unsigned int32
             # This maps [-2^31, 2^31-1] to [0, 2^32-1] losslessly
             col_idx = np.uint32(bit & 0xFFFFFFFF)
-            
+
             row_ind.append(i)
             col_ind.append(col_idx)
             data = np.ones(len(row_ind), dtype=np.bool)
-    
-    return csr_matrix((data, (row_ind, col_ind)), 
-                      shape=(len(mol_list), 2**32), 
-                      dtype=np.bool)
+
+    return csr_matrix((data, (row_ind, col_ind)), shape=(len(mol_list), 2**32), dtype=np.bool)
diff --git a/src/laplaciannb/legacy/LaplacianNB.py b/src/laplaciannb/legacy/LaplacianNB.py
index 7d473f0..78e5fad 100644
--- a/src/laplaciannb/legacy/LaplacianNB.py
+++ b/src/laplaciannb/legacy/LaplacianNB.py
@@ -1,4 +1,3 @@
-import warnings
 from functools import reduce
 from itertools import compress
 
diff --git a/src/laplaciannb/legacy/__init__.py b/src/laplaciannb/legacy/__init__.py
deleted file mode 100644
index 438957a..0000000
--- a/src/laplaciannb/legacy/__init__.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-Legacy LaplacianNB implementation.
-
-DEPRECATED: This module contains the legacy LaplacianNB implementation.
-Please use the new sklearn-compatible version instead:
-
-    from laplaciannb import LaplacianNB  # New version (recommended)
-
-instead of:
-
-    from laplaciannb.legacy import LaplacianNB  # Old version (deprecated)
-
-The new implementation offers:
-- Full sklearn compatibility (pipelines, cross-validation, grid search)
-- Memory-efficient sparse matrix support
-- Better error handling and validation
-- Consistent API with other sklearn estimators
-- Enhanced fingerprint utility functions
-
-The legacy version will be removed in a future release.
-"""
-
-import warnings
-
-from .LaplacianNB import LaplacianNB
-
-
-# Issue strong deprecation warning when legacy module is imported
-warnings.warn(
-    "\n" + "=" * 80 + "\n"
-    "DEPRECATION WARNING: Legacy LaplacianNB Implementation\n" + "=" * 80 + "\n"
-    "You are importing from the DEPRECATED legacy LaplacianNB module.\n"
-    "This implementation will be REMOVED in a future release.\n\n"
-    "PLEASE MIGRATE to the new sklearn-compatible version:\n\n"
-    "  ✅ RECOMMENDED:\n"
-    "    from laplaciannb import LaplacianNB\n"
-    "    from laplaciannb.fingerprint_utils import convert_fingerprints\n\n"
-    "  ❌ DEPRECATED (current usage):\n"
-    "    from laplaciannb.legacy import LaplacianNB\n\n"
-    "The new implementation provides:\n"
-    "• Full sklearn ecosystem compatibility\n"
-    "• Memory-efficient sparse matrix support\n"
-    "• Better performance and error handling\n"
-    "• Enhanced fingerprint conversion utilities\n\n"
-    "See MIGRATION_GUIDE.md for detailed migration instructions.\n" + "=" * 80,
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-__all__ = ["LaplacianNB"]
diff --git a/tests/test_bayes.py b/tests/test_bayes.py
index ea2534e..26819bf 100644
--- a/tests/test_bayes.py
+++ b/tests/test_bayes.py
@@ -10,12 +10,12 @@
 
 def test_bayes():
     from scipy.sparse import csr_matrix
-    
+
     clf = LaplacianNB()
     rng = np.random.RandomState(1)
     arr = rng.randint(2, size=(6, 100))
     Y = np.array([1, 2, 3, 4, 4, 5])
-    
+
     # Convert binary array to CSR matrix
     X = csr_matrix(arr, dtype=np.bool_)
     clf.fit(X, Y)
@@ -32,7 +32,7 @@ def test_lmnb_prior_unobserved_targets():
     # Create toy training data as sparse matrices
     # First sample has feature 1, second sample has feature 0
     row = [0, 1]
-    col = [1, 0] 
+    col = [1, 0]
     data = [1, 1]
     X = csr_matrix((data, (row, col)), shape=(2, 2), dtype=np.bool_)
     y = np.array([0, 1])
@@ -44,7 +44,7 @@ def test_lmnb_prior_unobserved_targets():
     test1 = csr_matrix(([1], ([0], [1])), shape=(1, 2), dtype=np.bool_)  # Feature 1 active
     test2 = csr_matrix(([1], ([0], [0])), shape=(1, 2), dtype=np.bool_)  # Feature 0 active
     test3 = csr_matrix(([1, 1], ([0, 0], [0, 1])), shape=(1, 2), dtype=np.bool_)  # Both features active
-    
+
     assert_array_equal(clf.predict(test1), np.array([0]))
     assert_array_equal(clf.predict(test2), np.array([1]))
     assert_array_equal(clf.predict(test3), np.array([0]))
@@ -57,10 +57,10 @@ def test_rdkit():
     DATA_PATH = Path(__file__).parent.parent.joinpath("tests/data/")
     file = str(DATA_PATH.joinpath("smiles_test.csv"))
     df = pd.read_csv(file)
-    
+
     # Convert to sparse CSR matrix using our fingerprint utility
     X_sparse = rdkit_to_csr(df['smiles'].values, radius=2)
-    
+
     y = df["activity"]
     clf = LaplacianNB()
     clf.fit(X_sparse, y)
@@ -79,7 +79,7 @@ def test_joint_log_likelihood():
     DATA_PATH = Path(__file__).parent.parent.joinpath("tests/data/")
     file = str(DATA_PATH.joinpath("smiles_test.csv"))
     df = pd.read_csv(file)
-    
+
     # Convert to CSR matrix using fingerprint utility
     X = rdkit_to_csr(df['smiles'].values, radius=2)
     y = df["activity"]
@@ -92,7 +92,7 @@ def test_joint_log_likelihood():
     test_col = [2**30]  # Use a large but valid index within 2^32-1 limit
     test_data = [1]
     new_X = csr_matrix((test_data, (test_row, test_col)), shape=(1, 2**32-1), dtype=np.bool_)
-    
+
     try:
         clf._joint_log_likelihood(new_X)
     except Exception as exc:
@@ -102,18 +102,18 @@ def test_joint_log_likelihood():
 def test_csr_fingerprint_conversion():
     """Test the new CSR fingerprint conversion functionality."""
     from laplaciannb.fingerprint_utils import rdkit_to_csr
-    
+
     # Create test molecules
     smiles_list = ["CCO", "CC", "CCC", "CCCC"]
-    
+
     # Convert to CSR matrix
     X_sparse = rdkit_to_csr(smiles_list, radius=2)
-    
+
     # Basic validation
     assert X_sparse.shape[0] == len(smiles_list)
     assert X_sparse.shape[1] == 2**32
     assert X_sparse.nnz > 0
-    
+
     # Test that different molecules have different fingerprints
     fingerprint_rows = []
     for i in range(X_sparse.shape[0]):
@@ -121,9 +121,9 @@ def test_csr_fingerprint_conversion():
         row_coo = row.tocoo()
         fingerprint_set = set(zip(row_coo.col, row_coo.data))
         fingerprint_rows.append(fingerprint_set)
-    
+
     # Verify that molecules have some different features
     assert len(set(len(fp) for fp in fingerprint_rows)) > 1  # Different numbers of features
-    
+
     print(f"Successfully created CSR matrix: {X_sparse.shape}, nnz: {X_sparse.nnz}")
     print(f"Fingerprint sizes: {[len(fp) for fp in fingerprint_rows]}")
diff --git a/tests/test_fingerprint_csr_conversion.py b/tests/test_fingerprint_csr_conversion.py
index 9cc03dd..1d2276c 100644
--- a/tests/test_fingerprint_csr_conversion.py
+++ b/tests/test_fingerprint_csr_conversion.py
@@ -18,27 +18,27 @@ def get_test_molecules():
 
 
 class TestFingerprintCSRConversion:
-    
+
     def test_rdkit_to_csr_basic(self):
         """Test basic RDKit to CSR conversion"""
         smiles = ["CCO", "CC", "CCC"]
         csr_matrix_result = rdkit_to_csr(smiles)
-        
+
         # Basic checks
         assert csr_matrix_result.shape[0] == len(smiles)
         assert csr_matrix_result.shape[1] == 2**32
         assert csr_matrix_result.nnz > 0  # Should have non-zero elements
-    
+
     def test_fingerprint_consistency(self):
         """Test that CSR conversion preserves fingerprint information"""
-        smiles = ["CCO", "CC", "CCC"] 
+        smiles = ["CCO", "CC", "CCC"]
         csr_result = rdkit_to_csr(smiles)
-        
+
         # Calculate total expected fingerprint bits across all molecules
         # Use the same API as the function
         from rdkit.Chem import rdFingerprintGenerator
         mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
-        
+
         total_expected_bits = 0
         for smi in smiles:
             mol = Chem.MolFromSmiles(smi)
@@ -53,7 +53,7 @@ def test_bit_conversion_roundtrip(self):
         """Test that bit conversion works both ways (WILL FAIL)"""
         # Test a few example bits
         test_bits = [-1000, 0, 1000]
-        
+
         for original_bit in test_bits:
             # This will fail because mock just returns the same value
             recovered_bit = csr_to_rdkit_bit(original_bit)

From 56b5e9566dfcab25a6f720572524b356bc4dda62 Mon Sep 17 00:00:00 2001
From: Bartosz Baranowski <bartekbaranow@gmail.com>
Date: Wed, 20 Aug 2025 17:40:18 +0200
Subject: [PATCH 3/8] cleaner version, sklearn working, CSR only

---
 DEPRECATION_TIMELINE.md                     |  89 --
 MIGRATION_GUIDE.md                          | 225 -----
 README.md                                   | 209 ++++-
 debug_comparison.py                         |   0
 examples/advanced_features_tutorial.ipynb   | 942 --------------------
 examples/basic_usage_example.py             |   0
 examples/basic_usage_tutorial.ipynb         | 624 -------------
 examples/bayes_tutorial.ipynb               | 623 -------------
 examples/integration_example.py             |  95 --
 examples/simple_example.py                  | 137 +++
 examples/sklearn_integration_example.py     |   0
 examples/sklearn_integration_tutorial.ipynb | 884 ------------------
 simple_performance_test.py                  |   0
 src/laplaciannb/bayes.py                    |   2 +-
 src/laplaciannb/legacy/LaplacianNB_new.py   | 373 --------
 15 files changed, 323 insertions(+), 3880 deletions(-)
 delete mode 100644 DEPRECATION_TIMELINE.md
 delete mode 100644 MIGRATION_GUIDE.md
 delete mode 100644 debug_comparison.py
 delete mode 100644 examples/advanced_features_tutorial.ipynb
 delete mode 100644 examples/basic_usage_example.py
 delete mode 100644 examples/basic_usage_tutorial.ipynb
 delete mode 100644 examples/bayes_tutorial.ipynb
 delete mode 100644 examples/integration_example.py
 create mode 100644 examples/simple_example.py
 delete mode 100644 examples/sklearn_integration_example.py
 delete mode 100644 examples/sklearn_integration_tutorial.ipynb
 delete mode 100644 simple_performance_test.py
 delete mode 100644 src/laplaciannb/legacy/LaplacianNB_new.py

diff --git a/DEPRECATION_TIMELINE.md b/DEPRECATION_TIMELINE.md
deleted file mode 100644
index 26b8c4d..0000000
--- a/DEPRECATION_TIMELINE.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# LaplacianNB Deprecation Timeline
-
-## Overview
-
-This document outlines the deprecation timeline for the legacy LaplacianNB implementation and the transition to the new sklearn-compatible version.
-
-## Migration Strategies
-
-### Immediate Migration (Recommended)
-```python
-# Before (legacy)
-from laplaciannb.legacy import LaplacianNB
-X_sets = [...]  # Sets of bit indices
-clf = LaplacianNB()
-clf.fit(X_sets, y)
-
-# After (modern)
-from laplaciannb import LaplacianNB
-from laplaciannb.fingerprint_utils import convert_fingerprints
-X = convert_fingerprints(X_sets, n_bits=size)
-clf = LaplacianNB()
-clf.fit(X, y)
-```
-
-### Gradual Migration
-```python
-# Phase 1: Suppress warnings while testing
-import warnings
-warnings.filterwarnings("ignore", category=DeprecationWarning, module="laplaciannb.legacy")
-
-# Phase 2: Test both implementations side by side
-from laplaciannb import LaplacianNB as NewLaplacianNB
-from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-
-# Phase 3: Switch to new implementation
-from laplaciannb import LaplacianNB
-```
-
-### Pipeline Migration
-```python
-# Before: Custom preprocessing
-X_processed = preprocess_fingerprints(X_raw)
-clf = LegacyLaplacianNB()
-
-# After: sklearn pipeline
-from sklearn.pipeline import Pipeline
-from laplaciannb import LaplacianNB, FingerprintTransformer
-
-pipeline = Pipeline([
-    ('fingerprints', FingerprintTransformer(n_bits=2048)),
-    ('classifier', LaplacianNB())
-])
-```
-
-## Version Compatibility Matrix
-
-| Version | Legacy Available | New Available | Default Import | Warnings |
-|---------|------------------|---------------|----------------|----------|
-| v0.7.0  | ✅ `legacy` module | ✅ Main module | New | Future |
-| v1.0.0  | ❌ Removed | ✅ Main module | New | None |
-
-
-## FAQ
-
-### Q: How long do I have to migrate?
-**A:** Legacy support will be removed in v1.0.0. We recommend migrating immediately to benefit from new features and better performance.
-
-### Q: Are the results identical between versions?
-**A:** Yes, both implementations are tested for compatibility and produce identical results.
-
-### Q: Can I use both versions in the same project?
-**A:** Yes, during the transition period. Import them with different names:
-```python
-from laplaciannb import LaplacianNB as NewLaplacianNB
-from laplaciannb.legacy import LaplacianNB as LegacyLaplacianNB
-```
-
-### Q: What if I find bugs in the new implementation?
-**A:** Please file an issue on GitHub. During the transition period, you can use the legacy version as a fallback.
-
-### Q: Will there be breaking changes in the new implementation?
-**A:** The new implementation follows sklearn conventions and semantic versioning. Breaking changes will only occur in major version releases.
-
-## Migration Resources
-
-1. **[MIGRATION_GUIDE.md](MIGRATION_GUIDE.md)** - Comprehensive migration instructions
-2. **[examples/](examples/)** - Example notebooks showing both versions
-3. **[tests/test_compatibility.py](tests/test_compatibility.py)** - Compatibility validation
-4. **GitHub Issues** - Community support and bug reports
diff --git a/MIGRATION_GUIDE.md b/MIGRATION_GUIDE.md
deleted file mode 100644
index a59918c..0000000
--- a/MIGRATION_GUIDE.md
+++ /dev/null
@@ -1,225 +0,0 @@
-# LaplacianNB Migration Guide
-
-## Overview
-
-LaplacianNB has been modernized with a new sklearn-compatible implementation. This guide helps you migrate from the legacy version to the new recommended version.
-
-## Quick Migration
-
-### Old Way (Deprecated)
-```python
-from laplaciannb.legacy import LaplacianNB  # ⚠️ DEPRECATED
-```
-
-### New Way (Recommended)
-```python
-from laplaciannb import LaplacianNB  # ✅ RECOMMENDED
-```
-
-## Key Differences
-
-### Input Data Format
-
-**Legacy Implementation:**
-- Expects fingerprints as sets, lists, or dictionaries
-- Custom input validation
-- Limited to specific data formats
-
-```python
-# Legacy - fingerprints as sets
-X_sets = [
-    {1, 5, 10, 15},
-    {2, 6, 11, 16},
-    {1, 3, 7, 12}
-]
-```
-
-**New Implementation:**
-- Accepts standard sklearn input formats (sparse/dense matrices)
-- Full sklearn input validation
-- Seamless integration with sklearn ecosystem
-
-```python
-# New - sklearn-compatible sparse/dense matrices
-from laplaciannb.fingerprint_utils import convert_fingerprints
-
-X_sklearn = convert_fingerprints(X_sets, n_bits=2048, output_format='csr')
-# or use FingerprintTransformer in pipelines
-```
-
-### API Changes
-
-**Legacy:**
-```python
-from laplaciannb.legacy import LaplacianNB
-
-clf = LaplacianNB(alpha=1.0)
-clf.fit(X_sets, y)
-predictions = clf.predict(X_sets)
-```
-
-**New:**
-```python
-from laplaciannb import LaplacianNB
-from laplaciannb.fingerprint_utils import convert_fingerprints
-
-# Convert fingerprints to sklearn format
-X = convert_fingerprints(X_sets, n_bits=2048)
-
-clf = LaplacianNB(alpha=1.0)
-clf.fit(X, y)
-predictions = clf.predict(X)
-```
-
-### Enhanced Features in New Version
-
-1. **sklearn Ecosystem Integration:**
-   ```python
-   from sklearn.pipeline import Pipeline
-   from sklearn.model_selection import GridSearchCV
-   from laplaciannb import LaplacianNB, FingerprintTransformer
-
-   # Pipeline support
-   pipeline = Pipeline([
-       ('fingerprints', FingerprintTransformer(n_bits=2048)),
-       ('classifier', LaplacianNB())
-   ])
-
-   # Grid search support
-   param_grid = {'classifier__alpha': [0.1, 1.0, 10.0]}
-   grid_search = GridSearchCV(pipeline, param_grid, cv=5)
-   ```
-
-2. **Memory-Efficient Sparse Matrices:**
-   ```python
-   # Automatic sparse matrix handling for large fingerprints
-   X_sparse = convert_fingerprints(fingerprints, n_bits=16384, output_format='csr')
-   clf = LaplacianNB()
-   clf.fit(X_sparse, y)  # Memory efficient for sparse data
-   ```
-
-3. **Better Error Handling:**
-   ```python
-   # Comprehensive input validation
-   # Clear error messages
-   # Proper sklearn-style exceptions
-   ```
-
-## Migration Steps
-
-### Step 1: Update Imports
-```python
-# Before
-from laplaciannb.legacy import LaplacianNB
-
-# After
-from laplaciannb import LaplacianNB
-from laplaciannb.fingerprint_utils import convert_fingerprints
-```
-
-### Step 2: Convert Input Data
-```python
-# Before - fingerprints as sets/lists
-X_fingerprints = [...]  # Your fingerprint data
-
-# After - convert to sklearn format
-X = convert_fingerprints(X_fingerprints, n_bits=your_fingerprint_size)
-```
-
-### Step 3: Update Model Usage
-```python
-# Both versions use the same basic API
-clf = LaplacianNB(alpha=1.0)
-clf.fit(X, y)
-predictions = clf.predict(X)
-probabilities = clf.predict_proba(X)
-```
-
-### Step 4: Leverage New Features (Optional)
-```python
-# Use in sklearn pipelines
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import cross_val_score
-
-pipeline = Pipeline([
-    ('classifier', LaplacianNB())
-])
-
-# Cross-validation
-scores = cross_val_score(pipeline, X, y, cv=5)
-```
-
-## Common Migration Issues
-
-### Issue 1: Input Format Mismatch
-**Problem:** Getting errors about input format
-
-**Solution:** Use fingerprint utilities to convert data
-```python
-from laplaciannb.fingerprint_utils import convert_fingerprints
-
-# Convert sets to sklearn format
-X_sklearn = convert_fingerprints(your_fingerprint_sets, n_bits=2048)
-```
-
-### Issue 2: Memory Issues with Large Fingerprints
-**Problem:** Running out of memory with large dense matrices
-
-**Solution:** Use sparse matrices (default behavior)
-```python
-# Default output is memory-efficient sparse CSR matrix
-X_sparse = convert_fingerprints(fingerprints, n_bits=16384)  # Uses CSR by default
-```
-
-### Issue 3: Different Prediction Results
-**Problem:** Getting slightly different results
-
-**Solution:** This should not happen - both implementations are tested for compatibility. If you encounter this, please file an issue.
-
-## Compatibility Guarantees
-
-- **Identical Results:** New implementation produces identical predictions to legacy version
-- **Backward Compatibility:** Legacy version remains available in `laplaciannb.legacy`
-- **Migration Period:** Legacy version will be maintained until sufficient adoption of new version
-
-## Testing Your Migration
-
-Use our compatibility test to verify your migration:
-
-```python
-import numpy as np
-from laplaciannb import LaplacianNB as LaplacianNB_New
-from laplaciannb.legacy import LaplacianNB as LaplacianNB_Legacy
-from laplaciannb.fingerprint_utils import convert_fingerprints
-
-# Your test data
-X_sets = [...]  # Your fingerprint sets
-y = [...]       # Your labels
-
-# Test both implementations
-clf_legacy = LaplacianNB_Legacy(alpha=1.0)
-clf_legacy.fit(np.array(X_sets, dtype=object), y)
-pred_legacy = clf_legacy.predict(np.array(X_sets, dtype=object))
-
-X_sklearn = convert_fingerprints(X_sets, n_bits=your_n_bits)
-clf_new = LaplacianNB_New(alpha=1.0)
-clf_new.fit(X_sklearn, y)
-pred_new = clf_new.predict(X_sklearn)
-
-# Verify identical results
-assert np.array_equal(pred_legacy, pred_new), "Predictions should be identical"
-print("✓ Migration successful - identical predictions!")
-```
-
-## Getting Help
-
-- **Documentation:** See example notebooks in `examples/` directory
-- **Issues:** File issues on GitHub if you encounter migration problems
-- **Examples:** Check `examples/sklearn_integration_tutorial.ipynb` for sklearn usage patterns
-
-## Timeline
-
-- **v0.7.0:** Increase deprecation warning severity
-- **v1.0.0:** Legacy version removal (planned)
-
-The migration is designed to be straightforward while providing significant benefits in terms of sklearn ecosystem integration and performance.
diff --git a/README.md b/README.md
index abbbc55..a5e4b69 100644
--- a/README.md
+++ b/README.md
@@ -25,14 +25,31 @@ The package includes both a **modern sklearn-compatible implementation** (recomm
 
 ---
 
-## Features
-
-- **Modern sklearn-compatible implementation** with full ecosystem integration
-- **Optimized for binary/boolean data** with fast prediction using indices of positive bits
-- **RDKit fingerprint conversion utilities** for molecular data
-- **Support for sparse and dense data formats**
-- **Memory-efficient sparse matrix handling**
-- Lightweight and easy to integrate
+## ✨ Features
+
+### 🔬 Core Algorithm
+- **Laplacian-modified Naive Bayes** with enhanced smoothing for sparse data
+- **Optimized for binary/boolean features** using bit index representation
+- **Fast prediction** leveraging only positive bit indices
+- **Robust handling** of unseen features and classes
+
+### 🚀 Performance & Scalability
+- **Memory-efficient sparse matrix support** for massive feature spaces (2^32 features)
+- **Lossless RDKit fingerprint conversion** with bit reinterpretation
+- **Automatic sparsity detection** and optimization
+- **Parallel processing** compatible with joblib
+
+### 🔧 sklearn Integration
+- **Full sklearn ecosystem compatibility** (pipelines, cross-validation, grid search)
+- **Drop-in replacement** for other Naive Bayes classifiers
+- **Consistent API** with sklearn estimators
+- **Custom transformers** for molecular data preprocessing
+
+### 🧪 Molecular Informatics
+- **Direct RDKit integration** for SMILES conversion
+- **Morgan fingerprint support** with configurable radius
+- **Chemical space analysis** capabilities
+- **QSAR/SAR modeling** optimized workflows
 
 ---
 
@@ -53,30 +70,85 @@ pip install --pre laplaciannb
 ```
 
 ### From Source
-For the latest development version:
+For the latest development version with examples:
 
 ```sh
 git clone https://github.com/rdkit/laplaciannb.git
 cd laplaciannb
-pip install -e .
+pip install -e ".[dev]"  # Includes development dependencies
+```
+
+### Optional Dependencies
+For molecular fingerprint functionality:
+```sh
+pip install rdkit  # For molecular fingerprint conversion
+```
+
+For full development environment:
+```sh
+pip install laplaciannb[dev]  # Includes testing, linting, and examples
 ```
 
 ## Quick Start
 
+### 🚀 Try the Interactive Example
+
+Run the comprehensive quickstart example to see all features in action:
+
+```sh
+cd examples
+python quickstart_example.py
+```
+
+This script demonstrates:
+- RDKit molecular fingerprint conversion
+- Sparse matrix handling for memory efficiency
+- scikit-learn ecosystem integration
+- Performance comparisons with other classifiers
+- Memory efficiency demonstrations
+
 ### Recommended Usage (Modern sklearn-compatible API)
 
+**For molecular data with RDKit:**
+
 ```python
-import numpy as np
 from laplaciannb import LaplacianNB
-from laplaciannb.fingerprint_utils import convert_fingerprints
+from laplaciannb.fingerprint_utils import rdkit_to_csr
 
-# Convert fingerprint data to sklearn format
-fingerprints = [
-    {1, 5, 10, 15},      # Fingerprint as set of bit indices
-    {2, 6, 11, 16},      # Each set represents active bits
-    {1, 3, 7, 12}
+# Sample molecular data (SMILES strings)
+smiles = [
+    "CCO",                              # Ethanol
+    "CC(=O)OC1=CC=CC=C1C(=O)O",        # Aspirin
+    "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"    # Ibuprofen
 ]
-X = convert_fingerprints(fingerprints, n_bits=20)
+y = [0, 1, 1]  # Activity labels
+
+# Convert to sparse CSR matrix (memory efficient)
+X = rdkit_to_csr(smiles, radius=2)
+print(f"Matrix shape: {X.shape}")  # (3, 4294967296)
+print(f"Sparsity: {1 - X.nnz / (X.shape[0] * X.shape[1]):.6f}")
+
+# Train classifier
+clf = LaplacianNB(alpha=1.0)
+clf.fit(X, y)
+
+# Make predictions
+predictions = clf.predict(X)
+probabilities = clf.predict_proba(X)
+```
+
+**For general binary/boolean data:**
+
+```python
+import numpy as np
+from scipy.sparse import csr_matrix
+from laplaciannb import LaplacianNB
+
+# Create sparse binary matrix directly
+row = [0, 0, 1, 1, 2, 2]
+col = [1, 5, 2, 6, 1, 3]
+data = [1, 1, 1, 1, 1, 1]
+X = csr_matrix((data, (row, col)), shape=(3, 10), dtype=np.bool_)
 y = [0, 1, 0]
 
 # Train and predict
@@ -88,27 +160,116 @@ probabilities = clf.predict_proba(X)
 
 ### sklearn Ecosystem Integration
 
+**Full Pipeline Example:**
+
 ```python
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV, cross_val_score
-from laplaciannb import LaplacianNB, FingerprintTransformer
+from sklearn.base import BaseEstimator, TransformerMixin
+from laplaciannb import LaplacianNB
+from laplaciannb.fingerprint_utils import rdkit_to_csr
+
+# Custom transformer for pipelines
+class RDKitFingerprintTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, radius=2):
+        self.radius = radius
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        return rdkit_to_csr(X, radius=self.radius)
 
 # Create pipeline
 pipeline = Pipeline([
-    ('fingerprints', FingerprintTransformer(n_bits=2048)),
-    ('classifier', LaplacianNB())
+    ('fingerprints', RDKitFingerprintTransformer(radius=2)),
+    ('classifier', LaplacianNB(alpha=1.0))
 ])
 
 # Grid search
 param_grid = {
     'classifier__alpha': [0.1, 1.0, 10.0],
-    'fingerprints__output_format': ['csr', 'dense']
+    'fingerprints__radius': [1, 2, 3]
 }
 grid_search = GridSearchCV(pipeline, param_grid, cv=5)
-grid_search.fit(fingerprints, y)
+grid_search.fit(smiles_data, y)  # Use SMILES directly in pipeline
 
 # Cross-validation
-cv_scores = cross_val_score(pipeline, fingerprints, y, cv=5)
+cv_scores = cross_val_score(pipeline, smiles_data, y, cv=5)
+print(f"CV Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
+
+# Direct sparse matrix usage (for pre-converted data)
+X_sparse = rdkit_to_csr(smiles_data, radius=2)
+clf = LaplacianNB(alpha=1.0)
+scores = cross_val_score(clf, X_sparse, y, cv=5)
+```
+
+## 🔥 Key Features & Advantages
+
+### Memory Efficiency
+- **Sparse matrix support**: Handle 2^32 feature spaces with minimal memory
+- **Lossless fingerprint conversion**: Convert RDKit fingerprints without data loss
+- **Automatic sparsity detection**: Works seamlessly with both sparse and dense data
+
+```python
+# Handle massive feature spaces efficiently
+X = rdkit_to_csr(smiles_list, radius=2)  # Shape: (n_samples, 4294967296)
+print(f"Memory usage: {X.data.nbytes / 1024**2:.1f} MB")  # Only a few MB!
+```
+
+### Performance
+- **Optimized for binary data**: Fast prediction using only positive bit indices
+- **sklearn compatible**: Drop-in replacement for other Naive Bayes classifiers
+- **Parallel processing**: Supports joblib parallelization
+
+### Molecular Informatics
+- **RDKit integration**: Direct conversion from molecular structures
+- **Flexible fingerprints**: Support for Morgan, MACCS, and custom fingerprints
+- **Chemical space analysis**: Ideal for QSAR/SAR modeling
+
+## 📚 Examples & Tutorials
+
+### Interactive Examples
+Explore the comprehensive examples in the `/examples` directory:
+
+- **`quickstart_example.py`**: Complete demonstration with molecular data
+- **`basic_usage_tutorial.ipynb`**: Step-by-step Jupyter notebook
+- **`sklearn_integration_tutorial.ipynb`**: Advanced sklearn integration
+- **`bayes_tutorial.ipynb`**: Deep dive into Naive Bayes concepts
+
+### Run the Quickstart
+```sh
+# Clone the repository
+git clone https://github.com/rdkit/laplaciannb.git
+cd laplaciannb
+
+# Install with examples
+pip install -e ".[dev]"
+
+# Run comprehensive example
+python examples/quickstart_example.py
+```
+
+### Example Outputs
+The quickstart example demonstrates:
+```
+BASIC LAPLACIANNB USAGE
+Matrix shape: (10, 4294967296)
+Matrix sparsity: 0.999998
+Training completed in 0.002 seconds
+Test Accuracy: 1.000
+
+SPARSE MATRIX EFFICIENCY
+Radius   Features     Sparsity   Train Time   Accuracy
+1        4,294,967,296 0.999999   0.001       1.000
+2        4,294,967,296 0.999998   0.002       1.000
+3        4,294,967,296 0.999997   0.003       1.000
+
+MEMORY EFFICIENCY
+Sparse matrix memory: 0.12 MB
+Dense equivalent would require 40,000+ MB!
+✓ Designed specifically for extremely sparse binary features
+```
 ```
 
 ### Legacy Usage (Deprecated)
diff --git a/debug_comparison.py b/debug_comparison.py
deleted file mode 100644
index e69de29..0000000
diff --git a/examples/advanced_features_tutorial.ipynb b/examples/advanced_features_tutorial.ipynb
deleted file mode 100644
index 9875af9..0000000
--- a/examples/advanced_features_tutorial.ipynb
+++ /dev/null
@@ -1,942 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "3dc13231",
-   "metadata": {},
-   "source": [
-    "# LaplacianNB Advanced Features Tutorial\n",
-    "\n",
-    "This notebook explores advanced features of the LaplacianNB package including fingerprint utilities, performance optimization, and comparison with other algorithms."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "537289a1",
-   "metadata": {},
-   "source": [
-    "## Setup and Imports\n",
-    "\n",
-    "Import all necessary libraries for advanced features demonstration."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8af405c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import time\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns\n",
-    "from pathlib import Path\n",
-    "\n",
-    "# RDKit for molecular operations\n",
-    "from rdkit import Chem\n",
-    "from rdkit.Chem import rdFingerprintGenerator, Descriptors\n",
-    "from rdkit.DataStructs import BulkTanimotoSimilarity\n",
-    "\n",
-    "# sklearn for comparison and utilities\n",
-    "from sklearn.naive_bayes import MultinomialNB, BernoulliNB\n",
-    "from sklearn.ensemble import RandomForestClassifier\n",
-    "from sklearn.svm import SVC\n",
-    "from sklearn.model_selection import cross_val_score, learning_curve\n",
-    "from sklearn.metrics import roc_curve, auc, precision_recall_curve\n",
-    "from sklearn.decomposition import PCA\n",
-    "from scipy import sparse\n",
-    "\n",
-    "# LaplacianNB components\n",
-    "from laplaciannb import (\n",
-    "    LaplacianNB, LaplacianNB_New, \n",
-    "    convert_fingerprints, RDKitFingerprintConverter, FingerprintTransformer\n",
-    ")\n",
-    "\n",
-    "# Set style for better plots\n",
-    "plt.style.use('seaborn-v0_8')\n",
-    "np.random.seed(42)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d2669255",
-   "metadata": {},
-   "source": [
-    "## Advanced Fingerprint Generation\n",
-    "\n",
-    "Let's explore different types of molecular fingerprints and their properties."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b68c2501",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_multiple_fingerprint_types(smiles, n_bits=1024):\n",
-    "    \"\"\"Generate multiple types of molecular fingerprints.\"\"\"\n",
-    "    mol = Chem.MolFromSmiles(smiles)\n",
-    "    if not mol:\n",
-    "        return None\n",
-    "    \n",
-    "    fingerprints = {}\n",
-    "    \n",
-    "    # Morgan fingerprints (ECFP-like)\n",
-    "    morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)\n",
-    "    fingerprints['morgan'] = set(morgan_gen.GetFingerprint(mol).GetOnBits())\n",
-    "    \n",
-    "    # Atom pair fingerprints\n",
-    "    ap_gen = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=n_bits)\n",
-    "    fingerprints['atom_pair'] = set(ap_gen.GetFingerprint(mol).GetOnBits())\n",
-    "    \n",
-    "    # Topological torsion fingerprints\n",
-    "    tt_gen = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=n_bits)\n",
-    "    fingerprints['torsion'] = set(tt_gen.GetFingerprint(mol).GetOnBits())\n",
-    "    \n",
-    "    # RDKit fingerprints (path-based)\n",
-    "    rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=n_bits)\n",
-    "    fingerprints['rdkit'] = set(rdkit_gen.GetFingerprint(mol).GetOnBits())\n",
-    "    \n",
-    "    return fingerprints"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "46a3c420",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test molecules with different properties\n",
-    "test_molecules = {\n",
-    "    'Simple alcohol': 'CCO',\n",
-    "    'Aromatic': 'c1ccccc1',\n",
-    "    'Drug-like': 'CC(C)Cc1ccc(cc1)[C@@H](C)C(=O)O',  # Ibuprofen\n",
-    "    'Complex natural product': 'CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C',  # Retinoic acid\n",
-    "    'Peptide-like': 'CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)N)C(=O)O',  # Dipeptide\n",
-    "}\n",
-    "\n",
-    "fingerprint_data = []\n",
-    "for name, smiles in test_molecules.items():\n",
-    "    fps = get_multiple_fingerprint_types(smiles, n_bits=1024)\n",
-    "    if fps:\n",
-    "        for fp_type, fp_bits in fps.items():\n",
-    "            fingerprint_data.append({\n",
-    "                'molecule': name,\n",
-    "                'smiles': smiles,\n",
-    "                'fp_type': fp_type,\n",
-    "                'n_bits_set': len(fp_bits),\n",
-    "                'fingerprint': fp_bits\n",
-    "            })\n",
-    "\n",
-    "fp_df = pd.DataFrame(fingerprint_data)\n",
-    "print(\"Fingerprint comparison across molecule types:\")\n",
-    "pivot_table = fp_df.pivot(index='molecule', columns='fp_type', values='n_bits_set')\n",
-    "print(pivot_table)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9cfc867d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize fingerprint bit distributions\n",
-    "plt.figure(figsize=(12, 8))\n",
-    "\n",
-    "fp_types = fp_df['fp_type'].unique()\n",
-    "molecules = fp_df['molecule'].unique()\n",
-    "\n",
-    "for i, fp_type in enumerate(fp_types):\n",
-    "    plt.subplot(2, 2, i+1)\n",
-    "    data = fp_df[fp_df['fp_type'] == fp_type]\n",
-    "    plt.bar(range(len(data)), data['n_bits_set'], alpha=0.7)\n",
-    "    plt.title(f'{fp_type.title()} Fingerprints')\n",
-    "    plt.xlabel('Molecule')\n",
-    "    plt.ylabel('Bits Set')\n",
-    "    plt.xticks(range(len(data)), data['molecule'], rotation=45, ha='right')\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f48637ab",
-   "metadata": {},
-   "source": [
-    "## Performance Comparison: Original vs New Implementation\n",
-    "\n",
-    "Let's compare performance between the original and new implementations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f29ba2a0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generate synthetic dataset of varying sizes\n",
-    "def generate_synthetic_data(n_samples, n_bits=1024, avg_bits_per_sample=50):\n",
-    "    \"\"\"Generate synthetic fingerprint data.\"\"\"\n",
-    "    np.random.seed(42)\n",
-    "    \n",
-    "    X = []\n",
-    "    y = []\n",
-    "    \n",
-    "    for i in range(n_samples):\n",
-    "        # Random number of bits set\n",
-    "        n_bits_set = np.random.poisson(avg_bits_per_sample)\n",
-    "        n_bits_set = max(1, min(n_bits_set, n_bits//2))  # Reasonable bounds\n",
-    "        \n",
-    "        # Random bit positions\n",
-    "        bit_positions = set(np.random.choice(n_bits, n_bits_set, replace=False))\n",
-    "        X.append(bit_positions)\n",
-    "        \n",
-    "        # Random target (with some correlation to fingerprint size)\n",
-    "        prob_active = (len(bit_positions) - 30) / 40  # Bias towards larger fingerprints\n",
-    "        prob_active = max(0.1, min(0.9, prob_active))\n",
-    "        y.append(1 if np.random.random() < prob_active else 0)\n",
-    "    \n",
-    "    return X, np.array(y)\n",
-    "\n",
-    "# Test different dataset sizes\n",
-    "dataset_sizes = [100, 500, 1000, 2000]\n",
-    "performance_results = []\n",
-    "\n",
-    "for n_samples in dataset_sizes:\n",
-    "    print(f\"Testing dataset size: {n_samples}\")\n",
-    "    \n",
-    "    # Generate data\n",
-    "    X_sets, y = generate_synthetic_data(n_samples, n_bits=1024)\n",
-    "    X_sparse = convert_fingerprints(X_sets, n_bits=1024)\n",
-    "    \n",
-    "    # Time original implementation\n",
-    "    start_time = time.time()\n",
-    "    clf_orig = LaplacianNB()\n",
-    "    clf_orig.fit(X_sets, y)\n",
-    "    pred_orig = clf_orig.predict(X_sets)\n",
-    "    time_orig = time.time() - start_time\n",
-    "    \n",
-    "    # Time new implementation\n",
-    "    start_time = time.time()\n",
-    "    clf_new = LaplacianNB_New()\n",
-    "    clf_new.fit(X_sparse, y)\n",
-    "    pred_new = clf_new.predict(X_sparse)\n",
-    "    time_new = time.time() - start_time\n",
-    "    \n",
-    "    # Check accuracy match\n",
-    "    accuracy_orig = np.mean(pred_orig == y)\n",
-    "    accuracy_new = np.mean(pred_new == y)\n",
-    "    predictions_match = np.array_equal(pred_orig, pred_new)\n",
-    "    \n",
-    "    performance_results.append({\n",
-    "        'n_samples': n_samples,\n",
-    "        'time_original': time_orig,\n",
-    "        'time_new': time_new,\n",
-    "        'speedup': time_orig / time_new,\n",
-    "        'accuracy_original': accuracy_orig,\n",
-    "        'accuracy_new': accuracy_new,\n",
-    "        'predictions_match': predictions_match,\n",
-    "        'memory_original': 'N/A (sets)',\n",
-    "        'memory_new': f'{X_sparse.data.nbytes / 1024:.1f} KB'\n",
-    "    })\n",
-    "\n",
-    "perf_df = pd.DataFrame(performance_results)\n",
-    "print(\"\\nPerformance Comparison Results:\")\n",
-    "print(perf_df.round(3))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fd8c9439",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize performance comparison\n",
-    "fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))\n",
-    "\n",
-    "# Timing comparison\n",
-    "ax1.plot(perf_df['n_samples'], perf_df['time_original'], 'o-', label='Original', linewidth=2)\n",
-    "ax1.plot(perf_df['n_samples'], perf_df['time_new'], 's-', label='New', linewidth=2)\n",
-    "ax1.set_xlabel('Dataset Size')\n",
-    "ax1.set_ylabel('Training Time (seconds)')\n",
-    "ax1.set_title('Training Time Comparison')\n",
-    "ax1.legend()\n",
-    "ax1.grid(True, alpha=0.3)\n",
-    "\n",
-    "# Speedup\n",
-    "ax2.plot(perf_df['n_samples'], perf_df['speedup'], 'g^-', linewidth=2)\n",
-    "ax2.set_xlabel('Dataset Size')\n",
-    "ax2.set_ylabel('Speedup Factor')\n",
-    "ax2.set_title('New Implementation Speedup')\n",
-    "ax2.grid(True, alpha=0.3)\n",
-    "\n",
-    "# Accuracy comparison\n",
-    "ax3.plot(perf_df['n_samples'], perf_df['accuracy_original'], 'o-', label='Original', linewidth=2)\n",
-    "ax3.plot(perf_df['n_samples'], perf_df['accuracy_new'], 's-', label='New', linewidth=2)\n",
-    "ax3.set_xlabel('Dataset Size')\n",
-    "ax3.set_ylabel('Accuracy')\n",
-    "ax3.set_title('Accuracy Comparison')\n",
-    "ax3.legend()\n",
-    "ax3.grid(True, alpha=0.3)\n",
-    "\n",
-    "# Predictions match indicator\n",
-    "match_values = [1 if match else 0 for match in perf_df['predictions_match']]\n",
-    "ax4.bar(range(len(perf_df)), match_values, alpha=0.7, color='green')\n",
-    "ax4.set_xlabel('Dataset Index')\n",
-    "ax4.set_ylabel('Predictions Match (1=Yes, 0=No)')\n",
-    "ax4.set_title('Prediction Consistency')\n",
-    "ax4.set_xticks(range(len(perf_df)))\n",
-    "ax4.set_xticklabels(perf_df['n_samples'])\n",
-    "ax4.grid(True, alpha=0.3)\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "91f456eb",
-   "metadata": {},
-   "source": [
-    "## Memory Efficiency Analysis\n",
-    "\n",
-    "Let's analyze memory efficiency with different sparse matrix formats."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e18bd34f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generate data with different sparsity levels\n",
-    "sparsity_levels = [0.90, 0.95, 0.98, 0.99, 0.995]  # 90% to 99.5% sparse\n",
-    "n_samples = 1000\n",
-    "n_bits = 2048\n",
-    "\n",
-    "memory_analysis = []\n",
-    "\n",
-    "for sparsity in sparsity_levels:\n",
-    "    bits_per_sample = int(n_bits * (1 - sparsity))\n",
-    "    X_sets, y = generate_synthetic_data(n_samples, n_bits, bits_per_sample)\n",
-    "    \n",
-    "    # Convert to different formats\n",
-    "    formats = ['dense', 'csr', 'csc', 'coo']\n",
-    "    format_results = {'sparsity': sparsity, 'avg_bits': bits_per_sample}\n",
-    "    \n",
-    "    for fmt in formats:\n",
-    "        X_converted = convert_fingerprints(X_sets, n_bits=n_bits, output_format=fmt)\n",
-    "        \n",
-    "        if fmt == 'dense':\n",
-    "            memory_mb = X_converted.nbytes / (1024 * 1024)\n",
-    "        else:\n",
-    "            memory_mb = (X_converted.data.nbytes + X_converted.indices.nbytes + \n",
-    "                        X_converted.indptr.nbytes) / (1024 * 1024)\n",
-    "        \n",
-    "        format_results[f'{fmt}_memory_mb'] = memory_mb\n",
-    "    \n",
-    "    memory_analysis.append(format_results)\n",
-    "\n",
-    "memory_df = pd.DataFrame(memory_analysis)\n",
-    "print(\"Memory Usage Analysis (MB):\")\n",
-    "print(memory_df.round(3))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6fac28db",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize memory efficiency\n",
-    "plt.figure(figsize=(12, 8))\n",
-    "\n",
-    "# Memory usage comparison\n",
-    "plt.subplot(2, 2, 1)\n",
-    "for fmt in ['dense', 'csr', 'csc', 'coo']:\n",
-    "    plt.plot(memory_df['sparsity'], memory_df[f'{fmt}_memory_mb'], 'o-', label=fmt.upper(), linewidth=2)\n",
-    "plt.xlabel('Sparsity Level')\n",
-    "plt.ylabel('Memory Usage (MB)')\n",
-    "plt.title('Memory Usage by Sparse Format')\n",
-    "plt.legend()\n",
-    "plt.grid(True, alpha=0.3)\n",
-    "\n",
-    "# Memory savings vs dense\n",
-    "plt.subplot(2, 2, 2)\n",
-    "for fmt in ['csr', 'csc', 'coo']:\n",
-    "    savings = (memory_df['dense_memory_mb'] - memory_df[f'{fmt}_memory_mb']) / memory_df['dense_memory_mb'] * 100\n",
-    "    plt.plot(memory_df['sparsity'], savings, 'o-', label=fmt.upper(), linewidth=2)\n",
-    "plt.xlabel('Sparsity Level')\n",
-    "plt.ylabel('Memory Savings (%)')\n",
-    "plt.title('Memory Savings vs Dense Format')\n",
-    "plt.legend()\n",
-    "plt.grid(True, alpha=0.3)\n",
-    "\n",
-    "# Efficiency ratio (performance per MB)\n",
-    "plt.subplot(2, 2, 3)\n",
-    "dense_memory = memory_df['dense_memory_mb']\n",
-    "for fmt in ['csr', 'csc', 'coo']:\n",
-    "    ratio = dense_memory / memory_df[f'{fmt}_memory_mb']\n",
-    "    plt.plot(memory_df['sparsity'], ratio, 'o-', label=fmt.upper(), linewidth=2)\n",
-    "plt.xlabel('Sparsity Level')\n",
-    "plt.ylabel('Memory Efficiency Ratio')\n",
-    "plt.title('Memory Efficiency (Dense/Sparse)')\n",
-    "plt.legend()\n",
-    "plt.grid(True, alpha=0.3)\n",
-    "\n",
-    "# Recommended format\n",
-    "plt.subplot(2, 2, 4)\n",
-    "recommendations = []\n",
-    "for _, row in memory_df.iterrows():\n",
-    "    min_memory = min(row['csr_memory_mb'], row['csc_memory_mb'], row['coo_memory_mb'])\n",
-    "    if row['csr_memory_mb'] == min_memory:\n",
-    "        recommendations.append('CSR')\n",
-    "    elif row['csc_memory_mb'] == min_memory:\n",
-    "        recommendations.append('CSC')\n",
-    "    else:\n",
-    "        recommendations.append('COO')\n",
-    "\n",
-    "format_counts = pd.Series(recommendations).value_counts()\n",
-    "plt.pie(format_counts.values, labels=format_counts.index, autopct='%1.1f%%')\n",
-    "plt.title('Recommended Sparse Format Distribution')\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0bab16b1",
-   "metadata": {},
-   "source": [
-    "## Algorithm Comparison\n",
-    "\n",
-    "Let's compare LaplacianNB with other machine learning algorithms."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c524bcb0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generate a balanced dataset for fair comparison\n",
-    "X_sets, y = generate_synthetic_data(1000, n_bits=1024, avg_bits_per_sample=50)\n",
-    "X_dense = convert_fingerprints(X_sets, n_bits=1024, output_format='dense')\n",
-    "X_sparse = convert_fingerprints(X_sets, n_bits=1024, output_format='csr')\n",
-    "\n",
-    "print(f\"Dataset info:\")\n",
-    "print(f\"  Samples: {len(X_sets)}\")\n",
-    "print(f\"  Features: {X_dense.shape[1]}\")\n",
-    "print(f\"  Target distribution: {np.bincount(y)}\")\n",
-    "print(f\"  Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.3f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "760f23f4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define algorithms to compare\n",
-    "algorithms = {\n",
-    "    'LaplacianNB (Original)': (LaplacianNB(), X_sets),\n",
-    "    'LaplacianNB (New)': (LaplacianNB_New(), X_sparse),\n",
-    "    'MultinomialNB': (MultinomialNB(), X_dense),\n",
-    "    'BernoulliNB': (BernoulliNB(), X_dense),\n",
-    "    'RandomForest': (RandomForestClassifier(n_estimators=100, random_state=42), X_dense),\n",
-    "    'SVM (linear)': (SVC(kernel='linear', random_state=42), X_dense[:500])  # Smaller subset for SVM\n",
-    "}\n",
-    "\n",
-    "# Compare performance\n",
-    "algorithm_results = []\n",
-    "\n",
-    "for name, (clf, X_data) in algorithms.items():\n",
-    "    print(f\"Testing {name}...\")\n",
-    "    \n",
-    "    # Adjust target for smaller dataset (SVM)\n",
-    "    y_data = y if X_data.shape[0] == len(y) else y[:X_data.shape[0]]\n",
-    "    \n",
-    "    # Time training\n",
-    "    start_time = time.time()\n",
-    "    try:\n",
-    "        if name == 'SVM (linear)':\n",
-    "            # Cross-validation for smaller dataset\n",
-    "            scores = cross_val_score(clf, X_data, y_data, cv=3, scoring='accuracy')\n",
-    "        else:\n",
-    "            scores = cross_val_score(clf, X_data, y_data, cv=5, scoring='accuracy')\n",
-    "        \n",
-    "        training_time = time.time() - start_time\n",
-    "        \n",
-    "        algorithm_results.append({\n",
-    "            'algorithm': name,\n",
-    "            'mean_accuracy': scores.mean(),\n",
-    "            'std_accuracy': scores.std(),\n",
-    "            'training_time': training_time,\n",
-    "            'cv_folds': len(scores)\n",
-    "        })\n",
-    "    except Exception as e:\n",
-    "        print(f\"  Error with {name}: {e}\")\n",
-    "\n",
-    "results_df = pd.DataFrame(algorithm_results)\n",
-    "print(\"\\nAlgorithm Comparison Results:\")\n",
-    "print(results_df.round(4))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "efb2fcf2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize algorithm comparison\n",
-    "fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6))\n",
-    "\n",
-    "# Accuracy comparison\n",
-    "algorithms_names = results_df['algorithm']\n",
-    "accuracies = results_df['mean_accuracy']\n",
-    "errors = results_df['std_accuracy']\n",
-    "\n",
-    "bars1 = ax1.bar(range(len(algorithms_names)), accuracies, yerr=errors, \n",
-    "                capsize=5, alpha=0.7, color='skyblue', edgecolor='navy')\n",
-    "ax1.set_xlabel('Algorithm')\n",
-    "ax1.set_ylabel('Cross-Validation Accuracy')\n",
-    "ax1.set_title('Algorithm Accuracy Comparison')\n",
-    "ax1.set_xticks(range(len(algorithms_names)))\n",
-    "ax1.set_xticklabels(algorithms_names, rotation=45, ha='right')\n",
-    "ax1.grid(True, alpha=0.3, axis='y')\n",
-    "\n",
-    "# Add value labels on bars\n",
-    "for i, (acc, err) in enumerate(zip(accuracies, errors)):\n",
-    "    ax1.text(i, acc + err + 0.005, f'{acc:.3f}', ha='center', fontsize=9)\n",
-    "\n",
-    "# Training time comparison\n",
-    "times = results_df['training_time']\n",
-    "bars2 = ax2.bar(range(len(algorithms_names)), times, alpha=0.7, color='lightcoral', edgecolor='darkred')\n",
-    "ax2.set_xlabel('Algorithm')\n",
-    "ax2.set_ylabel('Training Time (seconds)')\n",
-    "ax2.set_title('Training Time Comparison')\n",
-    "ax2.set_xticks(range(len(algorithms_names)))\n",
-    "ax2.set_xticklabels(algorithms_names, rotation=45, ha='right')\n",
-    "ax2.grid(True, alpha=0.3, axis='y')\n",
-    "\n",
-    "# Accuracy vs Time scatter plot\n",
-    "ax3.scatter(times, accuracies, s=100, alpha=0.7, c='green', edgecolor='darkgreen')\n",
-    "for i, name in enumerate(algorithms_names):\n",
-    "    ax3.annotate(name.split('(')[0], (times.iloc[i], accuracies.iloc[i]), \n",
-    "                xytext=(5, 5), textcoords='offset points', fontsize=9)\n",
-    "ax3.set_xlabel('Training Time (seconds)')\n",
-    "ax3.set_ylabel('Accuracy')\n",
-    "ax3.set_title('Accuracy vs Training Time')\n",
-    "ax3.grid(True, alpha=0.3)\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f61c0d9f",
-   "metadata": {},
-   "source": [
-    "## RDKitFingerprintConverter Advanced Usage\n",
-    "\n",
-    "Let's explore the advanced features of the fingerprint converter."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "004efef3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create advanced converter with custom settings\n",
-    "converter = RDKitFingerprintConverter(\n",
-    "    n_bits=2048,\n",
-    "    output_format='auto',  # Automatically choose format\n",
-    "    dtype=np.float32,\n",
-    "    sparse_threshold=0.95  # Use sparse if >95% zeros\n",
-    ")\n",
-    "\n",
-    "# Test with real molecules\n",
-    "real_molecules = [\n",
-    "    'CCO',                                    # Ethanol\n",
-    "    'CC(=O)OC1=CC=CC=C1C(=O)O',              # Aspirin\n",
-    "    'CC1=CC=C(C=C1)C(C)C(=O)O',              # Ibuprofen  \n",
-    "    'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',          # Caffeine\n",
-    "    'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',         # Ibuprofen (alternative)\n",
-    "    'C1=CC=C(C=C1)C(=O)O',                   # Benzoic acid\n",
-    "    'CC(C)(C)C1=CC=C(C=C1)O',                # 4-tert-Butylphenol\n",
-    "    'CCCCCCCCCCCCCCC(=O)O',                  # Palmitic acid\n",
-    "]\n",
-    "\n",
-    "# Convert molecules to fingerprint sets\n",
-    "mol_fingerprints = []\n",
-    "for smiles in real_molecules:\n",
-    "    mol = Chem.MolFromSmiles(smiles)\n",
-    "    if mol:\n",
-    "        mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)\n",
-    "        fp = mfpgen.GetFingerprint(mol)\n",
-    "        mol_fingerprints.append(set(fp.GetOnBits()))\n",
-    "    else:\n",
-    "        mol_fingerprints.append(set())\n",
-    "\n",
-    "print(f\"Converted {len(mol_fingerprints)} molecules to fingerprints\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d0661129",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Use converter to analyze data\n",
-    "X_converted = converter.convert(mol_fingerprints)\n",
-    "stats = converter.get_statistics(mol_fingerprints)\n",
-    "\n",
-    "print(\"Converter Statistics:\")\n",
-    "for key, value in stats.items():\n",
-    "    if isinstance(value, float):\n",
-    "        print(f\"  {key}: {value:.4f}\")\n",
-    "    else:\n",
-    "        print(f\"  {key}: {value}\")\n",
-    "\n",
-    "print(f\"\\nConverted matrix info:\")\n",
-    "print(f\"  Type: {type(X_converted)}\")\n",
-    "print(f\"  Shape: {X_converted.shape}\")\n",
-    "print(f\"  Data type: {X_converted.dtype}\")\n",
-    "\n",
-    "if hasattr(X_converted, 'format'):\n",
-    "    print(f\"  Sparse format: {X_converted.format}\")\n",
-    "    print(f\"  Non-zero elements: {X_converted.nnz}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e3fe8fc1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test different sparsity thresholds\n",
-    "sparsity_thresholds = [0.5, 0.8, 0.9, 0.95, 0.99]\n",
-    "threshold_results = []\n",
-    "\n",
-    "for threshold in sparsity_thresholds:\n",
-    "    test_converter = RDKitFingerprintConverter(\n",
-    "        n_bits=2048,\n",
-    "        output_format='auto',\n",
-    "        sparse_threshold=threshold\n",
-    "    )\n",
-    "    \n",
-    "    X_test = test_converter.convert(mol_fingerprints)\n",
-    "    test_stats = test_converter.get_statistics(mol_fingerprints)\n",
-    "    \n",
-    "    threshold_results.append({\n",
-    "        'threshold': threshold,\n",
-    "        'chosen_format': 'sparse' if hasattr(X_test, 'format') else 'dense',\n",
-    "        'actual_sparsity': test_stats['sparsity'],\n",
-    "        'memory_efficient': test_stats['sparsity'] > threshold\n",
-    "    })\n",
-    "\n",
-    "threshold_df = pd.DataFrame(threshold_results)\n",
-    "print(\"\\nSparsity Threshold Analysis:\")\n",
-    "print(threshold_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "10979aa9",
-   "metadata": {},
-   "source": [
-    "## Learning Curves and Model Analysis\n",
-    "\n",
-    "Let's analyze how model performance changes with dataset size."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "694405d7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generate larger dataset for learning curves\n",
-    "X_large, y_large = generate_synthetic_data(2000, n_bits=1024, avg_bits_per_sample=50)\n",
-    "X_large_sparse = convert_fingerprints(X_large, n_bits=1024)\n",
-    "\n",
-    "# Calculate learning curves\n",
-    "train_sizes = np.linspace(0.1, 1.0, 10)\n",
-    "models_to_test = {\n",
-    "    'LaplacianNB (New)': LaplacianNB_New(),\n",
-    "    'MultinomialNB': MultinomialNB(),\n",
-    "    'BernoulliNB': BernoulliNB()\n",
-    "}\n",
-    "\n",
-    "learning_results = {}\n",
-    "\n",
-    "for name, model in models_to_test.items():\n",
-    "    print(f\"Calculating learning curve for {name}...\")\n",
-    "    \n",
-    "    if name == 'LaplacianNB (New)':\n",
-    "        X_data = X_large_sparse\n",
-    "    else:\n",
-    "        X_data = convert_fingerprints(X_large, n_bits=1024, output_format='dense')\n",
-    "    \n",
-    "    train_sizes_abs, train_scores, val_scores = learning_curve(\n",
-    "        model, X_data, y_large, \n",
-    "        train_sizes=train_sizes, \n",
-    "        cv=5, \n",
-    "        scoring='accuracy',\n",
-    "        n_jobs=-1\n",
-    "    )\n",
-    "    \n",
-    "    learning_results[name] = {\n",
-    "        'train_sizes': train_sizes_abs,\n",
-    "        'train_scores_mean': train_scores.mean(axis=1),\n",
-    "        'train_scores_std': train_scores.std(axis=1),\n",
-    "        'val_scores_mean': val_scores.mean(axis=1),\n",
-    "        'val_scores_std': val_scores.std(axis=1)\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "58b81629",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Plot learning curves\n",
-    "plt.figure(figsize=(15, 5))\n",
-    "\n",
-    "for i, (name, results) in enumerate(learning_results.items()):\n",
-    "    plt.subplot(1, 3, i+1)\n",
-    "    \n",
-    "    train_mean = results['train_scores_mean']\n",
-    "    train_std = results['train_scores_std']\n",
-    "    val_mean = results['val_scores_mean']\n",
-    "    val_std = results['val_scores_std']\n",
-    "    train_sizes = results['train_sizes']\n",
-    "    \n",
-    "    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')\n",
-    "    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')\n",
-    "    \n",
-    "    plt.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Score')\n",
-    "    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')\n",
-    "    \n",
-    "    plt.xlabel('Training Set Size')\n",
-    "    plt.ylabel('Accuracy')\n",
-    "    plt.title(f'Learning Curve: {name}')\n",
-    "    plt.legend()\n",
-    "    plt.grid(True, alpha=0.3)\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5ed3c2b2",
-   "metadata": {},
-   "source": [
-    "## ROC Curves and Performance Metrics\n",
-    "\n",
-    "Let's create detailed performance analysis with ROC curves."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4f0adc5b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Prepare data for ROC analysis\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "X_roc, y_roc = generate_synthetic_data(1000, n_bits=1024, avg_bits_per_sample=50)\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X_roc, y_roc, test_size=0.3, random_state=42)\n",
-    "\n",
-    "# Convert data\n",
-    "X_train_sparse = convert_fingerprints(X_train, n_bits=1024)\n",
-    "X_test_sparse = convert_fingerprints(X_test, n_bits=1024)\n",
-    "X_train_dense = convert_fingerprints(X_train, n_bits=1024, output_format='dense')\n",
-    "X_test_dense = convert_fingerprints(X_test, n_bits=1024, output_format='dense')\n",
-    "\n",
-    "# Train models and get probabilities\n",
-    "roc_models = {\n",
-    "    'LaplacianNB (New)': (LaplacianNB_New(), X_train_sparse, X_test_sparse),\n",
-    "    'MultinomialNB': (MultinomialNB(), X_train_dense, X_test_dense),\n",
-    "    'BernoulliNB': (BernoulliNB(), X_train_dense, X_test_dense),\n",
-    "    'RandomForest': (RandomForestClassifier(n_estimators=100, random_state=42), X_train_dense, X_test_dense)\n",
-    "}\n",
-    "\n",
-    "roc_data = {}\n",
-    "\n",
-    "for name, (model, X_tr, X_te) in roc_models.items():\n",
-    "    print(f\"Training {name} for ROC analysis...\")\n",
-    "    \n",
-    "    model.fit(X_tr, y_train)\n",
-    "    y_proba = model.predict_proba(X_te)[:, 1]  # Probability of positive class\n",
-    "    \n",
-    "    fpr, tpr, thresholds = roc_curve(y_test, y_proba)\n",
-    "    roc_auc = auc(fpr, tpr)\n",
-    "    \n",
-    "    precision, recall, pr_thresholds = precision_recall_curve(y_test, y_proba)\n",
-    "    pr_auc = auc(recall, precision)\n",
-    "    \n",
-    "    roc_data[name] = {\n",
-    "        'fpr': fpr,\n",
-    "        'tpr': tpr,\n",
-    "        'roc_auc': roc_auc,\n",
-    "        'precision': precision,\n",
-    "        'recall': recall,\n",
-    "        'pr_auc': pr_auc,\n",
-    "        'y_proba': y_proba\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d99b6d24",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Plot ROC curves and Precision-Recall curves\n",
-    "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))\n",
-    "\n",
-    "# ROC Curves\n",
-    "colors = ['blue', 'red', 'green', 'orange']\n",
-    "for i, (name, data) in enumerate(roc_data.items()):\n",
-    "    ax1.plot(data['fpr'], data['tpr'], color=colors[i], linewidth=2,\n",
-    "             label=f'{name} (AUC = {data[\"roc_auc\"]:.3f})')\n",
-    "\n",
-    "ax1.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')\n",
-    "ax1.set_xlabel('False Positive Rate')\n",
-    "ax1.set_ylabel('True Positive Rate')\n",
-    "ax1.set_title('ROC Curves Comparison')\n",
-    "ax1.legend()\n",
-    "ax1.grid(True, alpha=0.3)\n",
-    "\n",
-    "# Precision-Recall Curves\n",
-    "for i, (name, data) in enumerate(roc_data.items()):\n",
-    "    ax2.plot(data['recall'], data['precision'], color=colors[i], linewidth=2,\n",
-    "             label=f'{name} (AUC = {data[\"pr_auc\"]:.3f})')\n",
-    "\n",
-    "# Baseline (random classifier)\n",
-    "baseline = np.sum(y_test) / len(y_test)\n",
-    "ax2.axhline(y=baseline, color='k', linestyle='--', linewidth=1, label=f'Random ({baseline:.3f})')\n",
-    "\n",
-    "ax2.set_xlabel('Recall')\n",
-    "ax2.set_ylabel('Precision')\n",
-    "ax2.set_title('Precision-Recall Curves Comparison')\n",
-    "ax2.legend()\n",
-    "ax2.grid(True, alpha=0.3)\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "432420fd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Performance summary table\n",
-    "performance_summary = []\n",
-    "\n",
-    "for name, data in roc_data.items():\n",
-    "    # Calculate additional metrics\n",
-    "    y_pred = (data['y_proba'] > 0.5).astype(int)\n",
-    "    accuracy = np.mean(y_pred == y_test)\n",
-    "    \n",
-    "    # Find optimal threshold (Youden's index)\n",
-    "    optimal_idx = np.argmax(data['tpr'] - data['fpr'])\n",
-    "    optimal_threshold = roc_data[list(roc_data.keys())[0]]['fpr'][optimal_idx]  # Approximation\n",
-    "    \n",
-    "    performance_summary.append({\n",
-    "        'Model': name,\n",
-    "        'ROC AUC': data['roc_auc'],\n",
-    "        'PR AUC': data['pr_auc'],\n",
-    "        'Accuracy': accuracy,\n",
-    "        'Best TPR': np.max(data['tpr']),\n",
-    "        'Best Precision': np.max(data['precision'])\n",
-    "    })\n",
-    "\n",
-    "summary_df = pd.DataFrame(performance_summary)\n",
-    "print(\"Performance Summary:\")\n",
-    "print(summary_df.round(4))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f81d3994",
-   "metadata": {},
-   "source": [
-    "## Summary and Recommendations\n",
-    "\n",
-    "This advanced tutorial covered:\n",
-    "\n",
-    "### 🔬 **Advanced Features Explored:**\n",
-    "\n",
-    "1. **Multiple Fingerprint Types**: Morgan, Atom Pair, Torsion, RDKit fingerprints\n",
-    "2. **Performance Optimization**: Detailed timing and memory analysis\n",
-    "3. **Memory Efficiency**: Sparse matrix format comparison and optimization\n",
-    "4. **Algorithm Benchmarking**: Comparison with other ML algorithms\n",
-    "5. **Advanced Converter Usage**: Custom settings and automatic format selection\n",
-    "6. **Learning Curves**: Performance vs dataset size analysis\n",
-    "7. **ROC Analysis**: Detailed classification performance metrics\n",
-    "\n",
-    "### 📊 **Key Findings:**\n",
-    "\n",
-    "- **New implementation** provides significant speedup while maintaining accuracy\n",
-    "- **CSR sparse format** is most memory-efficient for typical molecular fingerprints\n",
-    "- **LaplacianNB performs competitively** with other algorithms on sparse binary data\n",
-    "- **Memory savings** can exceed 95% with sparse representations\n",
-    "- **Automatic format selection** adapts to data characteristics\n",
-    "\n",
-    "### 🚀 **Best Practices:**\n",
-    "\n",
-    "1. Use **folded fingerprints** (1024-2048 bits) for memory efficiency\n",
-    "2. Choose **CSR format** for most molecular fingerprint applications  \n",
-    "3. Use **sparse_threshold=0.95** for automatic format selection\n",
-    "4. Monitor **sparsity levels** to optimize memory usage\n",
-    "5. Compare multiple **fingerprint types** for your specific problem\n",
-    "6. Use **cross-validation** for robust performance estimation\n",
-    "\n",
-    "### 🎯 **Production Recommendations:**\n",
-    "\n",
-    "- **LaplacianNB_New** for large-scale molecular classification\n",
-    "- **FingerprintTransformer** for sklearn pipeline integration\n",
-    "- **Memory monitoring** for very large datasets\n",
-    "- **Performance profiling** before deploying to production"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/basic_usage_example.py b/examples/basic_usage_example.py
deleted file mode 100644
index e69de29..0000000
diff --git a/examples/basic_usage_tutorial.ipynb b/examples/basic_usage_tutorial.ipynb
deleted file mode 100644
index dc5a4eb..0000000
--- a/examples/basic_usage_tutorial.ipynb
+++ /dev/null
@@ -1,624 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "e06d6bbd",
-   "metadata": {},
-   "source": [
-    "# LaplacianNB Basic Usage Tutorial\n",
-    "\n",
-    "This notebook demonstrates the basic usage of LaplacianNB with molecular fingerprints, following the pattern from the original bayes_tutorial but showcasing both implementations."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5de1de9f",
-   "metadata": {},
-   "source": [
-    "## Package Installation and Imports\n",
-    "\n",
-    "First, let's install the package and import necessary libraries."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "63a7e273",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install the package (uncomment if needed)\n",
-    "# !pip install laplaciannb --upgrade"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e078a074",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from rdkit import Chem\n",
-    "from rdkit.Chem import rdFingerprintGenerator\n",
-    "\n",
-    "# Import both implementations\n",
-    "from laplaciannb.LaplacianNB import LaplacianNB as LaplacianNB_Original\n",
-    "from laplaciannb.LaplacianNB_new import LaplacianNB as LaplacianNB_New\n",
-    "from laplaciannb.fingerprint_utils import convert_fingerprints"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bf3293da",
-   "metadata": {},
-   "source": [
-    "## Utility Function for Molecular Fingerprints\n",
-    "\n",
-    "We'll create a memory-efficient function to calculate Morgan fingerprints from SMILES."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0c661d68",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_fp(smiles: str, n_bits: int = 1024) -> set:\n",
-    "    \"\"\"\n",
-    "    Calculate Morgan fingerprint from SMILES string.\n",
-    "    \n",
-    "    Args:\n",
-    "        smiles (str): SMILES string\n",
-    "        n_bits (int): Size of folded fingerprint (default: 1024)\n",
-    "        \n",
-    "    Returns:\n",
-    "        set: Set of indices where bits are set to 1\n",
-    "    \"\"\"\n",
-    "    mol = Chem.MolFromSmiles(smiles)\n",
-    "    \n",
-    "    if not mol:\n",
-    "        return set()\n",
-    "    \n",
-    "    # Use folded fingerprint for memory efficiency\n",
-    "    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)\n",
-    "    fp = mfpgen.GetFingerprint(mol)\n",
-    "    \n",
-    "    if not fp:\n",
-    "        return set()\n",
-    "    \n",
-    "    return set(fp.GetOnBits())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7eb49c3e",
-   "metadata": {},
-   "source": [
-    "## Create Example Dataset\n",
-    "\n",
-    "Let's create a dataset with various molecules and their activities."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5da3c0fd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create example DataFrame with diverse molecules\n",
-    "df = pd.DataFrame({\n",
-    "    \"smiles\": [\n",
-    "        \"N[C@]([H])(C)C(=O)O\",           # Alanine (amino acid)\n",
-    "        \"O=Cc1ccc(O)c(OC)c1\",            # Vanillin (aromatic aldehyde)\n",
-    "        \"CN=C=O\",                         # Methyl isocyanate\n",
-    "        \"CCO\",                            # Ethanol (alcohol)\n",
-    "        \"c1ccccc1\",                       # Benzene (aromatic)\n",
-    "        \"CC(=O)O\",                        # Acetic acid\n",
-    "        \"CCCCO\",                          # Butanol (alcohol)\n",
-    "        \"c1ccc(C)cc1\",                    # Toluene (aromatic)\n",
-    "    ],\n",
-    "    \"activity\": [1, 0, 0, 1, 0, 1, 1, 0],\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24a8fcee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Display the dataset\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "df6461c0",
-   "metadata": {},
-   "source": [
-    "## Calculate Molecular Fingerprints\n",
-    "\n",
-    "Convert SMILES to molecular fingerprints using our utility function."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3a83bba5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Calculate fingerprints for each molecule\n",
-    "print(\"Calculating molecular fingerprints...\")\n",
-    "df[\"fingerprints\"] = df[\"smiles\"].apply(lambda x: get_fp(x, n_bits=1024))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a36efc61",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Display fingerprint information\n",
-    "print(\"Dataset with fingerprints:\")\n",
-    "for idx, row in df.iterrows():\n",
-    "    fp_size = len(row[\"fingerprints\"])\n",
-    "    fp_preview = list(sorted(row[\"fingerprints\"]))[:5] if row[\"fingerprints\"] else []\n",
-    "    print(f\"  {row['smiles'][:25]:25} -> {fp_size:3d} bits, first 5: {fp_preview}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "77c3e9f5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Show the complete dataframe\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a4555428",
-   "metadata": {},
-   "source": [
-    "## Prepare Training Data\n",
-    "\n",
-    "Extract features (X) and targets (y) from our dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bf11dfee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Prepare data for training\n",
-    "X = df[\"fingerprints\"].values\n",
-    "y = df[\"activity\"].values\n",
-    "\n",
-    "print(f\"Training data shape: {X.shape}\")\n",
-    "print(f\"Target distribution: {np.bincount(y)}\")\n",
-    "print(f\"Classes: {np.unique(y)}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "53b56c18",
-   "metadata": {},
-   "source": [
-    "## Example 1: Original LaplacianNB Implementation\n",
-    "\n",
-    "Let's use the original LaplacianNB implementation that works with sets."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "52bf5cb8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create and train original classifier\n",
-    "clf_original = LaplacianNB_Original()\n",
-    "clf_original.fit(X, y)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3894a6dd",
-   "metadata": {},
-   "source": [
-    "### Get Joint Log-Likelihood\n",
-    "\n",
-    "This shows the sum of feature probabilities for each compound per class."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fc7950ec",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get joint log-likelihood (internal method)\n",
-    "joint_ll = clf_original._joint_log_likelihood(X)\n",
-    "print(\"Joint log-likelihood shape:\", joint_ll.shape)\n",
-    "joint_ll"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b7a3f8c6",
-   "metadata": {},
-   "source": [
-    "### Get Class Probabilities\n",
-    "\n",
-    "Get probability predictions for each class using sklearn-compatible interface."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "04db205c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get probability predictions\n",
-    "probabilities = clf_original.predict_proba(X)\n",
-    "print(\"Probabilities shape:\", probabilities.shape)\n",
-    "probabilities"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9eaca1e8",
-   "metadata": {},
-   "source": [
-    "### Get Class Predictions\n",
-    "\n",
-    "Get hard predictions for each sample."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2e772e19",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get class predictions\n",
-    "predictions = clf_original.predict(X)\n",
-    "print(\"Predictions:\", predictions)\n",
-    "predictions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fe9dc5d5",
-   "metadata": {},
-   "source": [
-    "### Explore Model Properties\n",
-    "\n",
-    "Let's examine the trained model's properties."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2c66219f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get class names\n",
-    "print(\"Classes:\", clf_original.classes_)\n",
-    "clf_original.classes_"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ae1570bd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get feature mapping (index -> feature space position)\n",
-    "print(\"Number of unique features:\", len(clf_original.feature_names_))\n",
-    "print(\"First 10 feature mappings:\", dict(list(clf_original.feature_names_.items())[:10]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9dcc6ac6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get feature log probabilities\n",
-    "print(\"Feature log probabilities shape:\", clf_original.feature_log_prob_.shape)\n",
-    "print(\"Feature log probabilities (first 5 features):\")\n",
-    "clf_original.feature_log_prob_[:, :5]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ee5d4afc",
-   "metadata": {},
-   "source": [
-    "## Example 2: New sklearn-compatible LaplacianNB\n",
-    "\n",
-    "Now let's use the new implementation that works with sklearn sparse matrices."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0d237fa5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Convert fingerprints to sklearn format (sparse CSR by default)\n",
-    "X_sklearn = convert_fingerprints(X, n_bits=1024)\n",
-    "print(f\"Sklearn format shape: {X_sklearn.shape}\")\n",
-    "print(f\"Sparse matrix format: {X_sklearn.format}\")\n",
-    "print(f\"Number of non-zero elements: {X_sklearn.nnz}\")\n",
-    "print(f\"Sparsity: {1 - X_sklearn.nnz / (X_sklearn.shape[0] * X_sklearn.shape[1]):.3f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "be4bb62b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create and train new classifier\n",
-    "clf_new = LaplacianNB_New()\n",
-    "clf_new.fit(X_sklearn, y)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0f4ad20e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get predictions with new implementation\n",
-    "predictions_new = clf_new.predict(X_sklearn)\n",
-    "print(\"Predictions (new):\", predictions_new)\n",
-    "predictions_new"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c455f310",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get probabilities with new implementation\n",
-    "probabilities_new = clf_new.predict_proba(X_sklearn)\n",
-    "print(\"Probabilities shape:\", probabilities_new.shape)\n",
-    "probabilities_new"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0d9781e2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get log probabilities (additional method in new implementation)\n",
-    "log_probabilities_new = clf_new.predict_log_proba(X_sklearn)\n",
-    "print(\"Log probabilities shape:\", log_probabilities_new.shape)\n",
-    "log_probabilities_new"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2b801a81",
-   "metadata": {},
-   "source": [
-    "### New Implementation Properties"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7d9131f3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"Classes:\", clf_new.classes_)\n",
-    "print(\"Number of features:\", clf_new.n_features_in_)\n",
-    "print(\"Feature count shape:\", clf_new.feature_count_.shape)\n",
-    "print(\"Feature log probabilities shape:\", clf_new.feature_log_prob_.shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9e6ec732",
-   "metadata": {},
-   "source": [
-    "## Example 3: Implementation Comparison\n",
-    "\n",
-    "Let's compare the results from both implementations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7b896d83",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create comparison DataFrame\n",
-    "comparison_df = pd.DataFrame({\n",
-    "    'SMILES': df['smiles'],\n",
-    "    'True_Activity': y,\n",
-    "    'Original_Pred': predictions,\n",
-    "    'New_Pred': predictions_new,\n",
-    "    'Original_Prob_0': probabilities[:, 0],\n",
-    "    'Original_Prob_1': probabilities[:, 1],\n",
-    "    'New_Prob_0': probabilities_new[:, 0],\n",
-    "    'New_Prob_1': probabilities_new[:, 1],\n",
-    "})\n",
-    "\n",
-    "comparison_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6df1732b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Check if predictions match\n",
-    "predictions_match = np.array_equal(predictions, predictions_new)\n",
-    "probabilities_match = np.allclose(probabilities, probabilities_new, atol=1e-6)\n",
-    "\n",
-    "print(f\"Predictions match: {predictions_match}\")\n",
-    "print(f\"Probabilities match (within 1e-6): {probabilities_match}\")\n",
-    "\n",
-    "if not probabilities_match:\n",
-    "    prob_diff = np.abs(probabilities - probabilities_new)\n",
-    "    max_diff = np.max(prob_diff)\n",
-    "    mean_diff = np.mean(prob_diff)\n",
-    "    print(f\"Maximum probability difference: {max_diff:.2e}\")\n",
-    "    print(f\"Mean probability difference: {mean_diff:.2e}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3a63d8c1",
-   "metadata": {},
-   "source": [
-    "## Example 4: Different Fingerprint Sizes\n",
-    "\n",
-    "Let's explore how different fingerprint sizes affect performance."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2b4dd4fc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test different fingerprint sizes\n",
-    "fingerprint_sizes = [256, 512, 1024, 2048]\n",
-    "results = []\n",
-    "\n",
-    "for n_bits in fingerprint_sizes:\n",
-    "    # Calculate fingerprints with current size\n",
-    "    fps = df[\"smiles\"].apply(lambda x: get_fp(x, n_bits=n_bits)).values\n",
-    "    X_sized = convert_fingerprints(fps, n_bits=n_bits)\n",
-    "    \n",
-    "    # Train classifier\n",
-    "    clf_sized = LaplacianNB_New()\n",
-    "    clf_sized.fit(X_sized, y)\n",
-    "    \n",
-    "    # Calculate metrics\n",
-    "    accuracy = clf_sized.score(X_sized, y)\n",
-    "    sparsity = 1 - X_sized.nnz / (X_sized.shape[0] * X_sized.shape[1])\n",
-    "    avg_bits_per_molecule = np.mean([len(fp) for fp in fps])\n",
-    "    \n",
-    "    results.append({\n",
-    "        'n_bits': n_bits,\n",
-    "        'accuracy': accuracy,\n",
-    "        'sparsity': sparsity,\n",
-    "        'avg_bits_per_mol': avg_bits_per_molecule,\n",
-    "        'total_features': X_sized.shape[1]\n",
-    "    })\n",
-    "\n",
-    "# Display results\n",
-    "results_df = pd.DataFrame(results)\n",
-    "results_df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b6f3856e",
-   "metadata": {},
-   "source": [
-    "## Example 5: Detailed Prediction Analysis\n",
-    "\n",
-    "Let's analyze individual predictions in detail."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7e1e0c3d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Detailed analysis for each molecule\n",
-    "print(\"Detailed Prediction Analysis:\")\n",
-    "print(\"=\" * 80)\n",
-    "\n",
-    "for i, row in df.iterrows():\n",
-    "    smiles = row['smiles']\n",
-    "    true_activity = y[i]\n",
-    "    pred_orig = predictions[i]\n",
-    "    pred_new = predictions_new[i]\n",
-    "    prob_orig = probabilities[i]\n",
-    "    prob_new = probabilities_new[i]\n",
-    "    \n",
-    "    print(f\"\\nMolecule {i+1}: {smiles}\")\n",
-    "    print(f\"  True activity: {true_activity}\")\n",
-    "    print(f\"  Original prediction: {pred_orig} (prob: [{prob_orig[0]:.3f}, {prob_orig[1]:.3f}])\")\n",
-    "    print(f\"  New prediction: {pred_new} (prob: [{prob_new[0]:.3f}, {prob_new[1]:.3f}])\")\n",
-    "    \n",
-    "    if pred_orig != true_activity:\n",
-    "        print(f\"  ⚠️  Original implementation misclassified\")\n",
-    "    if pred_new != true_activity:\n",
-    "        print(f\"  ⚠️  New implementation misclassified\")\n",
-    "    if pred_orig == pred_new == true_activity:\n",
-    "        print(f\"  ✅ Both implementations correct\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "eca5cf71",
-   "metadata": {},
-   "source": [
-    "## Summary\n",
-    "\n",
-    "This tutorial demonstrated:\n",
-    "\n",
-    "1. **Basic usage** of both LaplacianNB implementations\n",
-    "2. **Fingerprint calculation** with memory-efficient folded fingerprints\n",
-    "3. **Model training and prediction** with molecular data\n",
-    "4. **Implementation comparison** showing compatibility between versions\n",
-    "5. **Fingerprint size optimization** for different use cases\n",
-    "6. **Detailed analysis** of individual predictions\n",
-    "\n",
-    "### Key Takeaways:\n",
-    "\n",
-    "- Both implementations produce identical results\n",
-    "- The new implementation is sklearn-compatible and memory-efficient\n",
-    "- Fingerprint size affects sparsity and potentially accuracy\n",
-    "- The package handles molecular fingerprints effectively for classification tasks"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/bayes_tutorial.ipynb b/examples/bayes_tutorial.ipynb
deleted file mode 100644
index 8053112..0000000
--- a/examples/bayes_tutorial.ipynb
+++ /dev/null
@@ -1,623 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ff57e6ba-3c49-45f2-9109-924bd310ac9c",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mRunning cells with 'lmnb (Python 3.10.17)' requires the ipykernel package.\n",
-      "\u001b[1;31mInstall 'ipykernel' into the Python environment. \n",
-      "\u001b[1;31mCommand: '/Users/baranba2/Projects/lmnb/.venv/bin/python -m pip install ipykernel -U --force-reinstall'"
-     ]
-    }
-   ],
-   "source": [
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "61ff9c84",
-   "metadata": {},
-   "source": [
-    "## Package installation from jupyter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "33b0b7e0",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: laplaciannb in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (0.4)\n",
-      "Collecting laplaciannb\n",
-      "  Using cached laplaciannb-0.4.1-py3-none-any.whl (6.0 kB)\n",
-      "Requirement already satisfied: scikit-learn>=1.1.1 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from laplaciannb) (1.1.2)\n",
-      "Requirement already satisfied: scipy>=1.8.1 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from laplaciannb) (1.9.0)\n",
-      "Requirement already satisfied: pandas>=1.4.2 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from laplaciannb) (1.4.3)\n",
-      "Requirement already satisfied: numpy>=1.22.4 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from laplaciannb) (1.23.1)\n",
-      "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from pandas>=1.4.2->laplaciannb) (2.8.2)\n",
-      "Requirement already satisfied: pytz>=2020.1 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from pandas>=1.4.2->laplaciannb) (2022.1)\n",
-      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from scikit-learn>=1.1.1->laplaciannb) (3.1.0)\n",
-      "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from scikit-learn>=1.1.1->laplaciannb) (1.1.0)\n",
-      "Requirement already satisfied: six>=1.5 in /usr/local/Caskroom/miniconda/base/envs/bayes/lib/python3.8/site-packages (from python-dateutil>=2.8.1->pandas>=1.4.2->laplaciannb) (1.16.0)\n",
-      "Installing collected packages: laplaciannb\n",
-      "  Attempting uninstall: laplaciannb\n",
-      "    Found existing installation: laplaciannb 0.4\n",
-      "    Uninstalling laplaciannb-0.4:\n",
-      "      Successfully uninstalled laplaciannb-0.4\n",
-      "Successfully installed laplaciannb-0.4.1\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install laplaciannb --upgrade"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "ad49c226-2ea6-4705-a8c0-295efbda2671",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from laplaciannb.LaplacianNB import LaplacianNB"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "97480b54",
-   "metadata": {},
-   "source": [
-    "## Small utility function to process smiles into a set of indices of positive bits"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "627d5672-8ead-4b3c-8224-2186f01ed8ae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from rdkit import Chem\n",
-    "from rdkit.Chem import rdFingerprintGenerator\n",
-    "\n",
-    "def get_fp(smiles: str) -> set:\n",
-    "    \"\"\"Function to calculate MorganFingerprint from smiles.\n",
-    "    It returns index of all '1' bits of not-folded fingerprint.\n",
-    "    Args:\n",
-    "        smiles (str): smiles string\n",
-    "    Returns:\n",
-    "        set: return list of index of '1' bits.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    mol = Chem.MolFromSmiles(smiles)\n",
-    "\n",
-    "    if not mol:\n",
-    "        return\n",
-    "\n",
-    "    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)\n",
-    "    fp = mfpgen.GetSparseFingerprint(mol)\n",
-    "    if not fp:\n",
-    "        return\n",
-    "\n",
-    "    return set(fp.GetOnBits())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "15d449fb",
-   "metadata": {},
-   "source": [
-    "## Create a example DataFrame"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "85ad8acb-f402-4b4b-8bc1-335f8727ec00",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.DataFrame(\n",
-    "    {\n",
-    "        \"smiles\": [\n",
-    "            \"N[C@]([H])(C)C(=O)O\",\n",
-    "            \"O=Cc1ccc(O)c(OC)c1 COc1cc(C=O)ccc1O\",\n",
-    "            \"CN=C=O\",\n",
-    "        ],\n",
-    "        \"activity\": [1, 0, 0],\n",
-    "    }\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "ab597a01-0209-4697-ba3c-b3d36c169879",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>smiles</th>\n",
-       "      <th>activity</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>N[C@]([H])(C)C(=O)O</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>O=Cc1ccc(O)c(OC)c1 COc1cc(C=O)ccc1O</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>CN=C=O</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                smiles  activity\n",
-       "0                  N[C@]([H])(C)C(=O)O         1\n",
-       "1  O=Cc1ccc(O)c(OC)c1 COc1cc(C=O)ccc1O         0\n",
-       "2                               CN=C=O         0"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "bf45a3b3-8093-4745-9309-b371a899ccfd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[\"sets\"] = df[\"smiles\"].apply(lambda x: get_fp(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "d1b4cf52-fb03-48fb-947b-fbc8c7f17c15",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>smiles</th>\n",
-       "      <th>activity</th>\n",
-       "      <th>sets</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>N[C@]([H])(C)C(=O)O</td>\n",
-       "      <td>1</td>\n",
-       "      <td>{2245273601, 2246728737, 2655406212, 153386432...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>O=Cc1ccc(O)c(OC)c1 COc1cc(C=O)ccc1O</td>\n",
-       "      <td>0</td>\n",
-       "      <td>{2076190208, 864942730, 2900751504, 2458968089...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>CN=C=O</td>\n",
-       "      <td>0</td>\n",
-       "      <td>{2246728737, 2245900962, 864942730, 3823506351...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                smiles  activity  \\\n",
-       "0                  N[C@]([H])(C)C(=O)O         1   \n",
-       "1  O=Cc1ccc(O)c(OC)c1 COc1cc(C=O)ccc1O         0   \n",
-       "2                               CN=C=O         0   \n",
-       "\n",
-       "                                                sets  \n",
-       "0  {2245273601, 2246728737, 2655406212, 153386432...  \n",
-       "1  {2076190208, 864942730, 2900751504, 2458968089...  \n",
-       "2  {2246728737, 2245900962, 864942730, 3823506351...  "
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4a2b20ee",
-   "metadata": {},
-   "source": [
-    "## Fit function"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "392ac910-c661-4fd2-9240-d5e02a95c0cd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X = df[\"sets\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "423550ba-54a5-47ba-a2ec-8f0c52ae96bf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y = df[\"activity\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "f9125b8f-39bb-4bb6-889d-8812242360f6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "clf = LaplacianNB()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "4e47c0df-ecc1-415d-aa08-c5f829b72784",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LaplacianNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LaplacianNB</label><div class=\"sk-toggleable__content\"><pre>LaplacianNB()</pre></div></div></div></div></div>"
-      ],
-      "text/plain": [
-       "LaplacianNB()"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "clf.fit(X, y)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bc1d718e",
-   "metadata": {},
-   "source": [
-    "## Get a sum of features probabilities for each compound per class [0, 1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "5f5b2e0f-3c62-4c4b-b7d4-73cd475fc32c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[-5.7550254,  4.920233 ],\n",
-       "       [ 2.962594 , -4.941602 ],\n",
-       "       [ 0.9315465, -1.5314839]], dtype=float32)"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "clf._joint_log_likelihood(X)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0c28d444",
-   "metadata": {},
-   "source": [
-    "## Get probability of each class (sklearn implementation)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "b1fce5dd-19b5-4a54-8b1d-fe005071a6c4",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[2.3109160e-05, 9.9997705e-01],\n",
-       "       [9.9963105e-01, 3.6905482e-04],\n",
-       "       [9.2150915e-01, 7.8490861e-02]], dtype=float32)"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "clf.predict_proba(X)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "aa9fa949",
-   "metadata": {},
-   "source": [
-    "## Get prediction of each class (sklearn implementation)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "236cebeb-2e81-449a-babf-27b9665e726e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([1, 0, 0])"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "clf.predict(X)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "16b18d5e",
-   "metadata": {},
-   "source": [
-    "## Get class names"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "06d46914",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([0, 1])"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "clf.classes_"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "32af4468",
-   "metadata": {},
-   "source": [
-    "## Get index of positive bit mapping to feature space -> key: value of an index, value: index in feature table (see below)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "9498b13d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{26234434: 0,\n",
-       " 847336149: 1,\n",
-       " 847957139: 2,\n",
-       " 864662311: 3,\n",
-       " 864674487: 4,\n",
-       " 864942730: 5,\n",
-       " 932712697: 6,\n",
-       " 951226070: 7,\n",
-       " 976134192: 8,\n",
-       " 994485099: 9,\n",
-       " 1135286194: 10,\n",
-       " 1310068516: 11,\n",
-       " 1510328189: 12,\n",
-       " 1510337516: 13,\n",
-       " 1516788326: 14,\n",
-       " 1517923320: 15,\n",
-       " 1533864325: 16,\n",
-       " 1879233475: 17,\n",
-       " 2038990649: 18,\n",
-       " 2076190208: 19,\n",
-       " 2245273601: 20,\n",
-       " 2245900962: 21,\n",
-       " 2246699815: 22,\n",
-       " 2246703798: 23,\n",
-       " 2246728737: 24,\n",
-       " 2458968089: 25,\n",
-       " 2549196227: 26,\n",
-       " 2599973650: 27,\n",
-       " 2625182169: 28,\n",
-       " 2655406212: 29,\n",
-       " 2900751504: 30,\n",
-       " 3011598321: 31,\n",
-       " 3026394695: 32,\n",
-       " 3217380708: 33,\n",
-       " 3218693969: 34,\n",
-       " 3537119515: 35,\n",
-       " 3725073659: 36,\n",
-       " 3823506351: 37,\n",
-       " 3855312692: 38,\n",
-       " 3945128999: 39,\n",
-       " 3975275337: 40,\n",
-       " 4046184955: 41}"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "clf.feature_names_"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2c039b62",
-   "metadata": {},
-   "source": [
-    "## Get log probability per feature/index"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "464e8dde",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 0.14884563,  0.14884563, -0.54430157, -0.20163734,  0.14884563,\n",
-       "        -0.05518642,  0.14884563,  0.14884563,  0.14884563,  0.14884563,\n",
-       "         0.14884563,  0.14884563, -0.54430157,  0.14884563,  0.14884563,\n",
-       "         0.14884563, -0.54430157,  0.14884563,  0.14884563,  0.14884563,\n",
-       "        -0.54430157,  0.14884563, -0.54430157,  0.14884563, -0.05518642,\n",
-       "         0.14884563,  0.14884563, -0.54430157, -0.54430157, -0.54430157,\n",
-       "         0.14884563,  0.14884563,  0.14884563,  0.14884563,  0.14884563,\n",
-       "        -0.54430157,  0.14884563,  0.14884563, -0.54430157,  0.14884563,\n",
-       "         0.14884563,  0.14884563],\n",
-       "       [-0.24419697, -0.24419697,  0.44895023,  0.25283533, -0.24419697,\n",
-       "         0.08894748, -0.24419697, -0.24419697, -0.24419697, -0.24419697,\n",
-       "        -0.24419697, -0.24419697,  0.44895023, -0.24419697, -0.24419697,\n",
-       "        -0.24419697,  0.44895023, -0.24419697, -0.24419697, -0.24419697,\n",
-       "         0.44895023, -0.24419697,  0.44895023, -0.24419697,  0.08894748,\n",
-       "        -0.24419697, -0.24419697,  0.44895023,  0.44895023,  0.44895023,\n",
-       "        -0.24419697, -0.24419697, -0.24419697, -0.24419697, -0.24419697,\n",
-       "         0.44895023, -0.24419697, -0.24419697,  0.44895023, -0.24419697,\n",
-       "        -0.24419697, -0.24419697]], dtype=float32)"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "clf.feature_log_prob_"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a8c6f20a",
-   "metadata": {},
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "lmnb",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.17"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/integration_example.py b/examples/integration_example.py
deleted file mode 100644
index c17db32..0000000
--- a/examples/integration_example.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""Example demonstrating LaplacianNB with fingerprint utilities."""
-
-import numpy as np
-
-from laplaciannb import LaplacianNB, RDKitFingerprintConverter, convert_fingerprints
-
-
-# Example 1: Basic usage with set data
-print("=== Example 1: Basic LaplacianNB Usage ===")
-X_sets = np.array(
-    [
-        {1, 5, 10, 15, 20},  # Sample 1: bits 1,5,10,15,20 are on
-        {2, 6, 11, 16, 21},  # Sample 2: bits 2,6,11,16,21 are on
-        {1, 3, 7, 12, 17},  # Sample 3: bits 1,3,7,12,17 are on
-        {4, 8, 13, 18, 22},  # Sample 4: bits 4,8,13,18,22 are on
-    ],
-    dtype=object,
-)
-y = np.array([0, 1, 0, 1])  # Binary classification
-
-# Train classifier
-clf = LaplacianNB()
-clf.fit(X_sets, y)
-
-# Predictions
-predictions = clf.predict(X_sets)
-probabilities = clf.predict_proba(X_sets)
-
-print(f"Predictions: {predictions}")
-print(f"Probabilities shape: {probabilities.shape}")
-print(f"Sample probability for class 0: {probabilities[0, 0]:.3f}")
-
-# Example 2: Using fingerprint conversion utilities
-print("\n=== Example 2: Fingerprint Conversion ===")
-
-# Simulate molecular fingerprints as sets of on-bits
-molecular_fps = [
-    {1, 5, 10, 15, 100, 200},  # Molecule 1
-    {2, 6, 11, 16, 101, 201},  # Molecule 2
-    {1, 3, 7, 12, 102, 202},  # Molecule 3
-    {4, 8, 13, 18, 103, 203},  # Molecule 4
-]
-
-# Convert to different formats
-dense_matrix = convert_fingerprints(molecular_fps, n_bits=512, output_format="dense")
-sparse_matrix = convert_fingerprints(molecular_fps, n_bits=512, output_format="csr")
-
-print(f"Dense matrix shape: {dense_matrix.shape}")
-print(f"Sparse matrix shape: {sparse_matrix.shape}")
-print(f"Sparse matrix format: {sparse_matrix.format}")
-print(f"Sparsity: {1 - sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1]):.3f}")
-
-# Example 3: Using the converter class
-print("\n=== Example 3: RDKitFingerprintConverter ===")
-
-converter = RDKitFingerprintConverter(
-    n_bits=1024,
-    output_format="auto",  # Automatically choose based on sparsity
-    dtype=np.float32,
-)
-
-# Convert fingerprints
-X_converted = converter.convert(molecular_fps)
-stats = converter.get_statistics(molecular_fps)
-
-print(f"Converted matrix type: {type(X_converted)}")
-print(f"Matrix shape: {X_converted.shape}")
-print("Statistics:")
-for key, value in stats.items():
-    if isinstance(value, float):
-        print(f"  {key}: {value:.3f}")
-    else:
-        print(f"  {key}: {value}")
-
-
-# Train classifier with converted data
-# Note: LaplacianNB expects sets of indices, so we need to convert back
-def sparse_to_sets(sparse_matrix):
-    """Convert sparse matrix back to array of sets for LaplacianNB."""
-    sets = []
-    for i in range(sparse_matrix.shape[0]):
-        row = sparse_matrix.getrow(i)
-        on_bits = set(row.nonzero()[1])
-        sets.append(on_bits)
-    return np.array(sets, dtype=object)
-
-
-X_sets_converted = sparse_to_sets(X_converted)
-clf_converted = LaplacianNB()
-clf_converted.fit(X_sets_converted, y)
-predictions_converted = clf_converted.predict(X_sets_converted)
-
-print(f"Predictions with converted data: {predictions_converted}")
-
-print("\n✅ All examples completed successfully!")
diff --git a/examples/simple_example.py b/examples/simple_example.py
new file mode 100644
index 0000000..57300f5
--- /dev/null
+++ b/examples/simple_example.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""
+Simple LaplacianNB Example
+=========================
+
+A minimal example showing basic LaplacianNB usage with molecular data.
+"""
+
+import numpy as np
+from laplaciannb import LaplacianNB
+from laplaciannb.fingerprint_utils import rdkit_to_csr
+
+# Sample molecular data
+smiles = [
+    "CCO",                              # Ethanol - inactive
+    "CC(=O)OC1=CC=CC=C1C(=O)O",        # Aspirin - active
+    "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",   # Ibuprofen - active
+    "CCCCCCCCCCCCCCCC",                 # Palmitic acid - inactive
+    "CC1=CC=C(C=C1)C(=O)O"             # p-Toluic acid - active
+]
+y = [0, 1, 1, 0, 1]  # Activity labels (0=inactive, 1=active)
+
+# Convert SMILES to sparse matrix
+print("Converting molecular fingerprints...")
+X = rdkit_to_csr(smiles, radius=2)
+print(f"Matrix shape: {X.shape}")
+print(f"Sparsity: {1 - X.nnz / (X.shape[0] * X.shape[1]):.6f}")
+
+# Train classifier
+print("\nTraining LaplacianNB...")
+clf = LaplacianNB(alpha=1.0)
+clf.fit(X, y)
+
+# Make predictions
+predictions = clf.predict(X)
+probabilities = clf.predict_proba(X)
+
+# Display results
+print("\nResults:")
+print("-" * 40)
+for i, (smiles_str, true_label, pred_label, prob) in enumerate(
+    zip(smiles, y, predictions, probabilities)
+):
+    print(f"Molecule {i+1}: {smiles_str[:20]}")
+    print(f"  True: {true_label}, Predicted: {pred_label}")
+    print(f"  Probabilities: [Inactive: {prob[0]:.3f}, Active: {prob[1]:.3f}]")
+    print()
+
+# Calculate accuracy
+accuracy = sum(predictions == y) / len(y)
+print(f"Accuracy: {accuracy:.1%}")
+
+# Advanced: Extract original fingerprint indices
+print("\n" + "=" * 50)
+print("EXTRACTING ORIGINAL FINGERPRINT INDICES")
+print("=" * 50)
+
+print("\nOriginal RDKit fingerprint indices for each molecule:")
+print("-" * 50)
+
+from rdkit import Chem
+from rdkit.Chem import rdFingerprintGenerator
+
+# Recreate the fingerprint generator to get individual fingerprints
+mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
+
+for i, smiles_str in enumerate(smiles):
+    mol = Chem.MolFromSmiles(smiles_str)
+    if mol is not None:
+        # Get sparse fingerprint with original indices
+        sfp = mfpgen.GetSparseFingerprint(mol)
+        original_indices = list(sfp.GetOnBits())
+
+        # Convert to the same uint32 indices used in the matrix
+        converted_indices = [int(np.uint32(bit & 0xFFFFFFFF)) for bit in original_indices]
+
+        print(f"\nMolecule {i+1}: {smiles_str}")
+        print(f"  Original indices: {original_indices[:10]}{'...' if len(original_indices) > 10 else ''}")
+        print(f"  Converted indices: {converted_indices[:10]}{'...' if len(converted_indices) > 10 else ''}")
+        print(f"  Total fingerprint bits: {len(original_indices)}")
+
+# Show how to extract indices from the sparse matrix
+print(f"\nExtracting indices from sparse matrix:")
+print("-" * 50)
+
+for i in range(X.shape[0]):
+    # Get the column indices for row i
+    start_idx = X.indptr[i]
+    end_idx = X.indptr[i + 1]
+    row_indices = X.indices[start_idx:end_idx]
+
+    print(f"Molecule {i+1} active bits: {row_indices[:10]}{'...' if len(row_indices) > 10 else ''}")
+    print(f"  Total: {len(row_indices)} active bits")
+
+print(f"\n✓ You can now map back to original RDKit fingerprint indices")
+print(f"✓ Useful for feature interpretation and chemical insights")
+
+# Reverse mapping: From sparse matrix back to RDKit
+print("\n" + "=" * 50)
+print("REVERSE MAPPING: MATRIX → RDKIT")
+print("=" * 50)
+
+print("\nMapping sparse matrix indices back to original RDKit bits:")
+print("-" * 50)
+
+def uint32_to_rdkit_index(uint32_index):
+    """Convert uint32 matrix index back to original RDKit signed int32."""
+    # Convert back from unsigned to signed int32
+    if uint32_index >= 2**31:
+        return int(uint32_index) - 2**32
+    else:
+        return int(uint32_index)
+
+# Example: Take the first molecule and show the reverse mapping
+mol_idx = 0
+print(f"\nExample with Molecule {mol_idx + 1}: {smiles[mol_idx]}")
+
+# Get active indices from sparse matrix
+start_idx = X.indptr[mol_idx]
+end_idx = X.indptr[mol_idx + 1]
+matrix_indices = X.indices[start_idx:end_idx]
+
+print(f"Matrix indices (uint32): {matrix_indices}")
+
+# Convert back to RDKit indices
+rdkit_indices = [uint32_to_rdkit_index(idx) for idx in matrix_indices]
+print(f"RDKit indices (int32):   {rdkit_indices}")
+
+# Verify this matches the original fingerprint
+mol = Chem.MolFromSmiles(smiles[mol_idx])
+sfp = mfpgen.GetSparseFingerprint(mol)
+original_indices = sorted(list(sfp.GetOnBits()))
+recovered_indices = sorted(rdkit_indices)
+
+print(f"Original RDKit indices: {original_indices}")
+print(f"Recovered indices:      {recovered_indices}")
+print(f"Match: {'✓' if original_indices == recovered_indices else '✗'}")
diff --git a/examples/sklearn_integration_example.py b/examples/sklearn_integration_example.py
deleted file mode 100644
index e69de29..0000000
diff --git a/examples/sklearn_integration_tutorial.ipynb b/examples/sklearn_integration_tutorial.ipynb
deleted file mode 100644
index edfbaa0..0000000
--- a/examples/sklearn_integration_tutorial.ipynb
+++ /dev/null
@@ -1,884 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "2fee5d5b",
-   "metadata": {},
-   "source": [
-    "# LaplacianNB sklearn Integration Tutorial\n",
-    "\n",
-    "This notebook demonstrates how to use LaplacianNB with sklearn's ecosystem including pipelines, cross-validation, grid search, and the FingerprintTransformer."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cdf001ce",
-   "metadata": {},
-   "source": [
-    "## Setup and Imports\n",
-    "\n",
-    "Let's import all necessary libraries for our sklearn integration examples."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d248db08",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from pathlib import Path\n",
-    "from rdkit import Chem\n",
-    "from rdkit.Chem import rdFingerprintGenerator\n",
-    "\n",
-    "# sklearn imports\n",
-    "from sklearn.model_selection import (\n",
-    "    train_test_split, cross_val_score, GridSearchCV, StratifiedKFold\n",
-    ")\n",
-    "from sklearn.pipeline import Pipeline\n",
-    "from sklearn.preprocessing import StandardScaler\n",
-    "from sklearn.feature_selection import SelectKBest, chi2\n",
-    "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
-    "from sklearn.base import clone\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "# LaplacianNB imports\n",
-    "from laplaciannb import LaplacianNB_New, FingerprintTransformer, convert_fingerprints\n",
-    "\n",
-    "# Set random seed for reproducibility\n",
-    "np.random.seed(42)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c683b90c",
-   "metadata": {},
-   "source": [
-    "## Utility Functions\n",
-    "\n",
-    "Define functions for molecular fingerprint generation."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "84f85bcd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_molecular_fingerprints(smiles_list, n_bits=1024):\n",
-    "    \"\"\"\n",
-    "    Convert SMILES to molecular fingerprints.\n",
-    "    \n",
-    "    Args:\n",
-    "        smiles_list: List of SMILES strings\n",
-    "        n_bits: Fingerprint size\n",
-    "        \n",
-    "    Returns:\n",
-    "        List of fingerprint sets\n",
-    "    \"\"\"\n",
-    "    def get_fp(smiles):\n",
-    "        mol = Chem.MolFromSmiles(smiles)\n",
-    "        if not mol:\n",
-    "            return set()\n",
-    "        mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)\n",
-    "        fp = mfpgen.GetFingerprint(mol)\n",
-    "        return set(fp.GetOnBits())\n",
-    "    \n",
-    "    return [get_fp(smiles) for smiles in smiles_list]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1d9e8686",
-   "metadata": {},
-   "source": [
-    "## Create Synthetic Dataset\n",
-    "\n",
-    "Let's create a larger synthetic dataset for demonstration."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d90b56e4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create synthetic molecular dataset\n",
-    "base_molecules = [\n",
-    "    # Alcohols (generally active)\n",
-    "    \"CCO\", \"CCC\", \"CCCO\", \"CCCCO\", \"CCCCCO\",\n",
-    "    \"CC(C)O\", \"CCC(C)O\", \"CC(O)C\",\n",
-    "    \n",
-    "    # Aromatics (generally inactive) \n",
-    "    \"c1ccccc1\", \"c1ccc(C)cc1\", \"c1ccc(CC)cc1\", \"c1ccc(O)cc1\",\n",
-    "    \"c1ccc(N)cc1\", \"c1ccc(Cl)cc1\",\n",
-    "    \n",
-    "    # Carboxylic acids (generally active)\n",
-    "    \"CC(=O)O\", \"CCC(=O)O\", \"CCCC(=O)O\", \"c1ccc(C(=O)O)cc1\",\n",
-    "    \"CC(C)C(=O)O\", \"CCCCC(=O)O\",\n",
-    "    \n",
-    "    # Alkanes (generally inactive)\n",
-    "    \"CC\", \"CCC\", \"CCCC\", \"CCCCC\", \"CCCCCC\",\n",
-    "    \"CC(C)C\", \"CC(C)CC\", \"CCC(C)C\",\n",
-    "    \n",
-    "    # Alkenes (mixed activity)\n",
-    "    \"C=C\", \"C=CC\", \"C=CCC\", \"CC=CC\", \"C=CC=C\",\n",
-    "    \n",
-    "    # Ethers (mixed activity)\n",
-    "    \"COC\", \"CCOC\", \"CCOCC\", \"c1ccc(OC)cc1\"\n",
-    "]\n",
-    "\n",
-    "# Define activity patterns (for demonstration)\n",
-    "activity_patterns = {\n",
-    "    # Alcohols -> active (1)\n",
-    "    0: [1, 1, 1, 1, 1, 1, 1, 1],\n",
-    "    # Aromatics -> inactive (0) \n",
-    "    1: [0, 0, 0, 0, 0, 0],\n",
-    "    # Acids -> active (1)\n",
-    "    2: [1, 1, 1, 1, 1, 1],\n",
-    "    # Alkanes -> inactive (0)\n",
-    "    3: [0, 0, 0, 0, 0, 0, 0, 0],\n",
-    "    # Alkenes -> mixed\n",
-    "    4: [1, 0, 1, 0, 1],\n",
-    "    # Ethers -> mixed\n",
-    "    5: [0, 1, 0, 1]\n",
-    "}\n",
-    "\n",
-    "# Build dataset\n",
-    "molecules = []\n",
-    "targets = []\n",
-    "molecule_types = []\n",
-    "\n",
-    "type_names = ['Alcohols', 'Aromatics', 'Acids', 'Alkanes', 'Alkenes', 'Ethers']\n",
-    "start_idx = 0\n",
-    "\n",
-    "for type_idx, (group_idx, activities) in enumerate(activity_patterns.items()):\n",
-    "    group_size = len(activities)\n",
-    "    group_molecules = base_molecules[start_idx:start_idx + group_size]\n",
-    "    \n",
-    "    molecules.extend(group_molecules)\n",
-    "    targets.extend(activities)\n",
-    "    molecule_types.extend([type_names[type_idx]] * group_size)\n",
-    "    \n",
-    "    start_idx += group_size\n",
-    "\n",
-    "print(f\"Created dataset with {len(molecules)} molecules\")\n",
-    "print(f\"Activity distribution: {np.bincount(targets)}\")\n",
-    "print(f\"Molecule types: {set(molecule_types)}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2dc2d8c3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create DataFrame\n",
-    "df = pd.DataFrame({\n",
-    "    'smiles': molecules,\n",
-    "    'activity': targets,\n",
-    "    'molecule_type': molecule_types\n",
-    "})\n",
-    "\n",
-    "# Display dataset summary\n",
-    "print(\"Dataset Summary:\")\n",
-    "print(f\"Total molecules: {len(df)}\")\n",
-    "print(f\"Active molecules: {sum(df['activity'])}\")\n",
-    "print(f\"Inactive molecules: {len(df) - sum(df['activity'])}\")\n",
-    "print(\"\\nMolecule types:\")\n",
-    "print(df['molecule_type'].value_counts())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3c14fca9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Show first few rows\n",
-    "df.head(10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "59bc266d",
-   "metadata": {},
-   "source": [
-    "## Generate Molecular Fingerprints\n",
-    "\n",
-    "Convert SMILES to molecular fingerprints for machine learning."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1383c16f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generate fingerprints\n",
-    "print(\"Converting molecules to fingerprints...\")\n",
-    "fingerprints = get_molecular_fingerprints(df['smiles'].tolist(), n_bits=1024)\n",
-    "\n",
-    "# Add to dataframe\n",
-    "df['fingerprints'] = fingerprints\n",
-    "\n",
-    "# Display fingerprint statistics\n",
-    "fp_sizes = [len(fp) for fp in fingerprints]\n",
-    "print(f\"Fingerprint statistics:\")\n",
-    "print(f\"  Average bits per molecule: {np.mean(fp_sizes):.1f}\")\n",
-    "print(f\"  Min bits: {np.min(fp_sizes)}\")\n",
-    "print(f\"  Max bits: {np.max(fp_sizes)}\")\n",
-    "print(f\"  Std deviation: {np.std(fp_sizes):.1f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "65ea6cb8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Plot fingerprint size distribution\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.hist(fp_sizes, bins=15, alpha=0.7, edgecolor='black')\n",
-    "plt.xlabel('Number of Bits Set')\n",
-    "plt.ylabel('Frequency')\n",
-    "plt.title('Distribution of Fingerprint Sizes')\n",
-    "plt.grid(True, alpha=0.3)\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3e073281",
-   "metadata": {},
-   "source": [
-    "## Example 1: Basic sklearn Integration\n",
-    "\n",
-    "Let's start with basic train/test split and evaluation."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "35dabf49",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Prepare data\n",
-    "X = fingerprints\n",
-    "y = df['activity'].values\n",
-    "\n",
-    "# Split data\n",
-    "X_train, X_test, y_train, y_test = train_test_split(\n",
-    "    X, y, test_size=0.3, random_state=42, stratify=y\n",
-    ")\n",
-    "\n",
-    "print(f\"Training set size: {len(X_train)}\")\n",
-    "print(f\"Test set size: {len(X_test)}\")\n",
-    "print(f\"Training set activity distribution: {np.bincount(y_train)}\")\n",
-    "print(f\"Test set activity distribution: {np.bincount(y_test)}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3f6f5b1f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Convert to sklearn format\n",
-    "X_train_sklearn = convert_fingerprints(X_train, n_bits=1024)\n",
-    "X_test_sklearn = convert_fingerprints(X_test, n_bits=1024)\n",
-    "\n",
-    "print(f\"Training matrix shape: {X_train_sklearn.shape}\")\n",
-    "print(f\"Test matrix shape: {X_test_sklearn.shape}\")\n",
-    "print(f\"Matrix format: {X_train_sklearn.format}\")\n",
-    "print(f\"Sparsity: {1 - X_train_sklearn.nnz / (X_train_sklearn.shape[0] * X_train_sklearn.shape[1]):.3f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ed676603",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Train classifier\n",
-    "clf = LaplacianNB_New(alpha=1.0)\n",
-    "clf.fit(X_train_sklearn, y_train)\n",
-    "\n",
-    "# Evaluate\n",
-    "train_score = clf.score(X_train_sklearn, y_train)\n",
-    "test_score = clf.score(X_test_sklearn, y_test)\n",
-    "\n",
-    "print(f\"Training accuracy: {train_score:.3f}\")\n",
-    "print(f\"Test accuracy: {test_score:.3f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "403259f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Detailed evaluation\n",
-    "y_pred = clf.predict(X_test_sklearn)\n",
-    "y_pred_proba = clf.predict_proba(X_test_sklearn)\n",
-    "\n",
-    "print(\"Classification Report:\")\n",
-    "print(classification_report(y_test, y_pred, target_names=['Inactive', 'Active']))\n",
-    "\n",
-    "print(\"\\nConfusion Matrix:\")\n",
-    "cm = confusion_matrix(y_test, y_pred)\n",
-    "print(cm)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f48a5eb5",
-   "metadata": {},
-   "source": [
-    "## Example 2: Cross-Validation\n",
-    "\n",
-    "Let's use cross-validation to get more robust performance estimates."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aca02a01",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Convert all data to sklearn format\n",
-    "X_all = convert_fingerprints(X, n_bits=1024)\n",
-    "y_all = np.array(y)\n",
-    "\n",
-    "print(f\"Full dataset shape: {X_all.shape}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d441f549",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test different CV strategies\n",
-    "cv_strategies = {\n",
-    "    \"5-fold CV\": 5,\n",
-    "    \"10-fold CV\": 10,\n",
-    "    \"Stratified 5-fold\": StratifiedKFold(n_splits=5, shuffle=True, random_state=42),\n",
-    "    \"Stratified 10-fold\": StratifiedKFold(n_splits=10, shuffle=True, random_state=42)\n",
-    "}\n",
-    "\n",
-    "cv_results = {}\n",
-    "\n",
-    "for name, cv in cv_strategies.items():\n",
-    "    scores = cross_val_score(clf, X_all, y_all, cv=cv, scoring='accuracy')\n",
-    "    cv_results[name] = scores\n",
-    "    print(f\"{name:20s}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "190428aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize CV results\n",
-    "plt.figure(figsize=(12, 6))\n",
-    "positions = range(len(cv_results))\n",
-    "bp = plt.boxplot([scores for scores in cv_results.values()], \n",
-    "                 labels=list(cv_results.keys()),\n",
-    "                 patch_artist=True)\n",
-    "\n",
-    "# Color the boxes\n",
-    "colors = ['lightblue', 'lightgreen', 'lightyellow', 'lightcoral']\n",
-    "for patch, color in zip(bp['boxes'], colors):\n",
-    "    patch.set_facecolor(color)\n",
-    "\n",
-    "plt.ylabel('Accuracy')\n",
-    "plt.title('Cross-Validation Results Comparison')\n",
-    "plt.xticks(rotation=45)\n",
-    "plt.grid(True, alpha=0.3)\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e87c1515",
-   "metadata": {},
-   "source": [
-    "## Example 3: Pipeline with Feature Selection\n",
-    "\n",
-    "Let's create a pipeline that includes feature selection."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d23a19d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create pipeline with feature selection\n",
-    "pipeline = Pipeline([\n",
-    "    ('feature_selection', SelectKBest(chi2, k=500)),\n",
-    "    ('classifier', LaplacianNB_New(alpha=1.0))\n",
-    "])\n",
-    "\n",
-    "# Train pipeline\n",
-    "pipeline.fit(X_train_sklearn, y_train)\n",
-    "pipeline_score = pipeline.score(X_test_sklearn, y_test)\n",
-    "\n",
-    "print(f\"Pipeline test accuracy: {pipeline_score:.3f}\")\n",
-    "print(f\"Improvement over basic model: {pipeline_score - test_score:.3f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0cbee57b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Cross-validate pipeline\n",
-    "pipeline_cv_scores = cross_val_score(pipeline, X_all, y_all, cv=5, scoring='accuracy')\n",
-    "basic_cv_scores = cross_val_score(clf, X_all, y_all, cv=5, scoring='accuracy')\n",
-    "\n",
-    "print(f\"Basic model CV accuracy: {basic_cv_scores.mean():.3f} (+/- {basic_cv_scores.std() * 2:.3f})\")\n",
-    "print(f\"Pipeline CV accuracy: {pipeline_cv_scores.mean():.3f} (+/- {pipeline_cv_scores.std() * 2:.3f})\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1247021b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Analyze selected features\n",
-    "selector = pipeline.named_steps['feature_selection']\n",
-    "selected_features = selector.get_support()\n",
-    "feature_scores = selector.scores_\n",
-    "\n",
-    "print(f\"Selected {np.sum(selected_features)} out of {len(selected_features)} features\")\n",
-    "print(f\"Selected feature indices (first 20): {np.where(selected_features)[0][:20]}\")\n",
-    "print(f\"Top 10 feature scores: {np.sort(feature_scores)[-10:]}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3ccf6c42",
-   "metadata": {},
-   "source": [
-    "## Example 4: Grid Search Hyperparameter Tuning\n",
-    "\n",
-    "Let's use grid search to optimize hyperparameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e0e0be43",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define parameter grid\n",
-    "param_grid = {\n",
-    "    'feature_selection__k': [200, 500, 800],\n",
-    "    'classifier__alpha': [0.1, 1.0, 10.0]\n",
-    "}\n",
-    "\n",
-    "print(\"Parameter grid:\")\n",
-    "for param, values in param_grid.items():\n",
-    "    print(f\"  {param}: {values}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bfb8902f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Perform grid search\n",
-    "grid_search = GridSearchCV(\n",
-    "    pipeline, \n",
-    "    param_grid, \n",
-    "    cv=5, \n",
-    "    scoring='accuracy',\n",
-    "    n_jobs=-1,\n",
-    "    verbose=1\n",
-    ")\n",
-    "\n",
-    "grid_search.fit(X_train_sklearn, y_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9a0eecae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Results\n",
-    "print(f\"Best parameters: {grid_search.best_params_}\")\n",
-    "print(f\"Best CV score: {grid_search.best_score_:.3f}\")\n",
-    "print(f\"Test score with best params: {grid_search.score(X_test_sklearn, y_test):.3f}\")\n",
-    "\n",
-    "# Show all results\n",
-    "results_df = pd.DataFrame(grid_search.cv_results_)\n",
-    "print(\"\\nAll grid search results:\")\n",
-    "print(results_df[['params', 'mean_test_score', 'std_test_score']].round(3))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "131cc895",
-   "metadata": {},
-   "source": [
-    "## Example 5: FingerprintTransformer Pipeline\n",
-    "\n",
-    "Now let's use the FingerprintTransformer to work directly with fingerprint sets."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4b589604",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create pipeline with FingerprintTransformer\n",
-    "transformer_pipeline = Pipeline([\n",
-    "    ('fingerprints', FingerprintTransformer(n_bits=1024, output_format='csr')),\n",
-    "    ('feature_selection', SelectKBest(chi2, k=500)),\n",
-    "    ('classifier', LaplacianNB_New(alpha=1.0))\n",
-    "])\n",
-    "\n",
-    "print(\"Pipeline steps:\")\n",
-    "for step_name, step in transformer_pipeline.steps:\n",
-    "    print(f\"  {step_name}: {type(step).__name__}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "91b1e67f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Train on raw fingerprint sets (not pre-converted matrices)\n",
-    "transformer_pipeline.fit(X_train, y_train)\n",
-    "transformer_score = transformer_pipeline.score(X_test, y_test)\n",
-    "\n",
-    "print(f\"Transformer pipeline accuracy: {transformer_score:.3f}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8de302cd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Cross-validate transformer pipeline\n",
-    "transformer_cv_scores = cross_val_score(transformer_pipeline, X, y, cv=5)\n",
-    "print(f\"Transformer pipeline CV: {transformer_cv_scores.mean():.3f} (+/- {transformer_cv_scores.std() * 2:.3f})\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f76ff52c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Grid search with transformer\n",
-    "transformer_param_grid = {\n",
-    "    'fingerprints__n_bits': [512, 1024],\n",
-    "    'fingerprints__output_format': ['csr', 'dense'],\n",
-    "    'feature_selection__k': [300, 500],\n",
-    "    'classifier__alpha': [0.5, 1.0]\n",
-    "}\n",
-    "\n",
-    "transformer_grid = GridSearchCV(\n",
-    "    transformer_pipeline, \n",
-    "    transformer_param_grid, \n",
-    "    cv=3, \n",
-    "    scoring='accuracy',\n",
-    "    verbose=1\n",
-    ")\n",
-    "\n",
-    "transformer_grid.fit(X_train, y_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4df09cad",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(f\"Best transformer params: {transformer_grid.best_params_}\")\n",
-    "print(f\"Best transformer CV score: {transformer_grid.best_score_:.3f}\")\n",
-    "print(f\"Transformer test score: {transformer_grid.score(X_test, y_test):.3f}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3949058b",
-   "metadata": {},
-   "source": [
-    "## Example 6: Model Comparison\n",
-    "\n",
-    "Let's compare different alpha values and other configurations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "51e3e319",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test different alpha values\n",
-    "alpha_values = [0.01, 0.1, 1.0, 10.0, 100.0]\n",
-    "alpha_results = {}\n",
-    "\n",
-    "for alpha in alpha_values:\n",
-    "    model = LaplacianNB_New(alpha=alpha)\n",
-    "    scores = cross_val_score(model, X_all, y_all, cv=5)\n",
-    "    alpha_results[alpha] = scores\n",
-    "    print(f\"Alpha {alpha:6.2f}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5c8a852e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize alpha comparison\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "alphas = list(alpha_results.keys())\n",
-    "means = [scores.mean() for scores in alpha_results.values()]\n",
-    "stds = [scores.std() for scores in alpha_results.values()]\n",
-    "\n",
-    "plt.errorbar(alphas, means, yerr=stds, marker='o', capsize=5, capthick=2)\n",
-    "plt.xscale('log')\n",
-    "plt.xlabel('Alpha (log scale)')\n",
-    "plt.ylabel('CV Accuracy')\n",
-    "plt.title('LaplacianNB Performance vs Alpha Parameter')\n",
-    "plt.grid(True, alpha=0.3)\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "62549f85",
-   "metadata": {},
-   "source": [
-    "## Example 7: Feature Importance Analysis\n",
-    "\n",
-    "Let's analyze which molecular features are most important."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "15814b9e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Train final model\n",
-    "final_model = LaplacianNB_New(alpha=1.0)\n",
-    "final_model.fit(X_all, y_all)\n",
-    "\n",
-    "# Get feature importance (log probability differences)\n",
-    "feature_log_probs = final_model.feature_log_prob_\n",
-    "print(f\"Feature log probabilities shape: {feature_log_probs.shape}\")\n",
-    "print(f\"Classes: {final_model.classes_}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d254d4c6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Calculate feature importance as log probability differences\n",
-    "class_0_probs = feature_log_probs[0]  # Inactive class\n",
-    "class_1_probs = feature_log_probs[1]  # Active class\n",
-    "\n",
-    "# Difference (higher = more important for active class)\n",
-    "prob_diff = class_1_probs - class_0_probs\n",
-    "\n",
-    "# Top features for each class\n",
-    "n_top = 10\n",
-    "top_inactive_features = np.argsort(prob_diff)[:n_top]  # Most negative\n",
-    "top_active_features = np.argsort(prob_diff)[-n_top:]   # Most positive\n",
-    "\n",
-    "print(f\"Top {n_top} features for INACTIVE class (bit indices): {top_inactive_features}\")\n",
-    "print(f\"Top {n_top} features for ACTIVE class (bit indices): {top_active_features}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "016c6f03",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize feature importance\n",
-    "plt.figure(figsize=(12, 8))\n",
-    "\n",
-    "# Plot histogram of all feature differences\n",
-    "plt.subplot(2, 1, 1)\n",
-    "plt.hist(prob_diff, bins=50, alpha=0.7, edgecolor='black')\n",
-    "plt.xlabel('Log Probability Difference (Active - Inactive)')\n",
-    "plt.ylabel('Number of Features')\n",
-    "plt.title('Distribution of Feature Importance Scores')\n",
-    "plt.grid(True, alpha=0.3)\n",
-    "\n",
-    "# Plot top features\n",
-    "plt.subplot(2, 1, 2)\n",
-    "top_features = np.concatenate([top_inactive_features, top_active_features])\n",
-    "top_scores = prob_diff[top_features]\n",
-    "colors = ['red'] * n_top + ['green'] * n_top\n",
-    "labels = [f'Bit {i}' for i in top_features]\n",
-    "\n",
-    "bars = plt.bar(range(len(top_features)), top_scores, color=colors, alpha=0.7)\n",
-    "plt.xlabel('Feature Index')\n",
-    "plt.ylabel('Importance Score')\n",
-    "plt.title(f'Top {n_top} Features for Each Class')\n",
-    "plt.xticks(range(len(top_features)), [f'{i}' for i in top_features], rotation=45)\n",
-    "\n",
-    "# Add legend\n",
-    "import matplotlib.patches as mpatches\n",
-    "red_patch = mpatches.Patch(color='red', alpha=0.7, label='Inactive Class')\n",
-    "green_patch = mpatches.Patch(color='green', alpha=0.7, label='Active Class')\n",
-    "plt.legend(handles=[red_patch, green_patch])\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "267cd47f",
-   "metadata": {},
-   "source": [
-    "## Example 8: Real-world Application Simulation\n",
-    "\n",
-    "Let's simulate a real-world scenario with new molecule prediction."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f084aede",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create some \"new\" molecules for prediction\n",
-    "new_molecules = [\n",
-    "    \"CCCCCO\",           # Long chain alcohol (probably active)\n",
-    "    \"c1ccc(F)cc1\",      # Fluorobenzene (probably inactive)\n",
-    "    \"CCCCCC(=O)O\",      # Hexanoic acid (probably active)\n",
-    "    \"CCCCCCCC\",         # Octane (probably inactive)\n",
-    "    \"COc1ccccc1\",       # Anisole (probably inactive)\n",
-    "    \"CC(C)(C)O\",        # tert-Butanol (probably active)\n",
-    "]\n",
-    "\n",
-    "print(\"Predicting activity for new molecules:\")\n",
-    "print(\"=\" * 50)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c8f28e13",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generate fingerprints for new molecules\n",
-    "new_fingerprints = get_molecular_fingerprints(new_molecules, n_bits=1024)\n",
-    "new_X = convert_fingerprints(new_fingerprints, n_bits=1024)\n",
-    "\n",
-    "# Make predictions\n",
-    "new_predictions = final_model.predict(new_X)\n",
-    "new_probabilities = final_model.predict_proba(new_X)\n",
-    "\n",
-    "# Display results\n",
-    "for i, smiles in enumerate(new_molecules):\n",
-    "    pred = new_predictions[i]\n",
-    "    prob_inactive, prob_active = new_probabilities[i]\n",
-    "    confidence = max(prob_inactive, prob_active)\n",
-    "    \n",
-    "    activity_label = \"ACTIVE\" if pred == 1 else \"INACTIVE\"\n",
-    "    print(f\"{smiles:15s} -> {activity_label:8s} (confidence: {confidence:.3f})\")\n",
-    "    print(f\"{'':15s}    Probabilities: Inactive={prob_inactive:.3f}, Active={prob_active:.3f}\")\n",
-    "    print()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "379f5870",
-   "metadata": {},
-   "source": [
-    "## Summary\n",
-    "\n",
-    "This tutorial demonstrated comprehensive sklearn integration with LaplacianNB:\n",
-    "\n",
-    "### ✅ What we covered:\n",
-    "\n",
-    "1. **Basic Integration**: Train/test splits and evaluation\n",
-    "2. **Cross-Validation**: Multiple CV strategies for robust evaluation  \n",
-    "3. **Pipelines**: Feature selection and preprocessing pipelines\n",
-    "4. **Grid Search**: Hyperparameter optimization\n",
-    "5. **FingerprintTransformer**: Direct integration with molecular fingerprints\n",
-    "6. **Model Comparison**: Alpha parameter optimization\n",
-    "7. **Feature Analysis**: Understanding important molecular features\n",
-    "8. **Real-world Application**: Predicting new molecule activities\n",
-    "\n",
-    "### 🚀 Key Benefits:\n",
-    "\n",
-    "- **sklearn Compatibility**: Full integration with sklearn ecosystem\n",
-    "- **Memory Efficiency**: Sparse matrix support for large fingerprints\n",
-    "- **Pipeline Support**: Easy integration with preprocessing and feature selection\n",
-    "- **Performance**: Fast training and prediction with molecular data\n",
-    "- **Flexibility**: Works with various fingerprint formats and sizes\n",
-    "\n",
-    "### 🎯 Next Steps:\n",
-    "\n",
-    "- Try with your own molecular datasets\n",
-    "- Experiment with different fingerprint types (ECFP, MACCS, etc.)\n",
-    "- Combine with other sklearn algorithms in ensembles\n",
-    "- Use in production pipelines for drug discovery"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/simple_performance_test.py b/simple_performance_test.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/laplaciannb/bayes.py b/src/laplaciannb/bayes.py
index 2b0b8d4..439e6eb 100644
--- a/src/laplaciannb/bayes.py
+++ b/src/laplaciannb/bayes.py
@@ -106,7 +106,7 @@ def __init__(self, *, alpha=1.0, force_alpha=True, fit_prior=True, class_prior=N
         self.alpha = alpha
         self.fit_prior = fit_prior
         self.class_prior = class_prior
-        force_alpha = force_alpha
+        self.force_alpha = force_alpha
 
     def _check_X(self, X):
         """Validate X, used only in predict* methods."""
diff --git a/src/laplaciannb/legacy/LaplacianNB_new.py b/src/laplaciannb/legacy/LaplacianNB_new.py
deleted file mode 100644
index 25495f7..0000000
--- a/src/laplaciannb/legacy/LaplacianNB_new.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import numpy as np
-from scipy import sparse
-from scipy.special import logsumexp
-from sklearn.naive_bayes import _BaseDiscreteNB
-from sklearn.preprocessing import LabelBinarizer
-from sklearn.utils.validation import (
-    _check_sample_weight,
-    check_array,
-    check_is_fitted,
-    check_X_y,
-)
-
-
-class LaplacianNB(_BaseDiscreteNB):
-    """Naive Bayes classifier for Laplacian modified models.
-
-    Like BernoulliNB, this classifier is suitable for binary/boolean data. The
-    difference is that while BernoulliNB processes all features, the
-    Laplacian modified approach uses only positive (non-zero) features.
-
-    Parameters
-    ----------
-    alpha : float, default=1.0
-        Additive (Laplace/Lidstone) smoothing parameter
-        (0 for no smoothing).
-
-    force_alpha : bool, default=True
-        If False and alpha is less than 1e-10, it will be set to 1e-10.
-
-    fit_prior : bool, default=True
-        Whether to learn class prior probabilities or not.
-        If false, a uniform prior will be used.
-
-    class_prior : array-like of shape (n_classes,), default=None
-        Prior probabilities of the classes. If specified, the priors are not
-        adjusted according to the data.
-
-    Attributes
-    ----------
-    class_count_ : ndarray of shape (n_classes,)
-        Number of samples encountered for each class during fitting. This
-        value is weighted by the sample weight when provided.
-
-    class_log_prior_ : ndarray of shape (n_classes,)
-        Log probability of each class (smoothed).
-
-    classes_ : ndarray of shape (n_classes,)
-        Class labels known to the classifier.
-
-    feature_count_ : ndarray of shape (n_classes,)
-        Sum of positive features for each class.
-
-    feature_count_per_class_ : ndarray of shape (n_classes, n_features_in_)
-        Number of positive bits encountered for each (class, feature) during fitting.
-
-    feature_all_ : float
-        Total number of positive features encountered.
-
-    feature_log_prob_ : ndarray of shape (n_classes, n_features_in_)
-        Empirical log probability of positive bit features given a class, P(x_i|y).
-
-    n_features_in_ : int
-        Number of features seen during fit.
-
-    feature_names_in_ : ndarray of shape (n_features_in_,), optional
-        Names of features seen during fit. Defined only when X
-        has feature names that are all strings.
-
-    References
-    ----------
-    Nidhi; Glick, M.; Davies, J. W.; Jenkins, J. L. Prediction of biological targets
-    for compounds using multiple-category Bayesian models trained on chemogenomics
-    databases. J. Chem. Inf. Model. 2006, 46, 1124– 1133,
-    https://doi.org/10.1021/ci060003g
-
-    Lam PY, Kutchukian P, Anand R, et al.
-    Cyp1 inhibition prevents doxorubicin-induced cardiomyopathy
-    in a zebrafish heart-failure model. Chem Bio Chem. 2020:cbic.201900741.
-    https://doi.org/10.1002/cbic.201900741
-    """
-
-    def __init__(self, *, alpha=1.0, force_alpha=True, fit_prior=True, class_prior=None):
-        self.alpha = alpha
-        self.force_alpha = force_alpha
-        self.fit_prior = fit_prior
-        self.class_prior = class_prior
-
-    def _check_X(self, X):
-        """Validate X for predict methods."""
-        # Detect legacy input formats first, before sklearn validation
-        self._detect_legacy_input_format(X)
-
-        X = check_array(
-            X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32, np.int64, np.int32, bool], ensure_2d=True
-        )
-
-        # Convert to binary if needed (handle sparse matrices properly)
-        if sparse.issparse(X):
-            # For sparse matrices, check if any value is not 0 or 1
-            if X.dtype != bool and not np.all((X.data == 0) | (X.data == 1)):
-                X = (X != 0).astype(np.float64)
-        else:
-            # For dense matrices
-            if not np.array_equal(X, X.astype(bool)):
-                X = (X != 0).astype(np.float64)
-
-        return X
-
-    def _check_X_y(self, X, y, reset=True):
-        """Validate X and y for fit."""
-        X, y = check_X_y(
-            X,
-            y,
-            accept_sparse=["csr", "csc"],
-            dtype=[np.float64, np.float32, np.int64, np.int32, bool],
-            ensure_2d=True,
-        )
-
-        # Convert to binary if needed (handle sparse matrices properly)
-        if sparse.issparse(X):
-            # For sparse matrices, check if any value is not 0 or 1
-            if X.dtype != bool and not np.all((X.data == 0) | (X.data == 1)):
-                X = (X != 0).astype(np.float64)
-        else:
-            # For dense matrices
-            if not np.array_equal(X, X.astype(bool)):
-                X = (X != 0).astype(np.float64)
-
-        return X, y
-
-    def _count_feature_occurrences(self, X, Y):
-        """Count how many times each feature appears positive for each class.
-
-        This implements the core Laplacian NB algorithm: counting only positive bits.
-        """
-        n_classes = Y.shape[1]
-        n_features = X.shape[1]
-
-        # Initialize counters
-        feature_count_per_class = np.zeros((n_classes, n_features), dtype=np.float64)
-        feature_sum_per_class = np.zeros(n_classes, dtype=np.float64)
-
-        # Count positive features for each class
-        if sparse.issparse(X):
-            X = X.tocsr()
-            for i in range(n_classes):
-                class_mask = Y[:, i].astype(bool)
-                if np.any(class_mask):
-                    # Sum positive features for samples in this class
-                    X_class = X[class_mask]
-                    feature_count_per_class[i] = np.asarray(X_class.sum(axis=0)).ravel()
-                    feature_sum_per_class[i] = feature_count_per_class[i].sum()
-        else:
-            for i in range(n_classes):
-                class_mask = Y[:, i].astype(bool)
-                if np.any(class_mask):
-                    # Sum positive features for samples in this class
-                    X_class = X[class_mask]
-                    feature_count_per_class[i] = X_class.sum(axis=0)
-                    feature_sum_per_class[i] = feature_count_per_class[i].sum()
-
-        # Count total positive features across all samples
-        total_feature_counts = np.asarray(X.sum(axis=0)).ravel() if sparse.issparse(X) else X.sum(axis=0)
-
-        return feature_count_per_class, feature_sum_per_class, total_feature_counts
-
-    def _init_counters(self, n_classes, n_features):
-        """Initialize counters."""
-        self.class_count_ = np.zeros(n_classes, dtype=np.float64)
-        self.feature_count_per_class_ = np.zeros((n_classes, n_features), dtype=np.float64)
-        self.feature_count_ = np.zeros(n_classes, dtype=np.float64)
-
-    def _count(self, X, Y):
-        """Count and smooth feature occurrences."""
-        (self.feature_count_per_class_, self.feature_count_, self.total_feature_counts_) = (
-            self._count_feature_occurrences(X, Y)
-        )
-
-        self.feature_all_ = self.feature_count_.sum()
-        self.class_count_ += Y.sum(axis=0)
-
-    def _update_feature_log_prob(self, alpha):
-        """Apply smoothing to raw counts and recompute log probabilities."""
-        # Prior probability for each class (based on positive feature counts)
-        prior = self.feature_count_ / (self.feature_all_ + np.finfo(float).eps)
-
-        # Laplacian smoothing for feature probabilities
-        # P(feature_i | class_j) = (count_ij + alpha) / (prior_j * total_i + alpha)
-        denominator = np.outer(prior, self.total_feature_counts_) + alpha
-        numerator = self.feature_count_per_class_ + alpha
-
-        self.feature_prob_ = numerator / (denominator + np.finfo(float).eps)
-        self.feature_log_prob_ = np.log(self.feature_prob_)
-
-    def _joint_log_likelihood(self, X):
-        """Calculate the posterior log probability of the samples X.
-
-        Only considers positive (non-zero) features as per Laplacian NB.
-
-        Note: This method returns the feature contributions only,
-        following the original implementation. Class priors are added
-        in predict_log_proba if needed.
-        """
-        check_is_fitted(self)
-
-        # For Laplacian NB, we only use positive features
-        if sparse.issparse(X):
-            # Efficient sparse matrix multiplication
-            # Only non-zero elements contribute to the sum
-            jll = X @ self.feature_log_prob_.T
-        else:
-            # Dense matrix: mask zero elements
-            X_binary = (X > 0).astype(np.float64)
-            jll = X_binary @ self.feature_log_prob_.T
-
-        # Do NOT add class priors here - follow original implementation
-        # jll += self.class_log_prior_  # Commented out to match original
-
-        return jll
-
-    def _detect_legacy_input_format(self, X):
-        """Detect and reject legacy input formats with helpful error message."""
-        # Check for single set
-        if isinstance(X, set):
-            raise ValueError(
-                "LEGACY INPUT FORMAT ERROR: You are trying to use a single set as input. "
-                "This is no longer supported in the new version. "
-                "\n\nTo fix this:\n"
-                "1. Use the legacy version: from laplaciannb.legacy import LaplacianNB\n"
-                "2. Or convert to proper format: from laplaciannb import convert_fingerprints\n"
-                "   X = convert_fingerprints([your_set], n_bits=desired_size)"
-            )
-
-        # Check for list of sets
-        if isinstance(X, list) and len(X) > 0 and isinstance(X[0], set):
-            raise ValueError(
-                "LEGACY INPUT FORMAT ERROR: You are trying to use the old list-of-sets format. "
-                "This is no longer supported in the new version. "
-                "\n\nTo fix this:\n"
-                "1. Use the legacy version: from laplaciannb.legacy import LaplacianNB\n"
-                "2. Or convert to proper format: from laplaciannb import convert_fingerprints\n"
-                "   X = convert_fingerprints(your_sets, n_bits=desired_size)"
-            )
-
-        # Check for numpy array with object dtype containing sets
-        if hasattr(X, "dtype") and X.dtype == object and len(X) > 0:
-            if isinstance(X.flat[0], set):
-                raise ValueError(
-                    "LEGACY INPUT FORMAT ERROR: You are trying to use the old numpy array of sets format. "
-                    "This is no longer supported in the new version. "
-                    "\n\nTo fix this:\n"
-                    "1. Use the legacy version: from laplaciannb.legacy import LaplacianNB\n"
-                    "2. Or convert to proper format: from laplaciannb import convert_fingerprints\n"
-                    "   X = convert_fingerprints(your_sets, n_bits=desired_size)"
-                )
-
-    def fit(self, X, y, sample_weight=None):
-        """Fit Naive Bayes classifier according to X, y.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training vectors. Binary/boolean features expected.
-            Non-zero values are treated as positive bits.
-
-        y : array-like of shape (n_samples,)
-            Target values.
-
-        sample_weight : array-like of shape (n_samples,), default=None
-            Weights applied to individual samples (1. for unweighted).
-
-        Returns
-        -------
-        self : object
-            Returns the instance itself.
-        """
-        # Detect legacy input formats first, before sklearn validation
-        self._detect_legacy_input_format(X)
-
-        X, y = self._check_X_y(X, y)
-
-        # Store number of features
-        _, self.n_features_in_ = X.shape
-
-        # Encode labels
-        labelbin = LabelBinarizer()
-        Y = labelbin.fit_transform(y)
-        self.classes_ = labelbin.classes_
-
-        if Y.shape[1] == 1:
-            if len(self.classes_) == 2:
-                Y = np.concatenate((1 - Y, Y), axis=1)
-            else:  # degenerate case: just one class
-                Y = np.ones_like(Y)
-
-        # Handle sample weights
-        if sample_weight is not None:
-            Y = Y.astype(np.float64, copy=False)
-            sample_weight = _check_sample_weight(sample_weight, X)
-            sample_weight = np.atleast_2d(sample_weight)
-            Y *= sample_weight.T
-
-        # Count raw events from data
-        n_classes = Y.shape[1]
-        self._init_counters(n_classes, self.n_features_in_)
-        self._count(X, Y)
-
-        # Update probabilities
-        alpha = self._check_alpha()
-        self._update_feature_log_prob(alpha)
-        self._update_class_log_prior(class_prior=self.class_prior)
-
-        return self
-
-    def predict_log_proba(self, X):
-        """Return log-probability estimates for the test vector X.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : array-like of shape (n_samples, n_classes)
-            Returns the log-probability of the samples for each class in
-            the model. The columns correspond to the classes in sorted
-            order, as they appear in the attribute classes_.
-        """
-        check_is_fitted(self)
-        X = self._check_X(X)
-
-        jll = self._joint_log_likelihood(X)
-
-        # Normalize by P(x) = P(f_1, ..., f_n)
-        log_prob_x = logsumexp(jll, axis=1)
-        return jll - np.atleast_2d(log_prob_x).T
-
-    def predict_proba(self, X):
-        """Return probability estimates for the test vector X.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : array-like of shape (n_samples, n_classes)
-            Returns the probability of the samples for each class in
-            the model. The columns correspond to the classes in sorted
-            order, as they appear in the attribute classes_.
-        """
-        return np.exp(self.predict_log_proba(X))
-
-    def predict(self, X):
-        """Perform classification on an array of test vectors X.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : ndarray of shape (n_samples,)
-            Predicted target values for X.
-        """
-        check_is_fitted(self)
-        X = self._check_X(X)
-
-        jll = self._joint_log_likelihood(X)
-        return self.classes_[np.argmax(jll, axis=1)]

From 8768210e095f2fb70edde8d284f5884acd61d1e5 Mon Sep 17 00:00:00 2001
From: Bartosz Baranowski <bartekbaranow@gmail.com>
Date: Wed, 20 Aug 2025 19:07:44 +0200
Subject: [PATCH 4/8] add the benchmark and fix molecule creation

---
 examples/benchmark_fingerprints.py   |  61 +++++
 examples/benchmark_large_scale.py    |  74 +++++
 pyproject.toml                       |   3 +-
 src/laplaciannb/fingerprint_utils.py | 385 ++++++++++++++++++++++++++-
 uv.lock                              |  14 +
 5 files changed, 530 insertions(+), 7 deletions(-)
 create mode 100644 examples/benchmark_fingerprints.py
 create mode 100644 examples/benchmark_large_scale.py

diff --git a/examples/benchmark_fingerprints.py b/examples/benchmark_fingerprints.py
new file mode 100644
index 0000000..b773834
--- /dev/null
+++ b/examples/benchmark_fingerprints.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+"""
+Fingerprint Conversion Benchmark
+===============================
+
+Test the performance of rdkit_to_csr function with different dataset sizes
+and parameters.
+"""
+
+import sys
+import os
+
+# Add src to path so we can import laplaciannb
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+
+from laplaciannb.fingerprint_utils import rdkit_to_csr, benchmark_fingerprint_conversion
+
+def main():
+    """Run fingerprint conversion benchmarks."""
+    print("LaplacianNB Fingerprint Conversion Benchmark")
+    print("=" * 50)
+    
+    try:
+        # Quick test with small dataset
+        print("\n1. Quick Test (50 molecules)")
+        print("-" * 30)
+        test_smiles = [
+            "CCO", "CC(=O)OC1=CC=CC=C1C(=O)O", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
+            "CCCCCCCCCCCCCCCC", "CC1=CC=C(C=C1)C(=O)O"
+        ] * 10  # 50 molecules
+        
+        X = rdkit_to_csr(test_smiles, radius=2, show_progress=True)
+        print(f"✓ Successfully converted {X.shape[0]} molecules")
+        
+        # Medium test
+        print("\n2. Medium Test (200 molecules)")
+        print("-" * 30)
+        medium_smiles = test_smiles * 4  # 200 molecules
+        X_medium = rdkit_to_csr(medium_smiles, radius=2, show_progress=True)
+        
+        # Comprehensive benchmark
+        print("\n3. Comprehensive Benchmark")
+        print("-" * 30)
+        benchmark_fingerprint_conversion(
+            n_molecules=1000, 
+            radii=[1, 2, 3],
+            molecules_per_test=[100, 500, 1000]
+        )
+        
+        print("\n" + "=" * 50)
+        print("✓ All benchmarks completed successfully!")
+        print("✓ LaplacianNB fingerprint conversion is ready for production")
+        
+    except ImportError as e:
+        print(f"Missing dependency: {e}")
+        print("Please install: pip install rdkit scikit-learn scipy")
+    except Exception as e:
+        print(f"Error during benchmark: {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/benchmark_large_scale.py b/examples/benchmark_large_scale.py
new file mode 100644
index 0000000..316d447
--- /dev/null
+++ b/examples/benchmark_large_scale.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""
+Large-Scale Fingerprint Conversion Benchmark
+==========================================
+
+Test the performance and scalability of LaplacianNB fingerprint conversion
+with datasets up to 100,000 molecules.
+"""
+
+import sys
+import os
+
+# Add src to path so we can import laplaciannb
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+
+from laplaciannb.fingerprint_utils import benchmark_large_scale_conversion
+
+def main():
+    """Run large-scale fingerprint conversion benchmark."""
+    print("LaplacianNB Large-Scale Fingerprint Benchmark")
+    print("=" * 50)
+    print("Testing conversion performance up to 100,000 molecules")
+    print("This benchmark evaluates:")
+    print("- Conversion speed and throughput")
+    print("- Memory usage and efficiency") 
+    print("- Scalability characteristics")
+    print("- Performance projections")
+    
+    try:
+        # Run the comprehensive large-scale benchmark
+        results = benchmark_large_scale_conversion(
+            target_molecules=100000,
+            test_sizes=[1000, 5000, 10000, 25000, 50000, 100000],
+            radius=2,
+            sample_diversity=True
+        )
+        
+        print("\n" + "="*50)
+        print("BENCHMARK SUMMARY")
+        print("="*50)
+        
+        if results:
+            fastest_rate = max(r['rate'] for r in results)
+            largest_test = max(results, key=lambda x: x['molecules'])
+            
+            print(f"Peak conversion rate: {fastest_rate:,.0f} molecules/second")
+            print(f"Largest test completed: {largest_test['molecules']:,} molecules")
+            print(f"Time for largest test: {largest_test['time']:.1f} seconds")
+            print(f"Memory for largest test: {largest_test['memory_mb']:.1f} MB")
+            print(f"Sparsity achieved: {largest_test['sparsity']:.6f}")
+            
+            # Calculate efficiency metrics
+            total_molecules = sum(r['molecules'] for r in results)
+            total_time = sum(r['time'] for r in results)
+            overall_rate = total_molecules / total_time
+            
+            print(f"\nOverall benchmark performance:")
+            print(f"  Total molecules processed: {total_molecules:,}")
+            print(f"  Total time: {total_time:.1f} seconds") 
+            print(f"  Average rate: {overall_rate:,.0f} molecules/second")
+        
+        print(f"\n✓ Large-scale benchmark completed successfully!")
+        print(f"✓ LaplacianNB fingerprint conversion scales efficiently to 100K+ molecules")
+        
+    except ImportError as e:
+        print(f"Missing dependency: {e}")
+        print("Please install: pip install rdkit scikit-learn scipy")
+    except Exception as e:
+        print(f"Error during benchmark: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index b2a5f4b..9c28a56 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,8 @@ dependencies = [
     "rdkit>=2024.3.5",
     "scikit-learn>=1.7.0",
     "pandas>=2.2.3",
-    "scipy>=1.6.0"
+    "scipy>=1.6.0",
+    "tqdm>=4.67.1",
 ]
 requires-python = ">=3.10"
 readme = "README.md"
diff --git a/src/laplaciannb/fingerprint_utils.py b/src/laplaciannb/fingerprint_utils.py
index 8709011..5b4009f 100644
--- a/src/laplaciannb/fingerprint_utils.py
+++ b/src/laplaciannb/fingerprint_utils.py
@@ -1,31 +1,404 @@
 import numpy as np
+import time
 from rdkit import Chem
 from rdkit.Chem import rdFingerprintGenerator
 from scipy.sparse import csr_matrix
 
+try:
+    from tqdm import tqdm
+    TQDM_AVAILABLE = True
+except ImportError:
+    TQDM_AVAILABLE = False
+    def tqdm(iterable, *args, **kwargs):
+        """Fallback if tqdm is not available."""
+        return iterable
 
-def rdkit_to_csr(smiles_list, radius=2):
-    """Convert RDKit sparse Morgan fingerprints to CSR matrix with lossless conversion."""
+
+def rdkit_to_csr(smiles_list, radius=2, show_progress=True):
+    """Convert RDKit sparse Morgan fingerprints to CSR matrix with lossless conversion.
+    
+    Parameters
+    ----------
+    smiles_list : list of str
+        List of SMILES strings to convert to fingerprints
+    radius : int, default=2
+        Morgan fingerprint radius
+    show_progress : bool, default=True
+        Show progress bar if tqdm is available
+        
+    Returns
+    -------
+    scipy.sparse.csr_matrix
+        Sparse matrix of shape (n_molecules, 2^32) with boolean dtype
+        
+    Examples
+    --------
+    >>> smiles = ["CCO", "CC(=O)OC1=CC=CC=C1C(=O)O"]
+    >>> X = rdkit_to_csr(smiles, radius=2)
+    >>> print(f"Shape: {X.shape}, Sparsity: {1 - X.nnz / X.size:.6f}")
+    """
+    start_time = time.time()
+    
     row_ind = []
     col_ind = []
 
     # Create Morgan fingerprint generator
+    print(f"Converting {len(smiles_list)} SMILES to molecular fingerprints...")
     mol_list = [Chem.MolFromSmiles(smi) for smi in smiles_list]
     mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius)
 
-    for i, mol in enumerate(mol_list):
+    # Process molecules with optional progress bar
+    iterator = enumerate(mol_list)
+    if show_progress and TQDM_AVAILABLE and len(mol_list) > 10:
+        iterator = tqdm(iterator, total=len(mol_list), 
+                       desc="Processing molecules", unit="mol")
+
+    valid_molecules = 0
+    total_bits = 0
+    
+    for i, mol in iterator:
         if mol is None:
             continue
+            
+        valid_molecules += 1
 
         # Get sparse fingerprint
         sfp = mfpgen.GetSparseFingerprint(mol)
-        for bit in set(sfp.GetOnBits()):
+        mol_bits = set(sfp.GetOnBits())
+        total_bits += len(mol_bits)
+        
+        for bit in mol_bits:
             # Reinterpret signed int32 as unsigned int32
             # This maps [-2^31, 2^31-1] to [0, 2^32-1] losslessly
             col_idx = np.uint32(bit & 0xFFFFFFFF)
 
             row_ind.append(i)
             col_ind.append(col_idx)
-            data = np.ones(len(row_ind), dtype=np.bool)
 
-    return csr_matrix((data, (row_ind, col_ind)), shape=(len(mol_list), 2**32), dtype=np.bool)
+    # Create data array (all ones for boolean matrix)
+    data = np.ones(len(row_ind), dtype=np.bool_)
+    
+    # Create sparse matrix
+    matrix = csr_matrix((data, (row_ind, col_ind)), 
+                       shape=(len(mol_list), 2**32), dtype=np.bool_)
+    
+    # Performance summary
+    conversion_time = time.time() - start_time
+    sparsity = 1 - matrix.nnz / matrix.size if matrix.size > 0 else 0
+    
+    print(f"Conversion completed in {conversion_time:.3f} seconds")
+    print(f"Valid molecules: {valid_molecules}/{len(mol_list)}")
+    print(f"Total fingerprint bits: {total_bits:,}")
+    print(f"Average bits per molecule: {total_bits/valid_molecules:.1f}")
+    print(f"Matrix shape: {matrix.shape}")
+    print(f"Matrix sparsity: {sparsity:.6f}")
+    print(f"Memory usage: {(matrix.data.nbytes + matrix.indices.nbytes + matrix.indptr.nbytes) / 1024**2:.2f} MB")
+    
+    return matrix
+
+
+def benchmark_fingerprint_conversion(n_molecules=100000, radii=[2], 
+                                   molecules_per_test=None):
+    """Benchmark fingerprint conversion performance with different parameters.
+    
+    Parameters
+    ----------
+    n_molecules : int, default=1000
+        Number of molecules to generate for benchmarking
+    radii : list of int, default=[1, 2, 3]
+        Morgan fingerprint radii to test
+    molecules_per_test : list of int, optional
+        Different molecule counts to test. If None, uses [100, 500, 1000]
+        
+    Examples
+    --------
+    >>> benchmark_fingerprint_conversion(1000, radii=[2, 3])
+    >>> benchmark_fingerprint_conversion(500, molecules_per_test=[100, 300, 500])
+    """
+    print("=" * 60)
+    print("FINGERPRINT CONVERSION BENCHMARK")
+    print("=" * 60)
+    
+    # Generate test SMILES data
+    print(f"Generating {n_molecules} test molecules...")
+    test_smiles = _generate_test_smiles(n_molecules)
+    
+    if molecules_per_test is None:
+        molecules_per_test = [min(100, n_molecules), 
+                            min(500, n_molecules), 
+                            n_molecules]
+    
+    # Test different molecule counts
+    print(f"\nTesting conversion speed with different dataset sizes:")
+    print("-" * 60)
+    print(f"{'Molecules':<12} {'Radius':<8} {'Time (s)':<10} {'Bits/mol':<10} {'MB':<8}")
+    print("-" * 60)
+    
+    for n_mol in molecules_per_test:
+        subset_smiles = test_smiles[:n_mol]
+        
+        for radius in radii:
+            start_time = time.time()
+            X = rdkit_to_csr(subset_smiles, radius=radius, show_progress=False)
+            conversion_time = time.time() - start_time
+            
+            avg_bits = X.nnz / X.shape[0] if X.shape[0] > 0 else 0
+            memory_mb = (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / 1024**2
+            
+            print(f"{n_mol:<12} {radius:<8} {conversion_time:<10.3f} {avg_bits:<10.1f} {memory_mb:<8.2f}")
+    
+    # Memory efficiency comparison
+    print(f"\nMemory Efficiency Analysis:")
+    print("-" * 40)
+    
+    X_example = rdkit_to_csr(test_smiles[:100], radius=2, show_progress=False)
+    sparse_memory = (X_example.data.nbytes + X_example.indices.nbytes + X_example.indptr.nbytes) / 1024**2
+    dense_memory = (X_example.shape[0] * X_example.shape[1] * np.dtype(np.bool_).itemsize) / 1024**2
+    
+    print(f"100 molecules, radius=2:")
+    print(f"  Sparse matrix: {sparse_memory:.2f} MB")
+    print(f"  Dense equivalent: {dense_memory:,.0f} MB")
+    print(f"  Memory reduction: {(1 - sparse_memory/dense_memory)*100:.3f}%")
+    
+    # Throughput summary
+    print(f"\nThroughput Summary:")
+    print("-" * 20)
+    fastest_time = min([conversion_time for n_mol in molecules_per_test[:1] 
+                       for radius in radii[:1]])
+    throughput = molecules_per_test[0] / fastest_time if fastest_time > 0 else 0
+    print(f"Peak throughput: ~{throughput:.0f} molecules/second")
+    print(f"Recommended for datasets: Up to {throughput * 60:.0f} molecules/minute")
+
+
+def _generate_test_smiles(n_molecules):
+    """Generate test SMILES strings for benchmarking."""
+    # Simple test molecules with varying complexity
+    base_smiles = [
+        "CCO",  # Ethanol
+        "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
+        "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen  
+        "CCCCCCCCCCCCCCCC",  # Palmitic acid
+        "CC1=CC=C(C=C1)C(=O)O",  # p-Toluic acid
+        "CCN(CC)CC",  # Triethylamine
+        "CC(C)(C)C1=CC=C(C=C1)O",  # BHT
+        "CCCCCCCCCCCCC",  # Tridecane
+        "CC1=CC(=CC(=C1)C)C(=O)O",  # Mesitylenic acid
+        "CCCCCCCCCC",  # Decane
+        "CC1=CC=CC=C1",  # Toluene
+        "C1=CC=CC=C1",  # Benzene
+        "CC(C)O",  # Isopropanol
+        "CCCCO",  # Butanol
+        "CC(C)C",  # Propane
+    ]
+    
+    # Repeat base molecules to reach desired count
+    test_smiles = []
+    while len(test_smiles) < n_molecules:
+        test_smiles.extend(base_smiles)
+    
+    return test_smiles[:n_molecules]
+
+
+def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None, 
+                                   radius=2, sample_diversity=True):
+    """Benchmark fingerprint conversion performance for large datasets.
+    
+    This function tests the scalability and performance of rdkit_to_csr 
+    with large molecular datasets up to 100,000 molecules.
+    
+    Parameters
+    ----------
+    target_molecules : int, default=100000
+        Maximum number of molecules to test
+    test_sizes : list of int, optional
+        Molecule counts to benchmark. If None, uses logarithmic scale
+    radius : int, default=2
+        Morgan fingerprint radius
+    sample_diversity : bool, default=True
+        If True, generates diverse molecular structures for realistic testing
+        
+    Examples
+    --------
+    >>> benchmark_large_scale_conversion(100000)
+    >>> benchmark_large_scale_conversion(50000, test_sizes=[1000, 10000, 50000])
+    """
+    print("=" * 80)
+    print("LARGE-SCALE FINGERPRINT CONVERSION BENCHMARK")
+    print("=" * 80)
+    print(f"Target dataset size: {target_molecules:,} molecules")
+    print(f"Morgan fingerprint radius: {radius}")
+    print(f"Diversity sampling: {'Enabled' if sample_diversity else 'Disabled'}")
+    
+    if test_sizes is None:
+        # Logarithmic scale testing
+        test_sizes = [1000, 5000, 10000, 25000, 50000]
+        if target_molecules >= 100000:
+            test_sizes.append(100000)
+        # Filter to not exceed target
+        test_sizes = [size for size in test_sizes if size <= target_molecules]
+    
+    print(f"\nGenerating test dataset with {target_molecules:,} molecules...")
+    print("-" * 60)
+    
+    start_gen = time.time()
+    test_smiles = _generate_diverse_smiles(target_molecules, diverse=sample_diversity)
+    gen_time = time.time() - start_gen
+    
+    print(f"Dataset generation completed in {gen_time:.2f} seconds")
+    print(f"Average generation rate: {target_molecules/gen_time:.0f} molecules/second")
+    
+    # Performance tracking
+    results = []
+    
+    print(f"\nBenchmarking conversion performance:")
+    print("-" * 80)
+    print(f"{'Molecules':<12} {'Time (s)':<10} {'Rate (mol/s)':<12} {'Bits/mol':<10} {'Memory (MB)':<12} {'Sparsity':<10}")
+    print("-" * 80)
+    
+    for n_molecules in test_sizes:
+        print(f"Testing {n_molecules:,} molecules...", end=" ", flush=True)
+        
+        # Subset the data
+        subset_smiles = test_smiles[:n_molecules]
+        
+        # Benchmark conversion
+        start_time = time.time()
+        X = rdkit_to_csr(subset_smiles, radius=radius, show_progress=False)
+        conversion_time = time.time() - start_time
+        
+        # Calculate metrics
+        rate = n_molecules / conversion_time if conversion_time > 0 else 0
+        avg_bits = X.nnz / X.shape[0] if X.shape[0] > 0 else 0
+        memory_mb = (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / 1024**2
+        sparsity = 1 - (X.nnz / X.size) if X.size > 0 else 0
+        
+        results.append({
+            'molecules': n_molecules,
+            'time': conversion_time,
+            'rate': rate,
+            'bits_per_mol': avg_bits,
+            'memory_mb': memory_mb,
+            'sparsity': sparsity
+        })
+        
+        print(f"{n_molecules:<12,} {conversion_time:<10.2f} {rate:<12.0f} {avg_bits:<10.1f} {memory_mb:<12.2f} {sparsity:<10.6f}")
+    
+    # Scalability analysis
+    print(f"\nScalability Analysis:")
+    print("-" * 40)
+    
+    if len(results) >= 2:
+        # Calculate scaling efficiency
+        small_result = results[0]
+        large_result = results[-1]
+        
+        size_ratio = large_result['molecules'] / small_result['molecules']
+        time_ratio = large_result['time'] / small_result['time']
+        scaling_efficiency = size_ratio / time_ratio
+        
+        print(f"Size scaling: {small_result['molecules']:,} → {large_result['molecules']:,} molecules ({size_ratio:.1f}x)")
+        print(f"Time scaling: {small_result['time']:.2f}s → {large_result['time']:.2f}s ({time_ratio:.1f}x)")
+        print(f"Scaling efficiency: {scaling_efficiency:.2f} (1.0 = perfect linear scaling)")
+        
+        # Memory scaling
+        memory_ratio = large_result['memory_mb'] / small_result['memory_mb']
+        print(f"Memory scaling: {small_result['memory_mb']:.1f}MB → {large_result['memory_mb']:.1f}MB ({memory_ratio:.1f}x)")
+    
+    # Performance projections
+    print(f"\nPerformance Projections:")
+    print("-" * 30)
+    
+    if results:
+        latest = results[-1]
+        
+        # Project to larger datasets
+        projected_1M = (1_000_000 / latest['rate']) if latest['rate'] > 0 else float('inf')
+        projected_memory_1M = latest['memory_mb'] * (1_000_000 / latest['molecules'])
+        
+        print(f"Projected time for 1M molecules: {projected_1M/60:.1f} minutes")
+        print(f"Projected memory for 1M molecules: {projected_memory_1M/1024:.1f} GB")
+        
+        # Realistic dataset recommendations
+        if latest['rate'] > 0:
+            molecules_per_minute = latest['rate'] * 60
+            molecules_per_hour = molecules_per_minute * 60
+            
+            print(f"\nRealistic Usage Recommendations:")
+            print(f"  Interactive analysis: Up to {int(molecules_per_minute/10):,} molecules")
+            print(f"  Batch processing: Up to {int(molecules_per_hour/10):,} molecules")
+            print(f"  Production pipeline: {int(molecules_per_hour):,}+ molecules/hour")
+    
+    # Memory efficiency showcase
+    print(f"\nMemory Efficiency Showcase:")
+    print("-" * 35)
+    
+    if results:
+        example = results[-1]
+        sparse_mb = example['memory_mb']
+        
+        # Calculate theoretical dense matrix size
+        n_mols = example['molecules']
+        dense_gb = (n_mols * (2**32) * 1) / (1024**3)  # 1 byte per boolean
+        
+        print(f"{n_mols:,} molecules:")
+        print(f"  Sparse matrix: {sparse_mb:.1f} MB")
+        print(f"  Dense equivalent: {dense_gb:,.0f} GB")
+        print(f"  Space savings: {(1 - sparse_mb/(dense_gb*1024))*100:.6f}%")
+    
+    print(f"\n{'='*80}")
+    print(f"✓ Large-scale benchmark completed successfully!")
+    print(f"✓ LaplacianNB can efficiently handle datasets up to {target_molecules:,} molecules")
+    print(f"{'='*80}")
+    
+    return results
+
+
+def _generate_diverse_smiles(n_molecules, diverse=True):
+    """Generate a diverse set of SMILES for realistic benchmarking."""
+    if diverse:
+        # More diverse molecular structures for realistic testing
+        base_smiles = [
+            # Simple aliphatics
+            "CCO", "CCC", "CCCC", "CCCCC", "CCCCCC", "CCCCCCC",
+            "CC(C)C", "CC(C)CC", "CC(C)(C)C", "CCCCCCCCCC",
+            
+            # Aromatics and pharmaceuticals
+            "C1=CC=CC=C1", "CC1=CC=CC=C1", "CC1=CC=C(C=C1)C",
+            "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
+            "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen
+            "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
+            
+            # Heterocycles
+            "C1=CC=NC=C1", "C1=CN=CC=C1", "C1=CC=C(C=C1)N",
+            "C1CCC(CC1)N", "C1=CC=C2C(=C1)C=CC=N2",
+            
+            # Functional groups
+            "CC(=O)O", "CCO", "CC(=O)C", "CCCN", "CCS", "CC=O",
+            "CC(=O)N", "CC(C)O", "C=CC", "C#CC", "CCCl", "CCBr",
+            
+            # Larger molecules
+            "CCCCCCCCCCCCCCCC",  # Palmitic acid
+            "CC1=CC(=CC(=C1)C)C(=O)O",  # Mesitylenic acid
+            "CC(C)(C)C1=CC=C(C=C1)O",  # BHT
+            "CCN(CC)CC",  # Triethylamine
+            
+            # Steroids and complex structures
+            "CC12CCC3C(C1CCC2O)CCC4=CC(=O)CCC34C",
+            "CN1CCC[C@H]1C2=CN=CC=C2",
+            "CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C)C",
+        ]
+    else:
+        # Simple repeated structures for baseline testing
+        base_smiles = [
+            "CCO", "CC(=O)OC1=CC=CC=C1C(=O)O", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
+            "CCCCCCCCCCCCCCCC", "CC1=CC=C(C=C1)C(=O)O", "CCN(CC)CC",
+            "CC(C)(C)C1=CC=C(C=C1)O", "CCCCCCCCCCCCC", "CC1=CC(=CC(=C1)C)C(=O)O",
+            "CCCCCCCCCC", "CC1=CC=CC=C1", "C1=CC=CC=C1", "CC(C)O", "CCCCO"
+        ]
+    
+    # Generate the required number of molecules
+    test_smiles = []
+    while len(test_smiles) < n_molecules:
+        test_smiles.extend(base_smiles)
+    
+    return test_smiles[:n_molecules]
diff --git a/uv.lock b/uv.lock
index 4aac06e..27d9b5d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -181,6 +181,7 @@ dependencies = [
     { name = "scikit-learn" },
     { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "scipy", version = "1.16.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "tqdm" },
 ]
 
 [package.dev-dependencies]
@@ -199,6 +200,7 @@ requires-dist = [
     { name = "rdkit", specifier = ">=2024.3.5" },
     { name = "scikit-learn", specifier = ">=1.7.0" },
     { name = "scipy", specifier = ">=1.6.0" },
+    { name = "tqdm", specifier = ">=4.67.1" },
 ]
 
 [package.metadata.requires-dev]
@@ -926,6 +928,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" },
 ]
 
+[[package]]
+name = "tqdm"
+version = "4.67.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.14.1"

From c1bdbf83af229e294560392b4dac42bb4839d0c8 Mon Sep 17 00:00:00 2001
From: Bartosz Baranowski <bartekbaranow@gmail.com>
Date: Thu, 21 Aug 2025 12:29:55 +0200
Subject: [PATCH 5/8] 0.7.1 version

---
 src/laplaciannb/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/laplaciannb/__init__.py b/src/laplaciannb/__init__.py
index 3c2c15c..33b4e01 100644
--- a/src/laplaciannb/__init__.py
+++ b/src/laplaciannb/__init__.py
@@ -21,7 +21,7 @@
 from .fingerprint_utils import rdkit_to_csr
 
 
-__version__ = "0.7.0"
+__version__ = "0.7.1"
 __all__ = [
     "LaplacianNB",
     "rdkit_to_csr",

From 676ea6b7b7ed288fdb69927b81129383439e1746 Mon Sep 17 00:00:00 2001
From: Bartosz Baranowski <bartekbaranow@gmail.com>
Date: Thu, 21 Aug 2025 12:33:14 +0200
Subject: [PATCH 6/8] pre-commit rerun

---
 examples/benchmark_fingerprints.py   |  14 +-
 examples/benchmark_large_scale.py    |  20 +-
 src/laplaciannb/fingerprint_utils.py | 281 +++++++++++++++------------
 3 files changed, 175 insertions(+), 140 deletions(-)

diff --git a/examples/benchmark_fingerprints.py b/examples/benchmark_fingerprints.py
index b773834..2f960c3 100644
--- a/examples/benchmark_fingerprints.py
+++ b/examples/benchmark_fingerprints.py
@@ -19,7 +19,7 @@ def main():
     """Run fingerprint conversion benchmarks."""
     print("LaplacianNB Fingerprint Conversion Benchmark")
     print("=" * 50)
-    
+
     try:
         # Quick test with small dataset
         print("\n1. Quick Test (50 molecules)")
@@ -28,29 +28,29 @@ def main():
             "CCO", "CC(=O)OC1=CC=CC=C1C(=O)O", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
             "CCCCCCCCCCCCCCCC", "CC1=CC=C(C=C1)C(=O)O"
         ] * 10  # 50 molecules
-        
+
         X = rdkit_to_csr(test_smiles, radius=2, show_progress=True)
         print(f"✓ Successfully converted {X.shape[0]} molecules")
-        
+
         # Medium test
         print("\n2. Medium Test (200 molecules)")
         print("-" * 30)
         medium_smiles = test_smiles * 4  # 200 molecules
         X_medium = rdkit_to_csr(medium_smiles, radius=2, show_progress=True)
-        
+
         # Comprehensive benchmark
         print("\n3. Comprehensive Benchmark")
         print("-" * 30)
         benchmark_fingerprint_conversion(
-            n_molecules=1000, 
+            n_molecules=1000,
             radii=[1, 2, 3],
             molecules_per_test=[100, 500, 1000]
         )
-        
+
         print("\n" + "=" * 50)
         print("✓ All benchmarks completed successfully!")
         print("✓ LaplacianNB fingerprint conversion is ready for production")
-        
+
     except ImportError as e:
         print(f"Missing dependency: {e}")
         print("Please install: pip install rdkit scikit-learn scipy")
diff --git a/examples/benchmark_large_scale.py b/examples/benchmark_large_scale.py
index 316d447..2b0799b 100644
--- a/examples/benchmark_large_scale.py
+++ b/examples/benchmark_large_scale.py
@@ -22,10 +22,10 @@ def main():
     print("Testing conversion performance up to 100,000 molecules")
     print("This benchmark evaluates:")
     print("- Conversion speed and throughput")
-    print("- Memory usage and efficiency") 
+    print("- Memory usage and efficiency")
     print("- Scalability characteristics")
     print("- Performance projections")
-    
+
     try:
         # Run the comprehensive large-scale benchmark
         results = benchmark_large_scale_conversion(
@@ -34,34 +34,34 @@ def main():
             radius=2,
             sample_diversity=True
         )
-        
+
         print("\n" + "="*50)
         print("BENCHMARK SUMMARY")
         print("="*50)
-        
+
         if results:
             fastest_rate = max(r['rate'] for r in results)
             largest_test = max(results, key=lambda x: x['molecules'])
-            
+
             print(f"Peak conversion rate: {fastest_rate:,.0f} molecules/second")
             print(f"Largest test completed: {largest_test['molecules']:,} molecules")
             print(f"Time for largest test: {largest_test['time']:.1f} seconds")
             print(f"Memory for largest test: {largest_test['memory_mb']:.1f} MB")
             print(f"Sparsity achieved: {largest_test['sparsity']:.6f}")
-            
+
             # Calculate efficiency metrics
             total_molecules = sum(r['molecules'] for r in results)
             total_time = sum(r['time'] for r in results)
             overall_rate = total_molecules / total_time
-            
+
             print(f"\nOverall benchmark performance:")
             print(f"  Total molecules processed: {total_molecules:,}")
-            print(f"  Total time: {total_time:.1f} seconds") 
+            print(f"  Total time: {total_time:.1f} seconds")
             print(f"  Average rate: {overall_rate:,.0f} molecules/second")
-        
+
         print(f"\n✓ Large-scale benchmark completed successfully!")
         print(f"✓ LaplacianNB fingerprint conversion scales efficiently to 100K+ molecules")
-        
+
     except ImportError as e:
         print(f"Missing dependency: {e}")
         print("Please install: pip install rdkit scikit-learn scipy")
diff --git a/src/laplaciannb/fingerprint_utils.py b/src/laplaciannb/fingerprint_utils.py
index 5b4009f..557a8c7 100644
--- a/src/laplaciannb/fingerprint_utils.py
+++ b/src/laplaciannb/fingerprint_utils.py
@@ -1,14 +1,18 @@
-import numpy as np
 import time
+
+import numpy as np
 from rdkit import Chem
 from rdkit.Chem import rdFingerprintGenerator
 from scipy.sparse import csr_matrix
 
+
 try:
     from tqdm import tqdm
+
     TQDM_AVAILABLE = True
 except ImportError:
     TQDM_AVAILABLE = False
+
     def tqdm(iterable, *args, **kwargs):
         """Fallback if tqdm is not available."""
         return iterable
@@ -16,7 +20,7 @@ def tqdm(iterable, *args, **kwargs):
 
 def rdkit_to_csr(smiles_list, radius=2, show_progress=True):
     """Convert RDKit sparse Morgan fingerprints to CSR matrix with lossless conversion.
-    
+
     Parameters
     ----------
     smiles_list : list of str
@@ -25,12 +29,12 @@ def rdkit_to_csr(smiles_list, radius=2, show_progress=True):
         Morgan fingerprint radius
     show_progress : bool, default=True
         Show progress bar if tqdm is available
-        
+
     Returns
     -------
     scipy.sparse.csr_matrix
         Sparse matrix of shape (n_molecules, 2^32) with boolean dtype
-        
+
     Examples
     --------
     >>> smiles = ["CCO", "CC(=O)OC1=CC=CC=C1C(=O)O"]
@@ -38,7 +42,7 @@ def rdkit_to_csr(smiles_list, radius=2, show_progress=True):
     >>> print(f"Shape: {X.shape}, Sparsity: {1 - X.nnz / X.size:.6f}")
     """
     start_time = time.time()
-    
+
     row_ind = []
     col_ind = []
 
@@ -50,23 +54,22 @@ def rdkit_to_csr(smiles_list, radius=2, show_progress=True):
     # Process molecules with optional progress bar
     iterator = enumerate(mol_list)
     if show_progress and TQDM_AVAILABLE and len(mol_list) > 10:
-        iterator = tqdm(iterator, total=len(mol_list), 
-                       desc="Processing molecules", unit="mol")
+        iterator = tqdm(iterator, total=len(mol_list), desc="Processing molecules", unit="mol")
 
     valid_molecules = 0
     total_bits = 0
-    
+
     for i, mol in iterator:
         if mol is None:
             continue
-            
+
         valid_molecules += 1
 
         # Get sparse fingerprint
         sfp = mfpgen.GetSparseFingerprint(mol)
         mol_bits = set(sfp.GetOnBits())
         total_bits += len(mol_bits)
-        
+
         for bit in mol_bits:
             # Reinterpret signed int32 as unsigned int32
             # This maps [-2^31, 2^31-1] to [0, 2^32-1] losslessly
@@ -77,15 +80,14 @@ def rdkit_to_csr(smiles_list, radius=2, show_progress=True):
 
     # Create data array (all ones for boolean matrix)
     data = np.ones(len(row_ind), dtype=np.bool_)
-    
+
     # Create sparse matrix
-    matrix = csr_matrix((data, (row_ind, col_ind)), 
-                       shape=(len(mol_list), 2**32), dtype=np.bool_)
-    
+    matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(mol_list), 2**32), dtype=np.bool_)
+
     # Performance summary
     conversion_time = time.time() - start_time
     sparsity = 1 - matrix.nnz / matrix.size if matrix.size > 0 else 0
-    
+
     print(f"Conversion completed in {conversion_time:.3f} seconds")
     print(f"Valid molecules: {valid_molecules}/{len(mol_list)}")
     print(f"Total fingerprint bits: {total_bits:,}")
@@ -93,14 +95,13 @@ def rdkit_to_csr(smiles_list, radius=2, show_progress=True):
     print(f"Matrix shape: {matrix.shape}")
     print(f"Matrix sparsity: {sparsity:.6f}")
     print(f"Memory usage: {(matrix.data.nbytes + matrix.indices.nbytes + matrix.indptr.nbytes) / 1024**2:.2f} MB")
-    
+
     return matrix
 
 
-def benchmark_fingerprint_conversion(n_molecules=100000, radii=[2], 
-                                   molecules_per_test=None):
+def benchmark_fingerprint_conversion(n_molecules=100000, radii=[2], molecules_per_test=None):
     """Benchmark fingerprint conversion performance with different parameters.
-    
+
     Parameters
     ----------
     n_molecules : int, default=1000
@@ -109,7 +110,7 @@ def benchmark_fingerprint_conversion(n_molecules=100000, radii=[2],
         Morgan fingerprint radii to test
     molecules_per_test : list of int, optional
         Different molecule counts to test. If None, uses [100, 500, 1000]
-        
+
     Examples
     --------
     >>> benchmark_fingerprint_conversion(1000, radii=[2, 3])
@@ -118,53 +119,50 @@ def benchmark_fingerprint_conversion(n_molecules=100000, radii=[2],
     print("=" * 60)
     print("FINGERPRINT CONVERSION BENCHMARK")
     print("=" * 60)
-    
+
     # Generate test SMILES data
     print(f"Generating {n_molecules} test molecules...")
     test_smiles = _generate_test_smiles(n_molecules)
-    
+
     if molecules_per_test is None:
-        molecules_per_test = [min(100, n_molecules), 
-                            min(500, n_molecules), 
-                            n_molecules]
-    
+        molecules_per_test = [min(100, n_molecules), min(500, n_molecules), n_molecules]
+
     # Test different molecule counts
-    print(f"\nTesting conversion speed with different dataset sizes:")
+    print("\nTesting conversion speed with different dataset sizes:")
     print("-" * 60)
     print(f"{'Molecules':<12} {'Radius':<8} {'Time (s)':<10} {'Bits/mol':<10} {'MB':<8}")
     print("-" * 60)
-    
+
     for n_mol in molecules_per_test:
         subset_smiles = test_smiles[:n_mol]
-        
+
         for radius in radii:
             start_time = time.time()
             X = rdkit_to_csr(subset_smiles, radius=radius, show_progress=False)
             conversion_time = time.time() - start_time
-            
+
             avg_bits = X.nnz / X.shape[0] if X.shape[0] > 0 else 0
             memory_mb = (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / 1024**2
-            
+
             print(f"{n_mol:<12} {radius:<8} {conversion_time:<10.3f} {avg_bits:<10.1f} {memory_mb:<8.2f}")
-    
+
     # Memory efficiency comparison
-    print(f"\nMemory Efficiency Analysis:")
+    print("\nMemory Efficiency Analysis:")
     print("-" * 40)
-    
+
     X_example = rdkit_to_csr(test_smiles[:100], radius=2, show_progress=False)
     sparse_memory = (X_example.data.nbytes + X_example.indices.nbytes + X_example.indptr.nbytes) / 1024**2
     dense_memory = (X_example.shape[0] * X_example.shape[1] * np.dtype(np.bool_).itemsize) / 1024**2
-    
-    print(f"100 molecules, radius=2:")
+
+    print("100 molecules, radius=2:")
     print(f"  Sparse matrix: {sparse_memory:.2f} MB")
     print(f"  Dense equivalent: {dense_memory:,.0f} MB")
     print(f"  Memory reduction: {(1 - sparse_memory/dense_memory)*100:.3f}%")
-    
+
     # Throughput summary
-    print(f"\nThroughput Summary:")
+    print("\nThroughput Summary:")
     print("-" * 20)
-    fastest_time = min([conversion_time for n_mol in molecules_per_test[:1] 
-                       for radius in radii[:1]])
+    fastest_time = min([conversion_time for n_mol in molecules_per_test[:1] for radius in radii[:1]])
     throughput = molecules_per_test[0] / fastest_time if fastest_time > 0 else 0
     print(f"Peak throughput: ~{throughput:.0f} molecules/second")
     print(f"Recommended for datasets: Up to {throughput * 60:.0f} molecules/minute")
@@ -176,7 +174,7 @@ def _generate_test_smiles(n_molecules):
     base_smiles = [
         "CCO",  # Ethanol
         "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
-        "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen  
+        "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen
         "CCCCCCCCCCCCCCCC",  # Palmitic acid
         "CC1=CC=C(C=C1)C(=O)O",  # p-Toluic acid
         "CCN(CC)CC",  # Triethylamine
@@ -190,22 +188,21 @@ def _generate_test_smiles(n_molecules):
         "CCCCO",  # Butanol
         "CC(C)C",  # Propane
     ]
-    
+
     # Repeat base molecules to reach desired count
     test_smiles = []
     while len(test_smiles) < n_molecules:
         test_smiles.extend(base_smiles)
-    
+
     return test_smiles[:n_molecules]
 
 
-def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None, 
-                                   radius=2, sample_diversity=True):
+def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None, radius=2, sample_diversity=True):
     """Benchmark fingerprint conversion performance for large datasets.
-    
-    This function tests the scalability and performance of rdkit_to_csr 
+
+    This function tests the scalability and performance of rdkit_to_csr
     with large molecular datasets up to 100,000 molecules.
-    
+
     Parameters
     ----------
     target_molecules : int, default=100000
@@ -216,7 +213,7 @@ def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None,
         Morgan fingerprint radius
     sample_diversity : bool, default=True
         If True, generates diverse molecular structures for realistic testing
-        
+
     Examples
     --------
     >>> benchmark_large_scale_conversion(100000)
@@ -228,7 +225,7 @@ def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None,
     print(f"Target dataset size: {target_molecules:,} molecules")
     print(f"Morgan fingerprint radius: {radius}")
     print(f"Diversity sampling: {'Enabled' if sample_diversity else 'Disabled'}")
-    
+
     if test_sizes is None:
         # Logarithmic scale testing
         test_sizes = [1000, 5000, 10000, 25000, 50000]
@@ -236,120 +233,130 @@ def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None,
             test_sizes.append(100000)
         # Filter to not exceed target
         test_sizes = [size for size in test_sizes if size <= target_molecules]
-    
+
     print(f"\nGenerating test dataset with {target_molecules:,} molecules...")
     print("-" * 60)
-    
+
     start_gen = time.time()
     test_smiles = _generate_diverse_smiles(target_molecules, diverse=sample_diversity)
     gen_time = time.time() - start_gen
-    
+
     print(f"Dataset generation completed in {gen_time:.2f} seconds")
     print(f"Average generation rate: {target_molecules/gen_time:.0f} molecules/second")
-    
+
     # Performance tracking
     results = []
-    
-    print(f"\nBenchmarking conversion performance:")
+
+    print("\nBenchmarking conversion performance:")
     print("-" * 80)
-    print(f"{'Molecules':<12} {'Time (s)':<10} {'Rate (mol/s)':<12} {'Bits/mol':<10} {'Memory (MB)':<12} {'Sparsity':<10}")
+    print(
+        f"{'Molecules':<12} {'Time (s)':<10} {'Rate (mol/s)':<12} {'Bits/mol':<10} {'Memory (MB)':<12} {'Sparsity':<10}"
+    )
     print("-" * 80)
-    
+
     for n_molecules in test_sizes:
         print(f"Testing {n_molecules:,} molecules...", end=" ", flush=True)
-        
+
         # Subset the data
         subset_smiles = test_smiles[:n_molecules]
-        
+
         # Benchmark conversion
         start_time = time.time()
         X = rdkit_to_csr(subset_smiles, radius=radius, show_progress=False)
         conversion_time = time.time() - start_time
-        
+
         # Calculate metrics
         rate = n_molecules / conversion_time if conversion_time > 0 else 0
         avg_bits = X.nnz / X.shape[0] if X.shape[0] > 0 else 0
         memory_mb = (X.data.nbytes + X.indices.nbytes + X.indptr.nbytes) / 1024**2
         sparsity = 1 - (X.nnz / X.size) if X.size > 0 else 0
-        
-        results.append({
-            'molecules': n_molecules,
-            'time': conversion_time,
-            'rate': rate,
-            'bits_per_mol': avg_bits,
-            'memory_mb': memory_mb,
-            'sparsity': sparsity
-        })
-        
-        print(f"{n_molecules:<12,} {conversion_time:<10.2f} {rate:<12.0f} {avg_bits:<10.1f} {memory_mb:<12.2f} {sparsity:<10.6f}")
-    
+
+        results.append(
+            {
+                "molecules": n_molecules,
+                "time": conversion_time,
+                "rate": rate,
+                "bits_per_mol": avg_bits,
+                "memory_mb": memory_mb,
+                "sparsity": sparsity,
+            }
+        )
+
+        print(
+            f"{n_molecules:<12,} {conversion_time:<10.2f} {rate:<12.0f} {avg_bits:<10.1f} {memory_mb:<12.2f} {sparsity:<10.6f}"
+        )
+
     # Scalability analysis
-    print(f"\nScalability Analysis:")
+    print("\nScalability Analysis:")
     print("-" * 40)
-    
+
     if len(results) >= 2:
         # Calculate scaling efficiency
         small_result = results[0]
         large_result = results[-1]
-        
-        size_ratio = large_result['molecules'] / small_result['molecules']
-        time_ratio = large_result['time'] / small_result['time']
+
+        size_ratio = large_result["molecules"] / small_result["molecules"]
+        time_ratio = large_result["time"] / small_result["time"]
         scaling_efficiency = size_ratio / time_ratio
-        
-        print(f"Size scaling: {small_result['molecules']:,} → {large_result['molecules']:,} molecules ({size_ratio:.1f}x)")
+
+        print(
+            f"Size scaling: {small_result['molecules']:,} → {large_result['molecules']:,} molecules ({size_ratio:.1f}x)"
+        )
         print(f"Time scaling: {small_result['time']:.2f}s → {large_result['time']:.2f}s ({time_ratio:.1f}x)")
         print(f"Scaling efficiency: {scaling_efficiency:.2f} (1.0 = perfect linear scaling)")
-        
+
         # Memory scaling
-        memory_ratio = large_result['memory_mb'] / small_result['memory_mb']
-        print(f"Memory scaling: {small_result['memory_mb']:.1f}MB → {large_result['memory_mb']:.1f}MB ({memory_ratio:.1f}x)")
-    
+        memory_ratio = large_result["memory_mb"] / small_result["memory_mb"]
+        print(
+            f"Memory scaling: {small_result['memory_mb']:.1f}MB → {large_result['memory_mb']:.1f}MB ({memory_ratio:.1f}x)"
+        )
+
     # Performance projections
-    print(f"\nPerformance Projections:")
+    print("\nPerformance Projections:")
     print("-" * 30)
-    
+
     if results:
         latest = results[-1]
-        
+
         # Project to larger datasets
-        projected_1M = (1_000_000 / latest['rate']) if latest['rate'] > 0 else float('inf')
-        projected_memory_1M = latest['memory_mb'] * (1_000_000 / latest['molecules'])
-        
+        projected_1M = (1_000_000 / latest["rate"]) if latest["rate"] > 0 else float("inf")
+        projected_memory_1M = latest["memory_mb"] * (1_000_000 / latest["molecules"])
+
         print(f"Projected time for 1M molecules: {projected_1M/60:.1f} minutes")
         print(f"Projected memory for 1M molecules: {projected_memory_1M/1024:.1f} GB")
-        
+
         # Realistic dataset recommendations
-        if latest['rate'] > 0:
-            molecules_per_minute = latest['rate'] * 60
+        if latest["rate"] > 0:
+            molecules_per_minute = latest["rate"] * 60
             molecules_per_hour = molecules_per_minute * 60
-            
-            print(f"\nRealistic Usage Recommendations:")
+
+            print("\nRealistic Usage Recommendations:")
             print(f"  Interactive analysis: Up to {int(molecules_per_minute/10):,} molecules")
             print(f"  Batch processing: Up to {int(molecules_per_hour/10):,} molecules")
             print(f"  Production pipeline: {int(molecules_per_hour):,}+ molecules/hour")
-    
+
     # Memory efficiency showcase
-    print(f"\nMemory Efficiency Showcase:")
+    print("\nMemory Efficiency Showcase:")
     print("-" * 35)
-    
+
     if results:
         example = results[-1]
-        sparse_mb = example['memory_mb']
-        
+        sparse_mb = example["memory_mb"]
+
         # Calculate theoretical dense matrix size
-        n_mols = example['molecules']
+        n_mols = example["molecules"]
         dense_gb = (n_mols * (2**32) * 1) / (1024**3)  # 1 byte per boolean
-        
+
         print(f"{n_mols:,} molecules:")
         print(f"  Sparse matrix: {sparse_mb:.1f} MB")
         print(f"  Dense equivalent: {dense_gb:,.0f} GB")
         print(f"  Space savings: {(1 - sparse_mb/(dense_gb*1024))*100:.6f}%")
-    
+
     print(f"\n{'='*80}")
-    print(f"✓ Large-scale benchmark completed successfully!")
+    print("✓ Large-scale benchmark completed successfully!")
     print(f"✓ LaplacianNB can efficiently handle datasets up to {target_molecules:,} molecules")
     print(f"{'='*80}")
-    
+
     return results
 
 
@@ -359,29 +366,47 @@ def _generate_diverse_smiles(n_molecules, diverse=True):
         # More diverse molecular structures for realistic testing
         base_smiles = [
             # Simple aliphatics
-            "CCO", "CCC", "CCCC", "CCCCC", "CCCCCC", "CCCCCCC",
-            "CC(C)C", "CC(C)CC", "CC(C)(C)C", "CCCCCCCCCC",
-            
+            "CCO",
+            "CCC",
+            "CCCC",
+            "CCCCC",
+            "CCCCCC",
+            "CCCCCCC",
+            "CC(C)C",
+            "CC(C)CC",
+            "CC(C)(C)C",
+            "CCCCCCCCCC",
             # Aromatics and pharmaceuticals
-            "C1=CC=CC=C1", "CC1=CC=CC=C1", "CC1=CC=C(C=C1)C",
+            "C1=CC=CC=C1",
+            "CC1=CC=CC=C1",
+            "CC1=CC=C(C=C1)C",
             "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
             "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen
             "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
-            
             # Heterocycles
-            "C1=CC=NC=C1", "C1=CN=CC=C1", "C1=CC=C(C=C1)N",
-            "C1CCC(CC1)N", "C1=CC=C2C(=C1)C=CC=N2",
-            
+            "C1=CC=NC=C1",
+            "C1=CN=CC=C1",
+            "C1=CC=C(C=C1)N",
+            "C1CCC(CC1)N",
+            "C1=CC=C2C(=C1)C=CC=N2",
             # Functional groups
-            "CC(=O)O", "CCO", "CC(=O)C", "CCCN", "CCS", "CC=O",
-            "CC(=O)N", "CC(C)O", "C=CC", "C#CC", "CCCl", "CCBr",
-            
+            "CC(=O)O",
+            "CCO",
+            "CC(=O)C",
+            "CCCN",
+            "CCS",
+            "CC=O",
+            "CC(=O)N",
+            "CC(C)O",
+            "C=CC",
+            "C#CC",
+            "CCCl",
+            "CCBr",
             # Larger molecules
             "CCCCCCCCCCCCCCCC",  # Palmitic acid
             "CC1=CC(=CC(=C1)C)C(=O)O",  # Mesitylenic acid
             "CC(C)(C)C1=CC=C(C=C1)O",  # BHT
             "CCN(CC)CC",  # Triethylamine
-            
             # Steroids and complex structures
             "CC12CCC3C(C1CCC2O)CCC4=CC(=O)CCC34C",
             "CN1CCC[C@H]1C2=CN=CC=C2",
@@ -390,15 +415,25 @@ def _generate_diverse_smiles(n_molecules, diverse=True):
     else:
         # Simple repeated structures for baseline testing
         base_smiles = [
-            "CCO", "CC(=O)OC1=CC=CC=C1C(=O)O", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
-            "CCCCCCCCCCCCCCCC", "CC1=CC=C(C=C1)C(=O)O", "CCN(CC)CC",
-            "CC(C)(C)C1=CC=C(C=C1)O", "CCCCCCCCCCCCC", "CC1=CC(=CC(=C1)C)C(=O)O",
-            "CCCCCCCCCC", "CC1=CC=CC=C1", "C1=CC=CC=C1", "CC(C)O", "CCCCO"
+            "CCO",
+            "CC(=O)OC1=CC=CC=C1C(=O)O",
+            "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
+            "CCCCCCCCCCCCCCCC",
+            "CC1=CC=C(C=C1)C(=O)O",
+            "CCN(CC)CC",
+            "CC(C)(C)C1=CC=C(C=C1)O",
+            "CCCCCCCCCCCCC",
+            "CC1=CC(=CC(=C1)C)C(=O)O",
+            "CCCCCCCCCC",
+            "CC1=CC=CC=C1",
+            "C1=CC=CC=C1",
+            "CC(C)O",
+            "CCCCO",
         ]
-    
+
     # Generate the required number of molecules
     test_smiles = []
     while len(test_smiles) < n_molecules:
         test_smiles.extend(base_smiles)
-    
+
     return test_smiles[:n_molecules]

From 04b28fe18120fbe8f09aeddf28995db22968dad6 Mon Sep 17 00:00:00 2001
From: Bartosz Baranowski <bartekbaranow@gmail.com>
Date: Thu, 21 Aug 2025 12:38:16 +0200
Subject: [PATCH 7/8] run ruff

---
 .github/workflows/ruff.yml               |  2 --
 .pre-commit-config.yaml                  |  4 ++--
 examples/benchmark_fingerprints.py       | 23 ++++++++++--------
 examples/benchmark_large_scale.py        | 28 ++++++++++++----------
 examples/simple_example.py               | 30 +++++++++++++-----------
 src/laplaciannb/__init__.py              |  2 +-
 tests/test_bayes.py                      | 15 ++++++------
 tests/test_fingerprint_csr_conversion.py |  6 ++---
 8 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 004bd69..d4d6ecf 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -18,13 +18,11 @@ jobs:
         uses: astral-sh/ruff-action@v1
         with:
           args: "check --output-format=github --exit-non-zero-on-fix"
-          src: "./src"
 
       - name: Run Ruff formatting check
         uses: astral-sh/ruff-action@v1
         with:
           args: "format --check"
-          src: "./src"
 
   tests:
     name: Run Tests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f362ecb..0f40967 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,12 +8,12 @@ repos:
         name: ruff-lint
         types_or: [python, pyi]
         args: [--fix, --exit-non-zero-on-fix]
-        files: ^src/
+        # Remove files restriction to check all Python files
       # Run the formatter (matches CI ruff-format step)
       - id: ruff-format
         name: ruff-format
         types_or: [python, pyi]
-        files: ^src/
+        # Remove files restriction to format all Python files
 
   # Security scanning (matches security.yml workflow)
   - repo: https://github.com/PyCQA/bandit
diff --git a/examples/benchmark_fingerprints.py b/examples/benchmark_fingerprints.py
index 2f960c3..24dd1bf 100644
--- a/examples/benchmark_fingerprints.py
+++ b/examples/benchmark_fingerprints.py
@@ -7,13 +7,15 @@
 and parameters.
 """
 
-import sys
 import os
+import sys
+
 
 # Add src to path so we can import laplaciannb
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
+
+from laplaciannb.fingerprint_utils import benchmark_fingerprint_conversion, rdkit_to_csr
 
-from laplaciannb.fingerprint_utils import rdkit_to_csr, benchmark_fingerprint_conversion
 
 def main():
     """Run fingerprint conversion benchmarks."""
@@ -25,8 +27,11 @@ def main():
         print("\n1. Quick Test (50 molecules)")
         print("-" * 30)
         test_smiles = [
-            "CCO", "CC(=O)OC1=CC=CC=C1C(=O)O", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
-            "CCCCCCCCCCCCCCCC", "CC1=CC=C(C=C1)C(=O)O"
+            "CCO",
+            "CC(=O)OC1=CC=CC=C1C(=O)O",
+            "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
+            "CCCCCCCCCCCCCCCC",
+            "CC1=CC=C(C=C1)C(=O)O",
         ] * 10  # 50 molecules
 
         X = rdkit_to_csr(test_smiles, radius=2, show_progress=True)
@@ -37,15 +42,12 @@ def main():
         print("-" * 30)
         medium_smiles = test_smiles * 4  # 200 molecules
         X_medium = rdkit_to_csr(medium_smiles, radius=2, show_progress=True)
+        print(f"✓ Successfully converted {X_medium.shape[0]} molecules")
 
         # Comprehensive benchmark
         print("\n3. Comprehensive Benchmark")
         print("-" * 30)
-        benchmark_fingerprint_conversion(
-            n_molecules=1000,
-            radii=[1, 2, 3],
-            molecules_per_test=[100, 500, 1000]
-        )
+        benchmark_fingerprint_conversion(n_molecules=1000, radii=[1, 2, 3], molecules_per_test=[100, 500, 1000])
 
         print("\n" + "=" * 50)
         print("✓ All benchmarks completed successfully!")
@@ -57,5 +59,6 @@ def main():
     except Exception as e:
         print(f"Error during benchmark: {e}")
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/benchmark_large_scale.py b/examples/benchmark_large_scale.py
index 2b0799b..b2f936c 100644
--- a/examples/benchmark_large_scale.py
+++ b/examples/benchmark_large_scale.py
@@ -7,14 +7,16 @@
 with datasets up to 100,000 molecules.
 """
 
-import sys
 import os
+import sys
+
 
 # Add src to path so we can import laplaciannb
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
 
 from laplaciannb.fingerprint_utils import benchmark_large_scale_conversion
 
+
 def main():
     """Run large-scale fingerprint conversion benchmark."""
     print("LaplacianNB Large-Scale Fingerprint Benchmark")
@@ -32,16 +34,16 @@ def main():
             target_molecules=100000,
             test_sizes=[1000, 5000, 10000, 25000, 50000, 100000],
             radius=2,
-            sample_diversity=True
+            sample_diversity=True,
         )
 
-        print("\n" + "="*50)
+        print("\n" + "=" * 50)
         print("BENCHMARK SUMMARY")
-        print("="*50)
+        print("=" * 50)
 
         if results:
-            fastest_rate = max(r['rate'] for r in results)
-            largest_test = max(results, key=lambda x: x['molecules'])
+            fastest_rate = max(r["rate"] for r in results)
+            largest_test = max(results, key=lambda x: x["molecules"])
 
             print(f"Peak conversion rate: {fastest_rate:,.0f} molecules/second")
             print(f"Largest test completed: {largest_test['molecules']:,} molecules")
@@ -50,17 +52,17 @@ def main():
             print(f"Sparsity achieved: {largest_test['sparsity']:.6f}")
 
             # Calculate efficiency metrics
-            total_molecules = sum(r['molecules'] for r in results)
-            total_time = sum(r['time'] for r in results)
+            total_molecules = sum(r["molecules"] for r in results)
+            total_time = sum(r["time"] for r in results)
             overall_rate = total_molecules / total_time
 
-            print(f"\nOverall benchmark performance:")
+            print("\nOverall benchmark performance:")
             print(f"  Total molecules processed: {total_molecules:,}")
             print(f"  Total time: {total_time:.1f} seconds")
             print(f"  Average rate: {overall_rate:,.0f} molecules/second")
 
-        print(f"\n✓ Large-scale benchmark completed successfully!")
-        print(f"✓ LaplacianNB fingerprint conversion scales efficiently to 100K+ molecules")
+        print("\n✓ Large-scale benchmark completed successfully!")
+        print("✓ LaplacianNB fingerprint conversion scales efficiently to 100K+ molecules")
 
     except ImportError as e:
         print(f"Missing dependency: {e}")
@@ -68,7 +70,9 @@ def main():
     except Exception as e:
         print(f"Error during benchmark: {e}")
         import traceback
+
         traceback.print_exc()
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/simple_example.py b/examples/simple_example.py
index 57300f5..e305a80 100644
--- a/examples/simple_example.py
+++ b/examples/simple_example.py
@@ -7,16 +7,20 @@
 """
 
 import numpy as np
+from rdkit import Chem
+from rdkit.Chem import rdFingerprintGenerator
+
 from laplaciannb import LaplacianNB
 from laplaciannb.fingerprint_utils import rdkit_to_csr
 
+
 # Sample molecular data
 smiles = [
-    "CCO",                              # Ethanol - inactive
-    "CC(=O)OC1=CC=CC=C1C(=O)O",        # Aspirin - active
-    "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",   # Ibuprofen - active
-    "CCCCCCCCCCCCCCCC",                 # Palmitic acid - inactive
-    "CC1=CC=C(C=C1)C(=O)O"             # p-Toluic acid - active
+    "CCO",  # Ethanol - inactive
+    "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin - active
+    "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen - active
+    "CCCCCCCCCCCCCCCC",  # Palmitic acid - inactive
+    "CC1=CC=C(C=C1)C(=O)O",  # p-Toluic acid - active
 ]
 y = [0, 1, 1, 0, 1]  # Activity labels (0=inactive, 1=active)
 
@@ -38,9 +42,7 @@
 # Display results
 print("\nResults:")
 print("-" * 40)
-for i, (smiles_str, true_label, pred_label, prob) in enumerate(
-    zip(smiles, y, predictions, probabilities)
-):
+for i, (smiles_str, true_label, pred_label, prob) in enumerate(zip(smiles, y, predictions, probabilities)):
     print(f"Molecule {i+1}: {smiles_str[:20]}")
     print(f"  True: {true_label}, Predicted: {pred_label}")
     print(f"  Probabilities: [Inactive: {prob[0]:.3f}, Active: {prob[1]:.3f}]")
@@ -58,8 +60,6 @@
 print("\nOriginal RDKit fingerprint indices for each molecule:")
 print("-" * 50)
 
-from rdkit import Chem
-from rdkit.Chem import rdFingerprintGenerator
 
 # Recreate the fingerprint generator to get individual fingerprints
 mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
@@ -80,7 +80,7 @@
         print(f"  Total fingerprint bits: {len(original_indices)}")
 
 # Show how to extract indices from the sparse matrix
-print(f"\nExtracting indices from sparse matrix:")
+print("\nExtracting indices from sparse matrix:")
 print("-" * 50)
 
 for i in range(X.shape[0]):
@@ -92,8 +92,8 @@
     print(f"Molecule {i+1} active bits: {row_indices[:10]}{'...' if len(row_indices) > 10 else ''}")
     print(f"  Total: {len(row_indices)} active bits")
 
-print(f"\n✓ You can now map back to original RDKit fingerprint indices")
-print(f"✓ Useful for feature interpretation and chemical insights")
+print("\n✓ You can now map back to original RDKit fingerprint indices")
+print("✓ Useful for feature interpretation and chemical insights")
 
 # Reverse mapping: From sparse matrix back to RDKit
 print("\n" + "=" * 50)
@@ -103,6 +103,7 @@
 print("\nMapping sparse matrix indices back to original RDKit bits:")
 print("-" * 50)
 
+
 def uint32_to_rdkit_index(uint32_index):
     """Convert uint32 matrix index back to original RDKit signed int32."""
     # Convert back from unsigned to signed int32
@@ -111,6 +112,7 @@ def uint32_to_rdkit_index(uint32_index):
     else:
         return int(uint32_index)
 
+
 # Example: Take the first molecule and show the reverse mapping
 mol_idx = 0
 print(f"\nExample with Molecule {mol_idx + 1}: {smiles[mol_idx]}")
@@ -129,7 +131,7 @@ def uint32_to_rdkit_index(uint32_index):
 # Verify this matches the original fingerprint
 mol = Chem.MolFromSmiles(smiles[mol_idx])
 sfp = mfpgen.GetSparseFingerprint(mol)
-original_indices = sorted(list(sfp.GetOnBits()))
+original_indices = sorted(sfp.GetOnBits())
 recovered_indices = sorted(rdkit_indices)
 
 print(f"Original RDKit indices: {original_indices}")
diff --git a/src/laplaciannb/__init__.py b/src/laplaciannb/__init__.py
index 33b4e01..74f30e6 100644
--- a/src/laplaciannb/__init__.py
+++ b/src/laplaciannb/__init__.py
@@ -21,7 +21,7 @@
 from .fingerprint_utils import rdkit_to_csr
 
 
-__version__ = "0.7.1"
+__version__ = "0.8.0"
 __all__ = [
     "LaplacianNB",
     "rdkit_to_csr",
diff --git a/tests/test_bayes.py b/tests/test_bayes.py
index 26819bf..3d0de84 100644
--- a/tests/test_bayes.py
+++ b/tests/test_bayes.py
@@ -51,15 +51,15 @@ def test_lmnb_prior_unobserved_targets():
 
 
 def test_rdkit():
-    from laplaciannb.fingerprint_utils import rdkit_to_csr
     from laplaciannb import LaplacianNB
+    from laplaciannb.fingerprint_utils import rdkit_to_csr
 
     DATA_PATH = Path(__file__).parent.parent.joinpath("tests/data/")
     file = str(DATA_PATH.joinpath("smiles_test.csv"))
     df = pd.read_csv(file)
 
     # Convert to sparse CSR matrix using our fingerprint utility
-    X_sparse = rdkit_to_csr(df['smiles'].values, radius=2)
+    X_sparse = rdkit_to_csr(df["smiles"].values, radius=2)
 
     y = df["activity"]
     clf = LaplacianNB()
@@ -72,16 +72,17 @@ def test_rdkit():
 
 def test_joint_log_likelihood():
     """Test joint log likelihood with CSR matrices."""
-    from laplaciannb.fingerprint_utils import rdkit_to_csr
-    from laplaciannb import LaplacianNB
     from scipy.sparse import csr_matrix
 
+    from laplaciannb import LaplacianNB
+    from laplaciannb.fingerprint_utils import rdkit_to_csr
+
     DATA_PATH = Path(__file__).parent.parent.joinpath("tests/data/")
     file = str(DATA_PATH.joinpath("smiles_test.csv"))
     df = pd.read_csv(file)
 
     # Convert to CSR matrix using fingerprint utility
-    X = rdkit_to_csr(df['smiles'].values, radius=2)
+    X = rdkit_to_csr(df["smiles"].values, radius=2)
     y = df["activity"]
     clf = LaplacianNB()
     clf.fit(X, y)
@@ -91,7 +92,7 @@ def test_joint_log_likelihood():
     test_row = [0]
     test_col = [2**30]  # Use a large but valid index within 2^32-1 limit
     test_data = [1]
-    new_X = csr_matrix((test_data, (test_row, test_col)), shape=(1, 2**32-1), dtype=np.bool_)
+    new_X = csr_matrix((test_data, (test_row, test_col)), shape=(1, 2**32 - 1), dtype=np.bool_)
 
     try:
         clf._joint_log_likelihood(new_X)
@@ -123,7 +124,7 @@ def test_csr_fingerprint_conversion():
         fingerprint_rows.append(fingerprint_set)
 
     # Verify that molecules have some different features
-    assert len(set(len(fp) for fp in fingerprint_rows)) > 1  # Different numbers of features
+    assert len({len(fp) for fp in fingerprint_rows}) > 1  # Different numbers of features
 
     print(f"Successfully created CSR matrix: {X_sparse.shape}, nnz: {X_sparse.nnz}")
     print(f"Fingerprint sizes: {[len(fp) for fp in fingerprint_rows]}")
diff --git a/tests/test_fingerprint_csr_conversion.py b/tests/test_fingerprint_csr_conversion.py
index 1d2276c..ba2a9b0 100644
--- a/tests/test_fingerprint_csr_conversion.py
+++ b/tests/test_fingerprint_csr_conversion.py
@@ -1,8 +1,6 @@
-import pytest
 import numpy as np
-from scipy.sparse import csr_matrix
 from rdkit import Chem
-from rdkit.Chem import AllChem
+
 from laplaciannb.fingerprint_utils import rdkit_to_csr
 
 
@@ -18,7 +16,6 @@ def get_test_molecules():
 
 
 class TestFingerprintCSRConversion:
-
     def test_rdkit_to_csr_basic(self):
         """Test basic RDKit to CSR conversion"""
         smiles = ["CCO", "CC", "CCC"]
@@ -37,6 +34,7 @@ def test_fingerprint_consistency(self):
         # Calculate total expected fingerprint bits across all molecules
         # Use the same API as the function
         from rdkit.Chem import rdFingerprintGenerator
+
         mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)
 
         total_expected_bits = 0

From 9d053fc6adc5a3a698cd991e5540077a9276c5e4 Mon Sep 17 00:00:00 2001
From: Bartosz Baranowski <bartekbaranow@gmail.com>
Date: Thu, 21 Aug 2025 12:39:47 +0200
Subject: [PATCH 8/8] ruff check

---
 examples/simple_example.py           |  6 +++---
 src/laplaciannb/fingerprint_utils.py | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/simple_example.py b/examples/simple_example.py
index e305a80..c59a123 100644
--- a/examples/simple_example.py
+++ b/examples/simple_example.py
@@ -43,7 +43,7 @@
 print("\nResults:")
 print("-" * 40)
 for i, (smiles_str, true_label, pred_label, prob) in enumerate(zip(smiles, y, predictions, probabilities)):
-    print(f"Molecule {i+1}: {smiles_str[:20]}")
+    print(f"Molecule {i + 1}: {smiles_str[:20]}")
     print(f"  True: {true_label}, Predicted: {pred_label}")
     print(f"  Probabilities: [Inactive: {prob[0]:.3f}, Active: {prob[1]:.3f}]")
     print()
@@ -74,7 +74,7 @@
         # Convert to the same uint32 indices used in the matrix
         converted_indices = [int(np.uint32(bit & 0xFFFFFFFF)) for bit in original_indices]
 
-        print(f"\nMolecule {i+1}: {smiles_str}")
+        print(f"\nMolecule {i + 1}: {smiles_str}")
         print(f"  Original indices: {original_indices[:10]}{'...' if len(original_indices) > 10 else ''}")
         print(f"  Converted indices: {converted_indices[:10]}{'...' if len(converted_indices) > 10 else ''}")
         print(f"  Total fingerprint bits: {len(original_indices)}")
@@ -89,7 +89,7 @@
     end_idx = X.indptr[i + 1]
     row_indices = X.indices[start_idx:end_idx]
 
-    print(f"Molecule {i+1} active bits: {row_indices[:10]}{'...' if len(row_indices) > 10 else ''}")
+    print(f"Molecule {i + 1} active bits: {row_indices[:10]}{'...' if len(row_indices) > 10 else ''}")
     print(f"  Total: {len(row_indices)} active bits")
 
 print("\n✓ You can now map back to original RDKit fingerprint indices")
diff --git a/src/laplaciannb/fingerprint_utils.py b/src/laplaciannb/fingerprint_utils.py
index 557a8c7..3ef0ea2 100644
--- a/src/laplaciannb/fingerprint_utils.py
+++ b/src/laplaciannb/fingerprint_utils.py
@@ -91,7 +91,7 @@ def rdkit_to_csr(smiles_list, radius=2, show_progress=True):
     print(f"Conversion completed in {conversion_time:.3f} seconds")
     print(f"Valid molecules: {valid_molecules}/{len(mol_list)}")
     print(f"Total fingerprint bits: {total_bits:,}")
-    print(f"Average bits per molecule: {total_bits/valid_molecules:.1f}")
+    print(f"Average bits per molecule: {total_bits / valid_molecules:.1f}")
     print(f"Matrix shape: {matrix.shape}")
     print(f"Matrix sparsity: {sparsity:.6f}")
     print(f"Memory usage: {(matrix.data.nbytes + matrix.indices.nbytes + matrix.indptr.nbytes) / 1024**2:.2f} MB")
@@ -157,7 +157,7 @@ def benchmark_fingerprint_conversion(n_molecules=100000, radii=[2], molecules_pe
     print("100 molecules, radius=2:")
     print(f"  Sparse matrix: {sparse_memory:.2f} MB")
     print(f"  Dense equivalent: {dense_memory:,.0f} MB")
-    print(f"  Memory reduction: {(1 - sparse_memory/dense_memory)*100:.3f}%")
+    print(f"  Memory reduction: {(1 - sparse_memory / dense_memory) * 100:.3f}%")
 
     # Throughput summary
     print("\nThroughput Summary:")
@@ -242,7 +242,7 @@ def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None, r
     gen_time = time.time() - start_gen
 
     print(f"Dataset generation completed in {gen_time:.2f} seconds")
-    print(f"Average generation rate: {target_molecules/gen_time:.0f} molecules/second")
+    print(f"Average generation rate: {target_molecules / gen_time:.0f} molecules/second")
 
     # Performance tracking
     results = []
@@ -322,8 +322,8 @@ def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None, r
         projected_1M = (1_000_000 / latest["rate"]) if latest["rate"] > 0 else float("inf")
         projected_memory_1M = latest["memory_mb"] * (1_000_000 / latest["molecules"])
 
-        print(f"Projected time for 1M molecules: {projected_1M/60:.1f} minutes")
-        print(f"Projected memory for 1M molecules: {projected_memory_1M/1024:.1f} GB")
+        print(f"Projected time for 1M molecules: {projected_1M / 60:.1f} minutes")
+        print(f"Projected memory for 1M molecules: {projected_memory_1M / 1024:.1f} GB")
 
         # Realistic dataset recommendations
         if latest["rate"] > 0:
@@ -331,8 +331,8 @@ def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None, r
             molecules_per_hour = molecules_per_minute * 60
 
             print("\nRealistic Usage Recommendations:")
-            print(f"  Interactive analysis: Up to {int(molecules_per_minute/10):,} molecules")
-            print(f"  Batch processing: Up to {int(molecules_per_hour/10):,} molecules")
+            print(f"  Interactive analysis: Up to {int(molecules_per_minute / 10):,} molecules")
+            print(f"  Batch processing: Up to {int(molecules_per_hour / 10):,} molecules")
             print(f"  Production pipeline: {int(molecules_per_hour):,}+ molecules/hour")
 
     # Memory efficiency showcase
@@ -350,12 +350,12 @@ def benchmark_large_scale_conversion(target_molecules=100000, test_sizes=None, r
         print(f"{n_mols:,} molecules:")
         print(f"  Sparse matrix: {sparse_mb:.1f} MB")
         print(f"  Dense equivalent: {dense_gb:,.0f} GB")
-        print(f"  Space savings: {(1 - sparse_mb/(dense_gb*1024))*100:.6f}%")
+        print(f"  Space savings: {(1 - sparse_mb / (dense_gb * 1024)) * 100:.6f}%")
 
-    print(f"\n{'='*80}")
+    print(f"\n{'=' * 80}")
     print("✓ Large-scale benchmark completed successfully!")
     print(f"✓ LaplacianNB can efficiently handle datasets up to {target_molecules:,} molecules")
-    print(f"{'='*80}")
+    print(f"{'=' * 80}")
 
     return results