!curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y

import os
os.environ['PATH'] = f"/root/.cargo/bin:{os.environ['PATH']}"
!echo $PATH

!rustc --version
!cargo --version

!pip install genbooster

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.utils.discovery import all_estimators
from sklearn.datasets import load_iris, load_breast_cancer, load_wine, load_digits
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.tree import ExtraTreeRegressor
from sklearn.model_selection import train_test_split
from genbooster.genboosterclassifier import BoosterClassifier
from genbooster.randombagclassifier import RandomBagClassifier
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from sklearn.utils.discovery import all_estimators
from time import time

datasets = [load_iris(return_X_y=True),
            load_breast_cancer(return_X_y=True),
            load_wine(return_X_y=True),
            load_digits(return_X_y=True)]

datasets_names = ['iris', 'breast_cancer', 'wine', 'digits']

# Booster
for dataset, dataset_name in tqdm(zip(datasets, datasets_names)):
    print("\n data set ", dataset_name)
    accuracy_scores = []
    X, y = dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    for estimator in tqdm(all_estimators(type_filter='regressor')):
      if (estimator[0] in ["BaggingRegressor", "ExtraTreeRegressor", "DecisionTreeRegressor", "ElasticNetCV", "SVR"]):
        print("Estimator: ", estimator[0])
        try:
            clf = BoosterClassifier(base_estimator=estimator[1]())
            start = time()
            clf.fit(X_train, y_train)
            training_time = time() - start
            print(f'\n Time (training): {training_time}')
            start = time()
            preds = clf.predict(X_test)
            inference_time = time() - start
            print(f'\n Time (inference): {inference_time}')
            print("Accuracy", np.mean(preds == y_test))
            accuracy_scores.append((dataset_name, estimator[0], np.mean(preds == y_test),
                                    training_time, inference_time))
        except Exception as e:
            continue
    accuracy_scores_df = pd.DataFrame(accuracy_scores, columns=['dataset', 'estimator', 'accuracy',
                                                                'training_time', 'inference_time'])
    accuracy_scores_df.sort_values(by='accuracy', ascending=False, inplace=True)
    display(accuracy_scores_df)

0it [00:00, ?it/s]

 data set  iris

  0%|          | 0/55 [00:00<?, ?it/s]

Estimator:  BaggingRegressor

 Time (training): 7.691629648208618

  5%|▌         | 3/55 [00:08<02:22,  2.74s/it]

 Time (inference): 0.5218164920806885
Accuracy 1.0
Estimator:  DecisionTreeRegressor

 11%|█         | 6/55 [00:08<00:58,  1.19s/it]

 Time (training): 0.23518085479736328

 Time (inference): 0.05045819282531738
Accuracy 1.0
Estimator:  ElasticNetCV

 16%|█▋        | 9/55 [00:16<01:24,  1.84s/it]

 Time (training): 7.839452505111694

 Time (inference): 0.0176393985748291
Accuracy 0.9333333333333333
Estimator:  ExtraTreeRegressor

# Random Bagging
for dataset, dataset_name in zip(datasets, datasets_names):
    print("\n data set ", dataset_name)
    accuracy_scores = []
    X, y = dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    for estimator in tqdm(all_estimators(type_filter='regressor')):
      if (estimator[0] in ["BaggingRegressor", "ExtraTreeRegressor", "DecisionTreeRegressor", "ElasticNetCV", "SVR"]):
        print("Estimator: ", estimator[0])
        try:
            # Convert X_train and X_test to NumPy arrays with float64 dtype
            X_train = X_train.astype(np.float64)
            X_test = X_test.astype(np.float64)

            clf = RandomBagClassifier(base_estimator=estimator[1]())
            start = time()
            clf.fit(X_train, y_train)
            training_time = time() - start
            print(f'\n Time (training): {training_time}')
            start = time()

            try:  # Nested try-except to catch PanicException during prediction
                preds = clf.predict(X_test)
            except Exception as pred_e:
                if "PanicException" in str(pred_e):
                    print(f"PanicException during prediction with estimator {estimator[0]}: {pred_e}")
                    continue  # Skip this estimator and move to the next
                else:
                    raise  # Re-raise other exceptions
            inference_time = time() - start
            print("Accuracy", np.mean(preds == y_test))
            print(f'\n Time (inference): {inference_time}')
            accuracy_scores.append((dataset_name, estimator[0], np.mean(preds == y_test),
                                    training_time, inference_time))
        except Exception as e:
            print(f"Error with estimator {estimator[0]}: {e}")
            continue

    accuracy_scores_df = pd.DataFrame(accuracy_scores, columns=['dataset', 'estimator', 'accuracy',
                                                                'training_time', 'inference_time'])
    accuracy_scores_df.sort_values(by='accuracy', ascending=False, inplace=True)
    display(accuracy_scores_df)

 data set  iris

  0%|          | 0/55 [00:00<?, ?it/s]

Estimator:  BaggingRegressor

 Time (training): 5.794196128845215

  5%|▌         | 3/55 [00:06<01:58,  2.27s/it]

Accuracy 1.0

 Time (inference): 1.0278966426849365
Estimator:  DecisionTreeRegressor

 11%|█         | 6/55 [00:07<00:50,  1.04s/it]

 Time (training): 0.41347432136535645
Accuracy 1.0

 Time (inference): 0.0855255126953125
Estimator:  ElasticNetCV

 16%|█▋        | 9/55 [00:29<03:02,  3.97s/it]

 Time (training): 22.3568377494812
Accuracy 0.9

 Time (inference): 0.04216742515563965
Estimator:  ExtraTreeRegressor

 18%|█▊        | 10/55 [00:30<02:29,  3.32s/it]

import pandas as pd
from genbooster.genboosterregressor import BoosterRegressor
from genbooster.randombagregressor import RandomBagRegressor

url = "https://raw.githubusercontent.com/Techtonique/datasets/refs/heads/main/tabular/regression/boston_dataset2.csv"

df = pd.read_csv(url)
print(df.head())

X = df.drop(columns=['target', 'training_index'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=142)

    V0    V1   V2   V3   V4   V5    V6   V7   V8     V9   V10    V11  V12  \
0 0.01 18.00 2.31 0.00 0.54 6.58 65.20 4.09 1.00 296.00 15.30 396.90 4.98   
1 0.03  0.00 7.07 0.00 0.47 6.42 78.90 4.97 2.00 242.00 17.80 396.90 9.14   
2 0.03  0.00 7.07 0.00 0.47 7.18 61.10 4.97 2.00 242.00 17.80 392.83 4.03   
3 0.03  0.00 2.18 0.00 0.46 7.00 45.80 6.06 3.00 222.00 18.70 394.63 2.94   
4 0.07  0.00 2.18 0.00 0.46 7.15 54.20 6.06 3.00 222.00 18.70 396.90 5.33   

   target  training_index  
0   24.00               0  
1   21.60               1  
2   34.70               0  
3   33.40               1  
4   36.20               1

rmse_scores = []

for estimator in tqdm(all_estimators(type_filter='regressor')):
  if (estimator[0] in ["BaggingRegressor", "ExtraTreeRegressor", "DecisionTreeRegressor", "ElasticNetCV", "SVR"]):
    print("Estimator: ", estimator[0])
    #try:
    clf = BoosterRegressor(base_estimator=estimator[1]())
    start = time()
    clf.fit(X_train, y_train)
    training_time = time() - start
    print(f'\n Time (training): {training_time}')
    start = time()
    preds = clf.predict(X_test)
    inference_time = time() - start
    print(f'\n Time (inference): {inference_time}')
    print("RMSE", np.sqrt(np.mean((preds - y_test)**2)))
    rmse_scores.append((estimator[0], np.sqrt(np.mean((preds - y_test)**2)),
                            training_time, inference_time))
    #except Exception as e:
    #    continue

rmse_scores_df = pd.DataFrame(rmse_scores, columns=['estimator', 'RMSE',
                                                    'training_time', 'inference_time'])
rmse_scores_df.sort_values(by='RMSE', ascending=True, inplace=True)
display(rmse_scores_df)

  0%|          | 0/55 [00:00<?, ?it/s]

Estimator:  BaggingRegressor

 Time (training): 8.92781949043274

  5%|▌         | 3/55 [00:09<02:39,  3.07s/it]

 Time (inference): 0.2669663429260254
RMSE 2.858643903624637
Estimator:  DecisionTreeRegressor

 11%|█         | 6/55 [00:09<01:07,  1.39s/it]

 Time (training): 0.6041898727416992

 Time (inference): 0.022806882858276367
RMSE 4.344879007810268
Estimator:  ElasticNetCV

 16%|█▋        | 9/55 [00:12<00:55,  1.21s/it]

 Time (training): 2.9936535358428955

 Time (inference): 0.008358955383300781
RMSE 4.224416765751234
Estimator:  ExtraTreeRegressor

 18%|█▊        | 10/55 [00:13<00:47,  1.05s/it]

 Time (training): 0.26199960708618164

 Time (inference): 0.02275371551513672
RMSE 2.5375423322614834
Estimator:  SVR

 Time (training): 2.0531556606292725

rmse_scores = []

for estimator in tqdm(all_estimators(type_filter='regressor')):
  if (estimator[0] in ["BaggingRegressor", "ExtraTreeRegressor", "DecisionTreeRegressor", "ElasticNetCV", "SVR"]):
    print("Estimator: ", estimator[0])
    try:
        clf = BoosterRegressor(base_estimator=estimator[1]())
        start = time()
        clf.fit(X_train, y_train)
        training_time = time() - start
        print(f'\n Time (training): {training_time}')
        start = time()
        preds = clf.predict(X_test)
        inference_time = time() - start
        print(f'\n Time (inference): {inference_time}')
        print("RMSE", np.sqrt(np.mean((preds - y_test)**2)))
        rmse_scores.append((estimator[0], np.sqrt(np.mean((preds - y_test)**2)),
                                training_time, inference_time))
    except Exception as e:
        continue

rmse_scores_df = pd.DataFrame(rmse_scores, columns=['estimator', 'RMSE',
                                                    'training_time', 'inference_time'])
rmse_scores_df.sort_values(by='RMSE', ascending=True, inplace=True)
display(rmse_scores_df)

  0%|          | 0/55 [00:00<?, ?it/s]

Estimator:  BaggingRegressor

 Time (training): 9.518575429916382

  5%|▌         | 3/55 [00:09<02:49,  3.26s/it]

 Time (inference): 0.2690126895904541
RMSE 2.771679576927226
Estimator:  DecisionTreeRegressor

 11%|█         | 6/55 [00:10<01:11,  1.47s/it]

 Time (training): 0.6097230911254883

 Time (inference): 0.021549463272094727
RMSE 4.398489123615693
Estimator:  ElasticNetCV

 16%|█▋        | 9/55 [00:13<00:57,  1.26s/it]

 Time (training): 2.9978723526000977

 Time (inference): 0.00686335563659668
RMSE 4.224416765751234
Estimator:  ExtraTreeRegressor

 18%|█▊        | 10/55 [00:13<00:48,  1.08s/it]

 Time (training): 0.25699806213378906

 Time (inference): 0.02178335189819336
RMSE 2.655690158719138
Estimator:  SVR

 Time (training): 2.04705548286438

Gradient-Boosting and Boostrap aggregating anything (alert: high performance): Part5, easier install and Rust backend

Python version

0 - Install Rust utilities (if necessary)¶

1 - 1 Import packages¶

1 - 2 Boosting classifier¶

1 - 3 Bagging classifier¶

1 - 4 Boosting regression¶

1 - 5 Bagging regression¶

R version

	dataset	estimator	accuracy	training_time	inference_time
0	iris	BaggingRegressor	1.00	7.69	0.52
1	iris	DecisionTreeRegressor	1.00	0.24	0.05
3	iris	ExtraTreeRegressor	1.00	0.28	0.04
4	iris	SVR	1.00	0.87	0.15
2	iris	ElasticNetCV	0.93	7.84	0.02

	dataset	estimator	accuracy	training_time	inference_time
2	breast_cancer	ElasticNetCV	0.98	23.06	0.02
4	breast_cancer	SVR	0.98	2.77	0.36
3	breast_cancer	ExtraTreeRegressor	0.97	0.29	0.03
0	breast_cancer	BaggingRegressor	0.95	49.69	0.47
1	breast_cancer	DecisionTreeRegressor	0.95	1.36	0.03

	dataset	estimator	accuracy	training_time	inference_time
0	wine	BaggingRegressor	1.00	10.58	0.50
2	wine	ElasticNetCV	1.00	10.47	0.02
1	wine	DecisionTreeRegressor	0.97	0.31	0.03
3	wine	ExtraTreeRegressor	0.97	0.36	0.07
4	wine	SVR	0.78	1.16	0.15

	dataset	estimator	accuracy	training_time	inference_time
3	digits	ExtraTreeRegressor	0.97	4.52	0.37
0	digits	BaggingRegressor	0.97	528.40	3.71
4	digits	SVR	0.96	140.60	17.80
2	digits	ElasticNetCV	0.95	217.42	0.13
1	digits	DecisionTreeRegressor	0.92	12.66	0.23

	estimator	RMSE	training_time	inference_time
3	ExtraTreeRegressor	2.54	0.26	0.02
0	BaggingRegressor	2.86	8.93	0.27
4	SVR	3.65	2.05	0.41
2	ElasticNetCV	4.22	2.99	0.01
1	DecisionTreeRegressor	4.34	0.60	0.02

	estimator	RMSE	training_time	inference_time
3	ExtraTreeRegressor	2.66	0.26	0.02
0	BaggingRegressor	2.77	9.52	0.27
4	SVR	3.65	2.05	0.28
2	ElasticNetCV	4.22	3.00	0.01
1	DecisionTreeRegressor	4.40	0.61	0.02