Today, give a try to Techtonique web app, a tool designed to help you make informed, data-driven decisions using Mathematics, Statistics, Machine Learning, and Data Visualization. Here is a tutorial with audio, video, code, and slides: https://moudiki2.gumroad.com/l/nrhgb. 100 API requests are now (and forever) offered to every user, no matter the pricing tier.
This week’s post is about mlsauce
(again), and LSBoost
in particular. No new working paper (still working on it), but:
- An updated R version, working at least on Linux and macOS (Windows users, if not working on your machine, give a try to the Windows Subsystem for Linux, WSL)
- A new updated documentation page
- My first StackOverflow question ever (still unanswered)
The examples below probably include some kind of leakage (great if you can spot it), but take it as an illustration.
0 - import packages
Importing mlsauce
from GitHub remains the preferred way to install it.
#!pip install numpy matplotlib scikit-learn
!pip install git+https://github.com/Techtonique/mlsauce.git --verbose
# Importing necessary libraries
import mlsauce as ms
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import KernelPCA # Non-linear dimensionality reduction through the use of kernels
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
1 - Data preprocessing
# Load breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target
print(X.shape)
print(y.shape)
(569, 30)
(569,)
1 - 1 Kernel PCA features
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform Kernel PCA to extract 2 'good' features
# (easier to visualize)
kpca = KernelPCA(n_components=2)
X_kpca = kpca.fit_transform(X_scaled)
# Splitting the dataset into training and testing sets
X_train_kpca, X_test_kpca, y_train, y_test = train_test_split(X_kpca, y, test_size=0.2,
random_state=32)
# Plotting the two principal components
plt.figure(figsize=(8, 6))
plt.scatter(X_test_kpca[:, 0], X_test_kpca[:, 1], c=y_test, cmap='viridis')
plt.xlabel('Kernel Principal Component 1')
plt.ylabel('Kernel Principal Component 2')
plt.title('Kernel PCA of Breast Cancer Dataset')
plt.colorbar(label='Malignant (0) / Benign (1)')
plt.show()
1 - 2 ‘Important’ features
# Training a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X, y)
# Feature importances
importances = rf_classifier.feature_importances_
print(importances)
indices = np.argsort(importances)[::-1]
print(indices)
# Select top 2 features
top_two_indices = indices[:2]
print(data.feature_names[top_two_indices])
X_rf = X[:,top_two_indices]
# Splitting the dataset into training and testing sets
X_train_rf, X_test_rf, y_train, y_test = train_test_split(X_rf, y, test_size=0.2,
random_state=32)
# Plotting the two principal components
plt.figure(figsize=(8, 6))
plt.scatter(X_test_rf[:, 0], X_test_rf[:, 1], c=y_test, cmap='viridis')
plt.xlabel("Most 'important' feature 1")
plt.ylabel("Most 'important' feature 2")
plt.title('Response for Breast Cancer Dataset')
plt.colorbar(label='Malignant (0) / Benign (1)')
plt.show()
[0.03484323 0.01522515 0.06799034 0.06046164 0.00795845 0.01159704
0.06691736 0.10704566 0.00342279 0.00261508 0.0142637 0.00374427
0.01008506 0.02955283 0.00472157 0.00561183 0.00581969 0.00375975
0.00354597 0.00594233 0.08284828 0.01748526 0.0808497 0.13935694
0.01223202 0.01986386 0.03733871 0.13222509 0.00817908 0.00449731]
[23 27 7 20 22 2 6 3 26 0 13 25 21 1 10 24 5 12 28 4 19 16 15 14
29 17 11 18 8 9]
['worst area' 'worst concave points']
2 - Adjust LSBoostClassifier
!pip install GPopt
import GPopt as gp
import mlsauce as ms
from sklearn.model_selection import cross_val_score
opt_objects_lsboost = []
def lsboost_cv(X_train, y_train,
n_estimators=100,
learning_rate=0.1,
n_hidden_features=5,
reg_lambda=0.1,
dropout=0,
tolerance=1e-4,
n_clusters=2,
seed=123,
solver="ridge"):
estimator = ms.LSBoostClassifier(n_estimators=int(n_estimators),
learning_rate=learning_rate,
n_hidden_features=int(n_hidden_features),
reg_lambda=reg_lambda,
dropout=dropout,
tolerance=tolerance,
n_clusters=int(n_clusters),
seed=seed, solver=solver, verbose=0)
return -cross_val_score(estimator, X_train, y_train,
scoring='f1_macro', cv=5).mean()
def optimize_lsboost(X_train, y_train, solver="ridge"):
# objective function for hyperparams tuning
def crossval_objective(x):
return lsboost_cv(
X_train=X_train,
y_train=y_train,
n_estimators=int(x[0]),
learning_rate=x[1],
n_hidden_features=int(x[2]),
reg_lambda=x[3],
dropout=x[4],
tolerance=x[5],
n_clusters=int(x[6]),
solver = solver)
gp_opt = gp.GPOpt(objective_func=crossval_objective,
lower_bound = np.array([ 10, 0.001, 5, 1e-2, 0, 0, 0]),
upper_bound = np.array([250, 0.4, 250, 1e4, 0.7, 1e-1, 4]),
params_names=["n_estimators", "learning_rate",
"n_hidden_features", "reg_lambda",
"dropout", "tolerance", "n_clusters"],
n_init=10, n_iter=190, seed=123)
return {'parameters': gp_opt.optimize(verbose=2, abs_tol=1e-2), 'opt_object': gp_opt}
opt_objects_lsboost.append(optimize_lsboost(X_train_kpca, y_train, solver="ridge"))
opt_objects_lsboost.append(optimize_lsboost(X_train_rf, y_train, solver="ridge"))
3 - Graphs
display(opt_objects_lsboost[0]['parameters'].best_params)
display(opt_objects_lsboost[1]['parameters'].best_params)
opt_objects_lsboost[0]['parameters'].best_params['n_estimators'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_estimators'])
opt_objects_lsboost[1]['parameters'].best_params['n_estimators'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_estimators'])
opt_objects_lsboost[0]['parameters'].best_params['n_hidden_features'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_hidden_features'])
opt_objects_lsboost[1]['parameters'].best_params['n_hidden_features'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_hidden_features'])
opt_objects_lsboost[0]['parameters'].best_params['n_clusters'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_clusters'])
opt_objects_lsboost[1]['parameters'].best_params['n_clusters'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_clusters'])
{'n_estimators': 221.10595703125,
'learning_rate': 0.12772097778320313,
'n_hidden_features': 45.053253173828125,
'reg_lambda': 2496.6505697631837,
'dropout': 0.2851226806640625,
'tolerance': 0.0047698974609375,
'n_clusters': 3.1986083984375}
{'n_estimators': 193.544921875,
'learning_rate': 0.3466668701171875,
'n_hidden_features': 208.9971923828125,
'reg_lambda': 1866.4632116699217,
'dropout': 0.37947998046875,
'tolerance': 0.01290283203125,
'n_clusters': 3.04443359375}
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
classifiers = [RandomForestClassifier(),
GradientBoostingClassifier(),
ms.LSBoostClassifier(**opt_objects_lsboost[0]['parameters'].best_params),
ms.LSBoostClassifier(**opt_objects_lsboost[1]['parameters'].best_params)]
names = ["rf", "gb", "lsboost_pca", "lsboost_rf"]
figure = plt.figure(figsize=(27, 9))
i = 1
datasets = [(X_kpca, y), (X_rf, y)]
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds[0], ds[1]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, random_state=42
)
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(["#FF0000", "#0000FF"])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
# Plot the testing points
ax.scatter(
X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf = make_pipeline(StandardScaler(), clf)
clf.fit(X_train, y_train)
try:
score = clf.score(X_test, y_test)
except: # no scoring method available yet for prediction sets
score = np.mean(clf.predict_proba(X_test).argmax(axis=1) == y_test)
DecisionBoundaryDisplay.from_estimator(
clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
)
# Plot the training points
ax.scatter(
X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
)
# Plot the testing points
ax.scatter(
X_test[:, 0],
X_test[:, 1],
c=y_test,
cmap=cm_bright,
edgecolors="k",
alpha=0.6,
)
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(name)
ax.text(
x_max - 0.3,
y_min + 0.3,
("%.2f" % score).lstrip("0"),
size=15,
horizontalalignment="right",
)
i += 1
plt.tight_layout()
plt.show()
43%|████▎ | 94/221 [00:00<00:00, 178.28it/s]
26%|██▋ | 51/193 [00:02<00:07, 18.66it/s]
54%|█████▍ | 51/94 [00:00<00:00, 449.07it/s]
100%|██████████| 51/51 [00:00<00:00, 61.11it/s]
Comments powered by Talkyard.