Today, give a try to Techtonique web app, a tool designed to help you make informed, data-driven decisions using Mathematics, Statistics, Machine Learning, and Data Visualization. Here is a tutorial with audio, video, code, and slides: https://moudiki2.gumroad.com/l/nrhgb
Remark: I have no affiliation with these projects, but I love using quarto and Visual Studio Code’s extension Jupyter To Markdown for these posts. Just wanted to say that.
Today:
- I’ll show you how to use Python’s
nnetsauce
(version0.17.0
) to download data from the R universe API. - I’ll use
nnetsauce
’s automated Machine Learning (AutoML) to adjust a model to the Affairs data set also studied in this post from 2023-12-25. - Understand the model’s decisions and obtain prediction intervals on the test set.
0 - Install packages
pip install nnetsauce the-teller imbalanced-learn matplotlib IPython
1- Import data from R universe API
import nnetsauce as ns
import teller as tr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from time import time
downloader = ns.Downloader()
df = downloader.download(pkgname="AER", dataset="Affairs", source = "https://cran.r-universe.dev/")
print(f"===== df: \n {df} \n")
print(f"===== df.dtypes: \n {df.dtypes}")
===== df:
affairs gender age yearsmarried children religiousness education \
0 0 male 37.00 10.00 no 3 18
1 0 female 27.00 4.00 no 4 14
2 0 female 32.00 15.00 yes 1 12
3 0 male 57.00 15.00 yes 5 18
4 0 male 22.00 0.75 no 2 17
.. ... ... ... ... ... ... ...
596 1 male 22.00 1.50 yes 1 12
597 7 female 32.00 10.00 yes 2 18
598 2 male 32.00 10.00 yes 2 17
599 2 male 22.00 7.00 yes 3 18
600 1 female 32.00 15.00 yes 3 14
occupation rating
0 7 4
1 6 4
2 1 4
3 6 5
4 6 3
.. ... ...
596 2 5
597 5 4
598 6 5
599 6 2
600 1 5
[601 rows x 9 columns]
===== df.dtypes:
affairs int64
gender object
age float64
yearsmarried float64
children object
religiousness int64
education int64
occupation int64
rating int64
dtype: object
The data set is extensively in studied in this post.
2 - Data preprocessing
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
df_char_encoded = encoder.fit_transform(df[['gender', 'children']])
df_char_encoded.drop(columns = ['gender_female', 'children_no'], axis=1, inplace=True)
df_minus_char = df.drop(columns=['affairs', 'gender', 'children'])
X = pd.concat((df_minus_char, df_char_encoded), axis=1)
y = np.float64(df['affairs'].values)
Upsampling
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)
Split data into training test and test set
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res,
test_size=0.2,
random_state=13)
3 - Automated Machine Learning with nnetsauce
# build the AutoML regressor
clf = ns.LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None,
n_hidden_features=10)
# adjust on training and evaluate on test data
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
0%| | 0/32 [00:00<?, ?it/s] 3%|▎ | 1/32 [00:00<00:03, 7.76it/s] 9%|▉ | 3/32 [00:00<00:02, 13.29it/s] 22%|██▏ | 7/32 [00:00<00:01, 16.00it/s] 28%|██▊ | 9/32 [00:00<00:02, 9.88it/s] 38%|███▊ | 12/32 [00:00<00:01, 13.48it/s] 50%|█████ | 16/32 [00:01<00:01, 14.66it/s] 62%|██████▎ | 20/32 [00:01<00:00, 18.59it/s] 72%|███████▏ | 23/32 [00:02<00:01, 6.32it/s] 78%|███████▊ | 25/32 [00:02<00:01, 6.57it/s] 84%|████████▍ | 27/32 [00:03<00:01, 4.74it/s] 97%|█████████▋| 31/32 [00:03<00:00, 7.31it/s]100%|██████████| 32/32 [00:03<00:00, 8.57it/s]
Models leaderboard
display(models)
4 - Explaining best model’s “decisions”
Notice the best here.
model_dictionary = clf.provide_models(X_train, X_test, y_train, y_test)
regr = model_dictionary['CustomRegressor(ExtraTreesRegressor)']
regr
CustomRegressor(n_hidden_features=10, obj=ExtraTreesRegressor(random_state=42))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CustomRegressor(n_hidden_features=10, obj=ExtraTreesRegressor(random_state=42))
ExtraTreesRegressor(random_state=42)
ExtraTreesRegressor(random_state=42)
# creating the explainer
expr0 = tr.Explainer(obj=regr)
expr0
Explainer(obj=CustomRegressor(n_hidden_features=10, obj=ExtraTreesRegressor(random_state=42)))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Explainer(obj=CustomRegressor(n_hidden_features=10, obj=ExtraTreesRegressor(random_state=42)))
CustomRegressor(n_hidden_features=10, obj=ExtraTreesRegressor(random_state=42))
ExtraTreesRegressor(random_state=42)
ExtraTreesRegressor(random_state=42)
start = time()
# creating an Explainer for the fitted object `regr1`
expr = tr.Explainer(obj=regr)
# covariates' effects
expr.fit(X_test.values, y_test, X_names=X_test.columns.to_list(), method="ci")
# summary of results
expr.summary()
# timing
print(f"\n Elapsed: {time()-start}")
Score (rmse):
1.083
Residuals:
Min 1Q Median 3Q Max
-12.00 -0.00 0.00 0.00 1.33
Tests on marginal effects (Jackknife):
Estimate Std. Error 95% lbound 95% ubound Pr(>|t|)
gender_male 4.24 2.16 -0.00 8.49 0.05 .
yearsmarried 0.18 0.03 0.12 0.24 0.00 ***
occupation 0.11 0.07 -0.02 0.25 0.10 -
education 0.08 0.04 -0.00 0.17 0.06 .
age -0.06 0.01 -0.09 -0.03 0.00 ***
religiousness -0.45 0.10 -0.65 -0.25 0.00 ***
children_yes -0.48 0.27 -1.01 0.05 0.08 .
rating -0.48 0.09 -0.66 -0.30 0.00 ***
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘-’ 1
Multiple R-squared: 0.929, Adjusted R-squared: 0.928
Elapsed: 118.04078912734985
The results are quite similar obtained in the previous study, with the same factors being associated with the same responses.
5 - Prediction interval on test set
pi = tr.PredictionInterval(regr, method="splitconformal", level=0.95)
pi.fit(X_train, y_train)
preds = pi.predict(X_test, return_pi=True)
pred = preds[0]
y_lower = preds[1]
y_upper = preds[2]
# compute and display the average coverage
print(f"coverage rate = {np.mean((y_test >= y_lower) & (y_test <= y_upper))}")
coverage rate = 0.9575645756457565
import warnings
warnings.filterwarnings('ignore')
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
np.warnings.filterwarnings('ignore')
split_color = 'green'
local_color = 'gray'
np.random.seed(1)
def plot_func(x,
y,
y_u=None,
y_l=None,
pred=None,
shade_color="",
method_name="",
title=""):
fig = plt.figure()
plt.plot(x, y, 'k.', alpha=.3, markersize=10,
fillstyle='full', label=u'Test set observations')
if (y_u is not None) and (y_l is not None):
plt.fill(np.concatenate([x, x[::-1]]),
np.concatenate([y_u, y_l[::-1]]),
alpha=.3, fc=shade_color, ec='None',
label = method_name + ' Prediction interval')
if pred is not None:
plt.plot(x, pred, 'k--', lw=2, alpha=0.9,
label=u'Predicted value')
#plt.ylim([-2.5, 7])
plt.xlabel('$X$')
plt.ylabel('$Y$')
plt.legend(loc='upper right')
plt.title(title)
plt.show()
max_idx = 50
plot_func(x = range(max_idx),
y = y_test[0:max_idx],
y_u = y_upper[0:max_idx],
y_l = y_lower[0:max_idx],
pred = pred[0:max_idx],
shade_color=split_color,
title = f"Split conformal ({max_idx} first points in test set)")
Comments powered by Talkyard.