Today, give a try to Techtonique web app, a tool designed to help you make informed, data-driven decisions using Mathematics, Statistics, Machine Learning, and Data Visualization. Here is a tutorial with audio, video, code, and slides: https://moudiki2.gumroad.com/l/nrhgb. 100 API requests are now (and forever) offered to every user every month, no matter the pricing tier.

Remark: I have no affiliation with these projects, but I love using quarto and Visual Studio Code’s extension Jupyter To Markdown for these posts. Just wanted to say that.

Today:

I’ll show you how to use Python’s nnetsauce (version 0.17.0) to download data from the R universe API.
I’ll use nnetsauce’s automated Machine Learning (AutoML) to adjust a model to the Affairs data set also studied in this post from 2023-12-25.
Understand the model’s decisions and obtain prediction intervals on the test set.

0 - Install packages

pip install nnetsauce the-teller imbalanced-learn matplotlib IPython

1- Import data from R universe API

import nnetsauce as ns
import teller as tr 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from time import time

downloader = ns.Downloader()

df = downloader.download(pkgname="AER", dataset="Affairs", source = "https://cran.r-universe.dev/")

print(f"===== df: \n {df} \n")
print(f"===== df.dtypes: \n {df.dtypes}")

===== df: 
      affairs  gender   age  yearsmarried children  religiousness  education  \
0          0    male 37.00         10.00       no              3         18   
1          0  female 27.00          4.00       no              4         14   
2          0  female 32.00         15.00      yes              1         12   
3          0    male 57.00         15.00      yes              5         18   
4          0    male 22.00          0.75       no              2         17   
..       ...     ...   ...           ...      ...            ...        ...   
596        1    male 22.00          1.50      yes              1         12   
597        7  female 32.00         10.00      yes              2         18   
598        2    male 32.00         10.00      yes              2         17   
599        2    male 22.00          7.00      yes              3         18   
600        1  female 32.00         15.00      yes              3         14   

     occupation  rating  
0             7       4  
1             6       4  
2             1       4  
3             6       5  
4             6       3  
..          ...     ...  
596           2       5  
597           5       4  
598           6       5  
599           6       2  
600           1       5  

[601 rows x 9 columns] 

===== df.dtypes: 
 affairs            int64
gender            object
age              float64
yearsmarried     float64
children          object
religiousness      int64
education          int64
occupation         int64
rating             int64
dtype: object

The data set is extensively in studied in this post.

2 - Data preprocessing

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
df_char_encoded = encoder.fit_transform(df[['gender', 'children']])
df_char_encoded.drop(columns = ['gender_female', 'children_no'], axis=1, inplace=True)
df_minus_char = df.drop(columns=['affairs', 'gender', 'children'])

X = pd.concat((df_minus_char, df_char_encoded), axis=1)
y = np.float64(df['affairs'].values)

Upsampling

ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)

Split data into training test and test set

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res,
                                                    test_size=0.2, 
                                                    random_state=13)

3 - Automated Machine Learning with `nnetsauce`

# build the AutoML regressor 
clf = ns.LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None, 
n_hidden_features=10)

# adjust on training and evaluate on test data
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/32 [00:00<?, ?it/s]  3%|▎         | 1/32 [00:00<00:03,  7.76it/s]  9%|▉         | 3/32 [00:00<00:02, 13.29it/s] 22%|██▏       | 7/32 [00:00<00:01, 16.00it/s] 28%|██▊       | 9/32 [00:00<00:02,  9.88it/s] 38%|███▊      | 12/32 [00:00<00:01, 13.48it/s] 50%|█████     | 16/32 [00:01<00:01, 14.66it/s] 62%|██████▎   | 20/32 [00:01<00:00, 18.59it/s] 72%|███████▏  | 23/32 [00:02<00:01,  6.32it/s] 78%|███████▊  | 25/32 [00:02<00:01,  6.57it/s] 84%|████████▍ | 27/32 [00:03<00:01,  4.74it/s] 97%|█████████▋| 31/32 [00:03<00:00,  7.31it/s]100%|██████████| 32/32 [00:03<00:00,  8.57it/s]

Models leaderboard

display(models)

4 - Explaining best model’s “decisions”

Notice the best here.

model_dictionary = clf.provide_models(X_train, X_test, y_train, y_test)
regr = model_dictionary['CustomRegressor(ExtraTreesRegressor)']
regr

CustomRegressor(n_hidden_features=10, obj=ExtraTreesRegressor(random_state=42))

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# creating the explainer
expr0 = tr.Explainer(obj=regr)
expr0

Explainer(obj=CustomRegressor(n_hidden_features=10,
                              obj=ExtraTreesRegressor(random_state=42)))

start = time()
# creating an Explainer for the fitted object `regr1`
expr = tr.Explainer(obj=regr)
# covariates' effects
expr.fit(X_test.values, y_test, X_names=X_test.columns.to_list(), method="ci")
# summary of results
expr.summary()
# timing
print(f"\n Elapsed: {time()-start}")

Score (rmse): 
 1.083


Residuals: 
   Min    1Q  Median   3Q  Max
-12.00 -0.00    0.00 0.00 1.33


Tests on marginal effects (Jackknife): 
              Estimate Std. Error 95% lbound 95% ubound Pr(>|t|)     
gender_male       4.24       2.16      -0.00       8.49     0.05    .
yearsmarried      0.18       0.03       0.12       0.24     0.00  ***
occupation        0.11       0.07      -0.02       0.25     0.10    -
education         0.08       0.04      -0.00       0.17     0.06    .
age              -0.06       0.01      -0.09      -0.03     0.00  ***
religiousness    -0.45       0.10      -0.65      -0.25     0.00  ***
children_yes     -0.48       0.27      -1.01       0.05     0.08    .
rating           -0.48       0.09      -0.66      -0.30     0.00  ***


Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘-’ 1


Multiple R-squared:  0.929, Adjusted R-squared:  0.928

 Elapsed: 118.04078912734985

The results are quite similar obtained in the previous study, with the same factors being associated with the same responses.

5 - Prediction interval on test set

pi = tr.PredictionInterval(regr, method="splitconformal", level=0.95)
pi.fit(X_train, y_train)
preds = pi.predict(X_test, return_pi=True)

pred = preds[0]
y_lower = preds[1]
y_upper = preds[2]

# compute and display the average coverage
print(f"coverage rate = {np.mean((y_test >= y_lower) & (y_test <= y_upper))}")

coverage rate = 0.9575645756457565

import warnings
warnings.filterwarnings('ignore')

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import numpy as np
np.warnings.filterwarnings('ignore')


split_color = 'green'
local_color = 'gray'

np.random.seed(1)

def plot_func(x,
              y,
              y_u=None,
              y_l=None,
              pred=None,
              shade_color="",
              method_name="",
              title=""):

    fig = plt.figure()

    plt.plot(x, y, 'k.', alpha=.3, markersize=10,
             fillstyle='full', label=u'Test set observations')

    if (y_u is not None) and (y_l is not None):
        plt.fill(np.concatenate([x, x[::-1]]),
                 np.concatenate([y_u, y_l[::-1]]),
                 alpha=.3, fc=shade_color, ec='None',
                 label = method_name + ' Prediction interval')

    if pred is not None:
        plt.plot(x, pred, 'k--', lw=2, alpha=0.9,
                 label=u'Predicted value')

    #plt.ylim([-2.5, 7])
    plt.xlabel('$X$')
    plt.ylabel('$Y$')
    plt.legend(loc='upper right')
    plt.title(title)

    plt.show()

max_idx = 50
plot_func(x = range(max_idx),
          y = y_test[0:max_idx],
          y_u = y_upper[0:max_idx],
          y_l = y_lower[0:max_idx],
          pred = pred[0:max_idx],
          shade_color=split_color,
          title = f"Split conformal ({max_idx} first points in test set)")

xxx

Comments powered by Talkyard.

@misc{ tmoudiki20240219, author = { T. Moudiki }, title = { A plethora of datasets at your fingertips Part3: how many times do couples cheat on each other? }, url = { https://thierrymoudiki.github.io/blog/2024/02/19/python/quasirandomizednn/explainableml/nnetsauce-dl-data }, year = { 2024 } }

A plethora of datasets at your fingertips Part3: how many times do couples cheat on each other?