utils::install.packages("reticulate")

library(reticulate)

# Install necessary Python packages (https://github.com/Techtonique/survivalist)
system("pip uninstall -y survivalist")
system("pip install survivalist --upgrade --no-cache-dir --verbose")

# Import Python modules (/!\ this will work if the Python packages are in the environment, e.g in Colab)
np <- import("numpy")
pd <- import("pandas")
lgb <- import("lightgbm")
xgb <- import("xgboost")
survivalist <- import("survivalist")
sklearn <- import("sklearn")

# Load specific functions
load_whas500 <- survivalist$datasets$load_whas500
PIComponentwiseGenGradientBoostingSurvivalAnalysis <- survivalist$ensemble$PIComponentwiseGenGradientBoostingSurvivalAnalysis
PISurvivalCustom <- survivalist$custom$PISurvivalCustom
RidgeCV <- sklearn$linear_model$RidgeCV
Lasso <- sklearn$linear_model$Lasso
XGB <- xgb$XGBRegressor
LGBM <- lgb$LGBMRegressor
train_test_split <- sklearn$model_selection$train_test_split

# Function for one-hot encoding categorical columns
encode_categorical_columns <- function(df, categorical_columns = NULL) {
  if (is.null(categorical_columns)) {
    categorical_columns <- names(df)[sapply(df, is.character)]
  }
  df_encoded <- py_to_r(pd$get_dummies(r_to_py(df), columns = categorical_columns))
  return(df_encoded)
}

# Load data
data <- load_whas500()
X <- py_to_r(data[[1]])
y <- py_to_r(data[[2]])
X <- encode_categorical_columns(X)

# Split into training and test sets
# Perform the split in Python to avoid unnecessary conversions
split_data <- train_test_split(r_to_py(X), r_to_py(y), test_size = 0.2, random_state = 42L)
X_train <- split_data[[1]] # Keep X_train as a Python object
X_test <- split_data[[2]]  # Keep X_test as a Python object
y_train <- split_data[[3]] # Keep y_train as a Python object
y_test <- split_data[[4]]  # Keep y_test as a Python object


# Survival model
# Fit the models using Python objects directly, and specifying the type of uncertainty
estimator1 <- PIComponentwiseGenGradientBoostingSurvivalAnalysis(regr=RidgeCV(), type_pi="bootstrap")
estimator1$fit(X_train, y_train)

estimator2 <- PIComponentwiseGenGradientBoostingSurvivalAnalysis(regr=Lasso(), type_pi="kde")
estimator2$fit(X_train, y_train)

# set a random seed
estimator3 <- PISurvivalCustom(regr=XGB(), type_pi="bootstrap")
estimator3$fit(X_train, y_train)

# set a random seed
estimator4 <- PISurvivalCustom(regr=LGBM(), type_pi="kde")
estimator4$fit(X_train, y_train)

PIComponentwiseGenGradientBoostingSurvivalAnalysis(regr=RidgeCV(),
                                                   type_pi='bootstrap')

PIComponentwiseGenGradientBoostingSurvivalAnalysis(regr=Lasso(), type_pi='kde')

PISurvivalCustom(regr=XGBRegressor(base_score=None, booster=None,
                                   callbacks=None, colsample_bylevel=None,
                                   colsample_bynode=None, colsample_bytree=None,
                                   device=None, early_stopping_rounds=None,
                                   enable_categorical=False, eval_metric=None,
                                   feature_types=None, gamma=None,
                                   grow_policy=None, importance_type=None,
                                   interaction_constraints=None,
                                   learning_rate=None, max_bin=None,
                                   max_cat_threshold=None,
                                   max_cat_to_onehot=None, max_delta_step=None,
                                   max_depth=None, max_leaves=None,
                                   min_child_weight=None, missing=nan,
                                   monotone_constraints=None,
                                   multi_strategy=None, n_estimators=None,
                                   n_jobs=None, num_parallel_tree=None,
                                   random_state=None, ...),
                 type_pi='bootstrap')

PISurvivalCustom(regr=LGBMRegressor(), type_pi='kde')

# Predict survival functions for an individual
# Use X_test as a Python object
surv_funcs <- estimator1$predict_survival_function(X_test[1, , drop = FALSE]) # Indexing in Python starts from 0
surv_funcs2 <- estimator2$predict_survival_function(X_test[1, , drop = FALSE]) # Indexing in Python starts from 0
surv_funcs3 <- estimator3$predict_survival_function(X_test[1, , drop = FALSE]) # Indexing in Python starts from 0
surv_funcs4 <- estimator4$predict_survival_function(X_test[1, , drop = FALSE]) # Indexing in Python starts from 0

# Extract survival function values
times <- surv_funcs$mean[[1]]$x
surv_prob <- surv_funcs$mean[[1]]$y
lower_bound <- surv_funcs$lower[[1]]$y
upper_bound <- surv_funcs$upper[[1]]$y

times2 <- surv_funcs2$mean[[1]]$x
surv_prob2 <- surv_funcs2$mean[[1]]$y
lower_bound2 <- surv_funcs2$lower[[1]]$y
upper_bound2 <- surv_funcs2$upper[[1]]$y

times3 <- surv_funcs3$mean[[1]]$x
surv_prob3 <- surv_funcs3$mean[[1]]$y
lower_bound3 <- surv_funcs3$lower[[1]]$y
upper_bound3 <- surv_funcs3$upper[[1]]$y

times4 <- surv_funcs4$mean[[1]]$x
surv_prob4 <- surv_funcs4$mean[[1]]$y
lower_bound4 <- surv_funcs4$lower[[1]]$y
upper_bound4 <- surv_funcs4$upper[[1]]$y

par(mfrow=c(2, 2))
plot(times, surv_prob, type = "s", ylim = c(0, 1), xlab = "Time", ylab = "Survival Probability", main = "Survival Function for RidgeCV")
polygon(c(times, rev(times)), c(lower_bound, rev(upper_bound)), col = "lightblue", border = NA)
lines(times, surv_prob, type = "s", lwd = 2)
plot(times2, surv_prob2, type = "s", ylim = c(0, 1), xlab = "Time", ylab = "Survival Probability", main = "Survival Function for Lasso")
polygon(c(times2, rev(times2)), c(lower_bound2, rev(upper_bound2)), col = "pink", border = NA)
lines(times2, surv_prob2, type = "s", lwd = 2)
plot(times3, surv_prob3, type = "s", ylim = c(0, 1), xlab = "Time", ylab = "Survival Probability", main = "Survival Function for XGBoost")
polygon(c(times3, rev(times3)), c(lower_bound3, rev(upper_bound3)), col = "green", border = NA)
lines(times3, surv_prob3, type = "s", lwd = 2)
plot(times4, surv_prob4, type = "s", ylim = c(0, 1), xlab = "Time", ylab = "Survival Probability", main = "Survival Function for LightGBM")
polygon(c(times4, rev(times4)), c(lower_bound4, rev(upper_bound4)), col = "yellow", border = NA)
lines(times4, surv_prob4, type = "s", lwd = 2)

R version of survivalist: Probabilistic model-agnostic survival analysis using scikit-learn, xgboost, lightgbm (and conformal prediction)