H20 AutoML

So let’s start from the pipeline with the best model cross-validated before

import pandas as pd
import numpy as np
import epyfun
import strom

import strom
strom_climate = strom.read_result("merge_strom_climate_data")
# strom_climate = pd.read_parquet("interim/strom_climate.parquet")
X = strom_climate.drop(columns="wd")
y = strom_climate["wd"]


dimensions=[
    "tt_tu_mean",
    "tt_tu_min",
    "tt_tu_max",
    "rf_tu_mean",
    "rf_tu_min",
    "rf_tu_max",
]

import h2o
from h2o.automl import H2OAutoML

# Initialize the H2O cluster
h2o.init()

# Load your data into a H2OFrame
# Assume strom_climate is a pandas DataFrame
hf = h2o.H2OFrame(strom_climate)

# Specify the target and predictor columns
target = 'wd'
predictors = [
    "tt_tu_min", "tt_tu_mean", "tt_tu_max",

    "rf_tu_min", "rf_tu_mean", "rf_tu_max",

    'tt_min', 'tt_mean', 'tt_max',

    'td_min', 'td_mean', 'td_max',

    'vp_std_min', 'vp_std_mean', 'vp_std_max',

    'tf_std_min', 'tf_std_mean', 'tf_std_max',

    'p_std_min', 'p_std_mean', 'p_std_max',

    'tt_std_min', 'tt_std_mean', 'tt_std_max',

    'rf_std_min', 'rf_std_mean', 'rf_std_max',

    'td_std_min', 'td_std_mean', 'td_std_max',

    "r1_min", "r1_mean", "r1_max",
    #'rs_ind_min', 'rs_ind_mean', 'rs_ind_max', 'wrtr_min', 'wrtr_mean', 'wrtr_max'
]

# Run AutoML
aml = H2OAutoML(seed=1, max_runtime_secs=3600) # 28800
aml.train(x=predictors, y=target, training_frame=hf)

# View the AutoML Leaderboard
lb = aml.leaderboard
#print(lb.head(rows=lb.nrows))
print(lb.head(rows=10))

# Get the best model
best_model = aml.leader

# Make predictions on the strom_climate data
predictions = best_model.predict(hf)

# Convert predictions to pandas DataFrame
y_pred = h2o.as_list(predictions)["predict"]

strom.scatter_fitted_observed(y, y_pred, strom_climate)

strom.splom_fitted_observed(
    y,
    y_pred,
    strom_climate,
    dimensions=[
        "tt_tu_mean",
        "tt_tu_min",
        "tt_tu_max",
        "rf_tu_mean",
        "rf_tu_min",
        "rf_tu_max",
        "td_mean",
        "vp_std_mean",
        "r1_mean"
    ],
).show()

strom.residuals_fitted(y, y_pred)

strom.residuals_hist(y, y_pred)

strom.residuals_qq(y, y_pred)

best_model.explain(hf)