import pandas as pd
import numpy as np
import epyfun
import strom
import strom
strom_climate = strom.read_result("merge_strom_climate_data")
# strom_climate = pd.read_parquet("interim/strom_climate.parquet")
X = strom_climate.drop(columns="wd")
y = strom_climate["wd"]
dimensions=[
"tt_tu_mean",
"tt_tu_min",
"tt_tu_max",
"rf_tu_mean",
"rf_tu_min",
"rf_tu_max",
]H20 AutoML
So let’s start from the pipeline with the best model cross-validated before
import h2o
from h2o.automl import H2OAutoML
# Initialize the H2O cluster
h2o.init()
# Load your data into a H2OFrame
# Assume strom_climate is a pandas DataFrame
hf = h2o.H2OFrame(strom_climate)
# Specify the target and predictor columns
target = 'wd'
predictors = [
"tt_tu_min", "tt_tu_mean", "tt_tu_max",
"rf_tu_min", "rf_tu_mean", "rf_tu_max",
'tt_min', 'tt_mean', 'tt_max',
'td_min', 'td_mean', 'td_max',
'vp_std_min', 'vp_std_mean', 'vp_std_max',
'tf_std_min', 'tf_std_mean', 'tf_std_max',
'p_std_min', 'p_std_mean', 'p_std_max',
'tt_std_min', 'tt_std_mean', 'tt_std_max',
'rf_std_min', 'rf_std_mean', 'rf_std_max',
'td_std_min', 'td_std_mean', 'td_std_max',
"r1_min", "r1_mean", "r1_max",
#'rs_ind_min', 'rs_ind_mean', 'rs_ind_max', 'wrtr_min', 'wrtr_mean', 'wrtr_max'
]
# Run AutoML
aml = H2OAutoML(seed=1, max_runtime_secs=3600) # 28800
aml.train(x=predictors, y=target, training_frame=hf)
# View the AutoML Leaderboard
lb = aml.leaderboard
#print(lb.head(rows=lb.nrows))
print(lb.head(rows=10))# Get the best model
best_model = aml.leader
# Make predictions on the strom_climate data
predictions = best_model.predict(hf)
# Convert predictions to pandas DataFrame
y_pred = h2o.as_list(predictions)["predict"]strom.scatter_fitted_observed(y, y_pred, strom_climate)strom.splom_fitted_observed(
y,
y_pred,
strom_climate,
dimensions=[
"tt_tu_mean",
"tt_tu_min",
"tt_tu_max",
"rf_tu_mean",
"rf_tu_min",
"rf_tu_max",
"td_mean",
"vp_std_mean",
"r1_mean"
],
).show()strom.residuals_fitted(y, y_pred)strom.residuals_hist(y, y_pred)strom.residuals_qq(y, y_pred)best_model.explain(hf)