# First import the Python tools needed to display the buttons
# This cell may take a moment to complete
import ipywidgets as widgets
import matplotlib.pyplot as plt
from IPython.display import display, clear_output, HTML, IFrame

from analysis_tech import display_knowledgecheck

display_knowledgecheck()

# Import the pandas Python library that can interpret the data file
import pandas as pd

# Location of the data on the THREDDS data server
file_path = 'https://thredds.ucar.edu/thredds/fileServer/cybertraining/CyberTraining_NC_ECOnet_data.parquet'

# Read data into this workspace
df = pd.read_parquet(file_path)

from analysis_tech import display_mt_mitchell_weather_dashboard

display_mt_mitchell_weather_dashboard(df)

from analysis_tech import display_input_stations_dashboard

display_input_stations_dashboard(df)

from analysis_tech import display_correlation_plot_dashboard

display_correlation_plot_dashboard()

from analysis_tech import create_percentage_widget

widget, get_values = create_percentage_widget()

# This is used to grab the values from the widget above (no need to change)
decimals = get_values()
training = decimals['training']
validation = decimals['validation']
testing = decimals['testing']

from analysis_tech import split_data_temporal

# Use the function
X_train, y_train, X_val, y_val, X_test, y_test, X_true_test, y_true_test = split_data_temporal(df,
                                                                                               train_pct=training,
                                                                                               val_pct=validation,
                                                                                               test_pct=testing)

Data split summary:
Training period: 2017-01-01 00:00:00 to 2022-06-03 00:00:00
Training samples: 47489 (70.0% of pre-cutoff data)

Validation period: 2022-06-03 01:00:00 to 2023-08-01 00:00:00
Validation samples: 10176 (15.0% of pre-cutoff data)

Testing period: 2023-08-01 01:00:00 to 2024-09-28 00:00:00
Testing samples: 10176 (15.0% of pre-cutoff data)

True test period: 2024-09-28 01:00:00 to 2024-12-16 23:00:00
True test samples: 1919

# imports needed to run the machine learning workflow

from xgboost import XGBRegressor
import time 
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from analysis_tech import algorithm_selection

selected_algo = algorithm_selection()

from analysis_tech import create_station_selector

station_selector = create_station_selector()

# To get selected stations at any time:
def get_selected_stations(selector):
    return [station for station, checkbox in selector.items() if checkbox.value]

selected = get_selected_stations(station_selector)
selected

['BURN', 'NCAT', 'UNCA']

from analysis_tech import filter_dataframe

X_train_filtered     = filter_dataframe(X_train,     selected)
X_val_filtered       = filter_dataframe(X_val,       selected)
X_test_filtered      = filter_dataframe(X_test,      selected)
X_true_test_filtered = filter_dataframe(X_true_test, selected)

Original DataFrame: 47 columns
Filtered DataFrame: 17 columns
Original DataFrame: 47 columns
Filtered DataFrame: 17 columns
Original DataFrame: 47 columns
Filtered DataFrame: 17 columns
Original DataFrame: 47 columns
Filtered DataFrame: 17 columns

from analysis_tech import train_button

model_choice = selected_algo()
trained_model = train_button(model_choice, X_train_filtered, y_train)

# Import the Python libraries that calculate the evaluation metrics
import numpy as np
from sklearn.metrics import root_mean_squared_error, r2_score

from analysis_tech import model_eval_MITC

model_eval_MITC(trained_model(), X_test_filtered, y_test)

Validation Metrics

Model Type: MultiXGBRegressor

Stations used (3/9):
BURN, NCAT, UNCA

RMSE for each target feature:
 MITC_airtemp_degF:	4.4934
 MITC_windspeed_mph:	6.9852
 MITC_windgust_mph:	8.2759
 MITC_rh_percent:	22.1142
 MITC_precip_in:	0.0582

R² Score for each target feature:
 MITC_airtemp_degF:	0.8872
 MITC_windspeed_mph:	0.1141
 MITC_windgust_mph:	0.3543
 MITC_rh_percent:	-0.2854
 MITC_precip_in:	0.0829

Average R² Score:	0.23

selected_algo = algorithm_selection()

station_selector = create_station_selector()

# Execute this cell after selecting stations
selected = get_selected_stations(station_selector)
X_train_filtered     = filter_dataframe(X_train,     selected)
X_val_filtered       = filter_dataframe(X_val,       selected)
X_test_filtered      = filter_dataframe(X_test,      selected)
X_true_test_filtered = filter_dataframe(X_true_test, selected)

Original DataFrame: 47 columns
Filtered DataFrame: 22 columns
Original DataFrame: 47 columns
Filtered DataFrame: 22 columns
Original DataFrame: 47 columns
Filtered DataFrame: 22 columns
Original DataFrame: 47 columns
Filtered DataFrame: 22 columns

model_choice = selected_algo()
trained_model = train_button(model_choice, X_train_filtered, y_train)

model_eval_MITC(trained_model(), X_test_filtered, y_test)

Validation Metrics

Model Type: MultiLinearRegressor

Stations used (4/9):
BURN, NCAT, SALI, WINE

RMSE for each target feature:
 MITC_airtemp_degF:	3.6752
 MITC_windspeed_mph:	6.4972
 MITC_windgust_mph:	7.6509
 MITC_rh_percent:	16.8714
 MITC_precip_in:	0.0496

R² Score for each target feature:
 MITC_airtemp_degF:	0.9246
 MITC_windspeed_mph:	0.2336
 MITC_windgust_mph:	0.4481
 MITC_rh_percent:	0.2518
 MITC_precip_in:	0.3336

Average R² Score:	0.44

model_eval_MITC(trained_model(), X_val_filtered, y_val, eval_type='Testing')

Testing Metrics

Model Type: MultiLinearRegressor

Stations used (4/9):
BURN, NCAT, SALI, WINE

RMSE for each target feature:
 MITC_airtemp_degF:	3.0410
 MITC_windspeed_mph:	6.7678
 MITC_windgust_mph:	8.2644
 MITC_rh_percent:	12.7908
 MITC_precip_in:	0.0371

R² Score for each target feature:
 MITC_airtemp_degF:	0.9475
 MITC_windspeed_mph:	0.4386
 MITC_windgust_mph:	0.4357
 MITC_rh_percent:	0.7173
 MITC_precip_in:	0.2152

Average R² Score:	0.55

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

y_pred = trained_model().predict(X_true_test_filtered)

from analysis_tech import plot_weather_comparison

fig, axs = plot_weather_comparison(
   df=df,
   y_pred=y_pred, 
   transition_date=pd.Timestamp('2024-09-28')
)
plt.show()

Machine Learning Analysis in the Earth Systems Sciences¶

Damaged weather station in western North Carolina¶

Part 1: Problem Framing¶

Problem framing questions¶

Part 2: Data Handling¶

Part 2a: Locate Data of Interest¶

Metadata Document for Western North Carolina Weather Station Data¶

General Information¶

Data Structure¶

Data Quality¶

Data Provenance¶

Part 2b: Explore Data¶

Explore target features¶

Explore input features¶

Compare stations¶

Part 2c: Create a data splitting strategy¶

Part 3: Model Development¶

Part 3a: Choose Algorithm¶

Part 3b: Choose input features¶

Part 3c: Train the Algorithm¶

Part 3d: Validate the Model¶

Part 3e: Evaluate and Refine the Model¶

Part 3f: Iterative Refinement Trials¶

New trial: Choose algorithm¶

New trial: Choose input features¶

New trial: Train algorithm¶

New trial: Validate model¶

Part 3g: Test Model¶

Part 3h: Evaluate and Justify¶

Your final decision¶

Conclusion¶

Acknowledgements¶