import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Location of the data on the THREDDS data server 
file_path ='https://thredds.ucar.edu/thredds/fileServer/cybertraining/sam_ptype.parquet'

# Read data into this workspace
df = pd.read_parquet(file_path)

print("Total records in dataset:", len(df))
print(df["ptype"].value_counts())

Total records in dataset: 5000
snow    3222
rain    1778
Name: ptype, dtype: int64

from applications_tech import HistogramWidget

widget = HistogramWidget(df)
widget.display()

Original Model Accuracy (validation dataset): 87.47%

Original Model Validation Metrics
Rain Precision: 0.896
Snow Precision: 0.866
Rain Recall: 0.736
Snow Recall: 0.952

Validation Metrics
Model Type: RandomForestClassifier
Input Features: TEMP_C_0_m, VGRD_m/s_0_m, T_DEWPOINT_C_1000_m, TEMP_C_5000_m, T_DEWPOINT_C_5000_m
 
Accuracy: 90.53%
Rain Precision: 0.927
Snow Precision: 0.896
Rain Recall: 0.799
Snow Recall: 0.965

Testing Metrics
Model Type: RandomForestClassifier
Input Features: TEMP_C_0_m, VGRD_m/s_0_m, T_DEWPOINT_C_1000_m, TEMP_C_5000_m, T_DEWPOINT_C_5000_m
 
Accuracy: 90.20%
Rain Precision: 0.918
Snow Precision: 0.894
Rain Recall: 0.802
Snow Recall: 0.959

df.describe()

from applications_tech import display_correlation_plot_dashboard

display_correlation_plot_dashboard()

from applications_tech import create_percentage_widget

widget, get_values = create_percentage_widget()

decimals = get_values()
training = decimals['training']
validation = decimals['validation']
testing = decimals['testing']

from applications_tech import train_val_test_split

X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(df, 
                                                                      y_col='ptype',
                                                                      train_size=training,
                                                                      val_size=validation,
                                                                      test_size=testing)

from applications_tech import algorithm_selection

selected_algo = algorithm_selection()

from applications_tech import create_column_filter_widget

widget, get_selected_columns = create_column_filter_widget()
display(widget)

X_train_filtered = X_train[get_selected_columns()]
X_val_filtered = X_val[get_selected_columns()]
X_test_filtered = X_test[get_selected_columns()]

from applications_tech import train_button

model_choice = selected_algo()
trained_model = train_button(model_choice, X_train_filtered, y_train)

# Get the model
model = trained_model()

y_pred = model.predict(X_val_filtered)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Get accuracy metric
accuracy = (accuracy_score(y_val, y_pred))*100
print(f"Original Model Accuracy (validation dataset): {accuracy:.2f}%")

Original Model Accuracy (validation dataset): 87.47%

from applications_tech import plot_confusion_matrix

fig = plot_confusion_matrix(model.classes_, y_true=y_val, y_pred=y_pred, 
                           title='Original Model Confusion Matrix')
plt.show()

precision, recall, _, _ = precision_recall_fscore_support(y_val, y_pred, average=None, labels=['rain', 'snow'])

print("Original Model Validation Metrics")
print(f"Rain Precision: {precision[0]:.3f}")
print(f"Snow Precision: {precision[1]:.3f}")
print(f"Rain Recall: {recall[0]:.3f}")
print(f"Snow Recall: {recall[1]:.3f}")

Original Model Validation Metrics
Rain Precision: 0.896
Snow Precision: 0.866
Rain Recall: 0.736
Snow Recall: 0.952

selected_algo2 = algorithm_selection()

widget, get_selected_columns2 = create_column_filter_widget()
display(widget)

X_train_filtered = X_train[get_selected_columns2()]
X_val_filtered = X_val[get_selected_columns2()]
X_test_filtered = X_test[get_selected_columns2()]

model_choice = selected_algo2()
trained_model2 = train_button(model_choice, X_train_filtered, y_train)

from applications_tech import classification_model_eval

model2 = trained_model2()
validation_metrics = classification_model_eval(model2, X_val_filtered, y_val, title='Validation')

Validation Metrics
Model Type: RandomForestClassifier
Input Features: TEMP_C_0_m, VGRD_m/s_0_m, T_DEWPOINT_C_1000_m, TEMP_C_5000_m, T_DEWPOINT_C_5000_m
 
Accuracy: 90.53%
Rain Precision: 0.927
Snow Precision: 0.896
Rain Recall: 0.799
Snow Recall: 0.965

test_metrics = classification_model_eval(model2, X_test_filtered, y_test, title='Testing')

Testing Metrics
Model Type: RandomForestClassifier
Input Features: TEMP_C_0_m, VGRD_m/s_0_m, T_DEWPOINT_C_1000_m, TEMP_C_5000_m, T_DEWPOINT_C_5000_m
 
Accuracy: 90.20%
Rain Precision: 0.918
Snow Precision: 0.894
Rain Recall: 0.802
Snow Recall: 0.959

label	definition
count	number of records
mean	arithmetic mean
std	standard deviation
min	minimum value
25%, 50%, 75%	25th, 50th, and 75th percentile of the distribution, respectively
max	maximum value

	TEMP_C_0_m	TEMP_C_1000_m	TEMP_C_5000_m	T_DEWPOINT_C_0_m	T_DEWPOINT_C_1000_m	T_DEWPOINT_C_5000_m	UGRD_m/s_0_m	UGRD_m/s_1000_m	UGRD_m/s_5000_m	VGRD_m/s_0_m	VGRD_m/s_1000_m	VGRD_m/s_5000_m	PRES_Pa_0_m	PRES_Pa_1000_m	PRES_Pa_5000_m
count	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000
mean	3.632016	-0.746313	-19.925755	1.130736	-2.172693	-27.131464	0.445190	3.266774	17.300987	-0.873734	1.544326	7.710204	97618.754410	862.006982	514.001548
std	8.372067	8.707363	8.836249	8.183560	8.601649	12.731960	3.805889	9.232793	12.732507	3.654125	9.584163	13.679516	4327.897516	38.665003	27.168435
min	-21.328827	-26.929945	-50.282283	-25.157257	-46.479145	-75.300519	-13.402076	-36.590512	-27.972064	-13.447640	-40.431958	-50.363994	66256.414062	581.237407	329.045904
25%	-1.437222	-6.585131	-26.066483	-3.862354	-7.727319	-35.558317	-2.092202	-2.399888	8.198301	-3.312952	-4.517914	-0.581500	97056.430915	856.920760	505.621304
50%	1.536438	-2.625801	-19.572960	-0.336419	-3.439562	-25.175794	0.106456	3.612947	17.815394	-0.772208	1.322942	7.652779	98584.171875	870.712713	517.785398
75%	6.498490	3.419899	-13.744804	4.214766	2.339916	-17.693843	2.994793	9.578508	26.319220	1.518725	7.466151	16.348368	99859.810250	882.060269	529.149799
max	37.702423	28.435911	3.553120	26.039459	21.696786	2.395806	20.642594	32.714230	62.565679	11.930449	42.105092	64.503670	103254.549774	913.848392	563.221716

Group	Percent of total data
Training	75%
Validation	15%
Testing	10%

	High Precision, Low Recall	High Recall, Low Precision
What happens?	The model is very careful when predicting rain, but it misses a lot of actual rain cases.	The model catches almost all rain cases, but it also makes mistakes, calling snow "rain" a lot.
Example	Predicts "rain" only when very sure, leading to fewer false alarms but missing real rain.	Predicts "rain" too often, ensuring real rain isn’t missed but causing more false alarms.

Machine Learning Applications in the Earth Systems Sciences¶

Optimize a precipitation classification machine learning model¶

Part 1: Problem Framing¶

Problem framing questions¶

Part 2: Data Handling¶

Part 2a: Locate Data of Interest¶

Metadata Document for Precipitation Type Classification Data¶

General Information¶

Data Structure¶

Data Quality¶

Data Provenance¶

Part 2b: Explore Data¶

Explore target features¶

Explore input features¶

Part 2c: Create a data splitting strategy¶

Part 3: Model Development¶

Part 3a: Choose Algorithm¶

Part 3b: Choose input features¶

Part 3c: Train the Algorithm¶

Part 3d: Validate the Model¶

Other evaluation metrics¶

Confusion matrix¶

Precision¶

Recall¶

Comparing Precision and Recall¶

Part 3e: Evaluate and Refine the Model¶

Part 3f: Iterative Refinement Trials¶

New trial: Choose algorithm¶

New trial: Choose input features¶

New trial: Train algorithm¶

New trial: Validate model¶

Part 3g: Test Model¶

Part 3h: Evaluate and Justify¶

Your final decision¶

Conclusion¶

Acknowledgements¶