import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.decomposition import PCA
Our cleaning and processing steps are explained in our Exploratory notebook, linked here.
seattle2020 = pd.read_csv("https://facultyweb.cs.wwu.edu/~wehrwes/courses/csci141_21s/fp/data/WA_Seattle.csv", low_memory = False)
seattle2010 = pd.read_csv("10yearweather.csv", low_memory = False)
seattle2021 = pd.read_csv("2021weather.csv", low_memory = False)
dfs = [seattle2020, seattle2010]
seattle = pd.concat(dfs)
sea_mask = seattle["REPORT_TYPE"].str.contains("SOD")
seattle = seattle[sea_mask]
daily_cols = ["DailyAverageDewPointTemperature", "DailyAverageDryBulbTemperature", "DailyAverageRelativeHumidity",
"DailyAverageSeaLevelPressure", "DailyAverageStationPressure", "DailyAverageWetBulbTemperature",
"DailyAverageWindSpeed", "DailyCoolingDegreeDays", "DailyDepartureFromNormalAverageTemperature",
"DailyHeatingDegreeDays", "DailyMaximumDryBulbTemperature", "DailyMinimumDryBulbTemperature",
"DailyPeakWindDirection", "DailyPeakWindSpeed", "DailySnowDepth", "DailySnowfall",
"DailySustainedWindDirection", "DailySustainedWindSpeed",
"DailyPrecipitation"]
seattle_daily = seattle[daily_cols].dropna()
seattle_daily["DailySnowDepth"] = seattle_daily["DailySnowDepth"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle_daily["DailySnowfall"] = seattle_daily["DailySnowfall"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle_daily["DailyPrecipitation"] = seattle_daily["DailyPrecipitation"].str.replace("T", "0.0").str.rstrip("s").astype(float)
sea21_mask = seattle2021["REPORT_TYPE"].str.contains("SOD")
seattle2021 = seattle2021[sea21_mask]
seattle2021 = seattle2021[daily_cols].dropna()
seattle2021["DailyPeakWindDirection"] = seattle2021["DailyPeakWindDirection"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySnowfall"] = seattle2021["DailySnowfall"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailyPrecipitation"] = seattle2021["DailyPrecipitation"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailyPeakWindSpeed"] = seattle2021["DailyPeakWindSpeed"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySustainedWindDirection"] = seattle2021["DailySustainedWindDirection"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySustainedWindSpeed"] = seattle2021["DailySustainedWindSpeed"].str.replace("T", "0.0").str.rstrip("s").astype(float)
X = seattle_daily.drop("DailyPrecipitation", axis=1)
y = seattle_daily["DailyPrecipitation"]
X_test_2021 = seattle2021.drop("DailyPrecipitation", axis=1)
y_test_2021 = seattle2021["DailyPrecipitation"]
We decided to compare 12 different regression models using our predetermined evaluation metrics to determine which model we should use to run our testing data on.
In order to effectively do this, we upgraded the find_train()
function we made in our Exploratory notebook. It now takes in a "zoo" of different regression models that it trains on X
and y
, and outputs our evaluation metrics (R2, MAE, RMSE) for each model, formatted into a DataFrame. We used a StandardScaler()
on our data to center it.
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
Our zoo includes our baseline model (which predicts the mean), and a variety of regression models that we hypertuned for optimal scores. We added hyperparameters to the models which seemed the most promising to improve their scores.
zoo = {
"Baseline": DummyRegressor(strategy="mean"),
"DT": DecisionTreeRegressor(),
"SVR": SVR(kernel='rbf', C=1.0, gamma='scale'),
"KNN": KNeighborsRegressor(n_neighbors=3),
"NN": MLPRegressor((8,8), solver='lbfgs', random_state=2, max_iter = 4500000),
"LR": LinearRegression(),
"KR": KernelRidge(),
"BR": BayesianRidge(),
"GBR": GradientBoostingRegressor(n_estimators=25, max_features = 'auto'),
"RFR": RandomForestRegressor(n_estimators = 10),
"Lasso": Lasso(alpha=2.0),
"Ridge": Ridge(tol=1)
}
def find_train(X, y, zoo):
reg_model = []
r2_train = []
r2_val = []
mae_train = []
mae_val = []
rmse_train = []
rmse_val = []
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_etc, y_train, y_etc = train_test_split(X, y, random_state=12, test_size=0.25, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, random_state=12, train_size=.8, shuffle=True)
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
for model in zoo:
regressor = zoo[model]
regressor.fit(X_train, y_train)
y_train_pred = regressor.predict(X_train)
y_val_pred = regressor.predict(X_val)
reg_model.append(model)
r2_train.append(round(r2_score(y_train, y_train_pred),4))
r2_val.append(round(r2_score(y_val, y_val_pred),4))
mae_train.append(round(mean_absolute_error(y_train, y_train_pred),4))
mae_val.append(round(mean_absolute_error(y_val, y_val_pred),4))
rmse_train.append(round(np.sqrt(mean_squared_error(y_train, y_train_pred)),4))
rmse_val.append(round(np.sqrt(mean_squared_error(y_val, y_val_pred)),4))
return make_table(reg_model, r2_train, r2_val, mae_train, mae_val, rmse_train, rmse_val)
We made a second evaluation environment that performs polynomial feature expansion on X
and computes the same evaluation metrics in a DataFrame. This will allow us to compare polynomial feature expansion between each model and between the original scores of each model.
def find_train2(X,y,zoo):
reg_model = []
r2_train = []
r2_val = []
mae_train = []
mae_val = []
rmse_train = []
rmse_val = []
X_train, X_etc, y_train, y_etc = train_test_split(X, y, random_state=12, test_size=0.2, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, random_state=12, train_size=.8, shuffle=True)
poly = PolynomialFeatures(degree = 2)
transTrainx = poly.fit_transform(X_train)
transValx = poly.fit_transform(X_val)
for model in zoo:
regressor = zoo[model].fit(transTrainx, y_train)
y_train_pred = regressor.predict(transTrainx)
y_val_pred = regressor.predict(transValx)
reg_model.append(model)
r2_train.append(round(r2_score(y_train, y_train_pred),4))
r2_val.append(round(r2_score(y_val, y_val_pred),4))
mae_train.append(round(mean_absolute_error(y_train, y_train_pred),4))
mae_val.append(round(mean_absolute_error(y_val, y_val_pred),4))
rmse_train.append(round(mean_squared_error(y_train, y_train_pred),4))
rmse_val.append(round(mean_squared_error(y_val, y_val_pred),4))
return make_table(reg_model, r2_train, r2_val, mae_train, mae_val, rmse_train, rmse_val)
The function make_table()
returns a DataFrame with each model in the zoo and its evaluation metrics on both training and validation sets.
def make_table(reg_model, r2_train, r2_val, mae_train, mae_val, rmse_train, rmse_val):
dic = {
"Model": reg_model,
"R2 train": r2_train,
"R2 val": r2_val,
"MAE train": mae_train,
"MAE val": mae_val,
"RMSE train": rmse_train,
"RMSE val": rmse_val
}
return pd.DataFrame(dic)
This table shows the initial scores of all our models.
initial = find_train(X,y,zoo)
initial
Model | R2 train | R2 val | MAE train | MAE val | RMSE train | RMSE val | |
---|---|---|---|---|---|---|---|
0 | Baseline | 0.0000 | -0.0012 | 0.1511 | 0.1600 | 0.2389 | 0.2678 |
1 | DT | 1.0000 | 0.1907 | 0.0000 | 0.1128 | 0.0000 | 0.2408 |
2 | SVR | 0.5618 | 0.4936 | 0.0956 | 0.1104 | 0.1582 | 0.1904 |
3 | KNN | 0.6523 | 0.3665 | 0.0671 | 0.1006 | 0.1409 | 0.2130 |
4 | NN | 0.6950 | 0.3504 | 0.0703 | 0.0977 | 0.1319 | 0.2157 |
5 | LR | 0.4024 | 0.3843 | 0.1151 | 0.1267 | 0.1847 | 0.2100 |
6 | KR | 0.1809 | 0.1863 | 0.1417 | 0.1531 | 0.2162 | 0.2414 |
7 | BR | 0.4020 | 0.3836 | 0.1146 | 0.1260 | 0.1848 | 0.2101 |
8 | GBR | 0.5709 | 0.4560 | 0.0858 | 0.0988 | 0.1565 | 0.1974 |
9 | RFR | 0.8986 | 0.4936 | 0.0357 | 0.0954 | 0.0761 | 0.1904 |
10 | Lasso | 0.0000 | -0.0012 | 0.1511 | 0.1600 | 0.2389 | 0.2678 |
11 | Ridge | 0.4019 | 0.3835 | 0.1146 | 0.1260 | 0.1848 | 0.2101 |
This shows the top model being SVR with a R2 train score of 0.5618 and a validation score of 0.4936. We decided the top model based on the highest R2 score with a lowest difference between R2 training and validation. To make model selection easier we displayed the numbers and top model beneath the table.
We tested the same data from above in our polynomial function and found that when we apply polynomial feature expansion, BR is the best model.
poly_table_1 = find_train2(X,y,zoo)
poly_table_1
/home/whitev4/311/data311_env/lib/python3.6/site-packages/sklearn/linear_model/_coordinate_descent.py:532: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.45308854350305694, tolerance: 0.018459296233053625 positive)
Model | R2 train | R2 val | MAE train | MAE val | RMSE train | RMSE val | |
---|---|---|---|---|---|---|---|
0 | Baseline | 0.0000 | -0.0011 | 0.1536 | 0.1636 | 0.0596 | 0.0911 |
1 | DT | 1.0000 | 0.1738 | 0.0000 | 0.1201 | 0.0000 | 0.0752 |
2 | SVR | 0.3420 | 0.2731 | 0.1085 | 0.1197 | 0.0392 | 0.0662 |
3 | KNN | 0.6322 | 0.2462 | 0.0718 | 0.1183 | 0.0219 | 0.0686 |
4 | NN | -1.1267 | -0.8641 | 0.2547 | 0.2751 | 0.1267 | 0.1697 |
5 | LR | 0.5624 | 0.3617 | 0.0925 | 0.1160 | 0.0261 | 0.0581 |
6 | KR | 0.5594 | 0.4762 | 0.0926 | 0.1105 | 0.0263 | 0.0477 |
7 | BR | 0.5094 | 0.4810 | 0.1002 | 0.1104 | 0.0292 | 0.0472 |
8 | GBR | 0.6280 | 0.4436 | 0.0825 | 0.1015 | 0.0222 | 0.0506 |
9 | RFR | 0.9064 | 0.4768 | 0.0352 | 0.1000 | 0.0056 | 0.0476 |
10 | Lasso | 0.3783 | 0.3378 | 0.1150 | 0.1268 | 0.0370 | 0.0603 |
11 | Ridge | 0.5595 | 0.4765 | 0.0927 | 0.1105 | 0.0262 | 0.0477 |
This model has a smaller gap between the training and validation scores, which we liked. However, the SVR scores were both higher.
Next, based on the regression coefficient and PCA analysis from our Exploratory notebook, we decided to only train the model on the ten columns with the strongest relationships with precipitation amount. We ran this through our first evaluation environment.
top_cols = ["DailyAverageStationPressure", "DailyAverageSeaLevelPressure", "DailyAverageWetBulbTemperature",
"DailyAverageDewPointTemperature", "DailySnowfall", "DailySnowDepth", "DailyAverageRelativeHumidity"
,"DailyPeakWindSpeed", "DailyDepartureFromNormalAverageTemperature", "DailyMaximumDryBulbTemperature"]
X2 = seattle_daily[top_cols]
top_cols_table = find_train(X2,y,zoo)
top_cols_table
Model | R2 train | R2 val | MAE train | MAE val | RMSE train | RMSE val | |
---|---|---|---|---|---|---|---|
0 | Baseline | 0.0000 | -0.0012 | 0.1511 | 0.1600 | 0.2389 | 0.2678 |
1 | DT | 1.0000 | 0.1469 | 0.0000 | 0.1140 | 0.0000 | 0.2472 |
2 | SVR | 0.5192 | 0.4879 | 0.1074 | 0.1210 | 0.1657 | 0.1915 |
3 | KNN | 0.6735 | 0.3375 | 0.0663 | 0.1056 | 0.1365 | 0.2178 |
4 | NN | 0.6029 | 0.4976 | 0.0762 | 0.0943 | 0.1505 | 0.1897 |
5 | LR | 0.4015 | 0.3811 | 0.1152 | 0.1271 | 0.1848 | 0.2105 |
6 | KR | 0.1800 | 0.1854 | 0.1416 | 0.1527 | 0.2163 | 0.2415 |
7 | BR | 0.4014 | 0.3810 | 0.1149 | 0.1267 | 0.1848 | 0.2106 |
8 | GBR | 0.5719 | 0.4555 | 0.0855 | 0.0989 | 0.1563 | 0.1975 |
9 | RFR | 0.9050 | 0.4806 | 0.0346 | 0.0949 | 0.0736 | 0.1929 |
10 | Lasso | 0.0000 | -0.0012 | 0.1511 | 0.1600 | 0.2389 | 0.2678 |
11 | Ridge | 0.4011 | 0.3808 | 0.1148 | 0.1264 | 0.1849 | 0.2106 |
This SVR model has higher scores than BR, and a smaller gap than the initial SVR model. This confirms that dropping the rest of the columns positively impacts our model.
Next, we combined both of these strategies and ran our top ten columns through our poly fit environment. This shows us that the highest model is Linear Regression, which we ended up using as our final model.
poly_top_ten = find_train2(X2,y,zoo)
poly_top_ten
Model | R2 train | R2 val | MAE train | MAE val | RMSE train | RMSE val | |
---|---|---|---|---|---|---|---|
0 | Baseline | 0.0000 | -0.0011 | 0.1536 | 0.1636 | 0.0596 | 0.0911 |
1 | DT | 1.0000 | 0.1975 | 0.0000 | 0.1211 | 0.0000 | 0.0731 |
2 | SVR | 0.4270 | 0.3881 | 0.1073 | 0.1176 | 0.0341 | 0.0557 |
3 | KNN | 0.6581 | 0.2287 | 0.0700 | 0.1152 | 0.0204 | 0.0702 |
4 | NN | 0.1496 | 0.1414 | 0.1450 | 0.1504 | 0.0507 | 0.0782 |
5 | LR | 0.5416 | 0.5158 | 0.0915 | 0.1033 | 0.0273 | 0.0441 |
6 | KR | 0.5360 | 0.5106 | 0.0921 | 0.1032 | 0.0276 | 0.0446 |
7 | BR | 0.5141 | 0.4887 | 0.0969 | 0.1076 | 0.0290 | 0.0465 |
8 | GBR | 0.6082 | 0.4559 | 0.0834 | 0.1005 | 0.0233 | 0.0495 |
9 | RFR | 0.8983 | 0.4624 | 0.0367 | 0.1033 | 0.0061 | 0.0489 |
10 | Lasso | 0.3758 | 0.3367 | 0.1151 | 0.1263 | 0.0372 | 0.0604 |
11 | Ridge | 0.5362 | 0.5118 | 0.0920 | 0.1029 | 0.0276 | 0.0444 |
This model has higher scores than the polynomial SVR and a smaller gap than our initial model. Even though the initial model scored higher on the training set(0.5618), it scored lower on the validation set (0.4936) and we opted to use this Linear Regression model due to the smaller gap between the scores to avoid potential overfitting.
Our next step was to create our linear regression model with polynomial feature expansion and dropped columns.
X_train, X_etc, y_train, y_etc = train_test_split(X2, y, random_state=12, test_size=0.2, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, random_state=12, train_size=.8, shuffle=True)
print("Shapes of data splits:")
print("X_Train:", X_train.shape)
print("X_Val:", X_val.shape)
print("X_Test:", X_test.shape)
print("y_Train:", y_train.shape)
print("y_Val:", y_val.shape)
print("y_Test:", y_test.shape)
poly = PolynomialFeatures(degree = 2)
transTrainx = poly.fit_transform(X_train)
transValx = poly.fit_transform(X_val)
regressor = LinearRegression().fit(transTrainx, y_train)
y_train_pred = regressor.predict(transTrainx)
y_val_pred = regressor.predict(transValx)
print("\nTrain Score:", round(regressor.score(transTrainx, y_train), 4))
print("Val Score:", round(regressor.score(transValx, y_val),4))
print("Train MAE:", round(mean_absolute_error(y_train, y_train_pred),4))
print("Val MAE:", round(mean_squared_error(y_val, y_val_pred),4))
print("Train RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, y_val_pred)))
Shapes of data splits: X_Train: (3098, 10) X_Val: (620, 10) X_Test: (155, 10) y_Train: (3098,) y_Val: (620,) y_Test: (155,) Train Score: 0.5416 Val Score: 0.5158 Train MAE: 0.0915 Val MAE: 0.0441 Train RMSE: 0.1652700487419283 Val RMSE: 0.20995026659693175
We created graphs of our linear regression model for our training and validation sets. Overall, our model performed the best on the training set.
graph = sns.regplot(x=y_train_pred,y=y_train)
graph.set_title("Training Set Pred vs Ground Truth")
graph.set(xlabel='Predicted Values', ylabel='Ground Truth');
graph = sns.regplot(x=y_val_pred,y=y_val)
graph.set_title("Validation Set Pred vs Ground Truth")
graph.set(xlabel='Predicted Values', ylabel='Ground Truth');
Our next step was to run our model on both of our testing sets.
X_test = poly.fit_transform(X_test)
y_test_pred = regressor.predict(X_test)
print("Test Score:", regressor.score(X_test, y_test))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
Test Score: 0.504189627414269 Test RMSE: 0.171635458592776
On our first testing set, our R2 score was 0.504, with a root mean squared error of 0.172. Compared to our validation set, it scored slightly worse, but had a smaller RMSE.
Below is a scatterplot of how our model performed on the first testing set.
graph = sns.regplot(x=y_test_pred,y=y_test)
graph.set(xlabel='Predicted Values', ylabel='Ground Truth')
graph.set_title("Test Set Pred vs Ground Truth");
Next, we ran our model on our testing set of 2021 data, which was a year our model had never seen before.
X_test_2021 = seattle2021[top_cols]
X_test_2021 = poly.fit_transform(X_test_2021)
y_test_2021_pred = regressor.predict(X_test_2021)
print("Test 2021 Score:", regressor.score(X_test_2021, y_test_2021))
print("Test 2021 RMSE:", np.sqrt(mean_squared_error(y_test_2021, y_test_2021_pred)))
Test 2021 Score: 0.43479193847866815 Test 2021 RMSE: 0.17568191092076288
This time, our model had an R2 score of 0.435, and an RMSE of 0.176. It had a much smaller score than any of our other sets, but its RMSE was pretty similar to the first testing set.
graph = sns.regplot(x=y_test_2021_pred,y=y_test_2021);
graph.set_title("2021 Test Set Pred vs Ground Truth")
graph.set(xlabel='Predicted Values', ylabel='Ground Truth');