import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.decomposition import PCA


seattle2020 = pd.read_csv("https://facultyweb.cs.wwu.edu/~wehrwes/courses/csci141_21s/fp/data/WA_Seattle.csv", low_memory = False)
seattle2010 = pd.read_csv("10yearweather.csv", low_memory = False)
seattle2021 = pd.read_csv("2021weather.csv", low_memory = False)

dfs = [seattle2020, seattle2010]
seattle = pd.concat(dfs)

sea_mask = seattle["REPORT_TYPE"].str.contains("SOD")
seattle = seattle[sea_mask]

daily_cols = ["DailyAverageDewPointTemperature", "DailyAverageDryBulbTemperature", "DailyAverageRelativeHumidity", 
        "DailyAverageSeaLevelPressure", "DailyAverageStationPressure", "DailyAverageWetBulbTemperature", 
        "DailyAverageWindSpeed", "DailyCoolingDegreeDays", "DailyDepartureFromNormalAverageTemperature", 
        "DailyHeatingDegreeDays", "DailyMaximumDryBulbTemperature", "DailyMinimumDryBulbTemperature", 
        "DailyPeakWindDirection", "DailyPeakWindSpeed", "DailySnowDepth", "DailySnowfall", 
        "DailySustainedWindDirection", "DailySustainedWindSpeed", 
        "DailyPrecipitation"]
seattle_daily = seattle[daily_cols].dropna()

seattle_daily["DailySnowDepth"] = seattle_daily["DailySnowDepth"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle_daily["DailySnowfall"] = seattle_daily["DailySnowfall"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle_daily["DailyPrecipitation"] = seattle_daily["DailyPrecipitation"].str.replace("T", "0.0").str.rstrip("s").astype(float)


sea21_mask = seattle2021["REPORT_TYPE"].str.contains("SOD")
seattle2021 = seattle2021[sea21_mask]
seattle2021 = seattle2021[daily_cols].dropna()
seattle2021["DailyPeakWindDirection"] = seattle2021["DailyPeakWindDirection"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySnowfall"] = seattle2021["DailySnowfall"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailyPrecipitation"] = seattle2021["DailyPrecipitation"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailyPeakWindSpeed"] = seattle2021["DailyPeakWindSpeed"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySustainedWindDirection"] = seattle2021["DailySustainedWindDirection"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySustainedWindSpeed"] = seattle2021["DailySustainedWindSpeed"].str.replace("T", "0.0").str.rstrip("s").astype(float)


X = seattle_daily.drop("DailyPrecipitation", axis=1)
y = seattle_daily["DailyPrecipitation"]

X_test_2021 = seattle2021.drop("DailyPrecipitation", axis=1)
y_test_2021 = seattle2021["DailyPrecipitation"]


from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor


zoo = {
    "Baseline": DummyRegressor(strategy="mean"),
    "DT": DecisionTreeRegressor(),
    "SVR": SVR(kernel='rbf', C=1.0, gamma='scale'),
    "KNN": KNeighborsRegressor(n_neighbors=3),
    "NN": MLPRegressor((8,8), solver='lbfgs', random_state=2, max_iter = 4500000),
    "LR": LinearRegression(),
    "KR": KernelRidge(),
    "BR": BayesianRidge(),
    "GBR": GradientBoostingRegressor(n_estimators=25, max_features = 'auto'),
    "RFR": RandomForestRegressor(n_estimators = 10),
    "Lasso": Lasso(alpha=2.0),
    "Ridge": Ridge(tol=1)
}


def find_train(X, y, zoo):
    reg_model = []
    r2_train = []
    r2_val = []
    mae_train = []
    mae_val = []
    rmse_train = []
    rmse_val = []
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    X_train, X_etc, y_train, y_etc = train_test_split(X, y, random_state=12, test_size=0.25, shuffle=True)
    X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, random_state=12, train_size=.8, shuffle=True)
    
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    for model in zoo:
        regressor = zoo[model]
        regressor.fit(X_train, y_train)
        y_train_pred = regressor.predict(X_train)
        y_val_pred = regressor.predict(X_val)
        reg_model.append(model)
        r2_train.append(round(r2_score(y_train, y_train_pred),4))
        r2_val.append(round(r2_score(y_val, y_val_pred),4))
        mae_train.append(round(mean_absolute_error(y_train, y_train_pred),4))
        mae_val.append(round(mean_absolute_error(y_val, y_val_pred),4))
        rmse_train.append(round(np.sqrt(mean_squared_error(y_train, y_train_pred)),4))
        rmse_val.append(round(np.sqrt(mean_squared_error(y_val, y_val_pred)),4))
        
    return make_table(reg_model, r2_train, r2_val, mae_train, mae_val, rmse_train, rmse_val)


def find_train2(X,y,zoo):
    reg_model = []        
    r2_train = []
    r2_val = []
    mae_train = []
    mae_val = []
    rmse_train = []
    rmse_val = []
    

    X_train, X_etc, y_train, y_etc = train_test_split(X, y, random_state=12, test_size=0.2, shuffle=True)
    X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, random_state=12, train_size=.8, shuffle=True)
    

    poly = PolynomialFeatures(degree = 2)
    transTrainx = poly.fit_transform(X_train)
    transValx = poly.fit_transform(X_val)
    for model in zoo:
        regressor = zoo[model].fit(transTrainx, y_train)
       
        y_train_pred = regressor.predict(transTrainx)
        y_val_pred = regressor.predict(transValx)
        
        
        reg_model.append(model)
        r2_train.append(round(r2_score(y_train, y_train_pred),4))
        r2_val.append(round(r2_score(y_val, y_val_pred),4))
        mae_train.append(round(mean_absolute_error(y_train, y_train_pred),4))
        mae_val.append(round(mean_absolute_error(y_val, y_val_pred),4))
        rmse_train.append(round(mean_squared_error(y_train, y_train_pred),4))
        rmse_val.append(round(mean_squared_error(y_val, y_val_pred),4))
    return make_table(reg_model, r2_train, r2_val, mae_train, mae_val, rmse_train, rmse_val)


def make_table(reg_model, r2_train, r2_val, mae_train, mae_val, rmse_train, rmse_val):
    dic = {
        "Model": reg_model,
        "R2 train": r2_train,
        "R2 val": r2_val,
        "MAE train": mae_train,
        "MAE val": mae_val,
        "RMSE train": rmse_train,
        "RMSE val": rmse_val
    }
    return pd.DataFrame(dic)


initial = find_train(X,y,zoo)
initial


poly_table_1 = find_train2(X,y,zoo)
poly_table_1

/home/whitev4/311/data311_env/lib/python3.6/site-packages/sklearn/linear_model/_coordinate_descent.py:532: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.45308854350305694, tolerance: 0.018459296233053625
  positive)


top_cols = ["DailyAverageStationPressure", "DailyAverageSeaLevelPressure", "DailyAverageWetBulbTemperature",
            "DailyAverageDewPointTemperature", "DailySnowfall", "DailySnowDepth", "DailyAverageRelativeHumidity"
            ,"DailyPeakWindSpeed", "DailyDepartureFromNormalAverageTemperature", "DailyMaximumDryBulbTemperature"]
X2 = seattle_daily[top_cols]


top_cols_table = find_train(X2,y,zoo)
top_cols_table


poly_top_ten = find_train2(X2,y,zoo)
poly_top_ten


X_train, X_etc, y_train, y_etc = train_test_split(X2, y, random_state=12, test_size=0.2, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, random_state=12, train_size=.8, shuffle=True)
print("Shapes of data splits:")
print("X_Train:", X_train.shape)
print("X_Val:", X_val.shape)
print("X_Test:", X_test.shape)
print("y_Train:", y_train.shape)
print("y_Val:", y_val.shape)
print("y_Test:", y_test.shape)

poly = PolynomialFeatures(degree = 2)
transTrainx = poly.fit_transform(X_train)
transValx = poly.fit_transform(X_val)
regressor = LinearRegression().fit(transTrainx, y_train)
       
y_train_pred = regressor.predict(transTrainx)
y_val_pred = regressor.predict(transValx)
print("\nTrain Score:", round(regressor.score(transTrainx, y_train), 4))
print("Val Score:", round(regressor.score(transValx, y_val),4))
print("Train MAE:", round(mean_absolute_error(y_train, y_train_pred),4))
print("Val MAE:", round(mean_squared_error(y_val, y_val_pred),4))
print("Train RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, y_val_pred)))

Shapes of data splits:
X_Train: (3098, 10)
X_Val: (620, 10)
X_Test: (155, 10)
y_Train: (3098,)
y_Val: (620,)
y_Test: (155,)

Train Score: 0.5416
Val Score: 0.5158
Train MAE: 0.0915
Val MAE: 0.0441
Train RMSE: 0.1652700487419283
Val RMSE: 0.20995026659693175


graph = sns.regplot(x=y_train_pred,y=y_train)
graph.set_title("Training Set Pred vs Ground Truth")
graph.set(xlabel='Predicted Values', ylabel='Ground Truth');


graph = sns.regplot(x=y_val_pred,y=y_val)
graph.set_title("Validation Set Pred vs Ground Truth")
graph.set(xlabel='Predicted Values', ylabel='Ground Truth');


X_test = poly.fit_transform(X_test)

y_test_pred = regressor.predict(X_test)


print("Test Score:", regressor.score(X_test, y_test))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))

Test Score: 0.504189627414269
Test RMSE: 0.171635458592776


graph = sns.regplot(x=y_test_pred,y=y_test)
graph.set(xlabel='Predicted Values', ylabel='Ground Truth')
graph.set_title("Test Set Pred vs Ground Truth");


X_test_2021 = seattle2021[top_cols]


X_test_2021 = poly.fit_transform(X_test_2021)

y_test_2021_pred = regressor.predict(X_test_2021)


print("Test 2021 Score:", regressor.score(X_test_2021, y_test_2021))
print("Test 2021 RMSE:", np.sqrt(mean_squared_error(y_test_2021, y_test_2021_pred)))

Test 2021 Score: 0.43479193847866815
Test 2021 RMSE: 0.17568191092076288


graph = sns.regplot(x=y_test_2021_pred,y=y_test_2021);
graph.set_title("2021 Test Set Pred vs Ground Truth")
graph.set(xlabel='Predicted Values', ylabel='Ground Truth');

	Model	R2 train	R2 val	MAE train	MAE val	RMSE train	RMSE val
0	Baseline	0.0000	-0.0012	0.1511	0.1600	0.2389	0.2678
1	DT	1.0000	0.1907	0.0000	0.1128	0.0000	0.2408
2	SVR	0.5618	0.4936	0.0956	0.1104	0.1582	0.1904
3	KNN	0.6523	0.3665	0.0671	0.1006	0.1409	0.2130
4	NN	0.6950	0.3504	0.0703	0.0977	0.1319	0.2157
5	LR	0.4024	0.3843	0.1151	0.1267	0.1847	0.2100
6	KR	0.1809	0.1863	0.1417	0.1531	0.2162	0.2414
7	BR	0.4020	0.3836	0.1146	0.1260	0.1848	0.2101
8	GBR	0.5709	0.4560	0.0858	0.0988	0.1565	0.1974
9	RFR	0.8986	0.4936	0.0357	0.0954	0.0761	0.1904
10	Lasso	0.0000	-0.0012	0.1511	0.1600	0.2389	0.2678
11	Ridge	0.4019	0.3835	0.1146	0.1260	0.1848	0.2101

	Model	R2 train	R2 val	MAE train	MAE val	RMSE train	RMSE val
0	Baseline	0.0000	-0.0011	0.1536	0.1636	0.0596	0.0911
1	DT	1.0000	0.1738	0.0000	0.1201	0.0000	0.0752
2	SVR	0.3420	0.2731	0.1085	0.1197	0.0392	0.0662
3	KNN	0.6322	0.2462	0.0718	0.1183	0.0219	0.0686
4	NN	-1.1267	-0.8641	0.2547	0.2751	0.1267	0.1697
5	LR	0.5624	0.3617	0.0925	0.1160	0.0261	0.0581
6	KR	0.5594	0.4762	0.0926	0.1105	0.0263	0.0477
7	BR	0.5094	0.4810	0.1002	0.1104	0.0292	0.0472
8	GBR	0.6280	0.4436	0.0825	0.1015	0.0222	0.0506
9	RFR	0.9064	0.4768	0.0352	0.1000	0.0056	0.0476
10	Lasso	0.3783	0.3378	0.1150	0.1268	0.0370	0.0603
11	Ridge	0.5595	0.4765	0.0927	0.1105	0.0262	0.0477

	Model	R2 train	R2 val	MAE train	MAE val	RMSE train	RMSE val
0	Baseline	0.0000	-0.0012	0.1511	0.1600	0.2389	0.2678
1	DT	1.0000	0.1469	0.0000	0.1140	0.0000	0.2472
2	SVR	0.5192	0.4879	0.1074	0.1210	0.1657	0.1915
3	KNN	0.6735	0.3375	0.0663	0.1056	0.1365	0.2178
4	NN	0.6029	0.4976	0.0762	0.0943	0.1505	0.1897
5	LR	0.4015	0.3811	0.1152	0.1271	0.1848	0.2105
6	KR	0.1800	0.1854	0.1416	0.1527	0.2163	0.2415
7	BR	0.4014	0.3810	0.1149	0.1267	0.1848	0.2106
8	GBR	0.5719	0.4555	0.0855	0.0989	0.1563	0.1975
9	RFR	0.9050	0.4806	0.0346	0.0949	0.0736	0.1929
10	Lasso	0.0000	-0.0012	0.1511	0.1600	0.2389	0.2678
11	Ridge	0.4011	0.3808	0.1148	0.1264	0.1849	0.2106

	Model	R2 train	R2 val	MAE train	MAE val	RMSE train	RMSE val
0	Baseline	0.0000	-0.0011	0.1536	0.1636	0.0596	0.0911
1	DT	1.0000	0.1975	0.0000	0.1211	0.0000	0.0731
2	SVR	0.4270	0.3881	0.1073	0.1176	0.0341	0.0557
3	KNN	0.6581	0.2287	0.0700	0.1152	0.0204	0.0702
4	NN	0.1496	0.1414	0.1450	0.1504	0.0507	0.0782
5	LR	0.5416	0.5158	0.0915	0.1033	0.0273	0.0441
6	KR	0.5360	0.5106	0.0921	0.1032	0.0276	0.0446
7	BR	0.5141	0.4887	0.0969	0.1076	0.0290	0.0465
8	GBR	0.6082	0.4559	0.0834	0.1005	0.0233	0.0495
9	RFR	0.8983	0.4624	0.0367	0.1033	0.0061	0.0489
10	Lasso	0.3758	0.3367	0.1151	0.1263	0.0372	0.0604
11	Ridge	0.5362	0.5118	0.0920	0.1029	0.0276	0.0444

Model Training¶

Data 311 Final Project¶

Alyssa, Bryan, and Vivian¶

Data Cleaning & Processing¶

Model Selection¶

Initial scores¶

Top model: SVR 0.5618 0.4936¶

Polynomial feature expansion¶

Top model: BR 0.5094 0.4810¶

Top 10 cols¶

Best model: SVR 0.5192 0.4879¶

Polynomial & top 10 cols¶

Best model: LR 0.5416, 0.5158¶

Testing our Model¶