import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor
We requested data from NOAA's LCD database. The data was collected from SeaTac airport, and we started off with 3 datasets: one spanning 2010-2019, one from 2020, and one from 2021. We combined the first two to get a dataset from 2010 to 2020, which we used for training, validating, and testing our model, and saved the 2021 dataset for testing our final model.
seattle2020 = pd.read_csv("https://facultyweb.cs.wwu.edu/~wehrwes/courses/csci141_21s/fp/data/WA_Seattle.csv", low_memory = False)
seattle2010 = pd.read_csv("10yearweather.csv", low_memory = False)
seattle2021 = pd.read_csv("2021weather.csv", low_memory = False)
dfs = [seattle2020, seattle2010]
seattle = pd.concat(dfs)
seattle
STATION | DATE | REPORT_TYPE | SOURCE | AWND | BackupDirection | BackupDistance | BackupDistanceUnit | BackupElements | BackupElevation | ... | ShortDurationPrecipitationValue100 | ShortDurationPrecipitationValue120 | ShortDurationPrecipitationValue150 | ShortDurationPrecipitationValue180 | Sunrise | Sunset | TStorms | WindEquipmentChangeDate | DYHF | DYTS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 72793024233 | 2020-01-01T00:53:00 | FM-15 | 7 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
1 | 72793024233 | 2020-01-01T01:53:00 | FM-15 | 7 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
2 | 72793024233 | 2020-01-01T02:53:00 | FM-15 | 7 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
3 | 72793024233 | 2020-01-01T03:53:00 | FM-15 | 7 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
4 | 72793024233 | 2020-01-01T04:00:00 | FM-12 | 4 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
131333 | 72793024233 | 2019-12-31T22:19:00 | FM-16 | 7 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
131334 | 72793024233 | 2019-12-31T22:53:00 | FM-15 | 7 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
131335 | 72793024233 | 2019-12-31T23:53:00 | FM-15 | 7 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
131336 | 72793024233 | 2019-12-31T23:59:00 | SOD | 6 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 757.0 | 1628.0 | NaN | 2007-05-17 | NaN | NaN |
131337 | 72793024233 | 2019-12-31T23:59:00 | SOM | 6 | 6.7 | NaN | NaN | NaN | NaN | NaN | ... | 0.64 | 0.71 | 0.82 | 0.9 | NaN | NaN | NaN | 2007-05-17 | NaN | NaN |
144259 rows × 126 columns
Because this dataset contains hourly, daily, and monthly data, first, we had to omit data we didn't need. We made a new dataset containing all of the columns with daily measurements, and used .dropna()
to remove rows with NaN values. This brought our dataset size down from 144,259 rows to 3,873.
sea_mask = seattle["REPORT_TYPE"].str.contains("SOD")
seattle = seattle[sea_mask]
daily_cols = ["DailyAverageDewPointTemperature", "DailyAverageDryBulbTemperature", "DailyAverageRelativeHumidity",
"DailyAverageSeaLevelPressure", "DailyAverageStationPressure", "DailyAverageWetBulbTemperature",
"DailyAverageWindSpeed", "DailyCoolingDegreeDays", "DailyDepartureFromNormalAverageTemperature",
"DailyHeatingDegreeDays", "DailyMaximumDryBulbTemperature", "DailyMinimumDryBulbTemperature",
"DailyPeakWindDirection", "DailyPeakWindSpeed", "DailySnowDepth", "DailySnowfall",
"DailySustainedWindDirection", "DailySustainedWindSpeed",
"DailyPrecipitation"]
seattle_daily = seattle[daily_cols].dropna()
Next, we cleaned our data. Three of our columns contained the letter "T", representing trace amounts of snowdepth, snowfall, and precipitation. We changed these values to 0.0 similar to how we did in Lab 2, and ensured that each column contained only floats.
seattle_daily["DailySnowDepth"] = seattle_daily["DailySnowDepth"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle_daily["DailySnowfall"] = seattle_daily["DailySnowfall"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle_daily["DailyPrecipitation"] = seattle_daily["DailyPrecipitation"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle_precip = seattle_daily["DailyPrecipitation"]
seattle_daily
DailyAverageDewPointTemperature | DailyAverageDryBulbTemperature | DailyAverageRelativeHumidity | DailyAverageSeaLevelPressure | DailyAverageStationPressure | DailyAverageWetBulbTemperature | DailyAverageWindSpeed | DailyCoolingDegreeDays | DailyDepartureFromNormalAverageTemperature | DailyHeatingDegreeDays | DailyMaximumDryBulbTemperature | DailyMinimumDryBulbTemperature | DailyPeakWindDirection | DailyPeakWindSpeed | DailySnowDepth | DailySnowfall | DailySustainedWindDirection | DailySustainedWindSpeed | DailyPrecipitation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
33 | 44.0 | 50.0 | 79.0 | 29.84 | 29.35 | 47.0 | 14.3 | 0.0 | 9.2 | 15.0 | 53.0 | 46.0 | 220.0 | 33.0 | 0.0 | 0.0 | 220.0 | 25.0 | 0.00 |
74 | 38.0 | 46.0 | 81.0 | 30.07 | 29.59 | 41.0 | 9.2 | 0.0 | 5.1 | 19.0 | 50.0 | 41.0 | 170.0 | 27.0 | 0.0 | 0.0 | 180.0 | 20.0 | 0.21 |
109 | 41.0 | 54.0 | 72.0 | 29.89 | 29.41 | 47.0 | 9.7 | 0.0 | 13.0 | 11.0 | 62.0 | 46.0 | 220.0 | 39.0 | 0.0 | 0.0 | 220.0 | 28.0 | 0.39 |
148 | 38.0 | 43.0 | 78.0 | 30.22 | 29.74 | 41.0 | 14.1 | 0.0 | 1.9 | 22.0 | 46.0 | 40.0 | 190.0 | 38.0 | 0.0 | 0.0 | 210.0 | 30.0 | 0.10 |
182 | 37.0 | 45.0 | 77.0 | 30.27 | 29.78 | 41.0 | 14.3 | 0.0 | 3.8 | 20.0 | 48.0 | 42.0 | 200.0 | 33.0 | 0.0 | 0.0 | 200.0 | 26.0 | 0.14 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
131149 | 33.0 | 35.0 | 92.0 | 30.13 | 29.63 | 34.0 | 6.0 | 0.0 | -5.3 | 30.0 | 39.0 | 30.0 | 120.0 | 15.0 | 0.0 | 0.0 | 120.0 | 12.0 | 0.05 |
131187 | 38.0 | 41.0 | 88.0 | 30.18 | 29.69 | 40.0 | 5.3 | 0.0 | 0.7 | 24.0 | 45.0 | 37.0 | 190.0 | 17.0 | 0.0 | 0.0 | 190.0 | 14.0 | 0.01 |
131219 | 40.0 | 44.0 | 87.0 | 30.20 | 29.70 | 42.0 | 4.2 | 0.0 | 3.6 | 21.0 | 47.0 | 41.0 | 140.0 | 12.0 | 0.0 | 0.0 | 160.0 | 8.0 | 0.00 |
131298 | 41.0 | 46.0 | 83.0 | 30.31 | 29.82 | 44.0 | 9.9 | 0.0 | 5.5 | 19.0 | 48.0 | 43.0 | 200.0 | 24.0 | 0.0 | 0.0 | 200.0 | 20.0 | 0.04 |
131336 | 44.0 | 50.0 | 84.0 | 29.90 | 29.41 | 47.0 | 14.7 | 0.0 | 9.4 | 15.0 | 53.0 | 46.0 | 210.0 | 37.0 | 0.0 | 0.0 | 190.0 | 26.0 | 0.26 |
3873 rows × 19 columns
seattle_daily.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3873 entries, 33 to 131336 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DailyAverageDewPointTemperature 3873 non-null float64 1 DailyAverageDryBulbTemperature 3873 non-null float64 2 DailyAverageRelativeHumidity 3873 non-null float64 3 DailyAverageSeaLevelPressure 3873 non-null float64 4 DailyAverageStationPressure 3873 non-null float64 5 DailyAverageWetBulbTemperature 3873 non-null float64 6 DailyAverageWindSpeed 3873 non-null float64 7 DailyCoolingDegreeDays 3873 non-null float64 8 DailyDepartureFromNormalAverageTemperature 3873 non-null float64 9 DailyHeatingDegreeDays 3873 non-null float64 10 DailyMaximumDryBulbTemperature 3873 non-null float64 11 DailyMinimumDryBulbTemperature 3873 non-null float64 12 DailyPeakWindDirection 3873 non-null float64 13 DailyPeakWindSpeed 3873 non-null float64 14 DailySnowDepth 3873 non-null float64 15 DailySnowfall 3873 non-null float64 16 DailySustainedWindDirection 3873 non-null float64 17 DailySustainedWindSpeed 3873 non-null float64 18 DailyPrecipitation 3873 non-null float64 dtypes: float64(19) memory usage: 605.2 KB
We repeated these data cleaning steps on our 2021 testing dataset:
sea21_mask = seattle2021["REPORT_TYPE"].str.contains("SOD")
seattle2021 = seattle2021[sea21_mask]
seattle2021 = seattle2021[daily_cols].dropna()
seattle2021["DailyPeakWindDirection"] = seattle2021["DailyPeakWindDirection"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySnowfall"] = seattle2021["DailySnowfall"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailyPrecipitation"] = seattle2021["DailyPrecipitation"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailyPeakWindSpeed"] = seattle2021["DailyPeakWindSpeed"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySustainedWindDirection"] = seattle2021["DailySustainedWindDirection"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021["DailySustainedWindSpeed"] = seattle2021["DailySustainedWindSpeed"].str.replace("T", "0.0").str.rstrip("s").astype(float)
seattle2021.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 273 entries, 42 to 9447 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DailyAverageDewPointTemperature 273 non-null float64 1 DailyAverageDryBulbTemperature 273 non-null float64 2 DailyAverageRelativeHumidity 273 non-null float64 3 DailyAverageSeaLevelPressure 273 non-null float64 4 DailyAverageStationPressure 273 non-null float64 5 DailyAverageWetBulbTemperature 273 non-null float64 6 DailyAverageWindSpeed 273 non-null float64 7 DailyCoolingDegreeDays 273 non-null float64 8 DailyDepartureFromNormalAverageTemperature 273 non-null float64 9 DailyHeatingDegreeDays 273 non-null float64 10 DailyMaximumDryBulbTemperature 273 non-null float64 11 DailyMinimumDryBulbTemperature 273 non-null float64 12 DailyPeakWindDirection 273 non-null float64 13 DailyPeakWindSpeed 273 non-null float64 14 DailySnowDepth 273 non-null float64 15 DailySnowfall 273 non-null float64 16 DailySustainedWindDirection 273 non-null float64 17 DailySustainedWindSpeed 273 non-null float64 18 DailyPrecipitation 273 non-null float64 dtypes: float64(19) memory usage: 42.7 KB
We renamed the columns to shortened versions of their names to improve the readability of the plots we wanted to make, because all of the data is daily.
seattle_daily.rename({'DailyAverageStationPressure': 'AvgStationPressure',
'DailyAverageSeaLevelPressure': 'AvgSeaLevelPressure',
"DailyAverageWetBulbTemperature":"AvgWetBulbTemp",
"DailyAverageDewPointTemperature":"AvgDewPointTemp",
"DailySnowfall": "Snowfall",
"DailyAverageRelativeHumidity":"AvgRelHumidity",
"DailySnowDepth":"SnowDepth",
"DailyPeakWindSpeed":"PeakWindSpeed",
"DailyDepartureFromNormalAverageTemperature":"DepartureAvgTemp",
"DailyMaximumDryBulbTemperature":"MaxDryBulbTemp",
"DailyHeatingDegreeDays":"HeatingDegreeDays",
"DailyAverageWindSpeed":"AvgWindSpeed",
"DailyAverageDryBulbTemperature":"AvgDryBulbTemp",
"DailySustainedWindSpeed":"SustainedWindSpeed",
"DailyCoolingDegreeDays":"CoolingDegreeDays",
"DailySustainedWindDirection":"SustainedWindDirection",
"DailyPeakWindDirection":"PeakWindDirection",
"DailyMinimumDryBulbTemperature":"MinDryBulbTemp" }, axis=1, inplace=True)
To explore our data, we first used pairplot()
to display scatter plots of each input feature against daily precipitation. These showed us that there were some pretty strong relationships with precipitation amounts. The scatterplots are sorted by correlation coefficient, which we calculate below. As a sanity check, we can see that the last three graphs all have to do with wind, which intuitively don't have a large impact on precipitation amounts. The distributions gradually appear more scattered as the correlation coefficients decrease.
g = sns.pairplot(seattle_daily, x_vars=['AvgStationPressure', 'AvgSeaLevelPressure', "AvgWetBulbTemp","AvgDewPointTemp",
"Snowfall","AvgRelHumidity", "SnowDepth", "PeakWindSpeed", "DepartureAvgTemp",
"MaxDryBulbTemp", "HeatingDegreeDays","AvgWindSpeed","AvgDryBulbTemp",
"SustainedWindSpeed","CoolingDegreeDays", "SustainedWindDirection",
"PeakWindDirection","MinDryBulbTemp"],
y_vars = ["DailyPrecipitation"])
g.fig.suptitle("Input Features vs Daily Precipitation",x=0.15, y=1.1);
To explore correlation coefficients, we created a basic LinearRegression()
model.
X = seattle_daily.drop("DailyPrecipitation", axis=1)
y = seattle_precip
X_train, X_etc, y_train, y_etc = train_test_split(X, y, random_state=12, test_size=0.2, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, random_state=12, train_size=.8, shuffle=True)
print("X_Train:", X_train.shape)
print("X_Val:", X_val.shape)
print("X_Test:", X_test.shape)
print("y_Train:", y_train.shape)
print("y_Val:", y_val.shape)
print("y_Test:", y_test.shape)
X_Train: (3098, 18) X_Val: (620, 18) X_Test: (155, 18) y_Train: (3098,) y_Val: (620,) y_Test: (155,)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred_train = regressor.predict(X_train)
y_pred_val = regressor.predict(X_val)
Next, we displayed the coefficients, sorted by absolute value, in a bar graph for comparison and in a table.
coefficients = pd.concat([pd.DataFrame(X.columns, columns = ["Input Features"]), pd.DataFrame(np.transpose(regressor.coef_), columns = ["Coefficients"])], axis = 1)
coefficients["Abs Coeff"] = round(abs(coefficients["Coefficients"]), 8)
coefficients = coefficients.sort_values(by="Abs Coeff", ascending=False)
new_df = coefficients.drop("Coefficients", axis=1)
g=sns.barplot(x="Input Features", y="Abs Coeff", data=new_df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_title("Coefficients of Input Features")
g;
table = plt.table(cellText=new_df.values,
cellLoc='center',
rowLabels=new_df.index,
colLabels=new_df.columns,
bbox=(1.2,0,0.8,1.2))
table.auto_set_font_size(False)
table.set_fontsize(12)
table.auto_set_column_width(True)
plt.show()
This shows us that daily average station pressure and sea level pressure have the strongest relationships with daily precipitation. The rest of the columns have slight correlations.
Next, we used PCA analysis to graph the explained variance ratio, or the percentage of variance that is attributed by each of the components.
scaler = preprocessing.StandardScaler(copy=True).fit(X)
Xscaled = scaler.transform(X)
pca = PCA(n_components = Xscaled.shape[1])
pca.fit(Xscaled)
pca.explained_variance_ratio_
ax = sns.lineplot(x=range(X.shape[1]), y=np.cumsum(pca.explained_variance_ratio_))
ax.set(xlabel="# of Input Features", ylabel="% Variance Explained", title="Amount of Variance Explained by Input Features");
This shows us that the percentage of explained variance hits a peak at around 9 or 10 input features, after which it appears to plateau. Combining this with our coefficient analysis, it seems likely that we could train our model on the ten columns with the highest regression coefficients to potentially improve the accuracy of our model.
We decided to make our baseline the mean amount of daily precipitation. We created a model for our baseline and calculated its R squared, MAE, and RMSE scores to give us an idea of what we wanted our final model to outperform.
The R2 score is the percentage of variance in the response variable that is explained by our linear model. MAE, or mean absolute error, is the mean of the absolute values of the differences between the true value and the predicted value for each data point. The RMSE is the square root of the average squared difference between the estimated values and the actual value. Due to squaring, RMSE gives a higher weight to large errors.
seattle_daily["DailyPrecipitation"].mean()
0.11595404079524917
seattle_daily["DailyPrecipitation"].std()
0.2542551610819193
The mean is 0.1159, which will be our baseline.
baseline = DummyRegressor(strategy="mean").fit(X_train, y_train)
base_train_pred = baseline.predict(X_train)
base_val_pred = baseline.predict(X_val)
print("Base train score:", baseline.score(X_train, y_train))
print("Base val score:", baseline.score(X_val, y_val))
print("Base train MAE:", mean_absolute_error(y_train, base_train_pred))
print("Base val MAE:", mean_absolute_error(y_val, base_val_pred))
print("Base train RMSE:", np.sqrt(mean_squared_error(y_train, base_train_pred, squared=False)))
print("Base val RMSE:", np.sqrt(mean_squared_error(y_val, base_val_pred, squared=False)))
Base train score: 0.0 Base val score: -0.0011437518561792093 Base train MAE: 0.153595632826693 Base val MAE: 0.16359759678460611 Base train RMSE: 0.4940642517995237 Base val RMSE: 0.5494458356892034
Because we predict the mean, the R2 score of 0.0 on our training set makes sense, because variance is defined as the deviation of the observations from the mean. The negative score on our validation set shows that the baseline performed worse, because the mean daily precipitation on our validation set was different than the mean of our training set. On the training set, MAE=0.15 implies that, on average, the baseline's distance from the true value is 0.15. On the validation set, the average distance from the ground truth is 0.16. For our training set, the square root of the average of squared differences between the ground truth and baseline values is 0.49, while it's 0.55 for the validation set. This penalizes higher residuals.
We created an initial evaluation environment. It takes in X
and y
and creates a baseline regression model and a simple linear regression model and prints our evaluation metrics for the training and validation sets.
def find_train(X,y):
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
X_train, X_etc, y_train, y_etc = train_test_split(X, y, random_state=12, test_size=0.2, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_etc, y_etc, random_state=12, train_size=.8, shuffle=True)
regressor = LinearRegression().fit(X_train, y_train)
y_train_pred = regressor.predict(X_train)
y_val_pred = regressor.predict(X_val)
print("Model train score:", regressor.score(X_train, y_train))
print("Model val score:", regressor.score(X_val, y_val))
print("Model train MAE:", mean_absolute_error(y_train, y_train_pred))
print("Model val MAE:", mean_absolute_error(y_val, y_val_pred))
print("Model train MSE:", mean_squared_error(y_train, y_train_pred, squared=False))
print("Model val MSE:", mean_squared_error(y_val, y_val_pred, squared=False))
print("Model train RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred, squared=False)))
print("Model val RMSE:", np.sqrt(mean_squared_error(y_val, y_val_pred, squared=False)))
find_train(X,y)
Model train score: 0.4033587124094692 Model val score: 0.3566012526733855 Model train MAE: 0.11766030860787145 Model val MAE: 0.13046639722423403 Model train MSE: 0.18854868793692955 Model val MSE: 0.24201465058595784 Model train RMSE: 0.4342219339657194 Model val RMSE: 0.4919498456000955
This shows that a basic linear regression model on our data has a coefficient of determination of 40% on our training set and 35% on our validation set. Our model's average distance from ground truth for the training and validation sets are 0.11 and 0.13, respectively, which are both lower than our baseline MAE scores. The same applies for the RMSE scores of 0.42 and 0.49.
From our exploratory analysis, we will try applying the top 10 columns with the highest regression coefficients to try and improve our model. The initial linear regression model didn't score super high, so we want to try different strategies to improve its score and potentially try other models.