My friend likes to bike 🚴
Building interactive webapp with Python, Bootstrap and Flask
This is just codes I used on the notebook to understand and clean the data. The data comes from my friend who likes to bike.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
from sklearn.utils import shuffle
import plotly.graph_objs as go
import plotly.express as px
%matplotlib inline
df = pd.read_csv ('data/activities.csv')
orig = pd.read_csv('data/activities.csv')
df = shuffle(df)
print("number of rows: "+ str(len(df)))
df.head(1)
quater_nulls = list(df.columns[df.isnull().sum() <= 0.25*len(df)])
#quater_nulls
df = df[quater_nulls]
df.columns
Going to remove discard = ['Activity Name', 'Activity ID', 'Commute', 'Filename', 'Commute.1','Distance', 'Elapsed Time.1', 'Bike']
Because not information or repetitive.
discard = ['Activity Name', 'Activity ID', 'Commute', 'Filename', 'Commute.1','Distance', 'Elapsed Time.1', 'Bike']
df = df.drop(discard, axis = 1)
df.head(1)
df.describe()
sns.heatmap(df.corr(), annot=True, fmt='.2f', ax = plt.figure(figsize = (15,10)).gca())
Interesting Correlations:
- Elevation Low,High with Athelete Weight, bike weight
- Calories and Moving Time and Distance.1 and Elevation Gain
- Elevation High, Low and Average Grade
- Elevation Gain with Elapsed time, Moving Time, Distance
df.hist(ax = plt.figure(figsize = (15,20)).gca());
cat_df = df.select_dtypes(include=['object']) #choose categorical columns
cat_df
Lets clean these up 🧹
time = df['Activity Date'].astype('datetime64[ns]')
yr,mon,d,h = [],[],[],[]
for i in time:
yr.append(i.year)
mon.append(i.month)
d.append(i.day)
h.append(i.hour)
len(yr)
time.head(4)
df['Year'] = yr
df['Month'] = mon
df['Day'] = d
df['Hour'] = h
df = df.drop(['Activity Date'], axis=1) # Drop original Date value
df.head(3)
print("Unique Activity Gear values: " + str(df['Activity Gear'].unique()))
print("Unique Activity Gear values: " + str(df['Activity Type'].unique()))
def create_dummy_df(df, cat_cols, dummy_na=False):
'''
INPUT:
df - pandas dataframe with categorical variables you want to dummy
cat_cols - list of strings that are associated with names of the categorical columns
dummy_na - Bool whether you want to dummy NA values or not
OUTPUT:
df - new dataframe with following characteristics:
1. contains all columns that were not specified as categorical
2. removes all the original columns in cat_cols
3. dummy columns for each of the categorical columns in cat_cols
4. use a prefix of the column name with an underscore (_) for separating
5. if dummy_na is True - it also contains dummy columns for NaN values
'''
for col in cat_cols:
try:
df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
except:
continue
return df
df = create_dummy_df(df, ['Activity Type'], dummy_na = True)
df.head(3)
no_nulls = list(df.columns[df.isnull().sum() != 0])
no_nulls
Based on their histogram, it seem like a good idea to
- Imputation on median: [Athlete Weight, Bike Weight, Elevation Low, Elevation High]
- Imputation on mean: [Elevation Gain, Average Watts, Calories, Max Speed, Max Grade]
fill_mean = lambda col: col.fillna(col.mean()) # function for imputating mean
fill_median = lambda col: col.fillna(col.median()) # function for imputating median
# impuation on mean
fill_df = df[['Elevation Gain', 'Average Watts', 'Calories', 'Max Speed', 'Max Grade']].apply(fill_mean, axis=0)
fill_df = pd.concat([fill_df, df.drop(['Elevation Gain', 'Average Watts', 'Calories', 'Max Speed', 'Max Grade'], axis=1)], axis=1)
# imputation on median
fill_df_med = df[['Athlete Weight', 'Bike Weight', 'Elevation Low', 'Elevation High']].apply(fill_median, axis=0)
filled_df = pd.concat([fill_df.drop(['Athlete Weight', 'Bike Weight', 'Elevation Low', 'Elevation High'], axis = 1), fill_df_med], axis=1)
# Alternative solution to null values by dropping all
dropped_df = df.dropna()
filled_df.head(2)
dropped_df.head(2)
filled_df = filled_df.dropna() #Note: can change this to na -> no bike (on foot)
no_nulls = list(filled_df.columns[filled_df.isnull().sum() != 0])
no_nulls_2 = list(dropped_df.columns[dropped_df.isnull().sum() != 0])
assert(no_nulls_2 == [])
assert(no_nulls == [])
y = filled_df['Distance.1']
X = filled_df.drop(['Distance.1'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42)
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)
test_pred = lm_model.predict(X_test)
train_pred = lm_model.predict(X_train)
r2_test = r2_score(y_test, test_pred)
r2_train = r2_score(y_train, train_pred)
print("test r2: "+str(r2_test))
print("train r2: "+str(r2_train))
bike_df = filled_df.drop(['Bike Weight'], axis=1) # Drop bike weight so it doesn't cheat
y = bike_df['Activity Gear']
X = bike_df.drop(['Activity Gear'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42)
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=2000, max_leaf_nodes=32, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_test_pred = rnd_clf.predict(X_test)
test_acc = np.sum(y_test_pred == y_test)/len(y_test)
print("test accuracy: "+str(test_acc))
from sklearn.preprocessing import LabelEncoder
#https://stackoverflow.com/questions/65549588/shap-treeexplainer-for-randomforest-multiclass-what-is-shap-valuesi
labels = [
"Fixie",
"Kestrel 200 SCI Older Road Bike",
"Vilano Aluminum Road Bike 21 Speed Shimano",
"Gusto",
]
le = LabelEncoder()
z = le.fit_transform(labels)
encoding_scheme = dict(zip(z, labels))
print(encoding_scheme)
sum(y == 'Kestrel 200 SCI Older Road Bike')
import shap
explainer = shap.TreeExplainer(rnd_clf)
shap_values = explainer.shap_values(X)
# SHAP plot for Gusto
shap.summary_plot(shap_values[1], X)
shap.summary_plot(shap_values[2], X)
shap.summary_plot(shap_values[3], X)
bike_df = filled_df.drop(['Bike Weight', 'Year', 'Athlete Weight'], axis=1) # Drop bike weight so it doesn't cheat
y = bike_df['Activity Gear']
X = bike_df.drop(['Activity Gear'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=2000, max_leaf_nodes=32, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_test_pred = rnd_clf.predict(X_test)
test_acc = np.sum(y_test_pred == y_test)/len(y_test)
print("test accuracy: "+str(test_acc))
explainer = shap.TreeExplainer(rnd_clf)
shap_values = explainer.shap_values(X)
# Gustov
shap.summary_plot(shap_values[1], X)
shap.dependence_plot('Max Grade', shap_values[1], X, interaction_index='Elevation Low')
What the heck makes Elevation Low a good guessing tool? Maybe my friend liked more mountains with certain bikes
# Plotly tests
import plotly.express as px
df_i = px.data.iris()
features = ["sepal_width", "sepal_length", "petal_width", "petal_length"]
fig = px.scatter_matrix(
df_i,
dimensions=features,
color="species"
)
fig.update_traces(diagonal_visible=False)
fig.show()