WEB-APP HERE

Note that the web-app takes about a min to load!

GIT REPOSITORY HERE

This is just codes I used on the notebook to understand and clean the data. The data comes from my friend who likes to bike.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
from sklearn.utils import shuffle
import plotly.graph_objs as go
import plotly.express as px
%matplotlib inline
df = pd.read_csv ('data/activities.csv')
orig = pd.read_csv('data/activities.csv')
df = shuffle(df)
print("number of rows: "+ str(len(df)))
df.head(1)
number of rows: 142
Activity ID Activity Date Activity Name Activity Type Activity Description Elapsed Time Distance Relative Effort Commute Activity Gear ... Gear Precipitation Probability Precipitation Type Cloud Cover Weather Visibility UV Index Weather Ozone translation missing: en-US.lib.export.portability_exporter.activities.horton_values.jump_count translation missing: en-US.lib.export.portability_exporter.activities.horton_values.total_grit translation missing: en-US.lib.export.portability_exporter.activities.horton_values.avg_flow
47 723876967 Sep 24, 2016, 10:54:54 PM Afternoon Ride Ride NaN 12735 29.55 NaN False Vilano Aluminum Road Bike 21 Speed Shimano ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

1 rows × 77 columns

quater_nulls = list(df.columns[df.isnull().sum() <= 0.25*len(df)])
#quater_nulls
df = df[quater_nulls]
df.columns
Index(['Activity ID', 'Activity Date', 'Activity Name', 'Activity Type',
       'Elapsed Time', 'Distance', 'Commute', 'Activity Gear', 'Filename',
       'Athlete Weight', 'Bike Weight', 'Elapsed Time.1', 'Moving Time',
       'Distance.1', 'Max Speed', 'Elevation Gain', 'Elevation Low',
       'Elevation High', 'Max Grade', 'Average Grade', 'Average Watts',
       'Calories', 'Commute.1', 'Bike'],
      dtype='object')

Going to remove discard = ['Activity Name', 'Activity ID', 'Commute', 'Filename', 'Commute.1','Distance', 'Elapsed Time.1', 'Bike']
Because not information or repetitive.

discard = ['Activity Name', 'Activity ID', 'Commute', 'Filename', 'Commute.1','Distance', 'Elapsed Time.1', 'Bike']
df = df.drop(discard, axis = 1)
df.head(1)
Activity Date Activity Type Elapsed Time Activity Gear Athlete Weight Bike Weight Moving Time Distance.1 Max Speed Elevation Gain Elevation Low Elevation High Max Grade Average Grade Average Watts Calories
47 Sep 24, 2016, 10:54:54 PM Ride 12735 Vilano Aluminum Road Bike 21 Speed Shimano 63.502899 11.0 7946.0 29549.900391 12.3 11.1737 1.2 13.2 38.299999 -0.002369 53.709999 475.859314
df.describe()
Elapsed Time Athlete Weight Bike Weight Moving Time Distance.1 Max Speed Elevation Gain Elevation Low Elevation High Max Grade Average Grade Average Watts Calories
count 142.000000 128.000000 121.000000 142.000000 142.000000 136.000000 137.000000 135.000000 135.000000 136.000000 142.000000 126.000000 132.000000
mean 8307.028169 65.011994 8.886777 5964.485915 34740.822462 13.735294 262.614366 26.635556 95.854814 27.179412 0.271421 111.032301 764.836008
std 5371.122774 5.263799 1.400711 3368.764442 21521.125565 3.903418 270.558590 47.756425 103.870238 15.398105 3.187175 28.786978 489.682759
min 204.000000 55.000000 7.500000 182.000000 0.000000 0.000000 0.000000 -18.000000 6.900000 0.000000 -0.752807 49.716900 26.208254
25% 3414.500000 60.000000 7.500000 2949.000000 17527.250488 11.800000 62.490898 -1.000000 21.850000 14.350000 -0.003579 91.731985 383.654442
50% 8070.500000 68.000000 9.000000 6047.500000 31560.699219 13.700000 166.636993 0.900000 101.099998 22.300000 0.000000 114.522282 673.522827
75% 11301.000000 68.000000 11.000000 7957.250000 50011.325195 15.300000 358.088989 72.400002 125.799999 42.899999 0.010629 130.717503 1066.323883
max 28317.000000 70.000000 11.000000 16708.000000 91705.296875 36.299999 1455.640015 382.299988 1092.099976 50.000000 37.947071 182.307999 2375.330322
sns.heatmap(df.corr(), annot=True, fmt='.2f', ax = plt.figure(figsize = (15,10)).gca())
<AxesSubplot:>

Interesting Correlations:

  • Elevation Low,High with Athelete Weight, bike weight
  • Calories and Moving Time and Distance.1 and Elevation Gain
  • Elevation High, Low and Average Grade
  • Elevation Gain with Elapsed time, Moving Time, Distance
df.hist(ax = plt.figure(figsize = (15,20)).gca());
<ipython-input-141-c15b20fe6ff7>:1: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  df.hist(ax = plt.figure(figsize = (15,20)).gca());

Categorical Columns

cat_df = df.select_dtypes(include=['object']) #choose categorical columns
cat_df
Activity Date Activity Type Activity Gear
47 Sep 24, 2016, 10:54:54 PM Ride Vilano Aluminum Road Bike 21 Speed Shimano
8 May 23, 2015, 10:26:06 PM Ride NaN
94 Sep 29, 2018, 4:19:38 PM Ride Gusto
55 Mar 22, 2017, 3:44:50 PM Ride Kestrel 200 SCI Older Road Bike
40 Apr 16, 2016, 5:57:42 PM Ride Vilano Aluminum Road Bike 21 Speed Shimano
... ... ... ...
30 Mar 16, 2016, 6:25:36 PM Ride Vilano Aluminum Road Bike 21 Speed Shimano
66 Jun 23, 2017, 11:25:10 PM Ride Kestrel 200 SCI Older Road Bike
62 May 9, 2017, 10:33:30 PM Ride Kestrel 200 SCI Older Road Bike
91 Aug 22, 2018, 9:34:35 PM Ride Gusto
35 Mar 23, 2016, 5:35:32 AM Run NaN

142 rows × 3 columns

Lets clean these up 🧹

time = df['Activity Date'].astype('datetime64[ns]')
yr,mon,d,h = [],[],[],[]
for i in time:
    yr.append(i.year)
    mon.append(i.month)
    d.append(i.day)
    h.append(i.hour)
len(yr)
time.head(4)
df['Year'] = yr
df['Month'] = mon
df['Day'] = d
df['Hour'] = h
df = df.drop(['Activity Date'], axis=1) # Drop original Date value
df.head(3)    
Activity Type Elapsed Time Activity Gear Athlete Weight Bike Weight Moving Time Distance.1 Max Speed Elevation Gain Elevation Low Elevation High Max Grade Average Grade Average Watts Calories Year Month Day Hour
47 Ride 12735 Vilano Aluminum Road Bike 21 Speed Shimano 63.502899 11.0 7946.0 29549.900391 12.3 11.173700 1.2 13.200000 38.299999 -0.002369 53.709999 475.859314 2016 9 24 22
8 Ride 11734 NaN 56.699001 NaN 10057.0 59956.300781 14.6 825.666992 -2.4 101.099998 46.500000 0.079558 130.302002 1461.148682 2015 5 23 22
94 Ride 4696 Gusto 68.000000 7.5 4127.0 27227.500000 14.0 158.414581 75.0 158.199997 11.000000 0.235424 109.483162 580.913513 2018 9 29 16
print("Unique Activity Gear values: " + str(df['Activity Gear'].unique()))
print("Unique Activity Gear values: " + str(df['Activity Type'].unique()))
Unique Activity Gear values: ['Gusto' 'Kestrel 200 SCI Older Road Bike' nan
 'Vilano Aluminum Road Bike 21 Speed Shimano' 'Fixie']
Unique Activity Gear values: ['Ride' 'Hike' 'Run' 'Workout' 'Walk']
def create_dummy_df(df, cat_cols, dummy_na=False):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool whether you want to dummy NA values or not
    
    OUTPUT:
    df - new dataframe with following characteristics:
        1. contains all columns that were not specified as categorical
        2. removes all the original columns in cat_cols
        3. dummy columns for each of the categorical columns in cat_cols
        4. use a prefix of the column name with an underscore (_) for separating
        5. if dummy_na is True - it also contains dummy columns for NaN values
    '''
    for col in cat_cols:
        try:
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df

df = create_dummy_df(df, ['Activity Type'], dummy_na = True)
df.head(3)
Elapsed Time Activity Gear Athlete Weight Bike Weight Moving Time Distance.1 Max Speed Elevation Gain Elevation Low Elevation High ... Calories Year Month Day Hour Activity Type_Ride Activity Type_Run Activity Type_Walk Activity Type_Workout Activity Type_nan
47 12735 Vilano Aluminum Road Bike 21 Speed Shimano 63.502899 11.0 7946.0 29549.900391 12.3 11.173700 1.2 13.200000 ... 475.859314 2016 9 24 22 1 0 0 0 0
8 11734 NaN 56.699001 NaN 10057.0 59956.300781 14.6 825.666992 -2.4 101.099998 ... 1461.148682 2015 5 23 22 1 0 0 0 0
94 4696 Gusto 68.000000 7.5 4127.0 27227.500000 14.0 158.414581 75.0 158.199997 ... 580.913513 2018 9 29 16 1 0 0 0 0

3 rows × 23 columns

Null Values

no_nulls = list(df.columns[df.isnull().sum() != 0])
no_nulls
['Activity Gear',
 'Athlete Weight',
 'Bike Weight',
 'Max Speed',
 'Elevation Gain',
 'Elevation Low',
 'Elevation High',
 'Max Grade',
 'Average Watts',
 'Calories']

Based on their histogram, it seem like a good idea to

  • Imputation on median: [Athlete Weight, Bike Weight, Elevation Low, Elevation High]
  • Imputation on mean: [Elevation Gain, Average Watts, Calories, Max Speed, Max Grade]
fill_mean = lambda col: col.fillna(col.mean()) # function for imputating mean
fill_median = lambda col: col.fillna(col.median()) # function for imputating median

# impuation on mean
fill_df = df[['Elevation Gain', 'Average Watts', 'Calories', 'Max Speed', 'Max Grade']].apply(fill_mean, axis=0) 
fill_df = pd.concat([fill_df, df.drop(['Elevation Gain', 'Average Watts', 'Calories', 'Max Speed', 'Max Grade'], axis=1)], axis=1)
# imputation on median
fill_df_med = df[['Athlete Weight', 'Bike Weight', 'Elevation Low', 'Elevation High']].apply(fill_median, axis=0)
filled_df = pd.concat([fill_df.drop(['Athlete Weight', 'Bike Weight', 'Elevation Low', 'Elevation High'], axis = 1), fill_df_med], axis=1)
# Alternative solution to null values by dropping all
dropped_df = df.dropna()
filled_df.head(2)
Elevation Gain Average Watts Calories Max Speed Max Grade Elapsed Time Activity Gear Moving Time Distance.1 Average Grade ... Hour Activity Type_Ride Activity Type_Run Activity Type_Walk Activity Type_Workout Activity Type_nan Athlete Weight Bike Weight Elevation Low Elevation High
8 825.666992 130.302002 1461.148682 14.600000 46.500000 11734 NaN 10057.0 59956.300781 0.079558 ... 22 1 0 0 0 0 56.699001 9.0 -2.4 101.099998
28 41.823601 128.156006 1058.558350 19.200001 24.700001 8448 Vilano Aluminum Road Bike 21 Speed Shimano 7408.0 53329.601562 -0.015939 ... 15 1 0 0 0 0 60.000000 11.0 -1.0 42.200001

2 rows × 23 columns

dropped_df.head(2)
Elapsed Time Activity Gear Athlete Weight Bike Weight Moving Time Distance.1 Max Speed Elevation Gain Elevation Low Elevation High ... Calories Year Month Day Hour Activity Type_Ride Activity Type_Run Activity Type_Walk Activity Type_Workout Activity Type_nan
101 11327 Gusto 68.000000 7.5 7993.0 54209.500000 11.5 240.664948 74.800003 113.900002 ... 975.611206 2019 5 15 21 1 0 0 0 0
44 5335 Vilano Aluminum Road Bike 21 Speed Shimano 67.131599 11.0 5038.0 38688.300781 11.8 24.196800 0.000000 13.900000 ... 770.871704 2016 8 29 17 1 0 0 0 0

2 rows × 23 columns

filled_df = filled_df.dropna() #Note: can change this to na -> no bike (on foot)
no_nulls = list(filled_df.columns[filled_df.isnull().sum() != 0])
no_nulls_2 = list(dropped_df.columns[dropped_df.isnull().sum() != 0])
assert(no_nulls_2 == [])
assert(no_nulls == [])

Linear Regression

y = filled_df['Distance.1']
X = filled_df.drop(['Distance.1'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42)

lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)
LinearRegression(normalize=True)
test_pred = lm_model.predict(X_test)
train_pred = lm_model.predict(X_train)
r2_test = r2_score(y_test, test_pred)
r2_train = r2_score(y_train, train_pred)
print("test r2: "+str(r2_test))
print("train r2: "+str(r2_train))
test r2: 0.6550060428615112
train r2: 0.9398003894033616
bike_df = filled_df.drop(['Bike Weight'], axis=1) # Drop bike weight so it doesn't cheat
y = bike_df['Activity Gear']
X = bike_df.drop(['Activity Gear'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42)

Random Forests

from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=2000, max_leaf_nodes=32, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
RandomForestClassifier(max_leaf_nodes=32, n_estimators=2000, n_jobs=-1)
y_test_pred = rnd_clf.predict(X_test)
test_acc = np.sum(y_test_pred == y_test)/len(y_test)
print("test accuracy: "+str(test_acc))
test accuracy: 0.8421052631578947
from sklearn.preprocessing import LabelEncoder

#https://stackoverflow.com/questions/65549588/shap-treeexplainer-for-randomforest-multiclass-what-is-shap-valuesi

labels = [
    "Fixie",
    "Kestrel 200 SCI Older Road Bike",
    "Vilano Aluminum Road Bike 21 Speed Shimano",
    "Gusto",
]
le = LabelEncoder()
z = le.fit_transform(labels)
encoding_scheme = dict(zip(z, labels))
print(encoding_scheme)
{0: 'Fixie', 2: 'Kestrel 200 SCI Older Road Bike', 3: 'Vilano Aluminum Road Bike 21 Speed Shimano', 1: 'Gusto'}
sum(y == 'Kestrel 200 SCI Older Road Bike')
39
import shap

explainer = shap.TreeExplainer(rnd_clf)
shap_values = explainer.shap_values(X)
# SHAP plot for Gusto
shap.summary_plot(shap_values[1], X)
shap.summary_plot(shap_values[2], X)
shap.summary_plot(shap_values[3], X)

Make it harder for computer to guess

bike_df = filled_df.drop(['Bike Weight', 'Year', 'Athlete Weight'], axis=1) # Drop bike weight so it doesn't cheat
y = bike_df['Activity Gear']
X = bike_df.drop(['Activity Gear'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=2000, max_leaf_nodes=32, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
RandomForestClassifier(max_leaf_nodes=32, n_estimators=2000, n_jobs=-1)
y_test_pred = rnd_clf.predict(X_test)
test_acc = np.sum(y_test_pred == y_test)/len(y_test)
print("test accuracy: "+str(test_acc))
test accuracy: 0.5263157894736842
explainer = shap.TreeExplainer(rnd_clf)
shap_values = explainer.shap_values(X)
# Gustov
shap.summary_plot(shap_values[1], X)
shap.dependence_plot('Max Grade', shap_values[1], X, interaction_index='Elevation Low')

What the heck makes Elevation Low a good guessing tool? Maybe my friend liked more mountains with certain bikes

# Plotly tests
import plotly.express as px

df_i = px.data.iris()
features = ["sepal_width", "sepal_length", "petal_width", "petal_length"]

fig = px.scatter_matrix(
    df_i,
    dimensions=features,
    color="species"
)
fig.update_traces(diagonal_visible=False)
fig.show()