Human Activity Recognition Using Smartphones Dataset¶

Data collected using embedded accelerometer and gyroscope
Pre-processed by applying noise filters
Features was obtained by calculating variables from the time and frequency domain
Six activity labels:
1. WALKING
2. WALKING_UPSTAIRS
3. WALKING_DOWNSTAIRS
4. SITTING
5. STANDING
6. LAYING
Features are normalized and bounded within [-1,1]
More info: https://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RF

# load train data set
X = np.loadtxt('UCI_HAR_Dataset/train/X_train.txt')
Y = np.loadtxt('UCI_HAR_Dataset/train/Y_train.txt')

# load test (validation)
X_test = np.loadtxt('UCI_HAR_Dataset/test/X_test.txt')
Y_test = np.loadtxt('UCI_HAR_Dataset/test/Y_test.txt')

Random Forest¶

clf = RF(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
clf = clf.fit(X, Y)
print("Training set score: %f" % clf.score(X, Y))
print("Test set score: %f" % clf.score(X_test, Y_test))

Training set score: 0.999048
Test set score: 0.910757

Y_predict = clf.predict(X_test)

Confusion matrix¶

import matplotlib.pyplot as plt
import seaborn as sns


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#        print("Normalized confusion matrix")
#    else:
#        print('Confusion matrix, without normalization')

#    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

import itertools
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(Y_test, Y_predict)
np.set_printoptions(precision=2)


class_names = ['WALKING', 
    'WALKING_UPSTAIRS',
    'WALKING_DOWNSTAIRS', 
    'SITTING', 
    'STANDING', 
    'LAYING']
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')
plt.grid(visible=False)
plt.show()

clf1 = RF(n_estimators=30, max_depth=30,min_samples_split=5, random_state=0)
clf1 = clf1.fit(X, Y)

Y1_predict = clf1.predict(X_test)
print("Training set score: %f" % clf1.score(X, Y))
print("Test set score: %f" % clf1.score(X_test, Y_test))

Training set score: 0.999864
Test set score: 0.926026

Confusion matrix my way (dataframes)¶

df_uns = pd.DataFrame(data= {"test":Y_test,"predict":Y1_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
         groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)
df_uns

plot_confusion_matrix(confusion_matrix(Y_test, Y1_predict), classes=class_names,
                      title='Confusion matrix, RF')
plt.grid(visible=False)
plt.show()

Filtering out less importnant features¶

import seaborn as sns
sns.distplot(clf1.feature_importances_)
plt.title("Distrubution of feature importances")
plt.ylabel('Number of features')
plt.xlabel('Importance')
plt.show()

/Users/nshaposh/anaconda3/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

sel_index = clf1.feature_importances_ > 0.0005
Xs = X[:,sel_index]
Xs_test = X_test[:,sel_index]
Xs.shape

(7352, 224)

Choose 5e-4 to filter out roughly half of the features. Then, run RF classificator on the filtered data set.

clf_sel = RF(n_estimators=30, max_depth=None,min_samples_split=5, random_state=0)
clf_sel = clf_sel.fit(Xs, Y)

Ys_predict = clf_sel.predict(Xs_test)

df_sel = pd.DataFrame(data= {"test":Y_test,"predict":Ys_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
         groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)

df_sel

#from tsne import tsne
#Y = tsne(X,2,50,30.0)

#colors = [cols[int(x)] for x in np.loadtxt('UCI_HAR_Dataset/train/Y_train.txt')]
#plt.scatter(Y[:,0],Y[:,1],5,colors)
#plt.show()
#plt.savefig("har_tsne.ps")

MPL (Neural Networks)¶

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(18,), max_iter=100, alpha=1e-5,
                    solver='sgd', verbose=0, tol=1e-9, random_state=1,
                    learning_rate_init=.025)

mlp.fit(X, Y)
print("Training set score: %f" % mlp.score(X, Y))
print("Test set score: %f" % mlp.score(X_test, Y_test))

Training set score: 0.984358
Test set score: 0.960638

Compare 0.96 at 0.92 for RF!

Ynn_predict = mlp.predict(X_test)

df_mpl = pd.DataFrame(data= {"test":Y_test,"predict":Ynn_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
         groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)

df_mpl

Comparison with RF¶

df_mpl - df_uns

plot_confusion_matrix(confusion_matrix(Y_test, Ynn_predict), classes=class_names,
                      title='Confusion matrix, MLP NN')
plt.grid(visible=False)
plt.show()

	value
predict	1	2	3	4	5	6
test
1	485	7	4	0	0	0
2	46	415	10	0	0	0
3	26	48	346	0	0	0
4	0	0	0	443	48	0
5	0	0	0	42	490	0
6	0	0	0	0	0	537

	value
predict	1	2	3	4	5	6
test
1	480	4	12	0	0	0
2	44	419	8	0	0	0
3	14	36	370	0	0	0
4	0	0	0	421	70	0
5	0	0	0	43	489	0
6	0	0	0	0	0	537

	value
predict	1	2	3	4	5	6
test
1	491	1	4	0	0	0
2	23	441	7	0	0	0
3	4	12	404	0	0	0
4	0	2	0	461	28	0
5	0	0	0	34	498	0
6	1	0	0	0	0	536

	value
predict	1	2	3	4	5	6
test
1	6	-6	0	0	0	0
2	-23	26	-3	0	0	0
3	-22	-36	58	0	0	0
4	0	2	0	18	-20	0
5	0	0	0	-8	8	0
6	1	0	0	0	0	-1