Human Activity Recognition Using Smartphones Dataset

In [134]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RF

# load train data set
X = np.loadtxt('UCI_HAR_Dataset/train/X_train.txt')
Y = np.loadtxt('UCI_HAR_Dataset/train/Y_train.txt')

# load test (validation)
X_test = np.loadtxt('UCI_HAR_Dataset/test/X_test.txt')
Y_test = np.loadtxt('UCI_HAR_Dataset/test/Y_test.txt')

Random Forest

In [175]:
clf = RF(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
clf = clf.fit(X, Y)
print("Training set score: %f" % clf.score(X, Y))
print("Test set score: %f" % clf.score(X_test, Y_test))
Training set score: 0.999048
Test set score: 0.910757
In [176]:
Y_predict = clf.predict(X_test)

Confusion matrix

In [214]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#        print("Normalized confusion matrix")
#    else:
#        print('Confusion matrix, without normalization')

#    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
In [215]:
import itertools
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(Y_test, Y_predict)
np.set_printoptions(precision=2)


class_names = ['WALKING', 
    'WALKING_UPSTAIRS',
    'WALKING_DOWNSTAIRS', 
    'SITTING', 
    'STANDING', 
    'LAYING']
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')
plt.grid(visible=False)
plt.show()
In [225]:
clf1 = RF(n_estimators=30, max_depth=30,min_samples_split=5, random_state=0)
clf1 = clf1.fit(X, Y)
In [226]:
Y1_predict = clf1.predict(X_test)
print("Training set score: %f" % clf1.score(X, Y))
print("Test set score: %f" % clf1.score(X_test, Y_test))
Training set score: 0.999864
Test set score: 0.926026

Confusion matrix my way (dataframes)

In [218]:
df_uns = pd.DataFrame(data= {"test":Y_test,"predict":Y1_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
         groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)
df_uns
Out[218]:
value
predict 1 2 3 4 5 6
test
1 485 7 4 0 0 0
2 46 415 10 0 0 0
3 26 48 346 0 0 0
4 0 0 0 443 48 0
5 0 0 0 42 490 0
6 0 0 0 0 0 537
In [219]:
plot_confusion_matrix(confusion_matrix(Y_test, Y1_predict), classes=class_names,
                      title='Confusion matrix, RF')
plt.grid(visible=False)
plt.show()

Filtering out less importnant features

In [221]:
import seaborn as sns
sns.distplot(clf1.feature_importances_)
plt.title("Distrubution of feature importances")
plt.ylabel('Number of features')
plt.xlabel('Importance')
plt.show()
/Users/nshaposh/anaconda3/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
In [170]:
sel_index = clf1.feature_importances_ > 0.0005
Xs = X[:,sel_index]
Xs_test = X_test[:,sel_index]
Xs.shape
Out[170]:
(7352, 224)

Choose 5e-4 to filter out roughly half of the features. Then, run RF classificator on the filtered data set.

In [222]:
clf_sel = RF(n_estimators=30, max_depth=None,min_samples_split=5, random_state=0)
clf_sel = clf_sel.fit(Xs, Y)
In [197]:
Ys_predict = clf_sel.predict(Xs_test)
In [199]:
df_sel = pd.DataFrame(data= {"test":Y_test,"predict":Ys_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
         groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)

df_sel
Out[199]:
value
predict 1 2 3 4 5 6
test
1 480 4 12 0 0 0
2 44 419 8 0 0 0
3 14 36 370 0 0 0
4 0 0 0 421 70 0
5 0 0 0 43 489 0
6 0 0 0 0 0 537
In [174]:
#from tsne import tsne
#Y = tsne(X,2,50,30.0)
In [21]:
#colors = [cols[int(x)] for x in np.loadtxt('UCI_HAR_Dataset/train/Y_train.txt')]
#plt.scatter(Y[:,0],Y[:,1],5,colors)
#plt.show()
#plt.savefig("har_tsne.ps")

MPL (Neural Networks)

In [203]:
from sklearn.neural_network import MLPClassifier
In [229]:
mlp = MLPClassifier(hidden_layer_sizes=(18,), max_iter=100, alpha=1e-5,
                    solver='sgd', verbose=0, tol=1e-9, random_state=1,
                    learning_rate_init=.025)

mlp.fit(X, Y)
print("Training set score: %f" % mlp.score(X, Y))
print("Test set score: %f" % mlp.score(X_test, Y_test))
Training set score: 0.984358
Test set score: 0.960638

Compare 0.96 at 0.92 for RF!

In [205]:
Ynn_predict = mlp.predict(X_test)
In [206]:
df_mpl = pd.DataFrame(data= {"test":Y_test,"predict":Ynn_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
         groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)

df_mpl
Out[206]:
value
predict 1 2 3 4 5 6
test
1 491 1 4 0 0 0
2 23 441 7 0 0 0
3 4 12 404 0 0 0
4 0 2 0 461 28 0
5 0 0 0 34 498 0
6 1 0 0 0 0 536

Comparison with RF

In [231]:
df_mpl - df_uns
Out[231]:
value
predict 1 2 3 4 5 6
test
1 6 -6 0 0 0 0
2 -23 26 -3 0 0 0
3 -22 -36 58 0 0 0
4 0 2 0 18 -20 0
5 0 0 0 -8 8 0
6 1 0 0 0 0 -1
In [232]:
plot_confusion_matrix(confusion_matrix(Y_test, Ynn_predict), classes=class_names,
                      title='Confusion matrix, MLP NN')
plt.grid(visible=False)
plt.show()
In [ ]: