Six activity labels:
Features are normalized and bounded within [-1,1]
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RF
# load train data set
X = np.loadtxt('UCI_HAR_Dataset/train/X_train.txt')
Y = np.loadtxt('UCI_HAR_Dataset/train/Y_train.txt')
# load test (validation)
X_test = np.loadtxt('UCI_HAR_Dataset/test/X_test.txt')
Y_test = np.loadtxt('UCI_HAR_Dataset/test/Y_test.txt')
clf = RF(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
clf = clf.fit(X, Y)
print("Training set score: %f" % clf.score(X, Y))
print("Test set score: %f" % clf.score(X_test, Y_test))
Y_predict = clf.predict(X_test)
import matplotlib.pyplot as plt
import seaborn as sns
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# print("Normalized confusion matrix")
# else:
# print('Confusion matrix, without normalization')
# print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
import itertools
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(Y_test, Y_predict)
np.set_printoptions(precision=2)
class_names = ['WALKING',
'WALKING_UPSTAIRS',
'WALKING_DOWNSTAIRS',
'SITTING',
'STANDING',
'LAYING']
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix, without normalization')
plt.grid(visible=False)
plt.show()
clf1 = RF(n_estimators=30, max_depth=30,min_samples_split=5, random_state=0)
clf1 = clf1.fit(X, Y)
Y1_predict = clf1.predict(X_test)
print("Training set score: %f" % clf1.score(X, Y))
print("Test set score: %f" % clf1.score(X_test, Y_test))
df_uns = pd.DataFrame(data= {"test":Y_test,"predict":Y1_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)
df_uns
plot_confusion_matrix(confusion_matrix(Y_test, Y1_predict), classes=class_names,
title='Confusion matrix, RF')
plt.grid(visible=False)
plt.show()
import seaborn as sns
sns.distplot(clf1.feature_importances_)
plt.title("Distrubution of feature importances")
plt.ylabel('Number of features')
plt.xlabel('Importance')
plt.show()
sel_index = clf1.feature_importances_ > 0.0005
Xs = X[:,sel_index]
Xs_test = X_test[:,sel_index]
Xs.shape
Choose 5e-4 to filter out roughly half of the features. Then, run RF classificator on the filtered data set.
clf_sel = RF(n_estimators=30, max_depth=None,min_samples_split=5, random_state=0)
clf_sel = clf_sel.fit(Xs, Y)
Ys_predict = clf_sel.predict(Xs_test)
df_sel = pd.DataFrame(data= {"test":Y_test,"predict":Ys_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)
df_sel
#from tsne import tsne
#Y = tsne(X,2,50,30.0)
#colors = [cols[int(x)] for x in np.loadtxt('UCI_HAR_Dataset/train/Y_train.txt')]
#plt.scatter(Y[:,0],Y[:,1],5,colors)
#plt.show()
#plt.savefig("har_tsne.ps")
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(18,), max_iter=100, alpha=1e-5,
solver='sgd', verbose=0, tol=1e-9, random_state=1,
learning_rate_init=.025)
mlp.fit(X, Y)
print("Training set score: %f" % mlp.score(X, Y))
print("Test set score: %f" % mlp.score(X_test, Y_test))
Compare 0.96 at 0.92 for RF!
Ynn_predict = mlp.predict(X_test)
df_mpl = pd.DataFrame(data= {"test":Y_test,"predict":Ynn_predict,"value":np.ones(len(Y_test),dtype=int)},dtype=int).\
groupby(["test","predict"]).sum().unstack(1).fillna(0).astype(int)
df_mpl
df_mpl - df_uns
plot_confusion_matrix(confusion_matrix(Y_test, Ynn_predict), classes=class_names,
title='Confusion matrix, MLP NN')
plt.grid(visible=False)
plt.show()