import pandas as pd
import numpy as np
import seaborn as sns
import os
from scipy import stats
import glob
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from matplotlib.pyplot import figure
%matplotlib inline
# Read the training file
df_train_csv = pd.read_csv('files/train.csv')
# Extract the individual subjects from training set
df_train_csv_subjects = list(dict(df_train_csv['Subject'].value_counts()).keys())
list_of_dfs = [] # To store the dataframes
def parse_subject_files(subject,all_files):
'''
Input: Subject in the train file and all the instances(files) of that subject
1. Loop through all the files(instances) of the subject present the subject folder
2. Validating the subject in the train file.
3. Add the list of dataframes into list_of_dfs
'''
print(subject,' : started processing.....')
for filename in all_files:
df = pd.read_csv(filename, header=None)
if filename.split(sep='\\')[1] in list(df_train_csv['Datafile'].str
.partition('/')[2]):
current_label = (df_train_csv[df_train_csv['Datafile']==
(''.join([subject,'/',filename.split(sep='\\')[1]]))]
['Label'].values[0])
df['Label'] = current_label
list_of_dfs.append(df)
print(subject,' : completed processing')
# Loop through each subject in the train file and call the parse_subject_files definition
for i in range(len(df_train_csv_subjects)):
path = r'/Users/sanyu/OneDrive - Jacobs University/2nd Sem/Data Mining/bbdc2019/files/'+df_train_csv_subjects[i]
all_files = glob.glob(path + "/*.csv")
parse_subject_files(df_train_csv_subjects[i],all_files)
type(list_of_dfs)
# Concat all the files in a single frame
df_train_main = pd.concat(list_of_dfs, axis=0)
# Rename the columns
# A 'ground truth' column named as Label has been added as our y-variable
df_train_main.columns = ['EMG1',
'EMG2',
'EMG3',
'EMG4',
'Airborne',
'ACC upper X',
'ACC upper Y',
'ACC upper Z',
'Goniometer X',
'ACC lower X',
'ACC lower Y',
'ACC loewr Z',
'Goniometer Y',
'Gyro upper X',
'Gyro upper Y',
'Gyro upper Z',
'Gyro lower X',
'Gyro lower Y',
'Gyro lower Z',
'Label']
df_train_main
label_encoder = LabelEncoder()
df_train_main['Label'] = label_encoder.fit_transform(df_train_main['Label'])
z_scores = stats.zscore(df_train_main)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 1).all(axis=1)
df_train_main = df_train_main[filtered_entries]
x = df_train_main.iloc[:,:-1].values
y = df_train_main.iloc[:,19].values
y = y.reshape(-1,1)
onehotencoder = OneHotEncoder(categories='auto')
y = onehotencoder.fit_transform(y).toarray()
y.shape
# Splitting the data into train-test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
# Principal Component Analysis
from sklearn.decomposition import PCA
pca = PCA(n_components = 8) #found this to be the optimal number of columns
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
# For the original dataframe
figure(num=None, figsize=(20, 18), dpi=80, facecolor='w', edgecolor='k')
#sns.heatmap(df_train_main.corr(), annot=True, fmt=".2%")
# after PCA we found 8 columns which can be selected
figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k')
#sns.heatmap(pd.DataFrame(x_train).corr(), annot=True)
pd.DataFrame(x_train).head()
figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k')
plt.figure(figsize=(8,8),facecolor='red',edgecolor='blue')
#df['N'].hist(by=df['Letter'], figsize = (16,18))
df_train_main.groupby('Label').hist(figsize = (8,5))
import warnings
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from sklearn import neighbors
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# filter warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore")
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
from sklearn import metrics
# predict the response
y_pred = knn.predict(x_test)
#print(metrics.accuracy_score(y, y_pred))
# evaluate and return accuracy return accuracy_score(y_test, pred)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
from sklearn import tree
dt = tree.DecisionTreeClassifier(random_state=0)
dt = dt.fit(x_train, y_train)
tree.plot_tree(dt)
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor().fit(x_train, y_train)
predicted = dt.predict(x_test)
expected = y_test
plt.scatter(expected, predicted)
plt.plot([0, 50], [0, 50], '--k')
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
cross_val = KNeighborsClassifier()
from sklearn.model_selection import cross_val_score
cross_val_score(cross_val, x_train, y_train, cv=5)
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(8, 8, 8), max_iter=1000)
mlp.fit(x_train, y_train)
predictions = mlp.predict(x_test)
predictions
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(y_test, predictions)
clf_report = classification_report(y_test,predictions,output_dict=True)
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)
plt.savefig('MLP.png')
from sklearn.metrics import precision_score
print("Precision score: {}".format(precision_score(y_test,predictions)))