In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from scipy import stats
import glob
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from matplotlib.pyplot import figure
%matplotlib inline
In [2]:
# Read the training file
df_train_csv = pd.read_csv('files/train.csv')
In [3]:
# Extract the individual subjects from training set
df_train_csv_subjects = list(dict(df_train_csv['Subject'].value_counts()).keys())
In [4]:
list_of_dfs = [] # To store the dataframes

def parse_subject_files(subject,all_files):
    '''
    Input: Subject in the train file and all the instances(files) of that subject
    
    1. Loop through all the files(instances) of the subject present the subject folder
    2. Validating the subject in the train file.
    3. Add the list of dataframes into list_of_dfs
    
    '''
    print(subject,' : started processing.....')
    
    for filename in all_files:
        df = pd.read_csv(filename, header=None)
        if filename.split(sep='\\')[1] in list(df_train_csv['Datafile'].str
                                                                      .partition('/')[2]):
            current_label = (df_train_csv[df_train_csv['Datafile']==
                                          (''.join([subject,'/',filename.split(sep='\\')[1]]))]
                                          ['Label'].values[0])
            df['Label'] = current_label
            list_of_dfs.append(df)
            
    print(subject,' : completed processing')
In [5]:
# Loop through each subject in the train file and call the parse_subject_files definition
for i in range(len(df_train_csv_subjects)):
    path = r'/Users/sanyu/OneDrive - Jacobs University/2nd Sem/Data Mining/bbdc2019/files/'+df_train_csv_subjects[i]
    all_files = glob.glob(path + "/*.csv")
    parse_subject_files(df_train_csv_subjects[i],all_files)
Subject06  : started processing.....
Subject06  : completed processing
Subject07  : started processing.....
Subject07  : completed processing
Subject19  : started processing.....
Subject19  : completed processing
Subject03  : started processing.....
Subject03  : completed processing
Subject13  : started processing.....
Subject13  : completed processing
Subject02  : started processing.....
Subject02  : completed processing
Subject12  : started processing.....
Subject12  : completed processing
Subject17  : started processing.....
Subject17  : completed processing
Subject09  : started processing.....
Subject09  : completed processing
Subject05  : started processing.....
Subject05  : completed processing
Subject04  : started processing.....
Subject04  : completed processing
Subject18  : started processing.....
Subject18  : completed processing
Subject11  : started processing.....
Subject11  : completed processing
Subject08  : started processing.....
Subject08  : completed processing
Subject16  : started processing.....
Subject16  : completed processing
In [ ]:
type(list_of_dfs)
In [6]:
# Concat all the files in a single frame
df_train_main = pd.concat(list_of_dfs, axis=0)
In [7]:
# Rename the columns
# A 'ground truth' column named as Label has been added as our y-variable
df_train_main.columns = ['EMG1',
                'EMG2',
                'EMG3',
                'EMG4',
                'Airborne',
                'ACC upper X',
                'ACC upper Y',
                'ACC upper Z',
                'Goniometer X',
                'ACC lower X',
                'ACC lower Y',
                'ACC loewr Z',
                'Goniometer Y',
                'Gyro upper X',
                'Gyro upper Y',
                'Gyro upper Z',
                'Gyro lower X',
                'Gyro lower Y',
                'Gyro lower Z',
                'Label']
In [1]:
df_train_main
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-61c865e338ff> in <module>
----> 1 df_train_main

NameError: name 'df_train_main' is not defined
In [8]:
label_encoder = LabelEncoder()
df_train_main['Label'] = label_encoder.fit_transform(df_train_main['Label'])
In [9]:
z_scores = stats.zscore(df_train_main)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 1).all(axis=1)
df_train_main = df_train_main[filtered_entries]
In [10]:
x = df_train_main.iloc[:,:-1].values
y = df_train_main.iloc[:,19].values

y = y.reshape(-1,1)

onehotencoder = OneHotEncoder(categories='auto')
y = onehotencoder.fit_transform(y).toarray()
In [ ]:
y.shape
In [11]:
# Splitting the data into train-test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

# Feature Scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
In [12]:
# Principal Component Analysis
from sklearn.decomposition import PCA
pca = PCA(n_components = 8) #found this to be the optimal number of columns 
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
In [ ]:
# For the original dataframe
figure(num=None, figsize=(20, 18), dpi=80, facecolor='w', edgecolor='k')
#sns.heatmap(df_train_main.corr(), annot=True, fmt=".2%")
In [ ]:
# after PCA we found 8 columns which can be selected 
figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k')
#sns.heatmap(pd.DataFrame(x_train).corr(), annot=True)
In [ ]:
pd.DataFrame(x_train).head()
In [ ]:
figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k')
plt.figure(figsize=(8,8),facecolor='red',edgecolor='blue')
#df['N'].hist(by=df['Letter'], figsize = (16,18))
df_train_main.groupby('Label').hist(figsize = (8,5))
In [ ]:
import warnings 
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from sklearn import neighbors
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# filter warnings
warnings.filterwarnings("ignore")

warnings.filterwarnings("ignore")

knn = KNeighborsClassifier(n_neighbors=3)    
knn.fit(x_train, y_train)
from sklearn import metrics
# predict the response
y_pred = knn.predict(x_test)

#print(metrics.accuracy_score(y, y_pred))



# evaluate and return  accuracy return accuracy_score(y_test, pred)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
In [ ]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(random_state=0)
dt = dt.fit(x_train, y_train)
tree.plot_tree(dt)
In [ ]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor().fit(x_train, y_train)
predicted = dt.predict(x_test)
expected = y_test

plt.scatter(expected, predicted) 

plt.plot([0, 50], [0, 50], '--k') 
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
In [ ]:
cross_val = KNeighborsClassifier()
from sklearn.model_selection import cross_val_score
cross_val_score(cross_val, x_train, y_train, cv=5)
In [13]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(8, 8, 8), max_iter=1000)
mlp.fit(x_train, y_train)
Out[13]:
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(8, 8, 8), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
In [14]:
predictions = mlp.predict(x_test)
In [ ]:
predictions
In [18]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-18-bb5247431d77> in <module>
      1 from sklearn.metrics import classification_report, confusion_matrix
----> 2 print(confusion_matrix(y_test,predictions))
      3 print(classification_report(y_test,predictions))

~\Anaconda3\lib\site-packages\sklearn\metrics\classification.py in confusion_matrix(y_true, y_pred, labels, sample_weight)
    253     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    254     if y_type not in ("binary", "multiclass"):
--> 255         raise ValueError("%s is not supported" % y_type)
    256 
    257     if labels is None:

ValueError: multilabel-indicator is not supported
In [20]:
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(y_test, predictions)
Out[20]:
array([[[306938,   9872],
        [ 54645,  12865]],

       [[366861,      0],
        [ 17459,      0]],

       [[359948,    120],
        [ 24230,     22]],

       [[362811,      0],
        [ 21509,      0]],

       [[360276,    293],
        [ 23483,    268]],

       [[365047,      0],
        [ 19273,      0]],

       [[347521,   1525],
        [ 32900,   2374]],

       [[336005,   1689],
        [ 43246,   3380]],

       [[331347,   1326],
        [ 49181,   2466]],

       [[317955,  15353],
        [  6356,  44656]],

       [[358050,    263],
        [ 25621,    386]]], dtype=int64)
In [22]:
clf_report = classification_report(y_test,predictions,output_dict=True)
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)
plt.savefig('MLP.png')
In [21]:
from sklearn.metrics import precision_score
print("Precision score: {}".format(precision_score(y_test,predictions)))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-f291ffa63554> in <module>
      1 from sklearn.metrics import precision_score
----> 2 print("Precision score: {}".format(precision_score(y_test,predictions)))

~\Anaconda3\lib\site-packages\sklearn\metrics\classification.py in precision_score(y_true, y_pred, labels, pos_label, average, sample_weight)
   1567                                                  average=average,
   1568                                                  warn_for=('precision',),
-> 1569                                                  sample_weight=sample_weight)
   1570     return p
   1571 

~\Anaconda3\lib\site-packages\sklearn\metrics\classification.py in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
   1413         raise ValueError("beta should be >0 in the F-beta score")
   1414     labels = _check_set_wise_labels(y_true, y_pred, average, labels,
-> 1415                                     pos_label)
   1416 
   1417     # Calculate tp_sum, pred_sum, true_sum ###

~\Anaconda3\lib\site-packages\sklearn\metrics\classification.py in _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
   1252             raise ValueError("Target is %s but average='binary'. Please "
   1253                              "choose another average setting, one of %r."
-> 1254                              % (y_type, average_options))
   1255     elif pos_label not in (None, 1):
   1256         warnings.warn("Note that pos_label (set to %r) is ignored when "

ValueError: Target is multilabel-indicator but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted', 'samples'].