'''
@@ Authur: Sanyukta Suman
Which kind of data set did you choose for your Project?
Answer- I have choosen the dataset which contains movie reviews from Rotten Tomatoes.
There are four features in this data set. They are:
1. Date -Date the review is posted.
2. Rating- User rating out of 5 (where 1 being the lowest and 5 being the highest)
3. Helpful- How many users think the review has been helpful.
4. Review- TextReview Description.
------
------ Objective of this project is multi-class classification,instead of positive/negative
prediction, we will classify phares in sentiment scale 0 to 4
where 0 is the lowest sentiment (negative) and 4 is the highest sentiment(positive)-------
'''
# Describe the RV-Fuction
'''The RV-function of the dataset is procedure of giving rating, measure of helpfullness and typing Review Text'''
#Define Data Value Space
'''Data value set(DVS) ={PhraseId, SentenceId, Phrase, Sentiment}'''
#importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import unicodedata, re, string
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.manifold import TSNE
from scipy import stats
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import warnings
warnings.filterwarnings("ignore")
#reading review data from the csv file "Reviews.csv"
df_train = pd.read_csv("train.tsv", sep = "\t", nrows=100)
df_test = pd.read_csv("test.tsv", sep = "\t", nrows=100)
#Printing 100 samples from the data set
df_test.head()
df_test.shape
df_train.head()
df_train.shape
#Train Data description
df_train.info()
df_train ['Phrase'][0]
#extracting location of rows from the training data set whose sentence id is ==1
df_train.loc[df_train['SentenceId'] == 1]
'''In the train data from the individual movie reviews phrases are taken out
of the context and it is split into smaller parts, each part is assigned with the sentiment category.
The data is fairly clean. Before the pre-processing of data, let's look at the data distribution,to see if the classes
in training data is fairly distributed.
'''
senti_count=[]
senti_count=df_train['Sentiment'].value_counts()
senti_count
#Data distribution Ananlysis
#Let's see the distribution of data on the basis of sentiments
dist = df_train.groupby(["Sentiment"]).size()
fig, ax = plt.subplots(figsize=(12,8))
sns.barplot(dist.keys(), dist.values);
plt.savefig('senti_count')
from scipy.stats import norm, kurtosis
kurtosis(df_train)
df_train.drop('PhraseId', axis=1).plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False, figsize=(9,9),
title='Box Plot for each input variable')
#plt.savefig('fruits_box')
plt.show()
'''Sentiment class seems to follow normal distribution, with most frequently distributed class
being 2.'''
'''Data in the form of text needs to be tokenized into numeric format.
Before converting words into numerics, I will filter out spaces and puntuation and use lemmetization to reduce dimensionality.
Note: At this point, I will not filter out stop words, because in case of movie phrases such as
"this movie is shit" and "this is movie is the shit" has totally different meaning, therefore I want to keep this
information since it will help me analyze the sentiment of the review better'''
df_train['Phrase'][:10]
import string
string.punctuation
#removing puntuations
df_train.Phrase=df_train.Phrase.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)).lower())
#Tokenization of sentence
df_train.Phrase=df_train.Phrase.str.split(' ')
df_train['Phrase'][:10]
#Remove stopwords
from nltk.stem import PorterStemmer
porter=PorterStemmer()
df_train.Phrase=df_train.Phrase.apply(lambda x: [porter.stem(w) for w in x])
stopwords_e=stopwords.words('english')
df_train.Phrase=[w for w in df_train.Phrase if w not in stopwords_e]
df_train.Phrase.head()
#Lemmetize words
lemmar=WordNetLemmatizer()
df_train.Phrase=df_train.Phrase.apply(lambda x: [lemmar.lemmatize(w) for w in x])
#stemming words
snow=SnowballStemmer('english')
df_train.Phrase=df_train.Phrase.apply(lambda x: [snow.stem(w) for w in x])
df_train['Phrase'][:10]
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
vector=TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')
df_train.Phrase=df_train.Phrase.apply(lambda x: ' '.join(x))
vector1=vector.fit(df_train.Phrase)
df_train_feature=vector1.transform(df_train.Phrase)
df_train_feature.toarray()
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
lr=LogisticRegression(multi_class='ovr')
lr=lr.fit(df_train_feature,df_train.Sentiment)
## Coefficient
lr.coef_
train_predict=lr.predict(df_train_feature)
## the number of data in each class
df_train.Sentiment.value_counts().sort_index()
## number of data in predict result
np.unique(train_predict,return_counts=True)
## Plot predict result
plt.figure(figsize=(5,5))
ax=plt.axes()
ax.set_title('Number of sentiment class in logistic regression')
sns.countplot(train_predict)
print(classification_report(train_predict, df_train.Sentiment))
from sklearn.tree import DecisionTreeClassifier
ds=DecisionTreeClassifier()
ds.fit(df_train_feature, df_train.Sentiment)
print(ds.feature_importances_)
ds_train_pred=ds.predict(df_train_feature)
df_train.Sentiment.value_counts().sort_index()
## Number of predict class
np.unique(ds_train_pred,return_counts=True)
plt.figure(figsize=(10,5))
ax=plt.axes()
ax.set_title('Number of sentiment class in decision tree')
sns.countplot(ds_train_pred)
print(classification_report(ds_train_pred, df_train.Sentiment))
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(df_train_feature, df_train.Sentiment)
print(rf.feature_importances_)
rf_train_pred=rf.predict(df_train_feature)
plt.figure(figsize=(10,5))
ax=plt.axes()
ax.set_title('Random forest')
sns.countplot(rf_train_pred)
print(classification_report(rf_train_pred, df_train.Sentiment))
## Import every packages
from scipy import stats
import string
from nltk.corpus import stopwords
stopwords_e=stopwords.words('english')
from nltk.stem import SnowballStemmer
snow=SnowballStemmer('english')
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
vector=TfidfVectorizer(stop_words='english')
## Preprocess function
def data_preprocess(text):
text_nonpunc=[w.lower() for w in text if w not in string.punctuation]
text_nonpunc=''.join(text_nonpunc)
text_rmstop=[x for x in text_nonpunc.split(' ') if x not in stopwords_e]
text_stem=[snow.stem(w) for w in text_rmstop]
text1=' '.join(text_stem)
return (text1)
## Notice: Class name and the first def should have a blank line
class EstimatorSelection:
def __init__(self, models):
self.models=models
self.keys=models.keys()
self.results={}
self.modelfit={}
self.modelpredict={}
def fit(self, x, y):
x1=x.apply(lambda i: data_preprocess(i))
x_feature1=vector.fit_transform(x1)
for key in self.keys:
model=self.models[key]
self.modelfit[key]=model.fit(x_feature1,y)
y_pred=model.predict(x_feature1)
self.results[key]=classification_report(y, y_pred,output_dict=True)
def predict(self,test_x):
test_x1=test_x.apply(lambda i: data_preprocess(i))
test_feature1=vector.transform(test_x1)
test_frames=[]
for key in self.keys:
modelfit=self.modelfit[key]
test_y=modelfit.predict(test_feature1)
test_frame=pd.DataFrame(test_y,columns=[key])
test_frames.append(test_frame)
predict_frame=pd.concat(test_frames,axis=1)
return(predict_frame)
def summary(self):
Frames=[]
for key in self.keys:
result=self.results[key]
Frame=pd.DataFrame(result['macro avg'], index=[key])
Frames.append(Frame)
result_sum=pd.concat(Frames)
return result_sum.iloc[:,:3]
## Models want to predict on test data
models = {
'LogisticClassifier': LogisticRegression(multi_class='ovr'),
'RandomforestClassifier':RandomForestClassifier(),
'DecisionTreeClassifier':DecisionTreeClassifier()
}
model_compare=EstimatorSelection(models)
model_compare.fit(df_train.Phrase, df_train.Sentiment)
summary=model_compare.summary()
summary
predict_result=model_compare.predict(df_test.Phrase)
predict_result
predict_result1=predict_result.reset_index().rename(columns={'index':'case'})
predict_result2=pd.melt(predict_result1,id_vars='case', value_vars=['LogisticClassifier', 'RandomforestClassifier', 'DecisionTreeClassifier'])
predict_result2=pd.melt(predict_result1,id_vars='case', value_vars=['LogisticClassifier', 'RandomforestClassifier', 'DecisionTreeClassifier'])
predict_result2
predict_result3=predict_result2.groupby(['variable','value']).size().reset_index().rename(columns={0:'count'})
predict_result3
plt.figure(figsize=(10,5))
ax=plt.axes()
ax.set_title('Number of class for each methods')
sns.barplot(x='value', y='count', hue='variable', data=predict_result3)
predict_result1=predict_result.reset_index().rename(columns={'index':'case'})
predict_result2=pd.melt(predict_result1,id_vars='case', value_vars=['LogisticClassifier', 'RandomforestClassifier', 'DecisionTreeClassifier'])