!pip install gensim tensorflow wordcloud
!pip install -q tensorflow-hub
!pip install xgboost
!pip install keras
!pip install nltk
!pip install string
!pip install tqdm
import gzip
import gensim
import os
import sys
import json
import shutil
import time
import re
import tarfile
import zipfile
import numpy as np
import pandas as pd
import collections
import math
import random
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec, FastText
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
import tensorflow as tf
import tensorflow_hub as hub
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,Dense,Flatten,GlobalMaxPooling1D,LSTM,Dropout, Activation,Bidirectional
from keras.layers.convolutional import Conv1D,MaxPooling1D
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, precision_recall_curve
from sklearn import utils
"""
We will ignore FutureWarning and DeprecationWarning
"""
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
"""
We will ignore warnings
"""
warnings.filterwarnings("ignore")
if not sys.warnoptions:
warnings.simplefilter("ignore")
program_start_time=time.time()
data_source_url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
datasets = '/datasets/'
data_file_path = os.getcwd()+datasets+'aclImdb_v1.tar.gz'
data = 'data_'
MOVIE = 'movie'
data_folder = os.getcwd()+datasets+data+MOVIE
data_json_file = data+MOVIE+'.json'
def check_if_file_exists(file):
'''
Checks if 'file' exists
'''
try:
fh = open(file, 'r')
return True
except FileNotFoundError:
print('Please make sure file: ' + file + ' is present before continuing')
return False
def check_if_dir_exists(directory):
'''
Checks if 'directory' exists
'''
return(os.path.isdir(directory))
def store_json(write_this_data):
'''
Store json if we are processing the first time
'''
open(data_json_file, 'w').write(json.dumps(write_this_data))
def return_data_json():
'''
Return data json from data json file
'''
with open(data_json_file, encoding='utf-8') as data_file:
return json.loads(data_file.read())
Download data source
if not check_if_file_exists(data_file_path):
print('Start of data download')
wget.download(data_source_url, os.getcwd()+datasets)
print('Download complete')
else:
print('Data file already exists. Not downloading again!')
if not check_if_dir_exists(data_folder):
with tarfile.open(data_file_path) as tar:
tar.extractall(path=data_folder)
else:
print('Data foler exists. Won\'t copy again!')
data = {}
all_reviews = []
if not check_if_file_exists(data_json_file):
dirpath = data_folder+"/aclImdb/train/pos/"
for root, dirs, files in os.walk(dirpath):
for name in files:
full = os.path.join(root, name)
with open(full, 'r') as f:
string = f.read()
file_content = {"review": string,"rating": 1}
all_reviews.append(file_content)
dirpath = data_folder+"/aclImdb/train/neg/"
for root, dirs, files in os.walk(dirpath):
for name in files:
full = os.path.join(root, name)
with open(full, 'r') as f:
string = f.read()
file_content = {"review": string,"rating": 0}
all_reviews.append(file_content)
dirpath = data_folder+"/aclImdb/test/pos/"
for root, dirs, files in os.walk(dirpath):
for name in files:
full = os.path.join(root, name)
with open(full, 'r') as f:
string = f.read()
file_content = {"review": string,"rating": 1}
all_reviews.append(file_content)
dirpath = data_folder+"/aclImdb/test/neg/"
for root, dirs, files in os.walk(dirpath):
for name in files:
full = os.path.join(root, name)
with open(full, 'r') as f:
string = f.read()
file_content = {"review": string,"rating": 0}
all_reviews.append(file_content)
data[MOVIE] = all_reviews
store_json(data)
pre_loaded_data = return_data_json()
len(pre_loaded_data[MOVIE])
Creating review dataframe and Data clean up
movie_df = pd.DataFrame(pre_loaded_data[MOVIE])
print('Before Cleanup : Shape of the Data Frame : {}'.format(movie_df.shape))
print('Remove missing values.')
movie_df.dropna(inplace=True)
movie_df.reset_index(drop=True,inplace=True)
print('Drop columns with duplicate data.')
movie_df.drop_duplicates()
print('After Cleanup : Shape of the Data Frame : {}'.format(movie_df.shape))
print('Counting null data per column.')
movie_df.isnull().sum()
Let us look at the data types of columns
movie_df.dtypes
"""
Ratings
"""
movie_df.rating.unique()
Let us explore the data a bit using head(), tail(), info(), describe()
movie_df.head()
movie_df.tail()
movie_df.info()
movie_df.describe()
movie_df.describe(include='object')
movie_df.describe(include='all')
Creating a new column called "review_length" which is the length of the review column.
movie_df['review_length'] = movie_df['review'].apply(len)
movie_df.head()
Using FacetGrid from the seaborn library to create a grid of two histograms of review_length based off of the ratings
sns.set_style('darkgrid')
g = sns.FacetGrid(movie_df,col='rating',size=5)
g.map(plt.hist,'review_length',bins=50)
plt.show()
Let's try to explain why the x-axis goes all the way to 14000ish, this must mean that there is some really long message!
movie_df.review_length.describe()
Creating a boxplot of review_length for each rating category.
plt.figure(figsize=(10,8))
sns.boxplot(x='rating',y='review_length',data=movie_df,palette='rainbow')
plt.title("Boxplot of review length for each rating category.",fontsize=16)
plt.xlabel("Rating",fontsize=14)
plt.ylabel("Review Length",fontsize=14)
plt.show()
Creating a countplot of the number of occurrences for each type of rating.
plt.figure(figsize=(10,8))
sns.countplot(x='rating',data=movie_df,palette='winter')
plt.title("Number of occurrences for each type of rating",fontsize=16)
plt.xlabel("Rating",fontsize=14)
plt.ylabel("Number of occurrences",fontsize=14)
plt.show()
Pre-processing of review text
def review_preprocess(review):
"""
Takes in a string of review, then performs the following:
1. Remove HTML tag from review
2. Remove URLs from review
3. Make entire review lowercase
4. Split the review in words
5. Remove all punctuation
6. Remove empty strings from review
7. Remove all stopwords
8. Returns a list of the cleaned review after jioning them back to a sentence
"""
en_stops = set(stopwords.words('english'))
"""
Removing HTML tag from review
"""
clean = re.compile('<.*?>')
review_without_tag = re.sub(clean, '', review)
"""
Removing URLs
"""
review_without_tag_and_url = re.sub(r"http\S+", "", review_without_tag)
review_without_tag_and_url = re.sub(r"www\S+", "", review_without_tag)
"""
Make entire string lowercase
"""
review_lowercase = review_without_tag_and_url.lower()
"""
Split string into words
"""
list_of_words = word_tokenize(review_lowercase)
"""
Remove punctuation
Checking characters to see if they are in punctuation
"""
list_of_words_without_punctuation=[''.join(this_char for this_char in this_string if (this_char in string.ascii_lowercase))for this_string in list_of_words]
"""
Remove empty strings
"""
list_of_words_without_punctuation = list(filter(None, list_of_words_without_punctuation))
"""
Remove any stopwords
"""
filtered_word_list = [w for w in list_of_words_without_punctuation if w not in en_stops]
"""
Returns a list of the cleaned review after jioning them back to a sentence
"""
return ' '.join(filtered_word_list)
"""
Here is the original reviews:
"""
movie_df['review'].tail()
Applying pre-processing to reviews
start_time=time.time()
movie_df['review']=movie_df['review'].apply(review_preprocess)
print('Elapsed time for review preprocessing : ',((time.time()-start_time)/60),' in minutes')
"""
Here is the reviews after preprocessing :
"""
movie_df['review'].tail()
reviews = movie_df['review'].str.cat(sep=' ')
"""
function to split review into word
"""
tokens = word_tokenize(reviews)
vocabulary = set(tokens)
print('Number of vocabulary : {}'.format(len(vocabulary)))
frequency_distribution = nltk.FreqDist(tokens)
sorted(frequency_distribution,key=frequency_distribution.__getitem__, reverse=True)[0:50]
wordcloud = WordCloud().generate_from_frequencies(frequency_distribution)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
def split_train_test(x, y):
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.2, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)
return x_train, y_train, x_test, y_test, x_validation, y_validation, pd.concat([x_train,x_validation,x_test])
X = movie_df.review
y = movie_df.rating
x_train, y_train, x_test, y_test, x_validation, y_validation, all_reviews = split_train_test(X, y)
Reload previously processed model if exists
list_of_tokenized_reviews = []
skip_modeling = False
filename_to_save_model = MOVIE + ".model"
if check_if_file_exists(filename_to_save_model):
skip_modeling = True
if not skip_modeling:
for one_sentence in all_reviews:
list_of_tokenized_reviews.append(gensim.utils.simple_preprocess(one_sentence))
model = Word2Vec(list_of_tokenized_reviews, size=150, window=10, min_count=2, workers=10)
model.save(filename_to_save_model)
model = Word2Vec.load(filename_to_save_model)
else:
model = Word2Vec.load(filename_to_save_model)
"""
look up top 10 words similar to the word 'terrible'.
"""
w1 = "terrible"
model.wv.most_similar(positive=w1)
"""
look up top 10 words similar to 'excellent'
"""
w1 = ["excellent"]
model.wv.most_similar (positive=w1)
"""
look up top 3 words similar to 'movie'
"""
w1 = ["movie"]
model.wv.most_similar (positive=w1,topn=3)
"""
look up top 5 words similar to 'worst'
"""
w1 = ["worst"]
model.wv.most_similar (positive=w1,topn=5)
"""
similarity between two different words
"""
model.wv.similarity(w1="great",w2="worse")
"""
similarity between two identical words
"""
model.wv.similarity(w1="outstanding",w2="outstanding")
"""
similarity between two related words
"""
model.wv.similarity(w1="excellent",w2="outstanding")
"""
Which one is the odd one out in this list?
"""
model.wv.doesnt_match(["best","great","good","disapointed"])
"""
Which one is the odd one out in this list?
"""
model.wv.doesnt_match(["movie","film","show","book"])
def word_vectors_plot(model, input_word, word_list):
"""
Seaborn plot results of query word and most similar words, alongwith other words in corpus
"""
word_arrays = np.empty((0, 150), dtype='f')
word_tags = [input_word]
color_list = ['blue']
"""
Creating Vector of query word
"""
word_arrays = np.append(word_arrays, model.wv.__getitem__([input_word]), axis=0)
"""
Find similar words
"""
similar_words = model.wv.most_similar([input_word],topn=8)
"""
Insert word vector for similar words into array
"""
for word_score in similar_words:
word_vector = model.wv.__getitem__([word_score[0]])
word_tags.append(word_score[0])
color_list.append('green')
word_arrays = np.append(word_arrays, word_vector, axis=0)
"""
Insert word vectors for other words into array
"""
for word in word_list:
word_vector = model.wv.__getitem__([word])
word_tags.append(word)
color_list.append('red')
word_arrays = np.append(word_arrays, word_vector, axis=0)
"""
Dimensionality from 150 to 17 dimensions with PCA
"""
reduce = PCA(n_components=17).fit_transform(word_arrays)
"""
Finds t-SNE coordinates for 2 dimensions
"""
np.set_printoptions(suppress=True)
Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduce)
"""
Sets everything up to plot
"""
df = pd.DataFrame({'x': [x for x in Y[:, 0]],
'y': [y for y in Y[:, 1]],
'words': word_tags,
'color': color_list})
fig, _ = plt.subplots()
fig.set_size_inches(9, 9)
"""
Original plot
"""
p1 = sns.regplot(data=df,
x="x",
y="y",
fit_reg=False,
marker="o",
scatter_kws={'s': 40,
'facecolors': df['color']
}
)
"""
Annotating word in plots
"""
for line in range(0, df.shape[0]):
p1.text(df["x"][line],
df['y'][line],
' ' + df["words"][line].title(),
horizontalalignment='left',
verticalalignment='bottom', size='medium',
color=df['color'][line],
weight='normal'
).set_size(15)
plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
plt.title('t-SNE viz for input: {}'.format(input_word.title()),fontsize=16)
word_vectors_plot(model, 'action', ['good', 'performance', 'cool', 'life', 'issue', 'bore', 'buy', 'love'])
word_vectors_plot(model, "excellent", [t[0] for t in model.wv.most_similar(positive=["excellent"],
topn=16)][8:])
word_vectors_plot(model, "comedy", [t[0] for t in model.wv.most_similar(positive=["comedy"],
topn=16)][8:])
word_vectors_plot(model, "story", [t[0] for t in model.wv.most_similar(positive=["story"],
topn=16)][8:])
pos_lst=[t[0] for t in model.wv.most_similar(positive=["outstanding"],topn=20)]
pos_wrd=' '.join(pos_lst)
print(pos_wrd)
wordcloud = WordCloud(background_color='white',
max_words=200000,
max_font_size=400).generate(pos_wrd)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
neg_lst=[n[0] for n in model.wv.most_similar(positive=["awful"],topn=20)]
neg_wrd=' '.join(neg_lst)
print(neg_wrd)
wordcloud = WordCloud(background_color='black',
max_words=100,
max_font_size=50).generate(neg_wrd)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
"""
Function to generate feature vectors
"""
def generate_feature_vectors(doc, model):
vec = np.zeros(150).reshape((1, 150))
count = 0
for word in gensim.utils.simple_preprocess(doc):
if model.__contains__(word.strip()):
count = count + 1
vec += model[word.strip()]
vec = vec / count
return vec
def generate_features(model, data):
features = np.concatenate([generate_feature_vectors(s, model) for s in data])
return features
"""
Generating train, test and validation vectors
"""
training_vectors = generate_features(model, x_train)
test_vectors = generate_features(model, x_test)
validation_vectors = generate_features(model, x_validation)
lr = LogisticRegression()
lr.fit(training_vectors, y_train)
print("***** Word2Vec Word Embedding Based Sentiment Analysis using LogisticRegression *******\n")
print("LogisticRegression Performance : \n")
print('Train-Set Score : {:.4f}'.format(lr.score(training_vectors, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,lr.predict(training_vectors))))
print("\nEvaluation on Validation-Set : ")
pred_val = lr.predict(validation_vectors)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(lr.score(validation_vectors, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = lr.predict(test_vectors)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(lr.score(test_vectors, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
svm = SVC(kernel='linear')
svm.fit(training_vectors, y_train)
print("***** Word2Vec Word Embedding Based Sentiment Analysis using SVC *******\n")
print("SVC with linear kernel Performance : \n")
print('Train-Set Score : {:.4f}'.format(svm.score(training_vectors, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,svm.predict(training_vectors))))
print("\nEvaluation on Validation-Set : ")
pred_val = svm.predict(validation_vectors)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(svm.score(validation_vectors, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = svm.predict(test_vectors)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(svm.score(test_vectors, y_test)))
print("Test-Set Accuracy: {:.4f}".format(accuracy_score(y_test, pred)))
xgb = XGBClassifier()
xgb.fit(training_vectors, y_train)
print("***** Word2Vec Word Embedding Based Sentiment Analysis using XGBClassifier *******\n")
print("XGBClassifier Performance : \n")
print('Train-Set Score : {:.4f}'.format(xgb.score(training_vectors, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,xgb.predict(training_vectors))))
print("\nEvaluation on Validation-Set : ")
pred_val = xgb.predict(validation_vectors)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(xgb.score(validation_vectors, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = xgb.predict(test_vectors)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print("Test-Set Score : {:.4f}".format(xgb.score(test_vectors, y_test)))
print("Test-Set Accuracy: {:.4f}".format(accuracy_score(y_test, pred)))
"""
Create the tokenizer
"""
number_of_words=len(vocabulary)
tokenizer = Tokenizer(num_words=number_of_words)
"""
Fit the tokenizer
"""
tokenizer.fit_on_texts(x_train)
"""
Sequence encode
"""
X_token_train = tokenizer.texts_to_sequences(x_train)
X_token_test = tokenizer.texts_to_sequences(x_test)
X_token_validation = tokenizer.texts_to_sequences(x_validation)
"""
Adding 1 because of reserved 0 index
"""
vocabulary_size = len(tokenizer.word_index) + 1
print("x_train[2] : ",x_train[2])
print("\n X_token_train[2] : ",X_token_train[2])
print("\n vocab_size : ",vocabulary_size)
"""
Checking the index of each word by looking at the word_index dictionary of the Tokenizer object
"""
for word in ['famous','cartoon','studios', 'love', 'baby']:
print('{} : {}'.format(word, tokenizer.word_index[word]))
"""
Pad sequences
"""
max_length = 1500
X_token_train = pad_sequences(X_token_train, padding='post', maxlen=max_length)
X_token_test = pad_sequences(X_token_test, padding='post', maxlen=max_length)
X_token_validation = pad_sequences(X_token_validation, padding='post', maxlen=max_length)
%%time
"""
Create model
"""
embedding_dimension = 100
keras_cnn_model = Sequential()
keras_cnn_model.add(Embedding(input_dim=vocabulary_size,
output_dim=embedding_dimension,
input_length=max_length))
keras_cnn_model.add(Conv1D(128, 5, activation='relu'))
keras_cnn_model.add(GlobalMaxPooling1D())
keras_cnn_model.add(Dense(10, activation='relu'))
keras_cnn_model.add(Dense(1, activation='sigmoid'))
"""
Compile network
"""
keras_cnn_model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
keras_cnn_model.summary()
%%time
"""
Fit network
"""
keras_cnn_model.fit(X_token_train, y_train,
epochs=5,
verbose=False,
validation_data=(X_token_validation, y_validation),
batch_size=10)
"""
Evaluate
"""
print("\n **** Sentiment Analysis Using Keras Convolutional Neural Networks(CNN) ****\n")
loss, accuracy = keras_cnn_model.evaluate(X_token_train, y_train, verbose=False)
print("Train-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Validation-Set : ")
pred_val=keras_cnn_model.predict_classes(X_token_validation)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
loss, accuracy = keras_cnn_model.evaluate(X_token_validation, y_validation, verbose=False)
print("Validation-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Test-Set : ")
pred=keras_cnn_model.predict_classes(X_token_test)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
loss, accuracy = keras_cnn_model.evaluate(X_token_test, y_test, verbose=False)
print("Test-Set Accuracy: {:.4f}".format(accuracy))
"""
Vocabulary size
"""
num_of_words = list(model.wv.vocab)
print('Vocabulary size : %d' % len(num_of_words))
%%time
"""
Save model in ASCII
"""
file_name = 'movie_embedding_word2vec.txt'
model.wv.save_word2vec_format(file_name, binary=False)
"""
Load word embedding
"""
def load_word_embedding(file_name):
word_embedding = dict()
file = open(file_name,'r')
lines = file.readlines()[1:]
file.close()
"""
Mapping words to vectors
"""
for line in lines:
line_parts = line.split()
word_embedding[line_parts[0]] = np.asarray(line_parts[1:], dtype='float32')
return word_embedding
"""
Create a weight matrix for the Embedding layer
"""
def get_embedding_weight_matrix(wrd_embedding, vocabulary):
vocabulary_size = len(vocabulary) + 1
"""
Define weight matrix dimensions with all 0
"""
embedding_weight_matrix = np.zeros((vocabulary_size, 150))
"""
Step vocab, store vectors using the Tokenizer's integer mapping
"""
for wrd, i in vocabulary.items():
vector = wrd_embedding.get(wrd)
if vector is not None:
embedding_weight_matrix[i] = vector
return embedding_weight_matrix
"""
Load embedding from file
"""
raw_w2v_embedding = load_word_embedding('movie_embedding_word2vec.txt')
print('Completed creation of raw word2vec word embedding')
"""
Get weight vectors in the right order
"""
embedding_weight_vectors = get_embedding_weight_matrix(raw_w2v_embedding, tokenizer.word_index)
print('Completed creation of embedding weight vectors')
"""
Create the embedding layer
"""
embedding_layer = Embedding(vocabulary_size,
150,
weights=[embedding_weight_vectors],
input_length=max_length,
trainable=False)
print('Completed creation of embedding layer')
"""
Create model
"""
keras_cnn_w2v_model = Sequential()
keras_cnn_w2v_model.add(embedding_layer)
keras_cnn_w2v_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
keras_cnn_w2v_model.add(MaxPooling1D(pool_size=2))
keras_cnn_w2v_model.add(Flatten())
keras_cnn_w2v_model.add(Dense(1, activation='sigmoid'))
keras_cnn_w2v_model.summary()
"""
Compile network
"""
keras_cnn_w2v_model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
"""
Fit network
"""
keras_cnn_w2v_model.fit(X_token_train, y_train,
epochs=5,
verbose=False,
validation_data=(X_token_validation, y_validation),
batch_size=10)
"""
Evaluate
"""
print("\n **** Sentiment Analysis Using Pre-trained Word2Vec Word Embedding To Keras CNN ****\n")
loss, accuracy = keras_cnn_w2v_model.evaluate(X_token_train, y_train, verbose=False)
print("Train-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Validation-Set : ")
pred_val=keras_cnn_w2v_model.predict_classes(X_token_validation)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
loss, accuracy = keras_cnn_w2v_model.evaluate(X_token_validation, y_validation, verbose=False)
print("Validation-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Test-Set : ")
pred=keras_cnn_w2v_model.predict_classes(X_token_test)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
loss, accuracy = keras_cnn_w2v_model.evaluate(X_token_test, y_test, verbose=False)
print("Test-Set Accuracy: {:.4f}".format(accuracy))
%%time
"""
Create model
"""
keras_cnn_bidir_lstm_w2v_model = Sequential()
keras_cnn_bidir_lstm_w2v_model.add(Embedding(vocabulary_size,
150,
weights=[embedding_weight_vectors],
input_length=max_length,
trainable=False))
keras_cnn_bidir_lstm_w2v_model.add(Conv1D(128, 5, activation='relu'))
keras_cnn_bidir_lstm_w2v_model.add(MaxPooling1D(pool_size=2))
keras_cnn_bidir_lstm_w2v_model.add(Bidirectional(LSTM(64)))
keras_cnn_bidir_lstm_w2v_model.add(Dropout(0.5))
keras_cnn_bidir_lstm_w2v_model.add(Dense(1, activation='sigmoid'))
"""
Compile network
"""
keras_cnn_bidir_lstm_w2v_model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
keras_cnn_bidir_lstm_w2v_model.summary()
%%time
"""
Fit train data
"""
keras_cnn_bidir_lstm_w2v_model.fit(X_token_train, y_train,
epochs=5,
verbose=False,
validation_data=(X_token_validation, y_validation),
batch_size=10)
"""
Evaluate
"""
print("**** Sentiment Analysis Using Pre-trained Word2Vec Word Embedding To Keras CNN And Bidirectional LSTM ****\n")
loss, accuracy = keras_cnn_bidir_lstm_w2v_model.evaluate(X_token_train, y_train, verbose=False)
print("Train-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Validation-Set : ")
pred_val=keras_cnn_bidir_lstm_w2v_model.predict_classes(X_token_validation)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
loss, accuracy = keras_cnn_bidir_lstm_w2v_model.evaluate(X_token_validation, y_validation, verbose=False)
print("Validation-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Test-Set : ")
pred=keras_cnn_bidir_lstm_w2v_model.predict_classes(X_token_test)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
loss, accuracy = keras_cnn_bidir_lstm_w2v_model.evaluate(X_token_test, y_test, verbose=False)
print("Test-Set Accuracy: {:.4f}".format(accuracy))
"""
Create model
"""
keras_bidir_lstm_w2v_model = Sequential()
keras_bidir_lstm_w2v_model.add(Embedding(vocabulary_size,
150,
weights=[embedding_weight_vectors],
input_length=max_length,
trainable=False))
keras_bidir_lstm_w2v_model.add(Bidirectional(LSTM(64)))
keras_bidir_lstm_w2v_model.add(Dropout(0.5))
keras_bidir_lstm_w2v_model.add(Dense(1, activation='sigmoid'))
"""
Compile network
"""
keras_bidir_lstm_w2v_model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
keras_bidir_lstm_w2v_model.summary()
%%time
"""
Fit train data
"""
keras_bidir_lstm_w2v_model.fit(X_token_train, y_train,
epochs=5,
verbose=False,
validation_data=(X_token_validation, y_validation),
batch_size=10)
"""
Evaluate
"""
print("**** Sentiment Analysis Using Pre-trained Word2Vec Word Embedding To Keras Bidirectional LSTM ****\n")
loss, accuracy = keras_bidir_lstm_w2v_model.evaluate(X_token_train, y_train, verbose=False)
print("Train-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Validation-Set : ")
pred_val=keras_bidir_lstm_w2v_model.predict_classes(X_token_validation)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
loss, accuracy = keras_bidir_lstm_w2v_model.evaluate(X_token_validation, y_validation, verbose=False)
print("Validation-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Test-Set : ")
pred=keras_bidir_lstm_w2v_model.predict_classes(X_token_test)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
loss, accuracy = keras_bidir_lstm_w2v_model.evaluate(X_token_test, y_test, verbose=False)
print("Test-Set Accuracy: {:.4f}".format(accuracy))
Below we will see the usage gensim python library with comaprisons of each of these models and their combinations. That is:
"""
Function to labelize the reviews
"""
def labelize_review(reviews,label):
labelized_review = []
prefix = label
for indx, rvw in zip(reviews.index, reviews):
labelized_review.append(LabeledSentence(rvw.split(), [prefix + '_%s' % indx]))
return labelized_review
"""
labelize the reviews
"""
all_reviews_d2v = labelize_review(all_reviews, 'all')
%%time
"""
Create doc2vec DBOW model
"""
dbow_model = Doc2Vec(dm=0,
size=150,
negative=5,
min_count=2,
workers=10,
alpha=0.065,
min_alpha=0.065)
dbow_model.build_vocab([review for review in tqdm(all_reviews_d2v)])
%%time
"""
Train the model
"""
for epoch in range(3):
dbow_model.train(utils.shuffle([review for review in tqdm(all_reviews_d2v)]),
total_examples=len(all_reviews_d2v),
epochs=1)
dbow_model.alpha -= 0.002
dbow_model.min_alpha = dbow_model.alpha
"""
Function to generate vectors from corpus
"""
def generate_vectors(model, corpus, size):
vectors = np.zeros((len(corpus), size))
n = 0
for indx in corpus.index:
prefix = 'all_' + str(indx)
vectors[n] = model.docvecs[prefix]
n += 1
return vectors
"""
Generating train, test and validation vectors
"""
train_vectors_dbow = generate_vectors(dbow_model, x_train, 150)
test_vectors_dbow = generate_vectors(dbow_model, x_test, 150)
validation_vectors_dbow = generate_vectors(dbow_model, x_validation, 150)
logreg_dbow = LogisticRegression()
logreg_dbow.fit(train_vectors_dbow, y_train)
print("**** Doc2Vec Distributed Bag Of Words(DBOW) Based Sentiment Analysis using LogisticRegression ****\n")
print("LogisticRegression Performance : \n")
print('Train-Set Score : {:.4f}'.format(logreg_dbow.score(train_vectors_dbow, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,logreg_dbow.predict(train_vectors_dbow))))
print("\nEvaluation on Validation-Set : ")
pred_val = logreg_dbow.predict(validation_vectors_dbow)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(logreg_dbow.score(validation_vectors_dbow, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = logreg_dbow.predict(test_vectors_dbow)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(logreg_dbow.score(test_vectors_dbow, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
svm_dbow = SVC(kernel='linear')
svm_dbow.fit(train_vectors_dbow, y_train)
print("**** Doc2Vec Distributed Bag Of Words(DBOW) Based Sentiment Analysis using SVC ****\n")
print("SVC with linear kernel Performance : \n")
print('Train-Set Score : {:.4f}'.format(svm_dbow.score(train_vectors_dbow, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,svm_dbow.predict(train_vectors_dbow))))
print("\nEvaluation on Validation-Set : ")
pred_val = svm_dbow.predict(validation_vectors_dbow)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(svm_dbow.score(validation_vectors_dbow, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = svm_dbow.predict(test_vectors_dbow)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(svm_dbow.score(test_vectors_dbow, y_test)))
print("Test-Set Accuracy: {:.4f}".format(accuracy_score(y_test, pred)))
xgb_dbow = XGBClassifier()
xgb_dbow.fit(train_vectors_dbow, y_train)
print("**** Doc2Vec Distributed Bag Of Words(DBOW) Based Sentiment Analysis using XGBClassifier ****\n")
print("XGBClassifier Performance : \n")
print('Train-Set Score : {:.4f}'.format(xgb_dbow.score(train_vectors_dbow, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,xgb_dbow.predict(train_vectors_dbow))))
print("\nEvaluation on Validation-Set : ")
pred_val = xgb_dbow.predict(validation_vectors_dbow)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(xgb_dbow.score(validation_vectors_dbow, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = xgb_dbow.predict(test_vectors_dbow)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print("Test-Set Score : {:.4f}".format(xgb_dbow.score(test_vectors_dbow, y_test)))
print("Test-Set Accuracy: {:.4f}".format(accuracy_score(y_test, pred)))
%%time
"""
Create Doc2Vec DMC model
"""
dmc_model = Doc2Vec(dm=1,
dm_concat=1,
size=150,
window=10,
negative=5,
min_count=2,
workers=10,
alpha=0.065,
min_alpha=0.065)
dmc_model.build_vocab([review for review in tqdm(all_reviews_d2v)])
%%time
"""
Train the model
"""
for epoch in range(3):
dmc_model.train(utils.shuffle([review for review in tqdm(all_reviews_d2v)]),
total_examples=len(all_reviews_d2v),
epochs=1)
dmc_model.alpha -= 0.002
dmc_model.min_alpha = dmc_model.alpha
"""
Generating train, test and validation vectors
"""
train_vectors_dmc = generate_vectors(dmc_model, x_train, 150)
test_vectors_dmc = generate_vectors(dmc_model, x_test, 150)
validation_vectors_dmc = generate_vectors(dmc_model, x_validation, 150)
logreg_dmc = LogisticRegression()
logreg_dmc.fit(train_vectors_dmc, y_train)
print("**** Doc2Vec Distributed Momory (concatenated) Based Sentiment Analysis using LogisticRegression ****\n")
print("LogisticRegression Performance : \n")
print('Train-Set Score : {:.4f}'.format(logreg_dmc.score(train_vectors_dmc, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,logreg_dmc.predict(train_vectors_dmc))))
print("\nEvaluation on Validation-Set : ")
pred_val = logreg_dmc.predict(validation_vectors_dmc)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(logreg_dmc.score(validation_vectors_dmc, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = logreg_dmc.predict(test_vectors_dmc)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(logreg_dmc.score(test_vectors_dmc, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
svm_dmc = SVC(kernel='linear')
svm_dmc.fit(train_vectors_dmc, y_train)
print("**** Doc2Vec Distributed Momory (concatenated) Based Sentiment Analysis using SVC ****\n")
print("SVC with linear kernel Performance : \n")
print('Train-Set Score : {:.4f}'.format(svm_dmc.score(train_vectors_dmc, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,svm_dmc.predict(train_vectors_dmc))))
print("\nEvaluation on Validation-Set : ")
pred_val = svm_dmc.predict(validation_vectors_dmc)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(svm_dmc.score(validation_vectors_dmc, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = svm_dmc.predict(test_vectors_dmc)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(svm_dmc.score(test_vectors_dmc, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
xgb_dmc = XGBClassifier()
xgb_dmc.fit(train_vectors_dmc, y_train)
print("**** Doc2Vec Distributed Momory (concatenated) Based Sentiment Analysis using XGBClassifier ****\n")
print("XGBClassifier Performance : \n")
print('Train-Set Score : {:.4f}'.format(xgb_dmc.score(train_vectors_dmc, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,xgb_dmc.predict(train_vectors_dmc))))
print("\nEvaluation on Validation-Set : ")
pred_val = xgb_dmc.predict(validation_vectors_dmc)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(xgb_dmc.score(validation_vectors_dmc, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = xgb_dmc.predict(test_vectors_dmc)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(xgb_dmc.score(test_vectors_dmc, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
%%time
"""
Create doc2vec DMM model
"""
dmm_model = Doc2Vec(dm=1,
dm_mean=1,
size=150,
window=10,
negative=5,
min_count=2,
workers=10,
alpha=0.065,
min_alpha=0.065)
dmm_model.build_vocab([review for review in tqdm(all_reviews_d2v)])
%%time
"""
Train the model
"""
for epoch in range(3):
dmm_model.train(utils.shuffle([review for review in tqdm(all_reviews_d2v)]),
total_examples=len(all_reviews_d2v),
epochs=1)
dmm_model.alpha -= 0.002
dmm_model.min_alpha = dmm_model.alpha
"""
Generating train, test and validation vectors
"""
train_vectors_dmm = generate_vectors(dmm_model, x_train, 150)
test_vectors_dmm = generate_vectors(dmm_model, x_test, 150)
validation_vectors_dmm = generate_vectors(dmm_model, x_validation, 150)
logreg_dmm = LogisticRegression()
logreg_dmm.fit(train_vectors_dmm, y_train)
print("**** Doc2Vec Distributed Memory(mean) Based Sentiment Analysis using LogisticRegression ****\n")
print("LogisticRegression Performance : \n")
print('Train-Set Score : {:.4f}'.format(logreg_dmm.score(train_vectors_dmm, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,logreg_dmm.predict(train_vectors_dmm))))
print("\nEvaluation on Validation-Set : ")
pred_val = logreg_dmm.predict(validation_vectors_dmm)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(logreg_dmm.score(validation_vectors_dmm, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = logreg_dmm.predict(test_vectors_dmm)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(logreg_dmm.score(test_vectors_dmm, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
svm_dmm = SVC(kernel='linear')
svm_dmm.fit(train_vectors_dmm, y_train)
print("**** Doc2Vec Distributed Memory(mean) Based Sentiment Analysis using SVC ****\n")
print("SVC With Linear Kernel Performance : \n")
print('Train-Set Score : {:.4f}'.format(svm_dmm.score(train_vectors_dmm, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,svm_dmm.predict(train_vectors_dmm))))
print("\nEvaluation on Validation-Set : ")
pred_val = svm_dmm.predict(validation_vectors_dmm)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(svm_dmm.score(validation_vectors_dmm, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = svm_dmm.predict(test_vectors_dmm)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(svm_dmm.score(test_vectors_dmm, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
xgb_dmm = XGBClassifier()
xgb_dmm.fit(train_vectors_dmm, y_train)
print("**** Doc2Vec Distributed Memory(mean) Based Sentiment Analysis using XGBClassifier ****\n")
print("XGBClassifier Performance : \n")
print('Train-Set Score : {:.4f}'.format(xgb_dmm.score(train_vectors_dmm, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,xgb_dmm.predict(train_vectors_dmm))))
print("\nEvaluation on Validation-Set : ")
pred_val = xgb_dmm.predict(validation_vectors_dmm)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(xgb_dmm.score(validation_vectors_dmm, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = xgb_dmm.predict(test_vectors_dmm)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(xgb_dmm.score(test_vectors_dmm, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
Now, I have the document vectors from three different models, now I can concatenate them in combination to see how it affects the performance. Below I defined a function to concatenate document vectors from different models
"""
Function to concatenate document vectors from different models
"""
def generate_concat_vectors(model1,model2, corpus, size):
vectors = np.zeros((len(corpus), size))
n = 0
for indx in corpus.index:
prefix = 'all_' + str(indx)
vectors[n] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])
n += 1
return vectors
"""
Generating train, test and validation document vectors
"""
train_vectors_dbow_dmc = generate_concat_vectors(dbow_model,dmc_model, x_train, 300)
test_vectors_dbow_dmc = generate_concat_vectors(dbow_model,dmc_model, x_test, 300)
validation_vectors_dbow_dmc = generate_concat_vectors(dbow_model,dmc_model, x_validation, 300)
logreg_dbow_dmc = LogisticRegression()
logreg_dbow_dmc.fit(train_vectors_dbow_dmc, y_train)
print("**** Combination of Doc2Vec DBOW And DMC Based Sentiment Analysis using LogisticRegression ****\n")
print("LogisticRegression Performance : \n")
print('Train-Set Score : {:.4f}'.format(logreg_dbow_dmc.score(train_vectors_dbow_dmc, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,logreg_dbow_dmc.predict(train_vectors_dbow_dmc))))
print("\nEvaluation on Validation-Set : ")
pred_val = logreg_dbow_dmc.predict(validation_vectors_dbow_dmc)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(logreg_dbow_dmc.score(validation_vectors_dbow_dmc, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = logreg_dbow_dmc.predict(test_vectors_dbow_dmc)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(logreg_dbow_dmc.score(test_vectors_dbow_dmc, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
svm_dbow_dmc = SVC(kernel='linear')
svm_dbow_dmc.fit(train_vectors_dbow_dmc, y_train)
print("**** Combination of Doc2Vec DBOW And DMC Based Sentiment Analysis using SVC ****\n")
print("SVC with linear kernel Performance : \n")
print('Train-Set Score : {:.4f}'.format(svm_dbow_dmc.score(train_vectors_dbow_dmc, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,svm_dbow_dmc.predict(train_vectors_dbow_dmc))))
print("\nEvaluation on Validation-Set : ")
pred_val = svm_dbow_dmc.predict(validation_vectors_dbow_dmc)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(svm_dbow_dmc.score(validation_vectors_dbow_dmc, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = svm_dbow_dmc.predict(test_vectors_dbow_dmc)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(svm_dbow_dmc.score(test_vectors_dbow_dmc, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
xgb_dbow_dmc =XGBClassifier()
xgb_dbow_dmc.fit(train_vectors_dbow_dmc, y_train)
print("**** Combination of Doc2Vec DBOW And DMC Based Sentiment Analysis using XGBClassifier ****\n")
print("XGBClassifier Performance : \n")
print('Train-Set Score : {:.4f}'.format(xgb_dbow_dmc.score(train_vectors_dbow_dmc, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,xgb_dbow_dmc.predict(train_vectors_dbow_dmc))))
print("\nEvaluation on Validation-Set : ")
pred_val = xgb_dbow_dmc.predict(validation_vectors_dbow_dmc)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(xgb_dbow_dmc.score(validation_vectors_dbow_dmc, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = xgb_dbow_dmc.predict(test_vectors_dbow_dmc)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(xgb_dbow_dmc.score(test_vectors_dbow_dmc, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
"""
Generating train, test and validation document vectors
"""
train_vectors_dbow_dmm = generate_concat_vectors(dbow_model,dmm_model, x_train, 300)
test_vectors_dbow_dmm = generate_concat_vectors(dbow_model,dmm_model, x_test, 300)
validation_vectors_dbow_dmm = generate_concat_vectors(dbow_model,dmm_model, x_validation, 300)
logreg_dbow_dmm = LogisticRegression()
logreg_dbow_dmm.fit(train_vectors_dbow_dmm, y_train)
print("**** Combination of Doc2Vec DBOW And DMM Based Sentiment Analysis using LogisticRegression ****\n")
print("LogisticRegression Performance : \n")
print('Train-Set Score : {:.4f}'.format(logreg_dbow_dmm.score(train_vectors_dbow_dmm, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,logreg_dbow_dmm.predict(train_vectors_dbow_dmm))))
print("\nEvaluation on Validation-Set : ")
pred_val = logreg_dbow_dmm.predict(validation_vectors_dbow_dmm)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(logreg_dbow_dmm.score(validation_vectors_dbow_dmm, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = logreg_dbow_dmm.predict(test_vectors_dbow_dmm)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(logreg_dbow_dmm.score(test_vectors_dbow_dmm, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
svm_dbow_dmm = SVC(kernel='linear')
svm_dbow_dmm.fit(train_vectors_dbow_dmm, y_train)
print("**** Combination of Doc2Vec DBOW And DMM Based Sentiment Analysis using SVC ****\n")
print("SVC with linear kernel Performance : \n")
print('Train-Set Score : {:.4f}'.format(svm_dbow_dmm.score(train_vectors_dbow_dmm, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,svm_dbow_dmm.predict(train_vectors_dbow_dmm))))
print("\nEvaluation on Validation-Set : ")
pred_val = svm_dbow_dmm.predict(validation_vectors_dbow_dmm)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(svm_dbow_dmm.score(validation_vectors_dbow_dmm, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = svm_dbow_dmm.predict(test_vectors_dbow_dmm)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(svm_dbow_dmm.score(test_vectors_dbow_dmm, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
xgb_dbow_dmm =XGBClassifier()
xgb_dbow_dmm.fit(train_vectors_dbow_dmm, y_train)
print("**** Combination of Doc2Vec DBOW And DMM Based Sentiment Analysis using XGBClassifier ****\n")
print("XGBClassifier Performance : \n")
print('Train-Set Score : {:.4f}'.format(xgb_dbow_dmm.score(train_vectors_dbow_dmm, y_train)))
print('Train-Set Accuracy : {:.4f}'.format(accuracy_score(y_train,xgb_dbow_dmm.predict(train_vectors_dbow_dmm))))
print("\nEvaluation on Validation-Set : ")
pred_val = xgb_dbow_dmm.predict(validation_vectors_dbow_dmm)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
print('Validation-Set Score : {:.4f}'.format(xgb_dbow_dmm.score(validation_vectors_dbow_dmm, y_validation)))
print('Validation-Set Accuracy:{:.4f}'.format(accuracy_score(y_validation, pred_val)))
print("\nEvaluation on Test-Set : ")
pred = xgb_dbow_dmm.predict(test_vectors_dbow_dmm)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
print('Test-Set Score : {:.4f}'.format(xgb_dbow_dmm.score(test_vectors_dbow_dmm, y_test)))
print('Test-Set Accuracy:{:.4f}'.format(accuracy_score(y_test, pred)))
"""
Create model
"""
keras_d2v_combo_dbow_dmc_model = Sequential()
keras_d2v_combo_dbow_dmc_model.add(Dense(128, activation='relu', input_dim=300))
keras_d2v_combo_dbow_dmc_model.add(Dense(128, activation='relu'))
keras_d2v_combo_dbow_dmc_model.add(Dense(1, activation='sigmoid'))
"""
Compile network
"""
keras_d2v_combo_dbow_dmc_model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
keras_d2v_combo_dbow_dmc_model.summary()
"""
Fit network
"""
keras_d2v_combo_dbow_dmc_model.fit(train_vectors_dbow_dmc, y_train,
validation_data=(validation_vectors_dbow_dmc, y_validation),
epochs=5,
batch_size=10,
verbose=False)
"""
Evaluate
"""
print("**** Sentiment Analysis Using Combination of Doc2Vec DBOW And DMC Document Embedding and Keras Neural Network ****\n")
loss, accuracy = keras_d2v_combo_dbow_dmc_model.evaluate(train_vectors_dbow_dmc, y_train, verbose=False)
print("Train-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Validation-Set : ")
pred_val=keras_d2v_combo_dbow_dmc_model.predict_classes(validation_vectors_dbow_dmc)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
loss, accuracy = keras_d2v_combo_dbow_dmc_model.evaluate(validation_vectors_dbow_dmc, y_validation, verbose=False)
print("Validation-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Test-Set : ")
pred=keras_d2v_combo_dbow_dmc_model.predict_classes(test_vectors_dbow_dmc)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
loss, accuracy = keras_d2v_combo_dbow_dmc_model.evaluate(test_vectors_dbow_dmc, y_test, verbose=False)
print("Test-Set Accuracy: {:.4f}".format(accuracy))
"""
Create model
"""
keras_d2v_combo_dbow_dmm_model = Sequential()
keras_d2v_combo_dbow_dmm_model.add(Dense(128, activation='relu', input_dim=300))
keras_d2v_combo_dbow_dmm_model.add(Dense(128, activation='relu'))
keras_d2v_combo_dbow_dmm_model.add(Dense(1, activation='sigmoid'))
"""
Compile network
"""
keras_d2v_combo_dbow_dmm_model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
keras_d2v_combo_dbow_dmm_model.summary()
"""
Fit network
"""
keras_d2v_combo_dbow_dmm_model.fit(train_vectors_dbow_dmm, y_train,
validation_data=(validation_vectors_dbow_dmm, y_validation),
epochs=5,
batch_size=10,
verbose=False)
"""
Evaluate
"""
print("**** Sentiment Analysis Using Combination of Doc2Vec DBOW And DMM Document Embedding and Keras Neural Network ****\n")
loss, accuracy = keras_d2v_combo_dbow_dmm_model.evaluate(train_vectors_dbow_dmm, y_train, verbose=False)
print("Train-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Validation-Set : ")
pred_val=keras_d2v_combo_dbow_dmm_model.predict_classes(validation_vectors_dbow_dmm)
print("Classification report:\n {}".format(classification_report(y_validation, pred_val)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_validation, pred_val)))
loss, accuracy = keras_d2v_combo_dbow_dmm_model.evaluate(validation_vectors_dbow_dmm, y_validation, verbose=False)
print("Validation-Set Accuracy: {:.4f}".format(accuracy))
print("\nEvaluation on Test-Set : ")
pred=keras_d2v_combo_dbow_dmm_model.predict_classes(test_vectors_dbow_dmm)
print("Classification report:\n {}".format(classification_report(y_test, pred)))
print("Confusion matrix:\n {}".format(confusion_matrix(y_test, pred)))
loss, accuracy = keras_d2v_combo_dbow_dmm_model.evaluate(test_vectors_dbow_dmm, y_test, verbose=False)
print("Test-Set Accuracy: {:.4f}".format(accuracy))
print("Time elapsed : ",(round(((time.time()-program_start_time)/3600),2))," in hours")