!pip install wget
!pip install clean-text
!pip install torchtext==0.6.0
!pip install sentencepiece
!pip install transformers==2.8.0
import re
import os
import csv
import time
import math
import json
import random
import collections
import numpy as np
import pandas as pd
from cleantext import clean
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchtext import data
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel, GPT2Tokenizer, GPT2LMHeadModel
from torchsummary import summary
from torchtext.data.metrics import bleu_score
from tqdm.autonotebook import tqdm
Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
from google.colab import drive
Mounted at /content/gdrive/
Data Preprocessing
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
Batch Size
isTrain = False
def set_seed(seed):
torch.backends.cudnn.deterministic = True
def textPreprocess(txt):
Convert to Lowercase and Trim the text
txt = txt.lower().strip()
Fix various unicode errors
transliterate to closest ASCII representation
txt = clean(txt, fix_unicode=True, to_ascii=True)
Removing zero-width character
txt = re.sub(u"\ufe0f", r" ", txt)
Remove URL
txt = re.sub(r"https?://[A-Za-z0-9./]*", r" ", txt)
Remove Specific Special character
txt = re.sub(r"[-.!?()_]+", r" ", txt)
Remove charatcter like special characters, punctuations except alphanumeric charatcter.
txt = re.sub(r"[^0-9a-zA-Z]+", r" ", txt)
Remove Extra spaces which are appearing from previous processing steps.
txt = re.sub(r"\s+", r" ", txt).strip()
return txt
Required Directories Creation
Required functions for directory creation
def check_if_dir_exists(directory):
Checks if 'directory' exists
def make_dir(directory):
Create directory
if not check_if_dir_exists(directory):
print("Directory %s created successfully." %directory)
print("Directory %s exists." %directory)
print("We are in:",os.getcwd())
Required directory creation
chatbot_dir="/content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta"
os.chdir("/content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta")
chatbot_data_dir = chatbot_dir + "/ChatBot_Data/"
chatbot_result_dir = chatbot_dir + "/ChatBot_Results/"
chatbot_checkpoint_dir = chatbot_dir + "/ChatBot_Checkpoint/"
We are in: /content Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta exists. Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Data/ exists. Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Results/ exists. Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Checkpoint/ exists. /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Data/
Listing Directories
def list_dir(dir_path):
List directories for a given path
print("Directory %s contains : " %dir_path)
for dir_or_file in os.listdir(dir_path):
List created directories
print('Current directory : ', os.getcwd(),'\n')
Current directory : /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta contains : ChatBot_Data ChatBot_Results ChatBot_Checkpoint .vector_cache Images Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Data/ contains : .kaggle dev_v2.1.json.gz dev_v2.1.json question_asked.tsv QAWhats.tsv twcs.csv QA_Pair.tsv Apple_QA_Pair.tsv Q_asked question_asked Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Checkpoint/ contains : QA_Seq2Seq_ChatBot_BestModel.pt QA_Seq2Seq_ChatBot.pt QA_GPT_Seq2seq_ChatBot.pt QA_GPT_Seq2seq_Results.csv Apple_QA_Seq2Seq_ChatBot.pt Apple_QA_Seq2Seq_ChatBot_BestModel.pt Apple_QA_GPT_Seq2seq_ChatBot.pt Apple_QA_GPT_Seq2seq_Results.csv
Batch Size, Embedding Size, Hidden Size
Sentence start, end and pad token
QA Pair, word to integer map And integer to word map Creation Function
def getVocab(qa_df):
qa_pairs, vocab2idx, idx2vocab = [], {}, {}
Question Answer Pair Creation
qa_pairs=[(qa_df.iloc[idx]['question'],qa_df.iloc[idx]['answer']) for idx in range(len(qa_df))]
print('Number of question and answer pairs) : ',len(qa_pairs))
Creation of word to integer map.
vocab=set(word for question, answer in qa_pairs for sentance in (question, answer) for word in sentance.split(" "))
print('Number of vocab : ',len(vocab))
vocab2idx = {w:i for i,w in enumerate(vocab,3)}
vocab2idx[PAD_TOKEN], vocab2idx[SOS_TOKEN], vocab2idx[EOS_TOKEN] = 0, 1, 2
print('Number of keys in vocab2idx : ',len(vocab2idx))
Creation of integer to word map.
idx2vocab = {idx:word for word, idx in vocab2idx.items()}
print('Number of keys in idx2vocab : ',len(idx2vocab))
return qa_pairs, vocab2idx, idx2vocab
QA DataSet Creation Function
class QADataset(Dataset):
def __init__(self, lang_pairs, vocab2idx):
self.lang_pairs = lang_pairs
self.vocab2idx = vocab2idx
def __len__(self):
return len(self.lang_pairs)
def addToken(self, txt, flag):
if flag:
return SOS_TOKEN + " " + txt + " " + EOS_TOKEN
return txt + " " + EOS_TOKEN
def getTensor(self, txt):
return torch.tensor([self.vocab2idx[wrd] for wrd in txt.split(" ")], dtype=torch.int64)
def __getitem__(self, idx):
x, y = self.getTensor(self.addToken(self.lang_pairs[idx][0],True)), self.getTensor(self.addToken(self.lang_pairs[idx][1],False))
return x, y
Pad items in the batch to the length of the longest item in the batch
def collate(batch):
len_tuples=[(i[0].size(0) , i[1].size(0)) for i in batch]
max_x, max_y = tuple(map(max, zip(*len_tuples)))
getBatch = lambda batch, idx, max_len : torch.stack([F.pad(src_trg[idx], (0,max_len-src_trg[idx].size(0)), value=PAD) for src_trg in batch])
X,Y = getBatch(batch, 0, max_x), getBatch(batch, 1, max_y)
return (X, Y), Y
Load Refined Data
print('Shape of qa_final_df : ',qa_final_df.shape)
Create qa_pairs, vocab2idx, idx2vocab
qa_pairs, vocab2idx, idx2vocab = getVocab(qa_final_df)
PAD = vocab2idx[PAD_TOKEN]
SOS = vocab2idx[SOS_TOKEN]
EOS = vocab2idx[EOS_TOKEN]
Create QA DataSet
qa_dataset = QADataset(qa_pairs, vocab2idx)
Data Split
train_size, test_size = round(len(qa_dataset)*0.8),len(qa_dataset)-round(len(qa_dataset)*0.8)
train_dataset, test_dataset = torch.utils.data.random_split(qa_dataset, [train_size, test_size])
Create DataLoader
train_loader = DataLoader(train_dataset, batch_size = BATCH_SZ, shuffle = True, collate_fn = collate)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SZ, collate_fn = collate)
print("\n\nHow does the tensor look?\n",train_dataset[8])
Shape of qa_final_df : (96750, 2) Number of question and answer pairs) : 96750 Number of vocab : 35075 Number of keys in vocab2idx : 35078 Number of keys in idx2vocab : 35078 How does the tensor look? (tensor([ 1, 1530, 31549, 2713, 33282, 22879, 4114, 3806, 9206, 34541, 12057, 3820, 12720, 23014, 12187, 12057, 3475, 2]), tensor([28294, 12187, 20625, 4868, 2639, 3456, 7267, 12859, 12057, 9073, 30162, 23118, 20185, 31533, 2378, 28581, 12070, 4251, 13627, 2]))
Attention Mechanism Layers
class Attention(nn.Module):
def __init__(self):
super(Attention, self).__init__()
def dot_score(self, hidden_encoded, hidden_decoded):
hidden_encoded : (B, T, D)
hidden_decoded : (B, D)
attention_score : (B, T, 1)
return torch.bmm(hidden_encoded, hidden_decoded.unsqueeze(2)) / np.sqrt(hidden_encoded.size(2))
def forward(self, hidden_encoded, hidden_decoded, mask=None):
hidden_encoded : (B, T, D)
hidden_decoded : (B, D)
attention_scores : (B, T, 1)
mask : (B, T)
context : (B, D)
attention_weight : (B, T, 1)
Attention scores
attention_scores = self.dot_score(hidden_encoded, hidden_decoded)
if mask is not None:
attention_scores[~mask] = float(-1000)
Attention weight
attention_weight = F.softmax(attention_scores, dim=1)
(B, T, D) * (B, T, 1) to (B, D)
context = (hidden_encoded * attention_weight).sum(dim=1)
return context, attention_weight
def maskedFill(input, time_dimension=1, fill=0):
Generate Mask of shape (B, T) to determine input sequence length.
dimensions = list(range(1,len(input.shape)))
if time_dimension in dimensions:
with torch.no_grad():
if len(dimensions) == 0:
return (input != fill)
mask = torch.sum((input != fill), dim=dimensions) > 0
return mask
class EncoderRNN(nn.Module):
def __init__(self, embeddding_size, hidden_size, n_layers=1, bidirectional=True):
super(EncoderRNN, self).__init__()
self.encoder_layer = nn.GRU(input_size = embeddding_size,
hidden_size = hidden_size//2,
num_layers = n_layers,
bidirectional = bidirectional)
def forward(self, question_embd, question_len):
Pack the sequences as question sequences are of varying length.
embed_packed = pack_padded_sequence(question_embd,
enc_outs, h_enc = self.encoder_layer(embed_packed)
As bidirectional : (B, T, 2, D//2)
enc_outs, _ = pad_packed_sequence(enc_outs)
batch_size, time_step = question_embd.size(0), question_embd.size(1)
(B, T, 2, D//2) to (B, T, D)
enc_outs = enc_outs.view(batch_size, time_step , -1)
hidden_size = enc_outs.size(2)
Reshaping the h_enc as (n_layers, directions, batch_size, hidden_size).
Take the last layer's output.
h_enc = h_enc.view(-1, 2, batch_size, hidden_size//2)[-1,:,:,:]
Reordering to (B, 2, D/2) and reshaping to (B, D)
h_enc = h_enc.permute(1, 0, 2).reshape(batch_size, -1)
return enc_outs, h_enc
class AttentionDecoderRNN(nn.Module):
def __init__(self,vocab_size, embedding_size, hidden_size, n_layers=1):
super(AttentionDecoderRNN, self).__init__()
Decoder is uni-directionall and used GRUCells so to do the decoding one step at a time.
self.decoder_layers = nn.ModuleList([nn.GRUCell(embedding_size, hidden_size)] +
[nn.GRUCell(hidden_size, hidden_size) for i in range(n_layers-1)])
self.attention = Attention()
Prediction, a fully connected network to convert the attention context and decoded context to a predicted next token
self.prediction = nn.Sequential(nn.Linear(2*hidden_size, hidden_size),
nn.Linear(hidden_size, hidden_size),
nn.Linear(hidden_size, vocab_size)
def forward(self, decoder_input, h_previous, encoded_outs, mask):
for layer in range(len(self.decoder_layers)):
next_hidden_state = self.decoder_layers[layer](decoder_input, h_previous[layer])
h_previous[layer], decoder_input = next_hidden_state, next_hidden_state
(B, D)
answer_decoded = decoder_input
Attention mechanism, to get relevant information from the previous encoded states.
(B, T, 1)
attention_context, attention_weights = self.attention(encoded_outs, answer_decoded, mask=mask)
Concatinating the attention context and the decoded context.
(B, D) + (B, D) to (B, 2*D)
pred_token = torch.cat((attention_context, answer_decoded), dim=1)
Predict the next token.
(B, 2*D) to (B, V)
pred_token = self.prediction(pred_token)
return attention_weights, pred_token, h_previous
The heart of chatbot is a sequence-to-sequence (seq2seq) model. The goal of a seq2seq model is to take a variable-length question sequence as an input, and return a variable-length answer sequence as an output.
Components :
I have used nn.Embedding
layer to convert tokens into feature vectors.
Next, I have used nn.GRU
, an encoding RNN that takes a tensor of shape $(B, T, D)$, since it expects all $T$ items at once. As the entire question is taken, I used bidrectional nn.GRU
For decoder RNN I have used nn.GRUCell
, a uni-derectional decoding RNN as it generates the output one item at a time.
To prevent an infinite loop in the case of a bad prediction, I have set a limit of 22 decode_steps
to control the maximum number of decoding steps.
class Seq2SeqAttention(nn.Module):
def __init__(self, vocab_size, embedding_size, hidden_size, pad_idx=None, n_layers=1, decode_steps=22):
super(Seq2SeqAttention, self).__init__()
self.pad_idx = pad_idx
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=self.pad_idx)
self.encoder = EncoderRNN(embedding_size,
self.attndecoder = AttentionDecoderRNN(vocab_size,
self.decode_steps = decode_steps
def forward(self, question):
question either (B, T) or ((B, T), (B, T'))
if isinstance(question, tuple):
question, answer = question
answer = None
batch_size, time_dimension = question.size(0), question.size(1)
(B, T) to (B, T, D)
embeded = self.embedding(question)
device = self.embedding.weight.device
mask = maskedFill(embeded)
question_lengths = mask.sum(dim=1).view(-1)
encoded_outs, h_encoded = self.encoder(embeded, question_lengths)
Attention Decoding
encoded_outs : encoded feature vectors of the question data.
h_encoded : the initial input for the decoder.
h_previous = [h_encoded for _ in range(self.n_layers)]
attention_weights, predictions = [], []
Last token of question, EOS marker as the first input for the decoder.
decoder_input = self.embedding(question[:,-1])
decode_steps = self.decode_steps
Training : Given Question and Answer pairs gives exact decode length.
Testing : Given decode_steps.
if answer is not None:
decode_steps = answer.size(1)
Either Teacher Forcing OR Auto-Regressive
teacher_forcing = np.random.choice((True,False))
for decode_step in range(decode_steps):
(B, D)
decoder_in = decoder_input
attention_weight, pred_token, h_previous = self.attndecoder(decoder_in, h_previous, encoded_outs, mask)
Selecting the token for the next time step.
torch.no_grad() : In-order to prevent the gradient to pass through the question tokens.
with torch.no_grad():
if self.training:
if answer is not None and teacher_forcing:
Teacher Forcing : next correct token.
next_token = answer[:,decode_step].squeeze()
Auto-Regressive : next token based on the prediction.
next_token = torch.multinomial(F.softmax(pred_token, dim=1), 1)[:,-1]
For testing : selecting most likely token.
next_token = torch.argmax(pred_token, dim=1)
Next token is the decoder input for next time step further time step's token prediction.
decoder_input = self.embedding(next_token.to(device))
prediction, attention_score = torch.stack(predictions, dim=1), torch.stack(attention_weights, dim=1).squeeze()
return prediction if self.training else prediction, attention_score
checkpointFile = chatbot_checkpoint_dir +'Apple_QA_Seq2Seq_ChatBot.pt'
checkpointFileBestModel = chatbot_checkpoint_dir +'Apple_QA_Seq2Seq_ChatBot_BestModel.pt'
seq2seq_model = Seq2SeqAttention(vocab_size = len(vocab2idx),
embedding_size = EMBEDDING_SIZE,
hidden_size = HIDDEN_SIZE,
pad_idx = PAD,
n_layers = NUM_LAYERS
Gradient Cliping
for param in seq2seq_model.parameters():
param.register_hook(lambda grad: torch.clamp(grad, -10, 10))
Load State Dict of Best Model
checkpoint_dict = torch.load(checkpointFileBestModel)
seq2seq_model = seq2seq_model.eval().cpu()
getWords = lambda x : [idx2vocab[idx] for idx in x.cpu().numpy()]
From test_dataset
getting question_tensor
, answer_tensor
Passing question_tensor
to seq2seq model in eval mode and with torch.no_grad()
to prevent gradient updation. This gives predicted answer tensor.
Converting predicted answer tensor and answer_tensor
to string using idx to vocabulary mapping.
Computing the BLEU score between a candidate answer and a predicted answer.
Function that calculate BLEU Score
def calculateBleuScore(model):
answers, pred_answers = [], []
for idx in tqdm(range(len(test_dataset)), desc="BLEU", disable=False):
question_tensor, answer_tensor = test_dataset[idx]
with torch.no_grad():
predictions, attn_score = model(question_tensor.unsqueeze(0))
pred = torch.argmax(predictions, dim=2)
ans_words, pred_ans = getWords(answer_tensor), getWords(pred[0,:])
return bleu_score(pred_answers, answers)
Calculate BLEU Score
bleu = calculateBleuScore(seq2seq_model)
print('BLEU Score : {:.4f}'.format(bleu))
BLEU Score : 0.4861
From test_dataset
getting question_tensor
, answer_tensor
Passing question_tensor
to seq2seq model in eval mode and with torch.no_grad()
to prevent gradient updation. This gives predicted answer tensor.
Converting predicted answer tensor and answer_tensor
to string using idx to vocabulary mapping.
Count the number of common words between them.
Calculate Precesion and Recall.
Calculate F1 Score based on the following formula.
F1 Score takes into account cooccurring words regardless their orders.
F1 Score :
def calculate_f1_score(model):
f1_scores = []
for idx in tqdm(range(len(test_dataset)), desc="F1 Score", disable=False):
question_tensor, answer_tensor = test_dataset[idx]
with torch.no_grad():
predictions, attention_score = model(question_tensor.unsqueeze(0))
pred = torch.argmax(predictions, dim=2)
answer_words, pred_ans = getWords(answer_tensor), getWords(pred[0,:])
number_of_common_words = sum((collections.Counter(answer_words) & collections.Counter(pred_ans)).values())
if number_of_common_words == 0:
f1_score = 0
precision = 1.0 * number_of_common_words / len(pred_ans)
recall = 1.0 * number_of_common_words / len(answer_words)
f1_score = (2 * precision * recall) / (precision + recall)
return f1_scores
f1_scores = calculate_f1_score(seq2seq_model)
f1_score =(sum(f1_scores)/len(f1_scores))
print('F1 Score : {:.4f}'.format(f1_score))
F1 Score : 0.9053
"Recall-Oriented Understudy for Gisting Evaluation. It includes measures to automatically determine the quality of a summary by comparing it to other (ideal) summaries created by humans. The measures count the number of overlapping units such as n-gram, word sequences, and word pairs between the computer-generated summary to be evaluated and the ideal summaries created by humans.
"Given two sequences X and Y, the longest common subsequence (LCS) of X and recall reflects the proportion of words in X (reference summary sentence) that are also present in Y (candidate summary sentence); while unigram precision is the proportion of words in Y that are also in X. Unigram recall and precision count all cooccurring words regardless their orders; while ROUGE-L counts only in-sequence co-occurrences."
ROUGE-L is one type of ROUGE measures. It is calculated by taking into account longest common subsequence (LCS) between two sequences.It counts only in-sequence co-occurrences.
From test_dataset
getting question_tensor
, answer_tensor
Passing question_tensor
to seq2seq model in eval mode and with torch.no_grad()
to prevent gradient updation. This gives predicted answer tensor.
Converting predicted answer tensor and answer_tensor
to string using idx to vocabulary mapping.
Computing the Rouge-L score between a candidate answer and a predicted answer by getting longest common subsequence (LCS) between the two sequences.
Applying ROUGE-L Score Formula :
def longest_common_subsequence(str1, str2):
Makeing a grid of 0's with len(str2) + 1 columns and len(str1) + 1 rows.
dp = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]
Iterate up each column, starting from the last one.
for col in reversed(range(len(str2))):
for row in reversed(range(len(str1))):
if str2[col] == str1[row]:
If the corresponding characters for this cell are the same.
dp[row][col] = 1 + dp[row + 1][col + 1]
Otherwise they must be different.
dp[row][col] = max(dp[row + 1][col], dp[row][col + 1])
The original problem's answer is in dp[0][0]. Return it.
return dp[0][0]
def rougel_score(ans, pred):
BETA, answers, pred_answers = 1.2, [], []
if len(pred)!=1 and len(ans)<=0:
for idx in range(min(len(pred),len(ans))):
pred_words, ans_words = pred[idx], ans[idx]
long_cmmn_subseq = longest_common_subsequence(ans_words, pred_words)
max_ans, max_pred = max(answers), max(pred_answers)
Rouge-L Score
return ((1 + BETA**2)* max_pred * max_ans)/float(max_ans + BETA**2 * max_pred) if (max_ans !=0 and max_pred !=0) else 0.0
def calculateRougeLScore(model):
answers, pred_answers = [], []
for idx in tqdm(range(len(test_dataset)), desc="ROUGE-L", disable=False):
question_tensor, answer_tensor = test_dataset[idx]
with torch.no_grad():
predictions, attn_score = model(question_tensor.unsqueeze(0))
pred = torch.argmax(predictions, dim=2)
answer_words, pred_ans = getWords(answer_tensor), getWords(pred[0,:])
return rougel_score(answers, pred_answers)
rouge_l_score = calculateRougeLScore(seq2seq_model)
print('ROUGE-L Score : {:.4f}'.format(rouge_l_score))
ROUGE-L Score : 0.9606
Start Conversation with the Bot
def bot_response(question):
question = SOS_TOKEN + " " + textPreprocess(question) + " " + EOS_TOKEN
question_tensor = torch.tensor([vocab2idx[w] for w in question.split(" ")], dtype=torch.int64)
with torch.no_grad():
predictions, attention_score = seq2seq_model(question_tensor.unsqueeze(0))
pred = torch.argmax(predictions, dim=2)
pred_words = getWords(pred[0,:])
return " ".join([w for w in pred_words if not (w == '<EOS>')])
question = ''
print('Bot : Hi, Did you want to chat with me?')
while question.lower()[:3] != 'bye':
while True:
print('Me : ', end='')
question = input()
if question:
if question.lower()[:3] != 'bye':
response = bot_response(question)
print('Bot: ' + response)
print('Bot: Bye!! Stay safe. Have a nice day.')
except KeyError:
print("Sorry, I am not sure what you are talking about :/")
Bot : Hi, Did you want to chat with me? Me : My last os update is not working. Bot: we'd like to help dm us the details of the issues you're experiencing and we'll go from there <url> Me : This needs to be fixed as my music randomly pauses. Bot: we want to help which iphone and ios version are you using Me : iphone and ios version is 11. Bot: thanks for letting us know let's continue in dm <url> Me : Also after the update,unable to connect to wifi automatically. Bot: we want to help which device are you using Me : iphone. Bot: thanks for that info let's continue in dm <url> Me : Thanks for your help!!! Bot: you're welcome we're glad to hear reach out to us if you need any more help have a great Me : Bye!!! Bot: Bye!! Stay safe. Have a nice day.