!pip install wget
!pip install clean-text
!pip install torchtext==0.6.0
!pip install sentencepiece
!pip install transformers==2.8.0
import re
import os
import csv
import time
import math
import json
import random
import collections
import numpy as np
import pandas as pd
from cleantext import clean
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchtext import data
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel, GPT2Tokenizer, GPT2LMHeadModel
from torchsummary import summary
from torchtext.data.metrics import bleu_score
from tqdm.autonotebook import tqdm
Collecting wget
Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
Building wheel for wget (setup.py) ... done
Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=c25f5180749786fadf9dc538740b4196d14b2b9872562bcc1381362268a9a646
Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting clean-text
Downloading https://files.pythonhosted.org/packages/78/30/7013e9bf37e00ad81406c771e8f5b071c624b8ab27a7984cd9b8434bed4f/clean_text-0.3.0-py3-none-any.whl
Collecting ftfy<6.0,>=5.8
Downloading https://files.pythonhosted.org/packages/ff/e2/3b51c53dffb1e52d9210ebc01f1fb9f2f6eba9b3201fa971fd3946643c71/ftfy-5.8.tar.gz (64kB)
|████████████████████████████████| 71kB 4.8MB/s
Collecting emoji
Downloading https://files.pythonhosted.org/packages/ff/1c/1f1457fe52d0b30cbeebfd578483cedb3e3619108d2d5a21380dfecf8ffd/emoji-0.6.0.tar.gz (51kB)
|████████████████████████████████| 51kB 3.8MB/s
Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from ftfy<6.0,>=5.8->clean-text) (0.2.5)
Building wheels for collected packages: ftfy, emoji
Building wheel for ftfy (setup.py) ... done
Created wheel for ftfy: filename=ftfy-5.8-cp36-none-any.whl size=45612 sha256=d1d22c28a0635375c1f1dbbd729b4686c5af3360fb9fc9c5848789f39589d819
Stored in directory: /root/.cache/pip/wheels/ba/c0/ef/f28c4da5ac84a4e06ac256ca9182fc34fa57fefffdbc68425b
Building wheel for emoji (setup.py) ... done
Created wheel for emoji: filename=emoji-0.6.0-cp36-none-any.whl size=49716 sha256=3977498e45f2ad13ac6946ec2bebf2e0d436a9a61976696f233bff5f23b09e32
Stored in directory: /root/.cache/pip/wheels/46/2c/8b/9dcf5216ca68e14e0320e283692dce8ae321cdc01e73e17796
Successfully built ftfy emoji
Installing collected packages: ftfy, emoji, clean-text
Successfully installed clean-text-0.3.0 emoji-0.6.0 ftfy-5.8
Collecting torchtext==0.6.0
Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
|████████████████████████████████| 71kB 4.7MB/s
Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchtext==0.6.0) (1.7.0+cu101)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torchtext==0.6.0) (1.18.5)
Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from torchtext==0.6.0) (4.41.1)
Collecting sentencepiece
Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
|████████████████████████████████| 1.1MB 13.3MB/s
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from torchtext==0.6.0) (1.15.0)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from torchtext==0.6.0) (2.23.0)
Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from torch->torchtext==0.6.0) (0.8)
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch->torchtext==0.6.0) (0.16.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.6/dist-packages (from torch->torchtext==0.6.0) (3.7.4.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext==0.6.0) (2020.11.8)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext==0.6.0) (1.24.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext==0.6.0) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->torchtext==0.6.0) (3.0.4)
Installing collected packages: sentencepiece, torchtext
Found existing installation: torchtext 0.3.1
Uninstalling torchtext-0.3.1:
Successfully uninstalled torchtext-0.3.1
Successfully installed sentencepiece-0.1.94 torchtext-0.6.0
Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (0.1.94)
Collecting transformers==2.8.0
Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
|████████████████████████████████| 573kB 11.4MB/s
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers==2.8.0) (2.23.0)
Collecting boto3
Downloading https://files.pythonhosted.org/packages/51/71/8025cafe9780b6102b9c564b75a0865781e84699d4d2c0d458e5664560b6/boto3-1.16.27.tar.gz (97kB)
|████████████████████████████████| 102kB 8.3MB/s
Collecting tokenizers==0.5.2
Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
|████████████████████████████████| 3.7MB 26.0MB/s
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers==2.8.0) (2019.12.20)
Collecting sacremoses
Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
|████████████████████████████████| 890kB 48.1MB/s
Requirement already satisfied: dataclasses; python_version < "3.7" in /usr/local/lib/python3.6/dist-packages (from transformers==2.8.0) (0.8)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers==2.8.0) (1.18.5)
Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (from transformers==2.8.0) (0.1.94)
Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers==2.8.0) (3.0.12)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers==2.8.0) (4.41.1)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers==2.8.0) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers==2.8.0) (1.24.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers==2.8.0) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers==2.8.0) (2020.11.8)
Collecting botocore<1.20.0,>=1.19.27
Downloading https://files.pythonhosted.org/packages/e4/e6/f41a1936b7602d9badb66ae9677e5d4cfc8bd9955a9a1618a0945f2f5b1b/botocore-1.19.27-py2.py3-none-any.whl (7.0MB)
|████████████████████████████████| 7.0MB 37.9MB/s
Collecting jmespath<1.0.0,>=0.7.1
Downloading https://files.pythonhosted.org/packages/07/cb/5f001272b6faeb23c1c9e0acc04d48eaaf5c862c17709d20e3469c6e0139/jmespath-0.10.0-py2.py3-none-any.whl
Collecting s3transfer<0.4.0,>=0.3.0
Downloading https://files.pythonhosted.org/packages/69/79/e6afb3d8b0b4e96cefbdc690f741d7dd24547ff1f94240c997a26fa908d3/s3transfer-0.3.3-py2.py3-none-any.whl (69kB)
|████████████████████████████████| 71kB 8.5MB/s
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers==2.8.0) (1.15.0)
Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers==2.8.0) (7.1.2)
Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers==2.8.0) (0.17.0)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.20.0,>=1.19.27->boto3->transformers==2.8.0) (2.8.1)
Building wheels for collected packages: boto3, sacremoses
Building wheel for boto3 (setup.py) ... done
Created wheel for boto3: filename=boto3-1.16.27-py2.py3-none-any.whl size=128454 sha256=d0ae36ea063530c3eeae2d08f6e5281f32cf99fe9301f330d6f35c0229bf51b3
Stored in directory: /root/.cache/pip/wheels/f7/4a/29/e5b74fd7012b8322191f5db368a910a78b013ce96bf4259d50
Building wheel for sacremoses (setup.py) ... done
Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=0e7149c2896f0d67879b69ee733aa2748b619f5feda9625bf98474bc9e853898
Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45
Successfully built boto3 sacremoses
ERROR: botocore 1.19.27 has requirement urllib3<1.27,>=1.25.4; python_version != "3.4", but you'll have urllib3 1.24.3 which is incompatible.
Installing collected packages: jmespath, botocore, s3transfer, boto3, tokenizers, sacremoses, transformers
Successfully installed boto3-1.16.27 botocore-1.19.27 jmespath-0.10.0 s3transfer-0.3.3 sacremoses-0.0.43 tokenizers-0.5.2 transformers-2.8.0
Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
from google.colab import drive
drive.mount('/content/gdrive/',force_remount=True)
Mounted at /content/gdrive/
Data Preprocessing
"""
Device
"""
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
"""
Batch Size
"""
BATCH_SIZE= 6
isTrain = False
def set_seed(seed):
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
set_seed(915)
def textPreprocess(txt):
"""
Convert to Lowercase and Trim the text
"""
txt = txt.lower().strip()
"""
Fix various unicode errors
transliterate to closest ASCII representation
"""
txt = clean(txt, fix_unicode=True, to_ascii=True)
"""
Removing zero-width character
"""
txt = re.sub(u"\ufe0f", r" ", txt)
"""
Remove URL
"""
txt = re.sub(r"https?://[A-Za-z0-9./]*", r" ", txt)
"""
Remove Specific Special character
"""
txt = re.sub(r"[-.!?()_]+", r" ", txt)
"""
Remove charatcter like special characters, punctuations except alphanumeric charatcter.
"""
txt = re.sub(r"[^0-9a-zA-Z]+", r" ", txt)
"""
Remove Extra spaces which are appearing from previous processing steps.
"""
txt = re.sub(r"\s+", r" ", txt).strip()
return txt
Required Directories Creation
"""
Required functions for directory creation
"""
def check_if_dir_exists(directory):
"""
Checks if 'directory' exists
"""
return(os.path.isdir(directory))
def make_dir(directory):
"""
Create directory
"""
if not check_if_dir_exists(directory):
os.mkdir(directory)
print("Directory %s created successfully." %directory)
else:
print("Directory %s exists." %directory)
print("We are in:",os.getcwd())
"""
Required directory creation
"""
chatbot_dir="/content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta"
make_dir(chatbot_dir)
os.chdir("/content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta")
chatbot_data_dir = chatbot_dir + "/ChatBot_Data/"
make_dir(chatbot_data_dir)
chatbot_result_dir = chatbot_dir + "/ChatBot_Results/"
make_dir(chatbot_result_dir)
chatbot_checkpoint_dir = chatbot_dir + "/ChatBot_Checkpoint/"
make_dir(chatbot_checkpoint_dir)
print(chatbot_data_dir)
We are in: /content Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta exists. Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Data/ exists. Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Results/ exists. Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Checkpoint/ exists. /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Data/
Listing Directories
def list_dir(dir_path):
"""
List directories for a given path
"""
print("Directory %s contains : " %dir_path)
for dir_or_file in os.listdir(dir_path):
print(dir_or_file)
print("\n")
"""
List created directories
"""
print('Current directory : ', os.getcwd(),'\n')
list_dir(chatbot_dir)
list_dir(chatbot_data_dir)
ist_dir(chatbot_checkpoint_dir)
Current directory : /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta contains : ChatBot_Data ChatBot_Results ChatBot_Checkpoint .vector_cache Images Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Data/ contains : .kaggle dev_v2.1.json.gz dev_v2.1.json question_asked.tsv QAWhats.tsv twcs.csv QA_Pair.tsv Apple_QA_Pair.tsv Q_asked question_asked Directory /content/gdrive/My Drive/DATA_SCIENCE_Capstone_Project_ChatBot_Sofia_Dutta/ChatBot_Checkpoint/ contains : QA_Seq2Seq_ChatBot_BestModel.pt QA_Seq2Seq_ChatBot.pt QA_GPT_Seq2seq_ChatBot.pt QA_GPT_Seq2seq_Results.csv Apple_QA_Seq2Seq_ChatBot.pt Apple_QA_Seq2Seq_ChatBot_BestModel.pt Apple_QA_GPT_Seq2seq_ChatBot.pt Apple_QA_GPT_Seq2seq_Results.csv
Benefits of using a Transformer
The heart of chatbot is a sequence-to-sequence (seq2seq) model. The goal of a seq2seq model is to take a variable-length question sequence as an input, and return a variable-length answer sequence as an output.
Components :
DistilBertModel
(i.e. distilbert-base-uncased
) layer to convert tokens into feature vectors. The purpose of the embedding layer is to convert tokens into feature vectors. Bert : "Bidirectional Encoder Representations from Transformers"
DistilBERT : a distilled version of BERT: smaller, faster, cheaper and lighter.
Next, for encoder I have used nn.TransformerEncoder
, an encoding transformer that takes a tensor of shape $(T, B, D)$ . It expects all $T$ items at once. The purpose of the encoder is to encode a variable length question sequence to a fixed-length context vector. This context vector will contain semantic information about the question.
For decoder I have used TransformerDecoder
, and GPT2LMHeadModel
(i.e. distilgpt2
). It generates the output one item at a time. The end goal of the decoder layer is to take as input, a word and a context vector, and predict the next word in the sequence.
GPT-2 : “Generative Pretrained Transformer 2”
- GPT2LMHeadModel : "The GPT2 Model transformer with a language modeling head on top."
For embedding DistilBertModel
(i.e. distilbert-base-uncased
) is used with DistilBertTokenizer
(i.e. distilbert-base-uncased
) in order to have exact same encoding process that is used by the original DistilBertModel
training to convert new strings into inputs for DistilBert
tokenization.
Number of hidden neurons $D$ that DistilBertModel
is using with the BERT_MODEL.config.dim
. .config
variable of huggingface model contains information of model configuration.
From BERT_TOKENIZER
taking vocab_size
, cls_token_id
, sep_token_id
, pad_token_id
as vocabulary size, start of sentence, end of sentence, padding for sentence to make variable length sentence padde upto max lenth with this pad token to make variable length sentence of same size.
Some information on BERT and other algorithms as explained by their docs for reference to papers read:
We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications.
"As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP), operating these large models in on-the-edge and/or under constrained computational training or inference budgets remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger counterparts."
GPT-2 : “Generative Pretrained Transformer 2”: “Generative” means the model was trained to predict (or “generate”) the next token in a sequence of tokens in an unsupervised way."
"GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1] of 8 million web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous words within some text. The diversity of the dataset causes this simple goal to contain naturally occurring demonstrations of many tasks across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than 10X the amount of data."
Transformer models in general are computationally expensive. Hence for faster run I have used GPT2LMHeadModel
* (i.e. distilgpt2
) for decoding one step at a time. It is used with GPT2Tokenizer
(i.e. distilgpt2
) in order to have exact same decoding process that is used by the original GPT2LMHeadModel
training to convert tensor into string GPT2
tokenization.
For size of the embedding GPT2LMHeadModel
is using with the GPT_MODEL.config.n_embd
.
From GPT_TOKENIZER
taking vocab_size
, bos_token_id
, eos_token_id
as vocabulary size, start of sentence and both for end of sentence, padding for sentence.
def getField(tokenizer, isGPT):
use_vocab = False
tokenize = tokenizer.tokenize
preprocessing = tokenizer.convert_tokens_to_ids
unk_token = tokenizer.unk_token_id
if isGPT:
init_token=tokenizer.bos_token_id
eos_token=tokenizer.eos_token_id
pad_token=tokenizer.eos_token_id
else:
init_token = tokenizer.cls_token_id
eos_token = tokenizer.sep_token_id
pad_token = tokenizer.pad_token_id
return data.Field(use_vocab = use_vocab,
tokenize = tokenize,
preprocessing = preprocessing,
init_token = init_token,
eos_token = eos_token,
pad_token = pad_token,
unk_token = unk_token)
def getDataset(dataFile,device):
SOURCE = getField(BERT_TOKENIZER, False)
TARGET = getField(GPT_TOKENIZER, True)
data_fileds = [('source', SOURCE), ('target', TARGET)]
qa_data = data.TabularDataset(path=dataFile, format='tsv', fields= data_fileds)
train_data, test_data = qa_data.split()
train_data, validation_data = train_data.split()
SOURCE.build_vocab(train_data, min_freq = 2)
TARGET.build_vocab(train_data, min_freq = 2)
train_iterator, validation_iterator, test_iterator = data.BucketIterator.splits((train_data, validation_data, test_data),
batch_size = BATCH_SIZE,
sort_key = lambda x: len(x.source),
sort_within_batch = False,
device = device)
return data_fileds ,train_iterator, validation_iterator, test_iterator
"""
BERT
BERT_TOKENIZER, VOCAB_SIZE, BERT_MODEL, HIDDEN_SIZE
"""
BERT_TOKENIZER = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
VOCAB_SIZE = BERT_TOKENIZER.vocab_size
BERT_MODEL = DistilBertModel.from_pretrained('distilbert-base-uncased')
HIDDEN_SIZE = BERT_MODEL.config.dim
"""
GPT2
GPT_TOKENIZER, GPT_VOCAB_SIZE, PAD, GPT_MODEL, EMBEDDING_SIZE
"""
GPT_TOKENIZER = GPT2Tokenizer.from_pretrained('distilgpt2')
GPT_VOCAB_SIZE = GPT_TOKENIZER.vocab_size
PAD = GPT_TOKENIZER.eos_token_id
GPT_MODEL = GPT2LMHeadModel.from_pretrained('distilgpt2')
EMBEDDING_SIZE = GPT_MODEL.config.n_embd
DATA_FILE = chatbot_data_dir + 'Apple_QA_Pair.tsv'
data_fileds, train_iterator, validation_iterator, test_iterator = getDataset(DATA_FILE, device)
.config
variable of huggingface model contains information of model configuration.BERT_MODEL.config
DistilBertConfig { "_num_labels": 2, "activation": "gelu", "architectures": [ "DistilBertForMaskedLM" ], "attention_dropout": 0.1, "bad_words_ids": null, "bos_token_id": null, "decoder_start_token_id": null, "dim": 768, "do_sample": false, "dropout": 0.1, "early_stopping": false, "eos_token_id": null, "finetuning_task": null, "hidden_dim": 3072, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "max_position_embeddings": 512, "min_length": 0, "model_type": "distilbert", "n_heads": 12, "n_layers": 6, "no_repeat_ngram_size": 0, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_past": true, "pad_token_id": 0, "prefix": null, "pruned_heads": {}, "qa_dropout": 0.1, "repetition_penalty": 1.0, "seq_classif_dropout": 0.2, "sinusoidal_pos_embds": false, "task_specific_params": null, "temperature": 1.0, "tie_weights_": true, "top_k": 50, "top_p": 1.0, "torchscript": false, "use_bfloat16": false, "vocab_size": 30522 }
GPT_MODEL.config
GPT2Config { "_num_labels": 1, "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "bad_words_ids": null, "bos_token_id": 50256, "decoder_start_token_id": null, "do_sample": false, "early_stopping": false, "embd_pdrop": 0.1, "eos_token_id": 50256, "finetuning_task": null, "id2label": { "0": "LABEL_0" }, "initializer_range": 0.02, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0 }, "layer_norm_epsilon": 1e-05, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "gpt2", "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 6, "n_positions": 1024, "no_repeat_ngram_size": 0, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_past": true, "pad_token_id": null, "prefix": null, "pruned_heads": {}, "repetition_penalty": 1.0, "resid_pdrop": 0.1, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "torchscript": false, "use_bfloat16": false, "vocab_size": 50257 }
While dealing with sequential data like a sentence, we need to take into account the sequence order. Without the information about the sequence order, randomly shuffled words in a sentence can undermine the ability of a model to predict the next sequence. Hence, I am using a technique recently developed by Takase and Okazaki, called positional encoding.
Purpose of Positional Encoding layer is to encode into embedded tensors the positional information capturing the order of sequences.
Lets say, $\boldsymbol{h}_i \in \mathbb{R}^D$ represents the embedding for a token $x_i$ fed into the network. For a sequence of input symbols we will have a sequence of embeddings $\boldsymbol{h}_1$, $\boldsymbol{h}_2$, $\ldots, \boldsymbol{h}_t, \ldots$, $\boldsymbol{h}_T$. We concatenate some kind of a position vector $P(t)$, which we can add to our embeddings to create an improved embedding $\tilde{\boldsymbol{h}_t}$ that contains information about the original context $x_t$ and it's location as the $t$'th item in the input.
Now, we can proceed with $\tilde{\boldsymbol{h}_1}, \tilde{\boldsymbol{h}_2}, \ldots, \tilde{\boldsymbol{h}_t}, \ldots, \tilde{\boldsymbol{h}_T}$ as the inputs to the rest of our network, knowing that the sequential nature has been placed inside of the embeddings themselves.
In order to aviod equal weight to both the content (original embedding $\boldsymbol{h}_t$) and the position $P(t)$. We up-weight the importance of the content over the position.
class PositionalEncoding(nn.Module):
"""
Element-wise adding information regarding the relative or absolute position of the tokens in the sequence.
"""
def __init__(self, d_model, dropout=0.1, max_len=5000, batch_first=False):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
self.d_model = d_model
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
self.batch_first = batch_first
def forward(self, x):
"""
input shape (B, T, D) to (T, B, D)
"""
if self.batch_first:
x = x.permute(1, 0, 2)
x = x *np.sqrt(self.d_model) + self.pe[:x.size(0), :]
x = self.dropout(x)
"""
going back to (B, T, D) shape
"""
if self.batch_first:
x = x.permute(1, 0, 2)
return x
I have used nn.TransformerEncoder
for an encoding transformer that takes a tensor of shape $(T, B, D)$ . It expects all $T$ items at once.
The inputs to the encoder are embeded question sequence $\boldsymbol{Q} = \boldsymbol{q}_1, \boldsymbol{q}_1, \ldots, \boldsymbol{q}_T$.
It applies positional encoding on embeded feature vector from DistilBertModel
using DistilBertTokenizer
to include information about sequence ordering in order to produce better results.
The first input to the decoder will be the output of encoder transformer.
Finally, I return the encoder transformer outputs which has semantic information about the question that goes as input to the bot.
The queries and values can be different inputs. This is used to create a new building block called multi-headed attention.
The idea behind multi-headed attention is that we want to have multiple attention mechanism iterations run on a single input. The multiple attention mechanism iterations will be the "heads" of the attention. Lets say we want $H$ heads, in order to accomplish this. We can simply create multiple attention functions, computing
Finally, after concatenation of the outputs, I add another projection on the output, $W^O$ to ensure the result of desired shape.
Final Multi-headed attention function:
class EncoderTransformer(nn.Module):
def __init__(self, H, n_head=8, n_layers=1, dropout=0.5):
super(EncoderTransformer, self).__init__()
self.pos_encoder = PositionalEncoding(HIDDEN_SIZE, dropout)
self.encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=HIDDEN_SIZE, nhead=n_head), num_layers= n_layers)
def forward(self, embedded, input_mask):
embedded = self.pos_encoder(embedded)
"""
enc_outs : (T, B, H)
"""
enc_outs = self.encoder(embedded, input_mask)
return enc_outs
TransformerDecoder
, and GPT2LMHeadModel
(i.e. distilgpt2
) for decoding one step at a time in a token-by-token fashion.
- GPT2LMHeadModel : "The GPT2 Model transformer with a language modeling head on top."
It applies positional encoding on transformer encoder outcome to include information about sequence ordering to have improved result.
TransformerDecoder
takes as input, a positional encoded sequence, a transformer encoder outcome as context/memory and a boolean mask to indicate presence of data among padded sequence.
Transormer has in-built multiheaded attention.
Input embedding for GPT2 decoders GPT2LMHeadModel
is populated from the above layers outcome.
Decode outcome from GPT2LMHeadModel
is passed as next deoder input input for further prediction of next token.
The DecoderTransformer produces predictions for the answer sequence, $\boldsymbol{\hat{A}} = \boldsymbol{\hat{a}}_1, \boldsymbol{\hat{a}}_1, \ldots, \boldsymbol{\hat{a}}_T$. Cross-entropy loss is calculated between $\boldsymbol{\hat{Y}}$ and $\boldsymbol{Y}$.
The return from the decoder stage predicted token.
class DecoderTransformer(nn.Module):
def __init__(self, hidden_size, n_head=8, n_layers=1, dropout=0.5):
super(DecoderTransformer, self).__init__()
self.pos_decoder = PositionalEncoding(EMBEDDING_SIZE, dropout)
self.output_mask = None
self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(d_model=EMBEDDING_SIZE, nhead=n_head), num_layers=n_layers)
self.gpt_model = GPT_MODEL
def forward(self, enc_memory, batch_size, answer, decode_steps, is_teacher_forcing):
"""
output_seq : (T, B, H)
"""
output_seq = self.pos_decoder(enc_memory)
if self.output_mask is None or self.output_mask.size(0) != output_seq.size(0):
self.output_mask = nn.Transformer().generate_square_subsequent_mask(output_seq.size(0)).to(output_seq.device)
"""
(T, B, H)
"""
decoded_in = self.decoder(tgt = output_seq, memory = enc_memory, tgt_mask = self.output_mask)
"""
Either Teacher Forcing OR Auto-Regressive.
"""
teacher_forcing = random.random() < is_teacher_forcing
h_previous = None
preds = torch.zeros(decode_steps, batch_size, GPT_VOCAB_SIZE, device=device)
"""
(B, T, H) Initial step
"""
h_decoded_outs, h_previous = self.gpt_model(input_ids=None, inputs_embeds=decoded_in.permute(1, 0, 2), past=h_previous)
for decode_step in range(decode_steps):
if decode_step > 0:
if self.training and teacher_forcing:
decoded_in = answer[decode_step].unsqueeze(0)
h_decoded_outs, h_previous = self.gpt_model(decoded_in.permute(1, 0), past=h_previous)
h_decoded_outs = h_decoded_outs.permute(1, 0, 2)
preds[decode_step] = h_decoded_outs[-1]
"""
Next token is the decoder input for next time step further time step's token prediction.
"""
decoded_in = torch.argmax(h_decoded_outs[-1], 1).unsqueeze(0)
return preds
Components :
DistilBertModel
(i.e. distilbert-base-uncased
) layer to convert tokens into feature vectors.Bert : "Bidirectional Encoder Representations from Transformers"
DistilBERT : "DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter."
Next, for encoder I have used nn.TransformerEncoder
, an encoding transformer that takes a tensor of shape $(T, B, D)$ . It expects all $T$ items at once.
For decoder I have used nn.TransformerDecoder
, and GPT2LMHeadModel
(i.e. distilgpt2
). It generates the output one item at a time.
- GPT2LMHeadModel : "The GPT2 Model transformer with a language modeling head on top."
Auto-Regressive :
Teacher Forcing :
For DistilBertModel's frozen weights, used the with torch.no_grad():
context instead of manually setting each parameters to requires_grad=False
. This has the same effect of computing gradients but forgetting them immediately so that gradient updated does not occur.
class GPTSeq2Seq(nn.Module):
def __init__(self, H=1024, n_residual_blocks=3, dropout=0.5, decode_steps=22):
super(GPTSeq2Seq, self).__init__()
self.decode_steps = decode_steps
self.bert_embedding = BERT_MODEL
self.input_mask = None
self.encoder = EncoderTransformer(H)
self.decoder = DecoderTransformer(H)
def forward(self, question, answer):
"""
Embedding
(T, B) to (T, B, H)
inputs : (T,B).
Bert takes (B,T).
Transformer takes tensor of (T,B,H) Shape. Hence again swap dimension.
embedded : (T, B, H)
"""
with torch.no_grad():
embedded = self.bert_embedding(question.permute(1, 0))[0].permute(1, 0, 2)
if self.input_mask is None or self.input_mask.size(0) != question.size(0):
self.input_mask = nn.Transformer().generate_square_subsequent_mask(question.size(0)).to(question.device)
"""
Encoding
"""
enc_memory = self.encoder(embedded, self.input_mask)
"""
Decoding
"""
decode_steps = self.decode_steps
"""
Training : Given Question and Answer pairs gives exact decode length.
Testing : Given decode_steps.
"""
if answer is not None:
decode_steps = answer.size(0) - 1
"""
Either Teacher Forcing OR Auto-Regressive
"""
teacher_forcing_ratio = 1 if self.training else 0
prediction = self.decoder(enc_memory, question.size(1), answer, decode_steps , teacher_forcing_ratio)
return prediction
gpt_seq2seq_model = GPTSeq2Seq()
def show_model_architechture(model_dict):
"""
Show GPT2 Seq2seq models architechture.
"""
print("*"*100)
print("GPT2 SEQ2SEQ Models Architechture".rjust(65))
print("*"*100 + "\n\n")
for m in model_dict:
print("*"*100)
print(m.rjust(55))
print("*"*100)
print(model_dict[m])
print("*"*100)
print('\n\n')
"""
Show the structure of all of the models of GPT2 Seq2seq models
"""
show_model_architechture({'GPT2_SEQ2SEQ':gpt_seq2seq_model})
**************************************************************************************************** GPT2 SEQ2SEQ Models Architechture **************************************************************************************************** **************************************************************************************************** GPT2_SEQ2SEQ **************************************************************************************************** GPTSeq2Seq( (bert_embedding): DistilBertModel( (embeddings): Embeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (transformer): Transformer( (layer): ModuleList( (0): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (1): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (2): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (3): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (4): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (5): TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) ) ) ) (encoder): EncoderTransformer( (pos_encoder): PositionalEncoding( (dropout): Dropout(p=0.5, inplace=False) ) (encoder): TransformerEncoder( (layers): ModuleList( (0): TransformerEncoderLayer( (self_attn): MultiheadAttention( (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=2048, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=2048, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) ) ) ) ) (decoder): DecoderTransformer( (pos_decoder): PositionalEncoding( (dropout): Dropout(p=0.5, inplace=False) ) (decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=2048, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=2048, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (gpt_model): GPT2LMHeadModel( (transformer): GPT2Model( (wte): Embedding(50257, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (lm_head): Linear(in_features=768, out_features=50257, bias=False) ) ) ) ****************************************************************************************************
Weight initialization and Freeze the BERT and GPT parameters
_Weightfreezing for not alter the parameters/coefficients of a layer. Gradients will still be calculated and back propagated through the layer, but when performing gradient descent updates make no change - as if we set the learning rate $\eta=0$.
It is not possible to use weight freezing for all layers of the network, that would mean there is nothing to train. Training is only meaningful if we are going to adjust at least some of the parameters of the model. We will freeze the weights for bert
, gpt
layers, and only change the weights of other layers added by me for fine tuninng. This implicitly assumes that the filters learned by bert and gpt2 are better .
To do this, setting the requires_grad
flag to False
for bert
, gpt
layers. This will make it so that no parameter will save a gradient after back-propagation, and thus no changes will occur when the optimizer performs the update step.
Optimizer And Loss Function
nn.CrossEntropyLoss
.nn.CrossEntropyLoss
will not calculate any loss for that value."""
Weight initialization
"""
def frozen_weights_init(m):
for parameter_name, parameter in m.named_parameters():
for w in ['bert.', 'gpt.']:
if parameter_name.find(w) != -1:
parameter.requires_grad = False
if parameter.requires_grad:
nn.init.normal_(parameter.data, mean=0, std=0.01)
frozen_weights_init(gpt_seq2seq_model)
optimizer = torch.optim.AdamW(gpt_seq2seq_model.parameters())
loss_func = nn.CrossEntropyLoss(ignore_index= PAD)
Let's denote $\boldsymbol{x}$ as input feature, and $f()$ to denote model. If there is a label associated with $\boldsymbol{x}$, then we will denote it as $y$. Our model takes in $\boldsymbol{x}$, and produces a prediction $\hat{y}$. This becomes $\hat{y} = f(\boldsymbol{x})$. The model needs to adjust some parameters to provide better predictions thus generating a better model. If $\Theta$ denotes all the parameters of a model. $\hat{y} = f_\Theta(\boldsymbol{x})$ represent that the model's behavior is dependent on the value of it's parameters $\Theta$ also known as the "state" of the model.
Our goal for training is to minimize the loss function which quantifies just how badly the model is doing at the goal of predicting the ground truth $y$. If $y$ is goal, and $\hat{y}$ is the prediction, then loss function is denoted by $\ell(y, \hat{y})$. If there is a training set with $N$ examples, the equation is:
$$\min_{\Theta} \sum_{i=1}^N \ell(f_\Theta(\boldsymbol{x}^{(i)}), y^{(i)}) $$The summation ($\sum_{i=1}^N$) is going over all $N$ pairs of input ($\boldsymbol{x}^{(i)}$) and output ($y^{(i)}$), and determining just how badly ($\ell(\cdot,\cdot)$) are doing. To create the best possible model $\Theta$ is adjusted using gradient descent. If $\Theta_k$ is the current state of our model, which needs to improve, then the next state $\Theta_{k+1}$, that hopefully reduces the loss of the model in terms of a mathematical equation is:
$$\Theta_{k+1} = \Theta_k - \eta \cdot \frac{1}{N} \sum_{i=1}^{N} \nabla_{\Theta}\ell(f_{\Theta_k}(\boldsymbol{x}^{(i)}), y^{(i)})$$The above equation shows the mathematical representation for gradient decent. We follow the gradient ($\nabla$) to tell us how to adjust $\Theta$. As PyTorch provides us APIs to perform differentiation, we can easily compute $\nabla_{\Theta}$ and don't have to keep track of everything inside of $\Theta$. $\eta$ is learning rate or the step size.
For training we need :
def train_gpt_seq2seq(model, loss_func, train_iterator, val_iterator=None,
epochs=50, device="cpu", optimizer=None):
to_track = ["epoch", "total time", "train loss"]
if val_iterator is not None:
to_track.append("val loss")
total_train_time = 0
results = {}
for item in to_track:
results[item] = []
"""
Place the model on the correct compute resource (CPU or GPU)
"""
model.to(device)
best_val_loss = float('inf')
for epoch in tqdm(range(epochs), desc="Epoch", disable=False):
"""
Put our model in training mode
"""
model = model.train()
BERT_MODEL.eval()
GPT_MODEL.eval()
running_loss = 0.0
start = time.time()
for i, batch in enumerate(tqdm(train_iterator, desc="Train Batch Iterator", leave=False, disable=False)):
question = batch.source
answer = batch.target
"""
PyTorch stores gradients in a mutable data structure. So need to set it to a clean state to prevent
getting old information from a previous iteration.
"""
optimizer.zero_grad()
"""
This just computed f_Θ(x(i))
target : (T, B)
y_hat : (T, B, D)
"""
y_hat = model(question = question,
answer = answer)
"""
Compute loss.
"""
loss = loss_func(y_hat.view(-1, y_hat.shape[-1]), answer[1:].view(-1))
"""
Compute ∇_Θ.
"""
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
"""
Now need to update all the parameters.
Θ_{k+1} = Θ_k − η * ∇_Θ ℓ(y_hat, y)
"""
optimizer.step()
"""
Capture Training Loss to plot it.
"""
running_loss += loss.item()
"""
end training epoch
"""
end = time.time()
total_train_time += (end-start)
results["epoch"].append( epoch )
results["total time"].append( total_train_time )
results["train loss"].append( (running_loss/len(train_iterator)) )
if val_iterator is None:
pass
else:
"""
Set the model to evaluation mode to prevent any updates.
"""
model = model.eval()
val_running_loss = 0.0
for i, batch in enumerate(val_iterator):
question = batch.source
answer = batch.target
y_hat = model(question = question,
answer = answer )
loss = loss_func(y_hat.view(-1, y_hat.shape[-1]), answer[1:].view(-1))
"""
Capture validation Loss.
"""
val_running_loss += loss.item()
results["val loss"].append( (val_running_loss/len(val_iterator)) )
if val_running_loss < best_val_loss:
"""
Save best Model in separate checkpoint file.
"""
torch.save(model.state_dict(), chatbot_checkpoint_dir + 'Apple_QA_GPT_Seq2seq_ChatBot_BestModel.pt')
best_val_loss = val_running_loss
"""
Save Model in a checkpoint file.
"""
torch.save(model.state_dict(), chatbot_checkpoint_dir + 'Apple_QA_GPT_Seq2seq_ChatBot.pt')
result_df = pd.DataFrame.from_dict(results)
result_df.to_csv(chatbot_checkpoint_dir + "Apple_QA_GPT_Seq2seq_Results.csv")
return result_df
import torch
torch.cuda.empty_cache()
result_df = train_gpt_seq2seq(model = gpt_seq2seq_model,
loss_func = loss_func,
train_iterator = train_iterator,
epochs = 4,
device = device,
optimizer = optimizer
)
"""
Load State Dict
"""
checkpointFile = chatbot_checkpoint_dir + 'Apple_QA_GPT_Seq2seq_ChatBot.pt'
loaded_checkpoint = torch.load(checkpointFile)
gpt_seq2seq_model.load_state_dict(loaded_checkpoint)
gpt_seq2seq_model = gpt_seq2seq_model.eval()
"""
GPTSeq2seq ChatBot Training Losses
"""
loaded_results_df = pd.read_csv(chatbot_checkpoint_dir + "Apple_QA_GPT_Seq2seq_Results.csv")
sns.set(style='darkgrid')
plt.figure(figsize=(12,6))
plt.plot(loaded_results_df['train loss'], 'b-o')
plt.title("Training Loss of GPT SEQ2SEQ QABot", fontsize=16)
plt.xlabel('Number of Epochs', fontsize=14)
plt.xticks(range(4))
plt.ylabel('Training Losses', fontsize=14)
plt.show()
"""
Load saved model
"""
gpt_seq2seq_model = GPTSeq2Seq()
gpt_seq2seq_model.load_state_dict(torch.load(chatbot_checkpoint_dir + 'Apple_QA_GPT_Seq2seq_ChatBot.pt'))
<All keys matched successfully>
getQuestion = lambda x : BERT_TOKENIZER.decode(x.tolist())
getSentence = lambda x : GPT_TOKENIZER.decode(x.tolist())
gpt_seq2seq_model = gpt_seq2seq_model.eval().to(device)
def getPrediction(model,idx_lst):
answers, pred_answers = [], []
for i, batch in tqdm(enumerate(test_iterator), desc="Predict", disable=False):
question_tensor, answer_tensor = batch.source, batch.target
with torch.no_grad():
predictions = model(question = question_tensor,
answer = answer_tensor)
question_s, answer_s, pred_ans = getQuestion(question_tensor[:,0]), getSentence(answer_tensor[:,0]), getSentence(torch.argmax(predictions[:, 0, :], 1))
answers.append(answer_s[:-1])
pred_answers.append(pred_ans)
if i in idx_lst:
print("\nAnswer : ", answer_s,"\nPredicted : ", pred_ans)
print("*"*100,"\n\n")
"""
Predictions
"""
print("*"*15)
print("Predictions :")
print("*"*15)
getPrediction(gpt_seq2seq_model, [12,15,18,20,22,34,88,98,95,100,165,189,190,398,555,879,980,999])
*************** Predictions : ***************
Answer : <|endoftext|>try the steps in this article to correct the issue: <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to get this sorted out dm us <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>you're welcome if you have any further questions, feel free to reach out<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to get this sorted out dm us <url> <url> <url> <url> <url> <url **************************************************************************************************** Answer : <|endoftext|>got it have you tried to work through these steps: <url> if not, give them a shot and update us<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to get this sorted out dm us <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url **************************************************************************************************** Answer : <|endoftext|>let's create a backup of your data and update to ios to see if that helps here's how: <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to get this sorted out dm us <url> <url> <url> <url> <url> <url> **************************************************************************************************** Answer : <|endoftext|>heres what you can do to work around the issue until its fixed in a future software update: <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to get this sorted out dm us <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url **************************************************************************************************** Answer : <|endoftext|>we'd like to help which iphone are you using also, tell us more details of what you're experiencing<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to get started please dm us <url> <url> <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>we'd like to help if you're having issues with your apple product dm us more details and we'll get started <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your dm us your current ios version number to proceed <url> <url> <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>no problem, were happy to help let us know if we can assist with anything in the future have a great day<|endoftext|><|endoftext|> Predicted : we'd like to work on your dm us your current ios version number to proceed <url> <url> < **************************************************************************************************** Answer : <|endoftext|>thanks for reaching out about this behavior lets work together to get this addressed to clarify, which device are you using approximately how long has this been happening let us know via a dm well go from there <url><|endoftext|><|endoftext|> Predicted : we'd like to work on your dm us your current ios version number to proceed <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>we can certainly take a look to begin, can you tell us more about the behavior you're experiencing<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your dm us your current ios version number to proceed <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>we want to help you with any issues youre having send us some details about whats going on in dm to get started <url><|endoftext|> Predicted : we'd like to assist dm us <url> <url> <url> <url> <url> <url> <url> **************************************************************************************************** Answer : <|endoftext|>we're here for you check out this article for help: <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your iphone, and ios version of ios software, please dm us <url> <url> <url> <url> <url> <url> <url> <url> <url> **************************************************************************************************** Answer : <|endoftext|>we'd like to help are you seeing that emoji when typing in any app or is it just in your frequently used emojis<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your iphone, and ios version of ios software, please dm us <url> <url> <url> <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>we'd like to take a further look at this with you please meet us in dm <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your iphone x, and ios version of ios is ios installed you can check by going to settings > general > about please dm us <url **************************************************************************************************** Answer : <|endoftext|>we're here to help take a look at this article and dm us if you need more help <url> <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your iphone x, which ios version is installed when did you first notice this issue dm us <url> <url> <url> <url> <url> <url> <url> <url> **************************************************************************************************** Answer : <|endoftext|>thanks for reaching out confirm your country for us in a dm using the link below and we'll continue <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to see how we can help to start, send us a dm with more information about what's happening <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>we'd like to help send us a dm with the ios version you are currently using and we can start there <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to help dm us and we'll continue <url> <url> <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>heres what you can do to work around the issue until its fixed in a future software update: <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to help dm us and we'll continue <url> <url> <url> <url> <url> <url> <url> <url> <url> < ****************************************************************************************************
def getPrediction(model,idx_lst):
answers, pred_answers = [], []
for i, batch in tqdm(enumerate(test_iterator), desc="Predict", disable=False):
question_tensor, answer_tensor = batch.source, batch.target
with torch.no_grad():
predictions = model(question = question_tensor,
answer = answer_tensor)
question_s, answer_s, pred_ans = getQuestion(question_tensor[:,0]), getSentence(answer_tensor[:,0]), getSentence(torch.argmax(predictions[:, 0, :], 1))
answers.append(answer_s[:-1])
pred_answers.append(pred_ans)
if i in idx_lst:
print("\nAnswer : ", answer_s,"\nPredicted : ", pred_ans)
print("*"*100,"\n\n")
"""
Predictions
"""
print("*"*15)
print("Predictions :")
print("*"*15)
getPrediction(gpt_seq2seq_model, [180,200,323,489,555,698,755,888,930,1111])
*************** Predictions : ***************
Answer : <|endoftext|>are you seeing the same issue when you aren't connected to wi fi<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your iphone, and ios version of ios software, please dm us <url> <url> <url **************************************************************************************************** Answer : <|endoftext|>were here to help tell us a bit about what is happening and well work with you to find a solution<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your iphone, and ios version of ios software, please dm us <url> <url> <url> <url> <url> <url> <url> <url> <url> <url **************************************************************************************************** Answer : <|endoftext|>try this out: <url> that should help with the autocorrect issue dm us if it does not <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your iphone x is it's the current ios version number that you're using you can find itunes version number in settings > general **************************************************************************************************** Answer : <|endoftext|>we'd be glad to help meet up with us in dm with the issues you're facing, and we'll take it from there <url><|endoftext|> Predicted : we'd like to work on your iphone x, which ios version is installed when did you first notice this issue dm us <url **************************************************************************************************** Answer : <|endoftext|>we're here to help take a look at this article and dm us if you need more help <url> <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to work on your iphone x, which ios version is installed when did you first notice this issue dm us <url> <url> <url> <url> <url> <url> <url> <url> **************************************************************************************************** Answer : <|endoftext|>thanks for reaching out so we can provide you with the best steps, dm us which device and ios version you're using <url><|endoftext|> Predicted : we'd like to work on this dm us with which iphone you're using and the ios version number it's running <url> **************************************************************************************************** Answer : <|endoftext|>you're in the right place for help for the best support, are you getting any errors if so, what message is shown<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to see how we can help to start, send us a dm with more information about what's happening <url> <url> <url> **************************************************************************************************** Answer : <|endoftext|>that doesn't sound like expected behavior for your battery let us know how long you've been experiencing this in dm <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to see how we can help to start, send us a dm with more information about what's happening <url> <url> <url> <url> <url> <url> **************************************************************************************************** Answer : <|endoftext|>we'd like to help with this could you please send us a dm so we can gather some info <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to help dm us and we'll continue <url> <url> <url> <url> <url> <url> < **************************************************************************************************** Answer : <|endoftext|>let's take a closer look into this issue select the following link to join us in a dm and we'll continue there <url><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Predicted : we'd like to help send us a dm with more information, and we'll continue from there <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url> <url ****************************************************************************************************
From test_iterator
getting question_tensor
, answer_tensor
.
Passing question_tensor
to gptseq2seq model in eval mode and with torch.no_grad()
to prevent gradient updation. This gives predicted answer tensor.
Converting predicted answer tensor and answer_tensor
to string using GPT_TOKENIZER
.
Computing the BLEU score between a candidate answer and a predicted answer.
def calculateBleuScore(model):
answers, pred_answers = [], []
for i, batch in tqdm(enumerate(test_iterator), desc="BLEU", disable=False):
question_tensor, answer_tensor = batch.source, batch.target
with torch.no_grad():
predictions = model(question = question_tensor,
answer = answer_tensor)
answer_s , pred_ans = getSentence(answer_tensor[:,0]), getSentence(torch.argmax(predictions[:, 0, :], 1))
answers.append([answer_s.split(" ")[:-1]])
pred_answers.append(pred_ans.split(" "))
return bleu_score(pred_answers, answers)
"""
Calculate BLEU Score
"""
bleu = calculateBleuScore(gpt_seq2seq_model)
print('BLEU Score : {:.4f}'.format(bleu))
BLEU Score : 0.4872
F1 Score takes into account cooccurring words regardless their orders.
From test_iterator
getting question_tensor
, answer_tensor
.
Passing question_tensor
to gptseq2seq model in eval mode and with torch.no_grad()
to prevent gradient updation. This gives predicted answer tensor.
Converting predicted answer tensor and answer_tensor
to string using GPT_TOKENIZER
.
Count the number of common words between them.
Calculate Precesion and Recall.
Calculate F1 Score based on the following formula.
F1 Score :
$$
\frac{2 \times precession \times recall}{precession + recall}
$$
def calculate_f1_score(model):
f1_scores = []
for i, batch in tqdm(enumerate(test_iterator), desc="F1 Score", disable=False):
question_tensor, answer_tensor = batch.source, batch.target
with torch.no_grad():
predictions = model(question = question_tensor,
answer = answer_tensor)
answer_s, pred_ans = getSentence(answer_tensor[:,0]), getSentence(torch.argmax(predictions[:, 0, :], 1))
number_of_common_words = sum((collections.Counter(answer_s.split(" ")) & collections.Counter(pred_ans.split(" "))).values())
if number_of_common_words == 0:
f1_score = 0
else:
precision = 1.0 * number_of_common_words / len(pred_ans)
recall = 1.0 * number_of_common_words / len(answer_s)
f1_score = (2 * precision * recall) / (precision + recall)
f1_scores.append(f1_score)
return f1_scores
f1_scores = calculate_f1_score(gpt_seq2seq_model)
f1_score = sum(f1_scores)/len(f1_scores)
print('F1 Score : {:.4f}'.format(f1_score))
F1 Score : 0.9112
"Recall-Oriented Understudy for Gisting Evaluation. It includes measures to automatically determine the quality of a summary by comparing it to other (ideal) summaries created by humans. The measures count the number of overlapping units such as n-gram, word sequences, and word pairs between the computer-generated summary to be evaluated and the ideal summaries created by humans.
"Given two sequences X and Y, the longest common subsequence (LCS) of X and recall reflects the proportion of words in X (reference summary sentence) that are also present in Y (candidate summary sentence); while unigram precision is the proportion of words in Y that are also in X. Unigram recall and precision count all cooccurring words regardless their orders; while ROUGE-L counts only in-sequence co-occurrences."
ROUGE-L is one type of ROUGE measures. It is calculated by taking into account longest common subsequence (LCS) between two sequences.It counts only in-sequence co-occurrences.
From test_iterator
getting question_tensor
, answer_tensor
.
Passing question_tensor
to gptseq2seq model in eval mode and with torch.no_grad()
to prevent gradient updation. This gives predicted answer tensor.
Converting predicted answer tensor and answer_tensor
to string using GPT_TOKENIZER
.
Computing the Rouge-L score between a candidate answer and a predicted answer by getting longest common subsequence (LCS) between the two sequences.
Applying ROUGE-L Score Formula : $$ \frac{(1+\beta^2) \times R \times P}{R + \beta^2 \times P} $$
def longest_common_subsequence(str1, str2):
"""
Makeing a grid of 0's with len(str2) + 1 columns and len(str1) + 1 rows.
"""
dp = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]
"""
Iterate up each column, starting from the last one.
"""
for col in reversed(range(len(str2))):
for row in reversed(range(len(str1))):
if str2[col] == str1[row]:
"""
If the corresponding characters for this cell are the same.
"""
dp[row][col] = 1 + dp[row + 1][col + 1]
else:
"""
Otherwise they must be different.
"""
dp[row][col] = max(dp[row + 1][col], dp[row][col + 1])
"""
The original problem's answer is in dp[0][0]. Return it.
"""
return dp[0][0]
def rougel_score(ans, pred):
BETA, answers, pred_answers = 1.2, [], []
if len(pred)!=1 and len(ans)<=0:
return
for idx in range(min(len(pred),len(ans))):
pred_words, ans_words = pred[idx], ans[idx]
long_cmmn_subseq = longest_common_subsequence(ans_words, pred_words)
answers.append(long_cmmn_subseq/float(len(ans_words)))
pred_answers.append(long_cmmn_subseq/float(len(pred_words)))
max_ans, max_pred = max(answers), max(pred_answers)
return ((1 + BETA**2)* max_pred * max_ans)/float(max_ans + BETA**2 * max_pred) if (max_ans !=0 and max_pred !=0) else 0.0
def calculateRougeLScore(model):
answers, pred_answers = [], []
for i, batch in tqdm(enumerate(test_iterator), desc="ROUGE-L", disable=False):
question_tensor, answer_tensor = batch.source, batch.target
with torch.no_grad():
predictions = model(question = question_tensor,
answer = answer_tensor)
answer_s, pred_ans = getSentence(answer_tensor[:,0]), getSentence(torch.argmax(predictions[:, 0, :], 1))
answers.append([answer_s.split(" ")[:-1]])
pred_answers.append(pred_ans.split(" "))
return rougel_score(answers, pred_answers)
"""
Calculate ROUGE-L Score
"""
rouge_l_score= calculateRougeLScore(gpt_seq2seq_model)
print('ROUGE-L Score : {:.4f}'.format(rouge_l_score))
ROUGE-L Score : 0.9635
gpt_seq2seq_model = gpt_seq2seq_model.to(device).eval()
def get_response(model, question_asked):
data_file = chatbot_data_dir + 'question_asked'
tsv_writer = csv.writer(open(data_file, 'w', encoding='utf-8', newline=''), delimiter='\t')
tsv_writer.writerow([question_asked])
SOURCE = getField(BERT_TOKENIZER, False)
question_data = data.TabularDataset(path=data_file + '.tsv',
format='tsv',
skip_header=False,
fields= [('source', SOURCE)])
iterator = data.BucketIterator(question_data,
batch_size=BATCH_SIZE,
sort_key=lambda x: len(x.source),
sort_within_batch=False,
device=device)
with torch.no_grad():
for i, batch in enumerate(iterator):
question_tensor = batch.source
"""
pred : (T, B, H)
"""
prediction = model(question_tensor, None)
pred_s = getSentence(torch.argmax(prediction[:, 0, :], 1))
return pred_s
question = ''
print('Bot : Hi, Did you want to chat with me?')
while question.lower()[:3] != 'bye':
try:
while True:
print('Me : ', end='')
question = input()
if question:
break
if question.lower()[:3] != 'bye':
response = get_response(gpt_seq2seq_model.to(device).eval(), question)
print('Bot: ' + response)
else:
print('Bot : Bye!! Stay safe. Have a nice day.')
except KeyError:
print("Sorry, I am not sure what you are talking about :/")
Bot : Hi, Did you want to chat with me? Me : My last os update is not working. Bot: we want to help dm us and let us know what country you're in <url> Me : Here is my dm attached. I am from USA Bot: we'd like to help dm us and let us know what country you're in <url> Me : what else I can do Bot: we'd like to help dm us and we'll get started <url> Me : Thanks Bot: you're Me : welcome Bot: thanks for that information let's take a look at this further for you reach out to us in dm Me : Bye Bot: Bye!! Stay safe. Have a nice day.