diff --git a/.gitignore b/.gitignore index f393884..ff94ae9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .ipynb_checkpoints data/ +model/ __pycache__ KR* tempo/ @@ -8,3 +9,5 @@ tempo/ qualitative/ outputs/ *.ipynb + +*.json \ No newline at end of file diff --git a/README.md b/README.md index 7f8f286..95ed095 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,36 @@ # SOM-DST +Convert code pytorch-transformers to huggingface transformers + +``` +# Fixed Requirements + +# pip install torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html +torch==1.7.1+cu110 +transformers==3.0.2 +wget==3.2 +jsonlines +tqdm +``` + +``` +# 동일한 Parameter setting (MW 2.1) 성능 소폭 감소 (0.5309 -> 0.5275) +------------------------------ +op_code: 4, is_gt_op: False, is_gt_p_state: False, is_gt_gen: False +Epoch 0 joint accuracy : 0.5275515743756786 +Epoch 0 slot turn accuracy : 0.9732401375316211 +Epoch 0 slot turn F1: 0.9175307139165523 +Epoch 0 op accuracy : 0.9737830256966589 +Epoch 0 op F1 : {'delete': 0.018656716417910446, 'update': 0.8015826338020638, 'dontcare': 0.3235668789808917, 'carryover': 0.9862940159245958} +Epoch 0 op hit count : {'delete': 15, 'update': 7496, 'dontcare': 127, 'carryover': 207607} +Epoch 0 op all count : {'delete': 1576, 'update': 10595, 'dontcare': 581, 'carryover': 208288} +Final Joint Accuracy : 0.3713713713713714 +Final slot turn F1 : 0.9101975987924662 +Latency Per Prediction : 24.244383 ms +----------------------------- +``` + +## The original readme.md is as follows This code is the official pytorch implementation of [Efficient Dialogue State Tracking by Selectively Overwriting Memory](https://arxiv.org/abs/1911.03906).
> [Sungdong Kim](https://github.com/dsksd), [Sohee Yang](https://github.com/soheeyang), [Gyuwan Kim](mailto:gyuwan.kim@navercorp.com), [Sang-woo Lee](https://scholar.google.co.kr/citations?user=TMTTMuQAAAAJ)
@@ -96,10 +127,10 @@ taxi 0.5903426791277259 0.9803219106957396 ### Main results on MultiWOZ dataset (Joint Goal Accuracy) -|Model |MultiWOZ 2.0 |MultWOZ 2.1| -|-------------|------------|------------| -|SOM-DST Base | 51.72 | 53.01 | -|SOM-DST Large| 52.32 | 53.68 | +| Model | MultiWOZ 2.0 | MultWOZ 2.1 | +| ------------- | ------------ | ----------- | +| SOM-DST Base | 51.72 | 53.01 | +| SOM-DST Large | 52.32 | 53.68 | ## Citation diff --git a/evaluation.py b/evaluation.py index 8ccbb5e..560bb1c 100644 --- a/evaluation.py +++ b/evaluation.py @@ -5,9 +5,17 @@ """ from utils.data_utils import prepare_dataset, MultiWozDataset -from utils.data_utils import make_slot_meta, domain2id, OP_SET, make_turn_label, postprocessing +from utils.data_utils import ( + make_slot_meta, + domain2id, + OP_SET, + make_turn_label, + postprocessing, +) from utils.eval_utils import compute_prf, compute_acc, per_domain_join_accuracy -from pytorch_transformers import BertTokenizer, BertConfig + +# from pytorch_transformers import BertTokenizer, BertConfig +from transformers import BertTokenizer, BertConfig from model import SomDST import torch.nn as nn @@ -23,51 +31,82 @@ import json from copy import deepcopy -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def main(args): ontology = json.load(open(os.path.join(args.data_root, args.ontology_data))) slot_meta, _ = make_slot_meta(ontology) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) - data = prepare_dataset(os.path.join(args.data_root, args.test_data), - tokenizer, - slot_meta, args.n_history, args.max_seq_length, args.op_code) + data = prepare_dataset( + os.path.join(args.data_root, args.test_data), + tokenizer, + slot_meta, + args.n_history, + args.max_seq_length, + args.op_code, + ) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = 0.1 op2id = OP_SET[args.op_code] - model = SomDST(model_config, len(op2id), len(domain2id), op2id['update']) - ckpt = torch.load(args.model_ckpt_path, map_location='cpu') + model = SomDST(model_config, len(op2id), len(domain2id), op2id["update"]) + ckpt = torch.load(args.model_ckpt_path, map_location="cpu") model.load_state_dict(ckpt) model.eval() model.to(device) if args.eval_all: - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - False, False, False) - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - False, False, True) - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - False, True, False) - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - False, True, True) - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - True, False, False) - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - True, True, False) - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - True, False, True) - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - True, True, True) + model_evaluation( + model, data, tokenizer, slot_meta, 0, args.op_code, False, False, False + ) + model_evaluation( + model, data, tokenizer, slot_meta, 0, args.op_code, False, False, True + ) + model_evaluation( + model, data, tokenizer, slot_meta, 0, args.op_code, False, True, False + ) + model_evaluation( + model, data, tokenizer, slot_meta, 0, args.op_code, False, True, True + ) + model_evaluation( + model, data, tokenizer, slot_meta, 0, args.op_code, True, False, False + ) + model_evaluation( + model, data, tokenizer, slot_meta, 0, args.op_code, True, True, False + ) + model_evaluation( + model, data, tokenizer, slot_meta, 0, args.op_code, True, False, True + ) + model_evaluation( + model, data, tokenizer, slot_meta, 0, args.op_code, True, True, True + ) else: - model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, - args.gt_op, args.gt_p_state, args.gt_gen) - - -def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', - is_gt_op=False, is_gt_p_state=False, is_gt_gen=False): + model_evaluation( + model, + data, + tokenizer, + slot_meta, + 0, + args.op_code, + args.gt_op, + args.gt_p_state, + args.gt_gen, + ) + + +def model_evaluation( + model, + test_data, + tokenizer, + slot_meta, + epoch, + op_code="4", + is_gt_op=False, + is_gt_p_state=False, + is_gt_gen=False, +): model.eval() op2id = OP_SET[op_code] id2op = {v: k for k, v in op2id.items()} @@ -91,19 +130,20 @@ def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', if is_gt_p_state is False: i.last_dialog_state = deepcopy(last_dialog_state) - i.make_instance(tokenizer, word_dropout=0.) + i.make_instance(tokenizer, word_dropout=0.0) else: # ground-truth previous dialogue state last_dialog_state = deepcopy(i.gold_p_state) i.last_dialog_state = deepcopy(last_dialog_state) - i.make_instance(tokenizer, word_dropout=0.) + i.make_instance(tokenizer, word_dropout=0.0) input_ids = torch.LongTensor([i.input_id]).to(device) input_mask = torch.FloatTensor([i.input_mask]).to(device) segment_ids = torch.LongTensor([i.segment_id]).to(device) state_position_ids = torch.LongTensor([i.slot_position]).to(device) - d_gold_op, _, _ = make_turn_label(slot_meta, last_dialog_state, i.gold_state, - tokenizer, op_code, dynamic=True) + d_gold_op, _, _ = make_turn_label( + slot_meta, last_dialog_state, i.gold_state, tokenizer, op_code, dynamic=True + ) gold_op_ids = torch.LongTensor([d_gold_op]).to(device) start = time.perf_counter() @@ -111,12 +151,14 @@ def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', with torch.no_grad(): # ground-truth state operation gold_op_inputs = gold_op_ids if is_gt_op else None - d, s, g = model(input_ids=input_ids, - token_type_ids=segment_ids, - state_positions=state_position_ids, - attention_mask=input_mask, - max_value=MAX_LENGTH, - op_ids=gold_op_inputs) + d, s, g = model( + input_ids=input_ids, + token_type_ids=segment_ids, + state_positions=state_position_ids, + attention_mask=input_mask, + max_value=MAX_LENGTH, + op_ids=gold_op_inputs, + ) _, op_ids = s.view(-1, len(op2id)).max(-1) @@ -133,20 +175,29 @@ def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', if is_gt_gen: # ground_truth generation - gold_gen = {'-'.join(ii.split('-')[:2]): ii.split('-')[-1] for ii in i.gold_state} + gold_gen = { + "-".join(ii.split("-")[:2]): ii.split("-")[-1] for ii in i.gold_state + } else: gold_gen = {} - generated, last_dialog_state = postprocessing(slot_meta, pred_ops, last_dialog_state, - generated, tokenizer, op_code, gold_gen) + generated, last_dialog_state = postprocessing( + slot_meta, + pred_ops, + last_dialog_state, + generated, + tokenizer, + op_code, + gold_gen, + ) end = time.perf_counter() wall_times.append(end - start) pred_state = [] for k, v in last_dialog_state.items(): - pred_state.append('-'.join([k, v])) + pred_state.append("-".join([k, v])) if set(pred_state) == set(i.gold_state): joint_acc += 1 - key = str(i.id) + '_' + str(i.turn_id) + key = str(i.id) + "_" + str(i.turn_id) results[key] = [pred_state, i.gold_state] # Compute prediction slot accuracy @@ -159,7 +210,9 @@ def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', slot_F1_count += count # Compute operation accuracy - temp_acc = sum([1 if p == g else 0 for p, g in zip(pred_ops, gold_ops)]) / len(pred_ops) + temp_acc = sum([1 if p == g else 0 for p, g in zip(pred_ops, gold_ops)]) / len( + pred_ops + ) op_acc += temp_acc if i.is_last_turn: @@ -191,14 +244,20 @@ def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', tp = tp_dic[k] fn = fn_dic[k] fp = fp_dic[k] - precision = tp / (tp+fp) if (tp+fp) != 0 else 0 - recall = tp / (tp+fn) if (tp+fn) != 0 else 0 - F1 = 2 * precision * recall / float(precision + recall) if (precision + recall) != 0 else 0 + precision = tp / (tp + fp) if (tp + fp) != 0 else 0 + recall = tp / (tp + fn) if (tp + fn) != 0 else 0 + F1 = ( + 2 * precision * recall / float(precision + recall) + if (precision + recall) != 0 + else 0 + ) op_F1_score[k] = F1 print("------------------------------") - print('op_code: %s, is_gt_op: %s, is_gt_p_state: %s, is_gt_gen: %s' % \ - (op_code, str(is_gt_op), str(is_gt_p_state), str(is_gt_gen))) + print( + "op_code: %s, is_gt_op: %s, is_gt_p_state: %s, is_gt_gen: %s" + % (op_code, str(is_gt_op), str(is_gt_p_state), str(is_gt_gen)) + ) print("Epoch %d joint accuracy : " % epoch, joint_acc_score) print("Epoch %d slot turn accuracy : " % epoch, turn_acc_score) print("Epoch %d slot turn F1: " % epoch, slot_F1_score) @@ -210,31 +269,39 @@ def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', print("Final slot turn F1 : ", final_slot_F1_score) print("Latency Per Prediction : %f ms" % latency) print("-----------------------------\n") - json.dump(results, open('preds_%d.json' % epoch, 'w')) + json.dump(results, open("preds_%d.json" % epoch, "w")) per_domain_join_accuracy(results, slot_meta) - scores = {'epoch': epoch, 'joint_acc': joint_acc_score, - 'slot_acc': turn_acc_score, 'slot_f1': slot_F1_score, - 'op_acc': op_acc_score, 'op_f1': op_F1_score, 'final_slot_f1': final_slot_F1_score} + scores = { + "epoch": epoch, + "joint_acc": joint_acc_score, + "slot_acc": turn_acc_score, + "slot_f1": slot_F1_score, + "op_acc": op_acc_score, + "op_f1": op_F1_score, + "final_slot_f1": final_slot_F1_score, + } return scores if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--data_root", default='data/mwz2.1', type=str) - parser.add_argument("--test_data", default='test_dials.json', type=str) - parser.add_argument("--ontology_data", default='ontology.json', type=str) - parser.add_argument("--vocab_path", default='assets/vocab.txt', type=str) - parser.add_argument("--bert_config_path", default='assets/bert_config_base_uncased.json', type=str) - parser.add_argument("--model_ckpt_path", default='outputs/model_best.bin', type=str) + parser.add_argument("--data_root", default="data/mwz2.1", type=str) + parser.add_argument("--test_data", default="test_dials.json", type=str) + parser.add_argument("--ontology_data", default="ontology.json", type=str) + parser.add_argument("--vocab_path", default="assets/vocab.txt", type=str) + parser.add_argument( + "--bert_config_path", default="assets/bert_config_base_uncased.json", type=str + ) + parser.add_argument("--model_ckpt_path", default="outputs/model_best.bin", type=str) parser.add_argument("--n_history", default=1, type=int) parser.add_argument("--max_seq_length", default=256, type=int) parser.add_argument("--op_code", default="4", type=str) - parser.add_argument("--gt_op", default=False, action='store_true') - parser.add_argument("--gt_p_state", default=False, action='store_true') - parser.add_argument("--gt_gen", default=False, action='store_true') - parser.add_argument("--eval_all", default=False, action='store_true') + parser.add_argument("--gt_op", default=False, action="store_true") + parser.add_argument("--gt_p_state", default=False, action="store_true") + parser.add_argument("--gt_gen", default=False, action="store_true") + parser.add_argument("--eval_all", default=False, action="store_true") args = parser.parse_args() main(args) diff --git a/model.py b/model.py index 7665c91..dd65daf 100644 --- a/model.py +++ b/model.py @@ -6,7 +6,9 @@ import torch import torch.nn as nn -from pytorch_transformers.modeling_bert import BertPreTrainedModel, BertModel + +# from pytorch_transformers.modeling_bert import BertPreTrainedModel, BertModel +from modeling_bert import BertPreTrainedModel, BertModel class SomDST(BertPreTrainedModel): @@ -14,23 +16,47 @@ def __init__(self, config, n_op, n_domain, update_id, exclude_domain=False): super(SomDST, self).__init__(config) self.hidden_size = config.hidden_size self.encoder = Encoder(config, n_op, n_domain, update_id, exclude_domain) - self.decoder = Decoder(config, self.encoder.bert.embeddings.word_embeddings.weight) + self.decoder = Decoder( + config, self.encoder.bert.embeddings.word_embeddings.weight + ) self.apply(self.init_weights) - def forward(self, input_ids, token_type_ids, - state_positions, attention_mask, - max_value, op_ids=None, max_update=None, teacher=None): - - enc_outputs = self.encoder(input_ids=input_ids, - token_type_ids=token_type_ids, - state_positions=state_positions, - attention_mask=attention_mask, - op_ids=op_ids, - max_update=max_update) - - domain_scores, state_scores, decoder_inputs, sequence_output, pooled_output = enc_outputs - gen_scores = self.decoder(input_ids, decoder_inputs, sequence_output, - pooled_output, max_value, teacher) + def forward( + self, + input_ids, + token_type_ids, + state_positions, + attention_mask, + max_value, + op_ids=None, + max_update=None, + teacher=None, + ): + + enc_outputs = self.encoder( + input_ids=input_ids, + token_type_ids=token_type_ids, + state_positions=state_positions, + attention_mask=attention_mask, + op_ids=op_ids, + max_update=max_update, + ) + + ( + domain_scores, + state_scores, + decoder_inputs, + sequence_output, + pooled_output, + ) = enc_outputs + gen_scores = self.decoder( + input_ids, + decoder_inputs, + sequence_output, + pooled_output, + max_value, + teacher, + ) return domain_scores, state_scores, gen_scores @@ -49,9 +75,15 @@ def __init__(self, config, n_op, n_domain, update_id, exclude_domain=False): self.n_domain = n_domain self.update_id = update_id - def forward(self, input_ids, token_type_ids, - state_positions, attention_mask, - op_ids=None, max_update=None): + def forward( + self, + input_ids, + token_type_ids, + state_positions, + attention_mask, + op_ids=None, + max_update=None, + ): bert_outputs = self.bert(input_ids, token_type_ids, attention_mask) sequence_output, pooled_output = bert_outputs[:2] state_pos = state_positions[:, :, None].expand(-1, -1, sequence_output.size(-1)) @@ -75,13 +107,23 @@ def forward(self, input_ids, token_type_ids, n = v.size(1) gap = max_update - n if gap > 0: - zeros = torch.zeros(1, 1*gap, self.hidden_size, device=input_ids.device) + zeros = torch.zeros( + 1, 1 * gap, self.hidden_size, device=input_ids.device + ) v = torch.cat([v, zeros], 1) else: - v = torch.zeros(1, max_update, self.hidden_size, device=input_ids.device) + v = torch.zeros( + 1, max_update, self.hidden_size, device=input_ids.device + ) gathered.append(v) decoder_inputs = torch.cat(gathered) - return domain_scores, state_scores, decoder_inputs, sequence_output, pooled_output.unsqueeze(0) + return ( + domain_scores, + state_scores, + decoder_inputs, + sequence_output, + pooled_output.unsqueeze(0), + ) class Decoder(nn.Module): @@ -90,22 +132,26 @@ def __init__(self, config, bert_model_embedding_weights): self.pad_idx = 0 self.hidden_size = config.hidden_size self.vocab_size = config.vocab_size - self.embed = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.pad_idx) + self.embed = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=self.pad_idx + ) self.embed.weight = bert_model_embedding_weights self.gru = nn.GRU(config.hidden_size, config.hidden_size, 1, batch_first=True) - self.w_gen = nn.Linear(config.hidden_size*3, 1) + self.w_gen = nn.Linear(config.hidden_size * 3, 1) self.sigmoid = nn.Sigmoid() self.dropout = nn.Dropout(config.dropout) for n, p in self.gru.named_parameters(): - if 'weight' in n: + if "weight" in n: p.data.normal_(mean=0.0, std=config.initializer_range) def forward(self, x, decoder_input, encoder_output, hidden, max_len, teacher=None): mask = x.eq(self.pad_idx) batch_size, n_update, _ = decoder_input.size() # B,J',5 # long state_in = decoder_input - all_point_outputs = torch.zeros(n_update, batch_size, max_len, self.vocab_size).to(x.device) + all_point_outputs = torch.zeros( + n_update, batch_size, max_len, self.vocab_size + ).to(x.device) result_dict = {} for j in range(n_update): w = state_in[:, j].unsqueeze(1) # B,1,D @@ -119,13 +165,17 @@ def forward(self, x, decoder_input, encoder_output, hidden, max_len, teacher=Non attn_history = nn.functional.softmax(attn_e, -1) # B,T # B,D * D,V => B,V - attn_v = torch.matmul(hidden.squeeze(0), self.embed.weight.transpose(0, 1)) # B,V + attn_v = torch.matmul( + hidden.squeeze(0), self.embed.weight.transpose(0, 1) + ) # B,V attn_vocab = nn.functional.softmax(attn_v, -1) # B,1,T * B,T,D => B,1,D context = torch.bmm(attn_history.unsqueeze(1), encoder_output) # B,1,D - p_gen = self.sigmoid(self.w_gen(torch.cat([w, hidden.transpose(0, 1), context], -1))) # B,1 + p_gen = self.sigmoid( + self.w_gen(torch.cat([w, hidden.transpose(0, 1), context], -1)) + ) # B,1 p_gen = p_gen.squeeze(-1) p_context_ptr = torch.zeros_like(attn_vocab).to(x.device) diff --git a/modeling_bert.py b/modeling_bert.py new file mode 100644 index 0000000..6c60bf8 --- /dev/null +++ b/modeling_bert.py @@ -0,0 +1,1281 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import math +import os +import sys +from io import open + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss + +# from pytorch_transformers.modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, +# prune_linear_layer, add_start_docstrings) +from transformers.modeling_utils import PretrainedConfig, PreTrainedModel, prune_linear_layer +from transformers.file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", + 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", +} + +BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", + 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", +} + + +def load_tf_weights_in_bert(model, config, tf_checkpoint_path): + """ Load tf checkpoints in a pytorch model. + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m", "global_step"] for n in name): + logger.info("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + elif l[0] == 'squad': + pointer = getattr(pointer, 'classifier') + else: + try: + pointer = getattr(pointer, l[0]) + except AttributeError: + logger.info("Skipping {}".format("/".join(name))) + continue + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + """Implementation of the gelu activation function. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + Also see https://arxiv.org/abs/1606.08415 + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} + + +class BertConfig(PretrainedConfig): + r""" + :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a + `BertModel`. + + + Arguments: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + """ + pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + **kwargs): + super(BertConfig, self).__init__(**kwargs) + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + else: + raise ValueError("First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)") + + + +try: + from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm +except (ImportError, AttributeError) as e: + logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .") + class BertLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-12): + """Construct a layernorm module in the TF style (epsilon inside the square root). + """ + super(BertLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.weight * x + self.bias + +# for graph fix BertEmbeddings +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + def __init__(self, config, type_vocab_size=None): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + if type_vocab_size is not None: + config.type_vocab_size = type_vocab_size + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None, position_ids=None): + seq_length = input_ids.size(1) + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.output_attentions = config.output_attentions + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask, head_mask=None, history_states=None): + if history_states is None: + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + else: + x_states = torch.cat((history_states, hidden_states), dim=1) + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(x_states) + mixed_value_layer = self.value(x_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + + try: + attention_scores = attention_scores + attention_mask + except RuntimeError: + print("---RuntimeError---") + print("attention_scores", attention_scores.shape) + print("attention_mask", attention_mask.shape) + exit() + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + + def prune_heads(self, heads): + if len(heads) == 0: + return + mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) + for head in heads: + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + # Update hyper params + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + + def forward(self, input_tensor, attention_mask, head_mask=None, history_states=None): + self_outputs = self.self(input_tensor, attention_mask, head_mask, history_states=history_states) + attention_output = self.output(self_outputs[0], input_tensor) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask, head_mask=None, history_states=None): + attention_outputs = self.attention(hidden_states, attention_mask, head_mask, history_states=history_states) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + return outputs + + +class BertEncoder(nn.Module): + def __init__(self, config): + super(BertEncoder, self).__init__() + + # self.output_attentions = config.output_attentions + # self.output_hidden_states = config.output_hidden_states + + self.output_attentions = False + self.output_hidden_states = True + + self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask, head_mask=None, prev_embedding=None, prev_encoded_layers=None): + assert (prev_embedding is None) == (prev_encoded_layers is None) + + all_hidden_states = () + + if (prev_embedding is not None) and (prev_encoded_layers is not None): + history_states = prev_embedding + for i, layer_module in enumerate(self.layer): + layer_outputs = layer_module( + hidden_states, attention_mask, head_mask[i], + history_states=history_states) + hidden_states = layer_outputs[0] + + all_hidden_states = all_hidden_states + (hidden_states,) + + history_states = prev_encoded_layers[i] + + else: + for i, layer_module in enumerate(self.layer): + + layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i]) + hidden_states = layer_outputs[0] + + all_hidden_states = all_hidden_states + (hidden_states,) + + # all_hidden_states: exclude embedding + + return hidden_states, all_hidden_states + + +class BertPooler(nn.Module): + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights=None): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + if bert_model_embedding_weights is None: + self.decoder = nn.Linear(config.hidden_size, + config.vocab_size, + bias=False) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + else: + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(bert_model_embedding_weights.size(1), + bert_model_embedding_weights.size(0), + bias=False) + self.decoder.weight = bert_model_embedding_weights + self.bias = nn.Parameter(torch.zeros( + bert_model_embedding_weights.size(0))) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super(BertOnlyMLMHead, self).__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + def __init__(self, config): + super(BertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = BertConfig + pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_bert + base_model_prefix = "bert" + + def __init__(self, *inputs, **kwargs): + super(BertPreTrainedModel, self).__init__(*inputs, **kwargs) + + def init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +BERT_START_DOCSTRING = r""" The BERT model was proposed in + `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ + by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer + pre-trained using a combination of masked language modeling objective and next sentence prediction + on a large corpus comprising the Toronto Book Corpus and Wikipedia. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`: + https://arxiv.org/abs/1810.04805 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. +""" + +BERT_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`. + See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and + :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertModel(BertPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config, type_vocab_size=None): + super(BertModel, self).__init__(config) + + self.embeddings = BertEmbeddings(config, type_vocab_size=type_vocab_size) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + + self.apply(self.init_weights) + + def _resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.embeddings.word_embeddings + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + self.embeddings.word_embeddings = new_embeddings + return self.embeddings.word_embeddings + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None, prev_embedding=None, prev_encoded_layers=None): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + if attention_mask.dim() == 2: + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + elif attention_mask.dim() == 3: + extended_attention_mask = attention_mask.unsqueeze(1) + else: + raise NotImplementedError + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + # extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: # TODO: This + head_mask = [None] * self.config.num_hidden_layers + + embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) + + sequence_output, all_hidden_states = self.encoder(embedding_output, + extended_attention_mask, + head_mask=head_mask, prev_embedding=prev_embedding, prev_encoded_layers=prev_encoded_layers) + + pooled_output = self.pooler(sequence_output) + + return sequence_output, pooled_output, embedding_output, all_hidden_states + + +@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training: + a `masked language modeling` head and a `next sentence prediction (classification)` head. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForPreTraining(BertPreTrainedModel): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForPreTraining.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + prediction_scores, seq_relationship_scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForPreTraining, self).__init__(config) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + + self.apply(self.init_weights) + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.cls.predictions.decoder, + self.bert.embeddings.word_embeddings) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, + next_sentence_label=None, position_ids=None, head_mask=None): + outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, + attention_mask=attention_mask, head_mask=head_mask) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + + if masked_lm_labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + outputs = (total_loss,) + outputs + + return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForMaskedLM(BertPreTrainedModel): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForMaskedLM.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, masked_lm_labels=input_ids) + loss, prediction_scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForMaskedLM, self).__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyMLMHead(config) + + self.apply(self.init_weights) + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.cls.predictions.decoder, + self.bert.embeddings.word_embeddings) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, + position_ids=None, head_mask=None): + outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, + attention_mask=attention_mask, head_mask=head_mask) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + outputs = (masked_lm_loss,) + outputs + + return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForNextSentencePrediction(BertPreTrainedModel): + r""" + **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Next sequence prediction (classification) loss. + **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + seq_relationship_scores = outputs[0] + + """ + def __init__(self, config): + super(BertForNextSentencePrediction, self).__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + + self.apply(self.init_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, + position_ids=None, head_mask=None): + outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, + attention_mask=attention_mask, head_mask=head_mask) + pooled_output = outputs[1] + + seq_relationship_score = self.cls(pooled_output) + + outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + outputs = (next_sentence_loss,) + outputs + + return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForSequenceClassification(BertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForSequenceClassification.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + def __init__(self, config): + super(BertForSequenceClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) + + self.apply(self.init_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, + position_ids=None, head_mask=None): + outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, + attention_mask=attention_mask, head_mask=head_mask) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + BERT_START_DOCSTRING) +class BertForMultipleChoice(BertPreTrainedModel): + r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`. + See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and + :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Mask to avoid performing attention on padding token indices. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForMultipleChoice.from_pretrained('bert-base-uncased') + choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, classification_scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForMultipleChoice, self).__init__(config) + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.apply(self.init_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, + position_ids=None, head_mask=None): + num_choices = input_ids.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + outputs = self.bert(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, head_mask=head_mask) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + outputs = (loss,) + outputs + + return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForTokenClassification(BertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForTokenClassification.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForTokenClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.apply(self.init_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, + position_ids=None, head_mask=None): + outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, + attention_mask=attention_mask, head_mask=head_mask) + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), scores, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForQuestionAnswering(BertPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForQuestionAnswering.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForQuestionAnswering, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.apply(self.init_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, + end_positions=None, position_ids=None, head_mask=None): + outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, + attention_mask=attention_mask, head_mask=head_mask) + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) diff --git a/requirements.txt b/requirements.txt index 4c538d6..85ed469 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ -pytorch-transformers==1.0.0 -torch==1.3.0a0+24ae9b5 +# pip install torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html +torch==1.7.1+cu110 +transformers==3.0.2 wget==3.2 +jsonlines +tqdm \ No newline at end of file diff --git a/run_eval.sh b/run_eval.sh new file mode 100644 index 0000000..8c80f49 --- /dev/null +++ b/run_eval.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +MODEL_PATH='./model_2.1/model_best.bin' +DATASET_DIR='./data/mw_2.1/' + +python evaluation.py\ + --model_ckpt_path $MODEL_PATH\ + --data_root $DATASET_DIR\ diff --git a/run_train.sh b/run_train.sh new file mode 100644 index 0000000..41a86ab --- /dev/null +++ b/run_train.sh @@ -0,0 +1,8 @@ +DATASET_DIR='./data/mw_2.1/' +SAVE_DIR='./model_2.1/' + +python train.py\ + --data_root ${DATASET_DIR}\ + --save_dir ${SAVE_DIR}\ + --bert_ckpt_path 'bert-base-uncased-pytorch_model.bin'\ + --op_code '4' \ No newline at end of file diff --git a/train.py b/train.py index fc0c021..4adc3f1 100644 --- a/train.py +++ b/train.py @@ -3,11 +3,24 @@ Copyright (c) 2020-present NAVER Corp. MIT license """ - +import sys from model import SomDST -from pytorch_transformers import BertTokenizer, AdamW, WarmupLinearSchedule, BertConfig + +# from pytorch_transformers import BertTokenizer, AdamW, WarmupLinearSchedule, BertConfig +from transformers import ( + BertTokenizer, + AdamW, + get_linear_schedule_with_warmup, + BertConfig, +) from utils.data_utils import prepare_dataset, MultiWozDataset -from utils.data_utils import make_slot_meta, domain2id, OP_SET, make_turn_label, postprocessing +from utils.data_utils import ( + make_slot_meta, + domain2id, + OP_SET, + make_turn_label, + postprocessing, +) from utils.eval_utils import compute_prf, compute_acc, per_domain_join_accuracy from utils.ckpt_utils import download_ckpt, convert_ckpt_compatible from evaluation import model_evaluation @@ -23,7 +36,7 @@ import time -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def masked_cross_entropy_for_value(logits, target, pad_idx=0): @@ -64,38 +77,46 @@ def worker_init_fn(worker_id): print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) - train_data_raw = prepare_dataset(data_path=args.train_data_path, - tokenizer=tokenizer, - slot_meta=slot_meta, - n_history=args.n_history, - max_seq_length=args.max_seq_length, - op_code=args.op_code) - - train_data = MultiWozDataset(train_data_raw, - tokenizer, - slot_meta, - args.max_seq_length, - rng, - ontology, - args.word_dropout, - args.shuffle_state, - args.shuffle_p) + train_data_raw = prepare_dataset( + data_path=args.train_data_path, + tokenizer=tokenizer, + slot_meta=slot_meta, + n_history=args.n_history, + max_seq_length=args.max_seq_length, + op_code=args.op_code, + ) + + train_data = MultiWozDataset( + train_data_raw, + tokenizer, + slot_meta, + args.max_seq_length, + rng, + ontology, + args.word_dropout, + args.shuffle_state, + args.shuffle_p, + ) print("# train examples %d" % len(train_data_raw)) - dev_data_raw = prepare_dataset(data_path=args.dev_data_path, - tokenizer=tokenizer, - slot_meta=slot_meta, - n_history=args.n_history, - max_seq_length=args.max_seq_length, - op_code=args.op_code) + dev_data_raw = prepare_dataset( + data_path=args.dev_data_path, + tokenizer=tokenizer, + slot_meta=slot_meta, + n_history=args.n_history, + max_seq_length=args.max_seq_length, + op_code=args.op_code, + ) print("# dev examples %d" % len(dev_data_raw)) - test_data_raw = prepare_dataset(data_path=args.test_data_path, - tokenizer=tokenizer, - slot_meta=slot_meta, - n_history=args.n_history, - max_seq_length=args.max_seq_length, - op_code=args.op_code) + test_data_raw = prepare_dataset( + data_path=args.test_data_path, + tokenizer=tokenizer, + slot_meta=slot_meta, + n_history=args.n_history, + max_seq_length=args.max_seq_length, + op_code=args.op_code, + ) print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) @@ -103,80 +124,149 @@ def worker_init_fn(worker_id): model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob - model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain) + type_vocab_size = 2 # token typy id == 2개 + model = SomDST( + model_config, len(op2id), len(domain2id), op2id["update"], args.exclude_domain + ) if not os.path.exists(args.bert_ckpt_path): - args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets') + args.bert_ckpt_path = download_ckpt( + args.bert_ckpt_path, args.bert_config_path, "assets" + ) + + state_dict = torch.load(args.bert_ckpt_path, map_location="cpu") + _k = "bert.embeddings.token_type_embeddings.weight" + print( + "config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format( + type_vocab_size, state_dict[_k].shape[0] + ) + ) + + keys = list(state_dict.keys()) + for key in keys: + if "LayerNorm" in key: + if "gamma" in key: + state_dict[key.replace("gamma", "weight")] = state_dict.pop(key) + else: + state_dict[key.replace("beta", "bias")] = state_dict.pop(key) + + from collections import OrderedDict - ckpt = torch.load(args.bert_ckpt_path, map_location='cpu') - model.encoder.bert.load_state_dict(ckpt) + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + name = k[5:] if k[:5] == "bert." else k # remove `bert.` + new_state_dict[name] = v + + # load params + model.encoder.bert.load_state_dict(new_state_dict, strict=False) + + print("\n### Done Load BERT") + sys.stdout.flush() # re-initialize added special tokens ([SLOT], [NULL], [EOS]) - model.encoder.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02) - model.encoder.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) - model.encoder.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) + model.encoder.bert.embeddings.word_embeddings.weight.data[1].normal_( + mean=0.0, std=0.02 + ) + model.encoder.bert.embeddings.word_embeddings.weight.data[2].normal_( + mean=0.0, std=0.02 + ) + model.encoder.bert.embeddings.word_embeddings.weight.data[3].normal_( + mean=0.0, std=0.02 + ) + model.to(device) num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] enc_param_optimizer = list(model.encoder.named_parameters()) enc_optimizer_grouped_parameters = [ - {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [ + p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay) + ], + "weight_decay": 0.01, + }, + { + "params": [ + p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, + ] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) - enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), - t_total=num_train_steps) + enc_scheduler = get_linear_schedule_with_warmup( + enc_optimizer, + int(num_train_steps * args.enc_warmup), + num_training_steps=num_train_steps, + ) dec_param_optimizer = list(model.decoder.parameters()) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) - dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), - t_total=num_train_steps) + dec_scheduler = get_linear_schedule_with_warmup( + dec_optimizer, + int(num_train_steps * args.dec_warmup), + num_training_steps=num_train_steps, + ) if n_gpu > 1: model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) - train_dataloader = DataLoader(train_data, - sampler=train_sampler, - batch_size=args.batch_size, - collate_fn=train_data.collate_fn, - num_workers=args.num_workers, - worker_init_fn=worker_init_fn) + train_dataloader = DataLoader( + train_data, + sampler=train_sampler, + batch_size=args.batch_size, + collate_fn=train_data.collate_fn, + num_workers=args.num_workers, + worker_init_fn=worker_init_fn, + ) loss_fnc = nn.CrossEntropyLoss() - best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0} + best_score = {"epoch": 0, "joint_acc": 0, "op_acc": 0, "final_slot_f1": 0} for epoch in range(args.n_epochs): batch_loss = [] model.train() for step, batch in enumerate(train_dataloader): batch = [b.to(device) if not isinstance(b, int) else b for b in batch] - input_ids, input_mask, segment_ids, state_position_ids, op_ids,\ - domain_ids, gen_ids, max_value, max_update = batch + ( + input_ids, + input_mask, + segment_ids, + state_position_ids, + op_ids, + domain_ids, + gen_ids, + max_value, + max_update, + ) = batch if rng.random() < args.decoder_teacher_forcing: # teacher forcing teacher = gen_ids else: teacher = None - domain_scores, state_scores, gen_scores = model(input_ids=input_ids, - token_type_ids=segment_ids, - state_positions=state_position_ids, - attention_mask=input_mask, - max_value=max_value, - op_ids=op_ids, - max_update=max_update, - teacher=teacher) + domain_scores, state_scores, gen_scores = model( + input_ids=input_ids, + token_type_ids=segment_ids, + state_positions=state_position_ids, + attention_mask=input_mask, + max_value=max_value, + op_ids=op_ids, + max_update=max_update, + teacher=teacher, + ) loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1)) - loss_g = masked_cross_entropy_for_value(gen_scores.contiguous(), - gen_ids.contiguous(), - tokenizer.vocab['[PAD]']) + loss_g = masked_cross_entropy_for_value( + gen_scores.contiguous(), gen_ids.contiguous(), tokenizer.vocab["[PAD]"] + ) loss = loss_s + loss_g if args.exclude_domain is not True: - loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1)) + loss_d = loss_fnc( + domain_scores.view(-1, len(domain2id)), domain_ids.view(-1) + ) loss = loss + loss_d batch_loss.append(loss.item()) @@ -189,66 +279,165 @@ def worker_init_fn(worker_id): if step % 100 == 0: if args.exclude_domain is not True: - print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \ - % (epoch+1, args.n_epochs, step, - len(train_dataloader), np.mean(batch_loss), - loss_s.item(), loss_g.item(), loss_d.item())) + print( + "[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" + % ( + epoch + 1, + args.n_epochs, + step, + len(train_dataloader), + np.mean(batch_loss), + loss_s.item(), + loss_g.item(), + loss_d.item(), + ) + ) else: - print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \ - % (epoch+1, args.n_epochs, step, - len(train_dataloader), np.mean(batch_loss), - loss_s.item(), loss_g.item())) + print( + "[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" + % ( + epoch + 1, + args.n_epochs, + step, + len(train_dataloader), + np.mean(batch_loss), + loss_s.item(), + loss_g.item(), + ) + ) batch_loss = [] - if (epoch+1) % args.eval_epoch == 0: - eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code) - if eval_res['joint_acc'] > best_score['joint_acc']: + if (epoch + 1) % args.eval_epoch == 0: + eval_res = model_evaluation( + model, dev_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code + ) + if eval_res["joint_acc"] > best_score["joint_acc"]: best_score = eval_res - model_to_save = model.module if hasattr(model, 'module') else model - save_path = os.path.join(args.save_dir, 'model_best.bin') + model_to_save = model.module if hasattr(model, "module") else model + save_path = os.path.join(args.save_dir, "model_best.bin") torch.save(model_to_save.state_dict(), save_path) print("Best Score : ", best_score) print("\n") print("Test using best model...") - best_epoch = best_score['epoch'] - ckpt_path = os.path.join(args.save_dir, 'model_best.bin') - model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain) - ckpt = torch.load(ckpt_path, map_location='cpu') + best_epoch = best_score["epoch"] + ckpt_path = os.path.join(args.save_dir, "model_best.bin") + model = SomDST( + model_config, len(op2id), len(domain2id), op2id["update"], args.exclude_domain + ) + ckpt = torch.load(ckpt_path, map_location="cpu") model.load_state_dict(ckpt) model.to(device) - model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, - is_gt_op=False, is_gt_p_state=False, is_gt_gen=False) - model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, - is_gt_op=False, is_gt_p_state=False, is_gt_gen=True) - model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, - is_gt_op=False, is_gt_p_state=True, is_gt_gen=False) - model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, - is_gt_op=False, is_gt_p_state=True, is_gt_gen=True) - model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, - is_gt_op=True, is_gt_p_state=False, is_gt_gen=False) - model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, - is_gt_op=True, is_gt_p_state=True, is_gt_gen=False) - model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, - is_gt_op=True, is_gt_p_state=False, is_gt_gen=True) - model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, - is_gt_op=True, is_gt_p_state=True, is_gt_gen=True) + model_evaluation( + model, + test_data_raw, + tokenizer, + slot_meta, + best_epoch, + args.op_code, + is_gt_op=False, + is_gt_p_state=False, + is_gt_gen=False, + ) + model_evaluation( + model, + test_data_raw, + tokenizer, + slot_meta, + best_epoch, + args.op_code, + is_gt_op=False, + is_gt_p_state=False, + is_gt_gen=True, + ) + model_evaluation( + model, + test_data_raw, + tokenizer, + slot_meta, + best_epoch, + args.op_code, + is_gt_op=False, + is_gt_p_state=True, + is_gt_gen=False, + ) + model_evaluation( + model, + test_data_raw, + tokenizer, + slot_meta, + best_epoch, + args.op_code, + is_gt_op=False, + is_gt_p_state=True, + is_gt_gen=True, + ) + model_evaluation( + model, + test_data_raw, + tokenizer, + slot_meta, + best_epoch, + args.op_code, + is_gt_op=True, + is_gt_p_state=False, + is_gt_gen=False, + ) + model_evaluation( + model, + test_data_raw, + tokenizer, + slot_meta, + best_epoch, + args.op_code, + is_gt_op=True, + is_gt_p_state=True, + is_gt_gen=False, + ) + model_evaluation( + model, + test_data_raw, + tokenizer, + slot_meta, + best_epoch, + args.op_code, + is_gt_op=True, + is_gt_p_state=False, + is_gt_gen=True, + ) + model_evaluation( + model, + test_data_raw, + tokenizer, + slot_meta, + best_epoch, + args.op_code, + is_gt_op=True, + is_gt_p_state=True, + is_gt_gen=True, + ) if __name__ == "__main__": parser = argparse.ArgumentParser() # Required parameters - parser.add_argument("--data_root", default='data/mwz2.1', type=str) - parser.add_argument("--train_data", default='train_dials.json', type=str) - parser.add_argument("--dev_data", default='dev_dials.json', type=str) - parser.add_argument("--test_data", default='test_dials.json', type=str) - parser.add_argument("--ontology_data", default='ontology.json', type=str) - parser.add_argument("--vocab_path", default='assets/vocab.txt', type=str) - parser.add_argument("--bert_config_path", default='assets/bert_config_base_uncased.json', type=str) - parser.add_argument("--bert_ckpt_path", default='assets/bert-base-uncased-pytorch_model.bin', type=str) - parser.add_argument("--save_dir", default='outputs', type=str) + parser.add_argument("--data_root", default="data/mwz2.1", type=str) + parser.add_argument("--train_data", default="train_dials.json", type=str) + parser.add_argument("--dev_data", default="dev_dials.json", type=str) + parser.add_argument("--test_data", default="test_dials.json", type=str) + parser.add_argument("--ontology_data", default="ontology.json", type=str) + parser.add_argument("--vocab_path", default="assets/vocab.txt", type=str) + parser.add_argument( + "--bert_config_path", default="./assets/bert_config_base_uncased.json", type=str + ) + parser.add_argument( + "--bert_ckpt_path", + default="./assets/bert-base-uncased-pytorch_model.bin", + type=str, + ) + parser.add_argument("--save_dir", default="outputs", type=str) parser.add_argument("--random_seed", default=42, type=int) parser.add_argument("--num_workers", default=4, type=int) @@ -267,13 +456,13 @@ def worker_init_fn(worker_id): parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float) parser.add_argument("--decoder_teacher_forcing", default=0.5, type=float) parser.add_argument("--word_dropout", default=0.1, type=float) - parser.add_argument("--not_shuffle_state", default=False, action='store_true') + parser.add_argument("--not_shuffle_state", default=False, action="store_true") parser.add_argument("--shuffle_p", default=0.5, type=float) parser.add_argument("--n_history", default=1, type=int) parser.add_argument("--max_seq_length", default=256, type=int) parser.add_argument("--msg", default=None, type=str) - parser.add_argument("--exclude_domain", default=False, action='store_true') + parser.add_argument("--exclude_domain", default=False, action="store_true") args = parser.parse_args() args.train_data_path = os.path.join(args.data_root, args.train_data) @@ -281,6 +470,6 @@ def worker_init_fn(worker_id): args.test_data_path = os.path.join(args.data_root, args.test_data) args.ontology_data = os.path.join(args.data_root, args.ontology_data) args.shuffle_state = False if args.not_shuffle_state else True - print('pytorch version: ', torch.__version__) + print("pytorch version: ", torch.__version__) print(args) main(args) diff --git a/utils/ckpt_utils.py b/utils/ckpt_utils.py index 6902d38..0a4d8b0 100644 --- a/utils/ckpt_utils.py +++ b/utils/ckpt_utils.py @@ -1,27 +1,29 @@ import wget import os import torch -from pytorch_transformers import BertForPreTraining, BertConfig + +# from pytorch_transformers import BertForPreTraining, BertConfig +from transformers import BertForPreTraining, BertConfig BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", } -def download_ckpt(ckpt_path, config_path, target_path='assets'): +def download_ckpt(ckpt_path, config_path, target_path="assets"): key = None - if 'base' in ckpt_path.lower(): - key = 'bert-base-uncased' - if 'large' in ckpt_path.lower(): - key = 'bert-large-uncased' + if "base" in ckpt_path.lower(): + key = "bert-base-uncased" + if "large" in ckpt_path.lower(): + key = "bert-large-uncased" assert key in BERT_PRETRAINED_MODEL_ARCHIVE_MAP url_path = BERT_PRETRAINED_MODEL_ARCHIVE_MAP[key] - print('start download %s from huggingface' % key) + print("start download %s from huggingface" % key) wget.download(url_path, out=target_path) - ckpt_path = os.path.join(target_path, key + '-pytorch_model.bin') + ckpt_path = os.path.join(target_path, key + "-pytorch_model.bin") ckpt = convert_ckpt_compatible(ckpt_path, config_path) torch.save(ckpt, ckpt_path) @@ -29,14 +31,14 @@ def download_ckpt(ckpt_path, config_path, target_path='assets'): def convert_ckpt_compatible(ckpt_path, config_path): - ckpt = torch.load(ckpt_path, map_location='cpu') + ckpt = torch.load(ckpt_path, map_location="cpu") keys = list(ckpt.keys()) for key in keys: - if 'LayerNorm' in key: - if 'gamma' in key: - ckpt[key.replace('gamma', 'weight')] = ckpt.pop(key) + if "LayerNorm" in key: + if "gamma" in key: + ckpt[key.replace("gamma", "weight")] = ckpt.pop(key) else: - ckpt[key.replace('beta', 'bias')] = ckpt.pop(key) + ckpt[key.replace("beta", "bias")] = ckpt.pop(key) model_config = BertConfig.from_json_file(config_path) model = BertForPreTraining(model_config)