Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update ROBERTAmodel.py
Browse files- ROBERTAmodel.py +15 -2
ROBERTAmodel.py
CHANGED
@@ -6,6 +6,8 @@ from transformers import (
|
|
6 |
RobertaForMaskedLM, RobertaForSequenceClassification
|
7 |
)
|
8 |
import os
|
|
|
|
|
9 |
|
10 |
CACHE_DIR = "/data/hf_cache"
|
11 |
|
@@ -149,6 +151,7 @@ class RoBERTaVisualizer(TransformerVisualizer):
|
|
149 |
def get_all_grad_attn_matrix(self, task, sentence, hypothesis='', maskID = None):
|
150 |
print(task, sentence, hypothesis)
|
151 |
print('Tokenize')
|
|
|
152 |
if task == 'mnli':
|
153 |
inputs = self.tokenizer(sentence, hypothesis, return_tensors='pt', padding=False, truncation=True)
|
154 |
elif task == 'mlm':
|
@@ -160,12 +163,16 @@ class RoBERTaVisualizer(TransformerVisualizer):
|
|
160 |
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
161 |
print(tokens)
|
162 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
|
|
|
163 |
|
164 |
print('Input embeddings with grad')
|
165 |
embedding_layer = self.model.roberta.embeddings.word_embeddings
|
166 |
inputs_embeds = embedding_layer(inputs["input_ids"])
|
167 |
inputs_embeds.requires_grad_()
|
168 |
-
|
|
|
|
|
169 |
print('Forward pass')
|
170 |
outputs = self.model.roberta(
|
171 |
inputs_embeds=inputs_embeds,
|
@@ -173,9 +180,13 @@ class RoBERTaVisualizer(TransformerVisualizer):
|
|
173 |
output_attentions=True
|
174 |
)
|
175 |
attentions = outputs.attentions # list of [1, heads, seq, seq]
|
176 |
-
|
|
|
|
|
177 |
print('Average attentions per layer')
|
178 |
mean_attns = [a.squeeze(0).mean(dim=0).detach().cpu() for a in attentions]
|
|
|
|
|
179 |
|
180 |
attn_matrices_all = []
|
181 |
grad_matrices_all = []
|
@@ -183,6 +194,8 @@ class RoBERTaVisualizer(TransformerVisualizer):
|
|
183 |
grad_matrix, attn_matrix = self.get_grad_attn_matrix(inputs_embeds, attentions, mean_attns, target_layer)
|
184 |
grad_matrices_all.append(grad_matrix.tolist())
|
185 |
attn_matrices_all.append(attn_matrix.tolist())
|
|
|
|
|
186 |
return grad_matrices_all, attn_matrices_all
|
187 |
|
188 |
def get_grad_attn_matrix(self,inputs_embeds, attentions, mean_attns, target_layer):
|
|
|
6 |
RobertaForMaskedLM, RobertaForSequenceClassification
|
7 |
)
|
8 |
import os
|
9 |
+
import time
|
10 |
+
|
11 |
|
12 |
CACHE_DIR = "/data/hf_cache"
|
13 |
|
|
|
151 |
def get_all_grad_attn_matrix(self, task, sentence, hypothesis='', maskID = None):
|
152 |
print(task, sentence, hypothesis)
|
153 |
print('Tokenize')
|
154 |
+
start = time.time()
|
155 |
if task == 'mnli':
|
156 |
inputs = self.tokenizer(sentence, hypothesis, return_tensors='pt', padding=False, truncation=True)
|
157 |
elif task == 'mlm':
|
|
|
163 |
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
164 |
print(tokens)
|
165 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
166 |
+
print(1,time.time()-start)
|
167 |
+
start = time.time()
|
168 |
|
169 |
print('Input embeddings with grad')
|
170 |
embedding_layer = self.model.roberta.embeddings.word_embeddings
|
171 |
inputs_embeds = embedding_layer(inputs["input_ids"])
|
172 |
inputs_embeds.requires_grad_()
|
173 |
+
|
174 |
+
print(2,time.time()-start)
|
175 |
+
start = time.time()
|
176 |
print('Forward pass')
|
177 |
outputs = self.model.roberta(
|
178 |
inputs_embeds=inputs_embeds,
|
|
|
180 |
output_attentions=True
|
181 |
)
|
182 |
attentions = outputs.attentions # list of [1, heads, seq, seq]
|
183 |
+
|
184 |
+
print(3,time.time()-start)
|
185 |
+
start = time.time()
|
186 |
print('Average attentions per layer')
|
187 |
mean_attns = [a.squeeze(0).mean(dim=0).detach().cpu() for a in attentions]
|
188 |
+
print(4,time.time()-start)
|
189 |
+
start = time.time()
|
190 |
|
191 |
attn_matrices_all = []
|
192 |
grad_matrices_all = []
|
|
|
194 |
grad_matrix, attn_matrix = self.get_grad_attn_matrix(inputs_embeds, attentions, mean_attns, target_layer)
|
195 |
grad_matrices_all.append(grad_matrix.tolist())
|
196 |
attn_matrices_all.append(attn_matrix.tolist())
|
197 |
+
|
198 |
+
print(5,time.time()-start)
|
199 |
return grad_matrices_all, attn_matrices_all
|
200 |
|
201 |
def get_grad_attn_matrix(self,inputs_embeds, attentions, mean_attns, target_layer):
|