yu-val-weiss
commited on
Commit
·
b8756a1
1
Parent(s):
0a5e4ab
remove numpy, switch to torch (avoid to/from cpu as much)
Browse files- blimp.py +17 -17
- requirements.txt +0 -1
blimp.py
CHANGED
|
@@ -18,7 +18,6 @@ from typing import Optional
|
|
| 18 |
|
| 19 |
import datasets
|
| 20 |
import evaluate
|
| 21 |
-
import numpy as np
|
| 22 |
import torch
|
| 23 |
from evaluate import logging
|
| 24 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
@@ -175,7 +174,8 @@ class Blimp(evaluate.Metric):
|
|
| 175 |
else ("mps" if torch.mps.is_available() else "cpu")
|
| 176 |
)
|
| 177 |
|
| 178 |
-
if samples_per_set is None
|
|
|
|
| 179 |
samples_per_set = 1000
|
| 180 |
|
| 181 |
model = AutoModelForCausalLM.from_pretrained(
|
|
@@ -226,13 +226,11 @@ class Blimp(evaluate.Metric):
|
|
| 226 |
|
| 227 |
# Prepare batches of good and bad sentences
|
| 228 |
|
| 229 |
-
phenom = dataset[0]["linguistics_term"]
|
| 230 |
-
|
| 231 |
sents = [(x["sentence_good"], x["sentence_bad"]) for x in dataset]
|
| 232 |
-
good_sents, bad_sents = zip(*sents[:
|
| 233 |
|
| 234 |
# Get probabilities in batches
|
| 235 |
-
good_probs =
|
| 236 |
model,
|
| 237 |
tokenizer,
|
| 238 |
good_sents,
|
|
@@ -241,7 +239,7 @@ class Blimp(evaluate.Metric):
|
|
| 241 |
category,
|
| 242 |
sent_type="good",
|
| 243 |
)
|
| 244 |
-
bad_probs =
|
| 245 |
model,
|
| 246 |
tokenizer,
|
| 247 |
bad_sents,
|
|
@@ -251,22 +249,24 @@ class Blimp(evaluate.Metric):
|
|
| 251 |
sent_type="bad",
|
| 252 |
)
|
| 253 |
|
| 254 |
-
# compute accuracy (mean of instances where good prob > bad prob)
|
| 255 |
-
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
results[category] =
|
| 258 |
-
phenom_results[phenom].append(
|
| 259 |
|
| 260 |
return {
|
| 261 |
"by_uid": results,
|
| 262 |
-
"accuracy":
|
| 263 |
"by_phenomenon": {
|
| 264 |
-
term:
|
| 265 |
},
|
| 266 |
}
|
| 267 |
|
| 268 |
|
| 269 |
-
def
|
| 270 |
model,
|
| 271 |
tokenizer,
|
| 272 |
sentences: list[str],
|
|
@@ -276,7 +276,7 @@ def get_batch_probabilities(
|
|
| 276 |
sent_type: str = "good",
|
| 277 |
):
|
| 278 |
"""Compute log probabilities for a batch of sentences"""
|
| 279 |
-
probs =
|
| 280 |
|
| 281 |
for i in logging.tqdm(
|
| 282 |
range(0, len(sentences), batch_size),
|
|
@@ -307,6 +307,6 @@ def get_batch_probabilities(
|
|
| 307 |
# sum log probabilities
|
| 308 |
sequence_log_probs = token_log_probs.sum(dim=1)
|
| 309 |
|
| 310 |
-
probs
|
| 311 |
|
| 312 |
-
return
|
|
|
|
| 18 |
|
| 19 |
import datasets
|
| 20 |
import evaluate
|
|
|
|
| 21 |
import torch
|
| 22 |
from evaluate import logging
|
| 23 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 174 |
else ("mps" if torch.mps.is_available() else "cpu")
|
| 175 |
)
|
| 176 |
|
| 177 |
+
samples_per_set = 1000 if samples_per_set is None else samples_per_set
|
| 178 |
+
if samples_per_set <= 0 or samples_per_set > 1000:
|
| 179 |
samples_per_set = 1000
|
| 180 |
|
| 181 |
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
| 226 |
|
| 227 |
# Prepare batches of good and bad sentences
|
| 228 |
|
|
|
|
|
|
|
| 229 |
sents = [(x["sentence_good"], x["sentence_bad"]) for x in dataset]
|
| 230 |
+
good_sents, bad_sents = zip(*sents[:samples_per_set])
|
| 231 |
|
| 232 |
# Get probabilities in batches
|
| 233 |
+
good_probs = _get_batch_probabilities(
|
| 234 |
model,
|
| 235 |
tokenizer,
|
| 236 |
good_sents,
|
|
|
|
| 239 |
category,
|
| 240 |
sent_type="good",
|
| 241 |
)
|
| 242 |
+
bad_probs = _get_batch_probabilities(
|
| 243 |
model,
|
| 244 |
tokenizer,
|
| 245 |
bad_sents,
|
|
|
|
| 249 |
sent_type="bad",
|
| 250 |
)
|
| 251 |
|
| 252 |
+
# compute accuracy (mean of instances where good prob > bad prob) for this UID
|
| 253 |
+
sub_acc = (good_probs > bad_probs).float().mean().item()
|
| 254 |
+
|
| 255 |
+
phenom = dataset[0]["linguistics_term"]
|
| 256 |
|
| 257 |
+
results[category] = sub_acc
|
| 258 |
+
phenom_results[phenom].append(sub_acc)
|
| 259 |
|
| 260 |
return {
|
| 261 |
"by_uid": results,
|
| 262 |
+
"accuracy": sum(results.values()) / len(results), # overall accuracy
|
| 263 |
"by_phenomenon": {
|
| 264 |
+
term: sum(acc) / len(acc) for term, acc in phenom_results.items()
|
| 265 |
},
|
| 266 |
}
|
| 267 |
|
| 268 |
|
| 269 |
+
def _get_batch_probabilities(
|
| 270 |
model,
|
| 271 |
tokenizer,
|
| 272 |
sentences: list[str],
|
|
|
|
| 276 |
sent_type: str = "good",
|
| 277 |
):
|
| 278 |
"""Compute log probabilities for a batch of sentences"""
|
| 279 |
+
probs = torch.zeros(len(sentences))
|
| 280 |
|
| 281 |
for i in logging.tqdm(
|
| 282 |
range(0, len(sentences), batch_size),
|
|
|
|
| 307 |
# sum log probabilities
|
| 308 |
sequence_log_probs = token_log_probs.sum(dim=1)
|
| 309 |
|
| 310 |
+
probs[i : i + batch_size] = sequence_log_probs
|
| 311 |
|
| 312 |
+
return probs
|
requirements.txt
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
git+https://github.com/huggingface/evaluate@5aa3982a9a8c86e506860e381d428a64b0cce73b
|
| 2 |
torch
|
| 3 |
transformers
|
| 4 |
-
numpy
|
|
|
|
| 1 |
git+https://github.com/huggingface/evaluate@5aa3982a9a8c86e506860e381d428a64b0cce73b
|
| 2 |
torch
|
| 3 |
transformers
|
|
|