Spaces:

DelaliScratchwerk
/

time-period-api

Sleeping

File size: 3,448 Bytes

#!/usr/bin/env python3
import os
from dataclasses import dataclass
from typing import Dict, List

import numpy as np
from datasets import load_dataset
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

# ======================
# LABEL SCHEMA
# ======================

LABELS: List[str] = [
    "pre-1900",
    "1900-1945",
    "1946-1979",
    "1980-1999",
    "2000-2015",
    "2016-present",
]

id2label: Dict[int, str] = {i: l for i, l in enumerate(LABELS)}
label2id: Dict[str, int] = {l: i for i, l in enumerate(LABELS)}

# Base model to fine-tune
BASE_MODEL = os.environ.get("BASE_MODEL", "distilroberta-base")

# Hugging Face hub repo where the fine-tuned model will be pushed
HUB_MODEL_ID = "DelaliScratchwerk/time-period-classifier-bert"

# ======================
# LOAD DATA
# ======================

# Expect CSVs at data/train.csv and data/val.csv
dataset = load_dataset(
    "csv",
    data_files={
        "train": "data/train.csv",
        "validation": "data/val.csv",
    },
)

print("Raw dataset:", dataset)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)


def encode_batch(batch):
    # tokenize texts
    enc = tokenizer(batch["text"], truncation=True)
    # map string labels -> integer ids
    # strip helps if there are trailing spaces in the CSV
    enc["labels"] = [label2id[l.strip()] for l in batch["label"]]
    return enc


# IMPORTANT: remove original 'text' and 'label' columns so Trainer only sees tensors
encoded = dataset.map(
    encode_batch,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

print(encoded)
print("Encoded train sample keys:", encoded["train"][0].keys())
# should be: dict_keys(['input_ids', 'attention_mask', 'labels'])

# ======================
# MODEL
# ======================

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id,
)

# ======================
# METRICS
# ======================

accuracy = evaluate.load("accuracy")
f1_macro = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1_macro.compute(
            predictions=preds, references=labels, average="macro"
        )["f1"],
    }


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ======================
# TRAINING ARGS
# ======================

training_args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    logging_steps=50,
    push_to_hub=True,
    hub_model_id=HUB_MODEL_ID,
    hub_private_repo=False,
)

# ======================
# TRAINER
# ======================

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

if __name__ == "__main__":
    trainer.train()
    # push best model + tokenizer to the Hub
    trainer.push_to_hub()
    tokenizer.push_to_hub(HUB_MODEL_ID)