Upload 4 files
Browse files
app.py
CHANGED
|
@@ -172,33 +172,12 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
|
|
| 172 |
greater_is_better=False,
|
| 173 |
)
|
| 174 |
|
| 175 |
-
#
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
tokenizer: Any
|
| 182 |
-
|
| 183 |
-
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 184 |
-
import torch
|
| 185 |
-
|
| 186 |
-
# Use tokenizer's pad method for proper padding
|
| 187 |
-
batch = self.tokenizer.pad(
|
| 188 |
-
features,
|
| 189 |
-
padding=True,
|
| 190 |
-
return_tensors="pt"
|
| 191 |
-
)
|
| 192 |
-
|
| 193 |
-
# Create labels from input_ids
|
| 194 |
-
# Replace padding token id with -100 so it's ignored in loss
|
| 195 |
-
labels = batch["input_ids"].clone()
|
| 196 |
-
labels[labels == self.tokenizer.pad_token_id] = -100
|
| 197 |
-
batch["labels"] = labels
|
| 198 |
-
|
| 199 |
-
return batch
|
| 200 |
-
|
| 201 |
-
data_collator = CustomDataCollator(tokenizer=tokenizer)
|
| 202 |
|
| 203 |
# Initialize trainer with custom loss
|
| 204 |
trainer = CoDATrainer(
|
|
|
|
| 172 |
greater_is_better=False,
|
| 173 |
)
|
| 174 |
|
| 175 |
+
# Use standard data collator for causal language modeling
|
| 176 |
+
# This properly handles CoDA's internal sequence modifications
|
| 177 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 178 |
+
tokenizer=tokenizer,
|
| 179 |
+
mlm=False # Causal LM, not masked LM
|
| 180 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
# Initialize trainer with custom loss
|
| 183 |
trainer = CoDATrainer(
|