llaa33219 commited on
Commit
f284dcb
·
verified ·
1 Parent(s): 98871c7

Upload 4 files

Browse files
Files changed (1) hide show
  1. app.py +6 -27
app.py CHANGED
@@ -172,33 +172,12 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
172
  greater_is_better=False,
173
  )
174
 
175
- # Custom data collator that handles labels properly
176
- from dataclasses import dataclass
177
- from typing import Any, Dict, List
178
-
179
- @dataclass
180
- class CustomDataCollator:
181
- tokenizer: Any
182
-
183
- def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
184
- import torch
185
-
186
- # Use tokenizer's pad method for proper padding
187
- batch = self.tokenizer.pad(
188
- features,
189
- padding=True,
190
- return_tensors="pt"
191
- )
192
-
193
- # Create labels from input_ids
194
- # Replace padding token id with -100 so it's ignored in loss
195
- labels = batch["input_ids"].clone()
196
- labels[labels == self.tokenizer.pad_token_id] = -100
197
- batch["labels"] = labels
198
-
199
- return batch
200
-
201
- data_collator = CustomDataCollator(tokenizer=tokenizer)
202
 
203
  # Initialize trainer with custom loss
204
  trainer = CoDATrainer(
 
172
  greater_is_better=False,
173
  )
174
 
175
+ # Use standard data collator for causal language modeling
176
+ # This properly handles CoDA's internal sequence modifications
177
+ data_collator = DataCollatorForLanguageModeling(
178
+ tokenizer=tokenizer,
179
+ mlm=False # Causal LM, not masked LM
180
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  # Initialize trainer with custom loss
183
  trainer = CoDATrainer(