llaa33219 commited on
Commit
8900cbf
·
verified ·
1 Parent(s): 6d15327

Upload 4 files

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -63,7 +63,10 @@ def preprocess_conversations(examples, tokenizer):
63
  text += f"<|assistant|>\n{content}\n"
64
  texts.append(text)
65
 
66
- return tokenizer(texts, truncation=True, max_length=2048, padding=False)
 
 
 
67
 
68
  # Persistent storage paths
69
  CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
@@ -171,10 +174,11 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
171
  greater_is_better=False,
172
  )
173
 
174
- # Data collator
175
  data_collator = DataCollatorForLanguageModeling(
176
  tokenizer=tokenizer,
177
- mlm=False
 
178
  )
179
 
180
  # Initialize trainer with custom loss
 
63
  text += f"<|assistant|>\n{content}\n"
64
  texts.append(text)
65
 
66
+ # Return tokenized data with labels for language modeling
67
+ tokenized = tokenizer(texts, truncation=True, max_length=2048, padding=False)
68
+ tokenized["labels"] = tokenized["input_ids"].copy()
69
+ return tokenized
70
 
71
  # Persistent storage paths
72
  CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
 
174
  greater_is_better=False,
175
  )
176
 
177
+ # Data collator with padding
178
  data_collator = DataCollatorForLanguageModeling(
179
  tokenizer=tokenizer,
180
+ mlm=False,
181
+ pad_to_multiple_of=8 # Pad to multiple of 8 for efficiency
182
  )
183
 
184
  # Initialize trainer with custom loss