Upload 4 files
Browse files
app.py
CHANGED
|
@@ -63,7 +63,10 @@ def preprocess_conversations(examples, tokenizer):
|
|
| 63 |
text += f"<|assistant|>\n{content}\n"
|
| 64 |
texts.append(text)
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# Persistent storage paths
|
| 69 |
CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
|
|
@@ -171,10 +174,11 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
|
|
| 171 |
greater_is_better=False,
|
| 172 |
)
|
| 173 |
|
| 174 |
-
# Data collator
|
| 175 |
data_collator = DataCollatorForLanguageModeling(
|
| 176 |
tokenizer=tokenizer,
|
| 177 |
-
mlm=False
|
|
|
|
| 178 |
)
|
| 179 |
|
| 180 |
# Initialize trainer with custom loss
|
|
|
|
| 63 |
text += f"<|assistant|>\n{content}\n"
|
| 64 |
texts.append(text)
|
| 65 |
|
| 66 |
+
# Return tokenized data with labels for language modeling
|
| 67 |
+
tokenized = tokenizer(texts, truncation=True, max_length=2048, padding=False)
|
| 68 |
+
tokenized["labels"] = tokenized["input_ids"].copy()
|
| 69 |
+
return tokenized
|
| 70 |
|
| 71 |
# Persistent storage paths
|
| 72 |
CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
|
|
|
|
| 174 |
greater_is_better=False,
|
| 175 |
)
|
| 176 |
|
| 177 |
+
# Data collator with padding
|
| 178 |
data_collator = DataCollatorForLanguageModeling(
|
| 179 |
tokenizer=tokenizer,
|
| 180 |
+
mlm=False,
|
| 181 |
+
pad_to_multiple_of=8 # Pad to multiple of 8 for efficiency
|
| 182 |
)
|
| 183 |
|
| 184 |
# Initialize trainer with custom loss
|