Upload 4 files
Browse files
app.py
CHANGED
|
@@ -63,10 +63,7 @@ def preprocess_conversations(examples, tokenizer):
|
|
| 63 |
text += f"<|assistant|>\n{content}\n"
|
| 64 |
texts.append(text)
|
| 65 |
|
| 66 |
-
|
| 67 |
-
tokenized = tokenizer(texts, truncation=True, max_length=2048, padding=False)
|
| 68 |
-
tokenized["labels"] = tokenized["input_ids"].copy()
|
| 69 |
-
return tokenized
|
| 70 |
|
| 71 |
# Persistent storage paths
|
| 72 |
CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
|
|
@@ -135,8 +132,16 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
|
|
| 135 |
|
| 136 |
# Preprocess dataset
|
| 137 |
progress(0.3, desc="Preprocessing dataset...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
tokenized_dataset = dataset.map(
|
| 139 |
-
|
| 140 |
batched=True,
|
| 141 |
remove_columns=dataset.column_names
|
| 142 |
)
|
|
|
|
| 63 |
text += f"<|assistant|>\n{content}\n"
|
| 64 |
texts.append(text)
|
| 65 |
|
| 66 |
+
return tokenizer(texts, truncation=True, max_length=2048, padding=False)
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# Persistent storage paths
|
| 69 |
CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
|
|
|
|
| 132 |
|
| 133 |
# Preprocess dataset
|
| 134 |
progress(0.3, desc="Preprocessing dataset...")
|
| 135 |
+
|
| 136 |
+
def tokenize_function(examples):
|
| 137 |
+
# Process conversations
|
| 138 |
+
processed = preprocess_conversations(examples, tokenizer)
|
| 139 |
+
# Add labels (copy of input_ids for language modeling)
|
| 140 |
+
processed["labels"] = [ids[:] for ids in processed["input_ids"]]
|
| 141 |
+
return processed
|
| 142 |
+
|
| 143 |
tokenized_dataset = dataset.map(
|
| 144 |
+
tokenize_function,
|
| 145 |
batched=True,
|
| 146 |
remove_columns=dataset.column_names
|
| 147 |
)
|