llaa33219 commited on
Commit
cb69d8f
·
verified ·
1 Parent(s): 8900cbf

Upload 4 files

Browse files
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -63,10 +63,7 @@ def preprocess_conversations(examples, tokenizer):
63
  text += f"<|assistant|>\n{content}\n"
64
  texts.append(text)
65
 
66
- # Return tokenized data with labels for language modeling
67
- tokenized = tokenizer(texts, truncation=True, max_length=2048, padding=False)
68
- tokenized["labels"] = tokenized["input_ids"].copy()
69
- return tokenized
70
 
71
  # Persistent storage paths
72
  CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
@@ -135,8 +132,16 @@ def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Pro
135
 
136
  # Preprocess dataset
137
  progress(0.3, desc="Preprocessing dataset...")
 
 
 
 
 
 
 
 
138
  tokenized_dataset = dataset.map(
139
- lambda x: preprocess_conversations(x, tokenizer),
140
  batched=True,
141
  remove_columns=dataset.column_names
142
  )
 
63
  text += f"<|assistant|>\n{content}\n"
64
  texts.append(text)
65
 
66
+ return tokenizer(texts, truncation=True, max_length=2048, padding=False)
 
 
 
67
 
68
  # Persistent storage paths
69
  CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
 
132
 
133
  # Preprocess dataset
134
  progress(0.3, desc="Preprocessing dataset...")
135
+
136
+ def tokenize_function(examples):
137
+ # Process conversations
138
+ processed = preprocess_conversations(examples, tokenizer)
139
+ # Add labels (copy of input_ids for language modeling)
140
+ processed["labels"] = [ids[:] for ids in processed["input_ids"]]
141
+ return processed
142
+
143
  tokenized_dataset = dataset.map(
144
+ tokenize_function,
145
  batched=True,
146
  remove_columns=dataset.column_names
147
  )