Spaces:

llaa33219
/

train3

Paused

File size: 14,843 Bytes

import gradio as gr
import torch
from transformers import (
    AutoModel,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from huggingface_hub import HfApi, login, whoami
import os
from datetime import datetime
import json
import pickle
from pathlib import Path

# Custom Trainer for CoDA model
class CoDATrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Custom loss computation for CoDA diffusion model.
        CoDA returns a dict with 'loss' key instead of a scalar.
        """
        outputs = model(**inputs)
        
        # CoDA model returns a dict with 'loss' key
        if isinstance(outputs, dict) and 'loss' in outputs:
            loss = outputs['loss']
        elif hasattr(outputs, 'loss'):
            loss = outputs.loss
        else:
            # Fallback: compute standard LM loss
            labels = inputs.get('labels')
            logits = outputs.get('logits') if isinstance(outputs, dict) else outputs[0]
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        
        # Ensure loss is a scalar
        if loss.dim() > 0:
            loss = loss.mean()
        
        return (loss, outputs) if return_outputs else loss

def preprocess_conversations(examples, tokenizer):
    """Convert ChatML-style conversations to text for training"""
    texts = []
    for conv in examples['conversations']:
        # Format: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
        if not isinstance(conv, list):
            raise ValueError(f"Expected conversation to be a list, got {type(conv)}")
        
        text = ""
        for message in conv:
            if not isinstance(message, dict):
                raise ValueError(f"Expected message to be a dict, got {type(message)}")
            
            role = message.get('role', '')
            content = message.get('content', '')
            if role == 'user':
                text += f"<|user|>\n{content}\n"
            elif role == 'assistant':
                text += f"<|assistant|>\n{content}\n"
        texts.append(text)
    
    return tokenizer(texts, truncation=True, max_length=2048, padding=False)

# Persistent storage paths
CHECKPOINT_DIR = Path("/data/checkpoints") if Path("/data").exists() else Path("./checkpoints")
MODEL_DIR = Path("/data/models") if Path("/data").exists() else Path("./models")
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

STATE_FILE = CHECKPOINT_DIR / "training_state.pkl"

def save_training_state(state):
    """Save training state to persistent storage"""
    with open(STATE_FILE, 'wb') as f:
        pickle.dump(state, f)

def load_training_state():
    """Load training state from persistent storage"""
    if STATE_FILE.exists():
        with open(STATE_FILE, 'rb') as f:
            return pickle.load(f)
    return None

def train_model(epochs, batch_size, learning_rate, resume=False, progress=gr.Progress()):
    try:
        # Check for existing training state
        if resume:
            saved_state = load_training_state()
            if saved_state:
                progress(0, desc=f"Resuming from step {saved_state.get('step', 0)}...")
        
        progress(0, desc="Initializing training...")
        
        # Check for GPU
        device = "cuda" if torch.cuda.is_available() else "cpu"
        if device == "cpu":
            return "⚠️ Warning: Training on CPU will be very slow. Please upgrade Space to GPU."
        
        progress(0.1, desc="Loading model and tokenizer...")
        
        # Load model and tokenizer
        # Note: Using Instruct version which is better for fine-tuning
        model_name = "Salesforce/CoDA-v0-Instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        )
        
        # Move model to device (CoDA doesn't support device_map='auto')
        if device == "cuda":
            model = model.to(device)
        
        # Set pad token if not exists
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = tokenizer.eos_token_id
        
        progress(0.2, desc="Loading dataset...")
        
        # Load dataset
        dataset = load_dataset("baseten-admin/gpt-oss120b-generated-perfectblend", split="train")
        
        # Verify dataset has conversations column
        if 'conversations' not in dataset.column_names:
            return f"❌ Error: Dataset does not have 'conversations' column. Found columns: {dataset.column_names}"
        
        # Preprocess dataset
        progress(0.3, desc="Preprocessing dataset...")
        
        tokenized_dataset = dataset.map(
            lambda x: preprocess_conversations(x, tokenizer),
            batched=True,
            remove_columns=dataset.column_names
        )
        
        # Split into train/eval
        train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
        train_dataset = train_test_split['train']
        eval_dataset = train_test_split['test']
        
        progress(0.4, desc="Setting up training configuration...")
        
        # Training arguments - use persistent storage
        output_dir = str(MODEL_DIR / "coda-finetuned")
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=learning_rate,
            warmup_steps=100,
            logging_steps=5,  # More frequent logging
            logging_first_step=True,
            eval_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=500,
            save_total_limit=2,
            fp16=True if device == "cuda" else False,
            gradient_accumulation_steps=4,
            gradient_checkpointing=False,  # CoDA doesn't support gradient checkpointing
            optim="adamw_torch",
            report_to="none",
            load_best_model_at_end=True,
            metric_for_best_model="loss",
            greater_is_better=False,
        )
        
        # Use standard data collator for causal language modeling
        # This properly handles CoDA's internal sequence modifications
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False  # Causal LM, not masked LM
        )
        
        # Initialize trainer with custom loss
        trainer = CoDATrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
        )
        
        progress(0.5, desc=f"Training for {epochs} epochs...")
        
        # Train with live logging
        class ProgressCallback:
            def __init__(self, progress_fn):
                self.progress_fn = progress_fn
                self.step = 0
            
            def on_log(self, args, state, control, logs=None, **kwargs):
                if logs:
                    self.step += 1
                    log_str = f"Step {state.global_step}: "
                    if 'loss' in logs:
                        log_str += f"loss={logs['loss']:.4f} "
                    if 'learning_rate' in logs:
                        log_str += f"lr={logs['learning_rate']:.2e}"
                    self.progress_fn(0.5 + (0.4 * state.global_step / state.max_steps), desc=log_str)
        
        from transformers import TrainerCallback
        class GradioProgressCallback(TrainerCallback):
            def __init__(self, progress_fn):
                self.progress_fn = progress_fn
            
            def on_log(self, args, state, control, logs=None, **kwargs):
                if logs and state.max_steps > 0:
                    log_str = f"Step {state.global_step}/{state.max_steps}: "
                    if 'loss' in logs:
                        log_str += f"loss={logs['loss']:.4f} "
                    if 'learning_rate' in logs:
                        log_str += f"lr={logs['learning_rate']:.2e}"
                    progress = 0.5 + (0.4 * state.global_step / state.max_steps)
                    self.progress_fn(progress, desc=log_str)
        
        # Add state saving callback
        class StateSavingCallback(TrainerCallback):
            def on_save(self, args, state, control, **kwargs):
                save_training_state({
                    'step': state.global_step,
                    'epoch': state.epoch,
                    'best_metric': state.best_metric
                })
        
        trainer.add_callback(GradioProgressCallback(progress))
        trainer.add_callback(StateSavingCallback())
        
        # Resume from checkpoint if exists
        resume_from_checkpoint = None
        if resume:
            checkpoints = list(Path(output_dir).glob("checkpoint-*"))
            if checkpoints:
                latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("-")[1]))
                resume_from_checkpoint = str(latest_checkpoint)
                progress(0, desc=f"Resuming from {latest_checkpoint.name}...")
        
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
        
        progress(0.9, desc="Saving model...")
        
        # Save final model
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        
        progress(1.0, desc="Training complete!")
        
        return f"✅ Training completed successfully!\nModel saved to: {output_dir}\n\nFinal training loss: {trainer.state.log_history[-1].get('loss', 'N/A')}"
        
    except Exception as e:
        return f"❌ Error during training: {str(e)}"

def upload_to_hub(repo_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
    try:
        if oauth_token is None:
            return "❌ Please login first to upload the model!"
        
        progress(0, desc="Authenticating...")
        
        # Login with OAuth token
        login(token=oauth_token.token)
        user_info = whoami(oauth_token.token)
        username = user_info['name']
        
        progress(0.2, desc="Preparing model for upload...")
        
        # Full repo ID
        if not repo_name:
            timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
            repo_name = f"coda-finetuned-{timestamp}"
        
        repo_id = f"{username}/{repo_name}"
        
        progress(0.3, desc=f"Creating repository {repo_id}...")
        
        # Create repo
        api = HfApi()
        api.create_repo(repo_id=repo_id, exist_ok=True, token=oauth_token.token, repo_type="model")
        
        progress(0.5, desc="Uploading model files...")
        
        # Upload folder
        model_dir = "./coda-finetuned"
        if not os.path.exists(model_dir):
            return "❌ No trained model found! Please train a model first."
        
        api.upload_folder(
            folder_path=model_dir,
            repo_id=repo_id,
            repo_type="model",
            token=oauth_token.token
        )
        
        progress(1.0, desc="Upload complete!")
        
        return f"✅ Model successfully uploaded to: https://huggingface.co/{repo_id}"
        
    except Exception as e:
        return f"❌ Error during upload: {str(e)}"

# Gradio UI
with gr.Blocks(title="CoDA Fine-tuning Space") as demo:
    gr.Markdown("""
    # 🚀 CoDA Model Fine-tuning Space
    
    This Space fine-tunes the **Salesforce/CoDA-v0-Instruct** diffusion model on the **baseten-admin/gpt-oss120b-generated-perfectblend** dataset.
    
    ### Steps:
    1. **Login** with your Hugging Face account (required for upload)
    2. **Configure** training parameters
    3. **Train** the model (requires GPU - upgrade Space if needed)
    4. **Upload** the trained model to your account
    
    ⚠️ **Note**: 
    - Full fine-tuning requires significant GPU resources. Training may take several hours.
    - **Checkpoints are saved every 500 steps** - you can resume if interrupted.
    - For Docker: Mount `/data` volume for full persistence across container restarts.
    - On Spaces: Checkpoints persist in the same session and across rebuilds with persistent storage.
    """)
    
    with gr.Row():
        login_button = gr.LoginButton()
    
    gr.Markdown("## Training Configuration")
    
    with gr.Row():
        with gr.Column():
            epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Epochs")
            batch_size = gr.Slider(minimum=1, maximum=8, value=2, step=1, label="Batch Size per Device")
            learning_rate = gr.Slider(minimum=1e-6, maximum=1e-4, value=2e-5, step=1e-6, label="Learning Rate", info="Default: 2e-5")
            resume_training = gr.Checkbox(label="Resume from last checkpoint", value=False, info="Check if training was interrupted")
    
    with gr.Row():
        train_button = gr.Button("🎯 Start Training", variant="primary", size="lg")
    
    training_output = gr.Textbox(label="Training Status", lines=5)
    
    gr.Markdown("## Upload Trained Model")
    
    with gr.Row():
        repo_name = gr.Textbox(label="Model Repository Name", placeholder="coda-finetuned-v1", info="Leave empty for auto-generated name")
    
    with gr.Row():
        upload_button = gr.Button("📤 Upload to Hugging Face Hub", variant="secondary", size="lg")
    
    upload_output = gr.Textbox(label="Upload Status", lines=3)
    
    gr.Markdown("""
    ---
    ### About
    
    **CoDA (Code Diffusion with Autoregressive)** is a 1.7B parameter bidirectional diffusion model for text generation.
    This Space performs full fine-tuning on conversational data in ChatML format.
    
    **Dataset**: The training uses the `conversations` column from the dataset, which contains question-answer pairs.
    
    **Hardware**: GPU (T4 or better) is strongly recommended. CPU training will be extremely slow.
    """)
    
    # Event handlers
    train_button.click(
        fn=train_model,
        inputs=[epochs, batch_size, learning_rate, resume_training],
        outputs=training_output
    )
    
    upload_button.click(
        fn=upload_to_hub,
        inputs=[repo_name, login_button],
        outputs=upload_output
    )

if __name__ == "__main__":
    demo.launch()