Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on about 1 month ago

Commit

93d757f

verified ·

1 Parent(s): 81a72ce

Upload app.py

Browse files

Files changed (1) hide show

app.py +74 -110

app.py CHANGED Viewed

@@ -890,29 +890,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
         progress(0.15, desc="Creating configuration...")
         config_path = create_config_file(model, work_dir)
-        # Run initial evaluation
-        # For small datasets (like AIME with 30 instances), use all available
-        # For larger datasets, use 50 samples for quick baseline
         # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
-        # Load dataset to check size
-        try:
-            temp_dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
-        except ValueError as e:
-            if "config" in str(e).lower():
-                default_config = "main"
-                if dataset_name.lower() == "glue":
-                    default_config = "sst2"
-                temp_dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
-            else:
-                raise
-        dataset_size = len(temp_dataset)
-        eval_samples = min(dataset_size, 50)  # Use all if dataset has ≤50, else use 50
-        progress(0.2, desc=f"Running initial evaluation on {eval_samples} samples (dataset has {dataset_size} total)...")
         initial_eval = evaluate_prompt(
-            initial_prompt, dataset_name, dataset_split, eval_samples,
             model, input_field, target_field
         )
@@ -1007,66 +989,60 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                 else:
                     best_prompt = initial_prompt
-            # Final evaluation: Use same samples as initial eval
-            # For small datasets (≤50), we already used all samples
-            # For large datasets, we used 50 samples with potential for Stage 2
-            progress(0.85, desc=f"Evaluating best prompt on {eval_samples} samples...")
             final_eval = evaluate_prompt(
-                best_prompt, dataset_name, dataset_split, eval_samples,
                 model, input_field, target_field,
                 fixed_indices=eval_indices  # Use same samples as initial eval!
             )
-            # Stage 2: Only for large datasets (>50) with high accuracy
-            if dataset_size > 50 and final_eval.get('accuracy', 0) > 80.0:
-                progress(0.90, desc="Stage 2: Accuracy >80%! Evaluating 50 more samples...")
-                # Load dataset to get additional samples
-                try:
-                    dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
-                except ValueError as e:
-                    if "config" in str(e).lower():
-                        default_config = "main"
-                        if dataset_name.lower() == "glue":
-                            default_config = "sst2"
-                        dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
-                    else:
-                        raise
-                # Get 50 additional indices (different from initial 50)
-                import random
-                random.seed(42)
-                all_indices = list(range(len(dataset)))
-                remaining_indices = [i for i in all_indices if i not in eval_indices]
-                if len(remaining_indices) >= 50:
-                    additional_indices = random.sample(remaining_indices, 50)
-                    # Evaluate on additional 50 samples
-                    additional_eval = evaluate_prompt(
-                        best_prompt, dataset_name, dataset_split, 50,
-                        model, input_field, target_field,
-                        fixed_indices=additional_indices
-                    )
-                    # Combine results from both stages
-                    combined_correct = final_eval['correct'] + additional_eval['correct']
-                    combined_total = final_eval['total'] + additional_eval['total']
-                    combined_accuracy = (combined_correct / combined_total * 100) if combined_total > 0 else 0
-                    final_eval = {
-                        'accuracy': combined_accuracy,
-                        'correct': combined_correct,
-                        'total': combined_total,
-                        'results': final_eval['results'] + additional_eval['results']
-                    }
-                    progress(0.95, desc=f"Stage 2 complete: {combined_correct}/{combined_total} = {combined_accuracy:.1f}%")
                 else:
-                    progress(0.90, desc="Not enough samples for Stage 2, using Stage 1 results")
-            elif dataset_size <= 50:
-                progress(0.90, desc=f"Complete: Evaluated on all {dataset_size} instances")
             else:
-                progress(0.90, desc=f"Stage 1 complete: {final_eval['accuracy']:.1f}% (below 80%, skipping Stage 2)")
             final_results = f"""
 ### Evolved Prompt Evaluation
@@ -1094,10 +1070,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
 ### Summary
 - **Dataset**: {dataset_name} ({dataset_split} split)
-- **Dataset Size**: {dataset_size} instances
 - **Model**: {model}
-- **Evaluation**: {eval_samples} samples {'(all instances)' if eval_samples == dataset_size else ''}
-- **Evolution Eval**: Staged (50 → 200 if score ≥ 0.5)
 - **Iterations**: 10
 ### Results
@@ -1133,12 +1109,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
-    2. Default dataset is **AIME 2025** (30 hard math competition problems) - tests on challenging problems!
-    3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `stanfordnlp/imdb`)
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
-    7. Compare initial vs. best prompt side-by-side (uses all instances for small datasets ≤50)!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
@@ -1156,47 +1132,36 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
-                value="MathArena/aime_2025",
-                placeholder="e.g., MathArena/aime_2025, gsm8k, stanfordnlp/imdb",
-                info="Dataset name from HuggingFace Hub. Default: AIME 2025 (30 hard math problems)"
             )
             dataset_split = gr.Textbox(
                 label="Dataset Split",
-                value="train",
                 placeholder="e.g., train, test, validation"
             )
             input_field = gr.Textbox(
                 label="Input Field Name",
-                value="problem",
-                placeholder="e.g., problem, question, sentence, text",
                 info="The field containing inputs to process"
             )
             target_field = gr.Textbox(
                 label="Target Field Name",
-                value="answer",
-                placeholder="e.g., answer, label, target",
                 info="The field containing expected outputs"
             )
             initial_prompt = gr.TextArea(
                 label="Initial Prompt",
-                value="""You are a math competition expert. Solve this AIME problem using clear reasoning.
-Problem: {input}
-Instructions:
-1. Read the problem carefully
-2. Identify what is being asked
-3. Break down the solution into steps
-4. Show your work clearly
-5. Provide the final numerical answer after ####
-Solution:""",
-                lines=10,
-                info="Use {input} as placeholder for dataset inputs. Chain-of-thought prompting works best for AIME!"
             )
     # Button outside the column for better visibility
@@ -1230,11 +1195,11 @@ Solution:""",
         | Dataset | Split | Input Field | Target Field | Task | Size |
         |---------|-------|-------------|--------------|------|------|
-        | **MathArena/aime_2025** | train | problem | answer | Hard Math (AIME) | 30 |
-        | gsm8k | train | question | answer | Grade School Math | 7,473 |
-        | stanfordnlp/imdb | test | text | label | Sentiment Analysis | 25,000 |
         | dair-ai/emotion | test | text | label | Emotion Classification | 2,000 |
         | fancyzhx/ag_news | test | text | label | News Classification | 7,600 |
         ### About This Demo Space:
@@ -1253,12 +1218,11 @@ Solution:""",
            - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
         4. **Run & Monitor**:
            - All inputs are validated before starting
-           - **Adaptive evaluation**:
-             - Small datasets (≤50): Uses all instances for accurate results
-             - Large datasets (>50): Uses 50 samples → 100 if accuracy >80%
-           - Evolution uses staged evaluation (50 → 200 if score ≥ 0.5)
-           - Saves API calls and time with smart staged evaluation
-           - Compare initial vs best prompt side-by-side
         ### About OpenEvolve:
         OpenEvolve is an open-source evolutionary optimization framework. Learn more at:

         progress(0.15, desc="Creating configuration...")
         config_path = create_config_file(model, work_dir)
+        # Run initial evaluation with 50 samples
         # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
+        progress(0.2, desc="Running initial evaluation on 50 samples...")
         initial_eval = evaluate_prompt(
+            initial_prompt, dataset_name, dataset_split, 50,
             model, input_field, target_field
         )
                 else:
                     best_prompt = initial_prompt
+            # Two-stage final evaluation: 50 → 200 samples
+            # Stage 1: Evaluate on same 50 samples as initial eval
+            progress(0.85, desc="Stage 1: Evaluating on 50 samples...")
             final_eval = evaluate_prompt(
+                best_prompt, dataset_name, dataset_split, 50,
                 model, input_field, target_field,
                 fixed_indices=eval_indices  # Use same samples as initial eval!
             )
+            # Stage 2: Continue to 200 total samples (add 150 more)
+            progress(0.90, desc="Stage 2: Evaluating 150 more samples (200 total)...")
+            # Load dataset to get additional samples
+            try:
+                dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
+            except ValueError as e:
+                if "config" in str(e).lower():
+                    default_config = "main"
+                    if dataset_name.lower() == "glue":
+                        default_config = "sst2"
+                    dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
                 else:
+                    raise
+            # Get 150 additional indices (different from initial 50)
+            import random
+            random.seed(42)
+            all_indices = list(range(len(dataset)))
+            remaining_indices = [i for i in all_indices if i not in eval_indices]
+            if len(remaining_indices) >= 150:
+                additional_indices = random.sample(remaining_indices, 150)
+                # Evaluate on additional 150 samples
+                additional_eval = evaluate_prompt(
+                    best_prompt, dataset_name, dataset_split, 150,
+                    model, input_field, target_field,
+                    fixed_indices=additional_indices
+                )
+                # Combine results from both stages
+                combined_correct = final_eval['correct'] + additional_eval['correct']
+                combined_total = final_eval['total'] + additional_eval['total']
+                combined_accuracy = (combined_correct / combined_total * 100) if combined_total > 0 else 0
+                final_eval = {
+                    'accuracy': combined_accuracy,
+                    'correct': combined_correct,
+                    'total': combined_total,
+                    'results': final_eval['results'] + additional_eval['results']
+                }
+                progress(0.95, desc=f"Stage 2 complete: {combined_correct}/{combined_total} = {combined_accuracy:.1f}%")
             else:
+                progress(0.90, desc=f"Not enough samples for Stage 2, using Stage 1 results ({final_eval['correct']}/{final_eval['total']})")
             final_results = f"""
 ### Evolved Prompt Evaluation
 ### Summary
 - **Dataset**: {dataset_name} ({dataset_split} split)
 - **Model**: {model}
+- **Initial Eval**: 50 samples
+- **Final Eval**: 50 samples → 200 samples (two-stage)
+- **Evolution**: Staged (50 → 200 if score ≥ 0.5)
 - **Iterations**: 10
 ### Results
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
+    2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
+    3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
     4. Choose a free model from OpenRouter
     5. Click "Optimize Prompt" - the system will validate everything first!
     6. Watch the evolution progress in real-time
+    7. Compare initial vs. best prompt side-by-side (50 samples → 200 samples for final evaluation)!
     **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
             dataset_name = gr.Textbox(
                 label="HuggingFace Dataset (Full Name)",
+                value="stanfordnlp/imdb",
+                placeholder="e.g., stanfordnlp/imdb, gsm8k, MathArena/aime_2025",
+                info="Dataset name from HuggingFace Hub. Default: IMDB (sentiment classification)"
             )
             dataset_split = gr.Textbox(
                 label="Dataset Split",
+                value="test",
                 placeholder="e.g., train, test, validation"
             )
             input_field = gr.Textbox(
                 label="Input Field Name",
+                value="text",
+                placeholder="e.g., text, question, sentence",
                 info="The field containing inputs to process"
             )
             target_field = gr.Textbox(
                 label="Target Field Name",
+                value="label",
+                placeholder="e.g., label, answer, target",
                 info="The field containing expected outputs"
             )
             initial_prompt = gr.TextArea(
                 label="Initial Prompt",
+                value="Classify the sentiment of this review as positive or negative.\n\nReview: {input}\n\nSentiment:",
+                lines=5,
+                info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
             )
     # Button outside the column for better visibility
         | Dataset | Split | Input Field | Target Field | Task | Size |
         |---------|-------|-------------|--------------|------|------|
+        | **stanfordnlp/imdb** | test | text | label | Sentiment Analysis | 25,000 |
         | dair-ai/emotion | test | text | label | Emotion Classification | 2,000 |
+        | gsm8k | train | question | answer | Grade School Math | 7,473 |
         | fancyzhx/ag_news | test | text | label | News Classification | 7,600 |
+        | MathArena/aime_2025 | train | problem | answer | Hard Math (AIME) | 30 |
         ### About This Demo Space:
            - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
         4. **Run & Monitor**:
            - All inputs are validated before starting
+           - **Evaluation strategy**:
+             - Initial evaluation: 50 samples (quick baseline)
+             - Final evaluation: 50 → 200 samples (two-stage for accuracy)
+             - Evolution: Staged (50 → 200 if score ≥ 0.5 to save API calls)
+           - Compare initial vs best prompt side-by-side with full results
         ### About OpenEvolve:
         OpenEvolve is an open-source evolutionary optimization framework. Learn more at: