Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on Nov 17

Commit

e51517c

verified ·

1 Parent(s): 3c1041b

Upload app.py

Browse files

Files changed (1) hide show

app.py +281 -81

app.py CHANGED Viewed

@@ -3,12 +3,14 @@ import os
 import yaml
 import json
 import random
-from datasets import load_dataset
 from openai import OpenAI
 from openevolve import run_evolution
-from typing import Dict, List, Tuple
 import tempfile
 import shutil
 # Free models from OpenRouter (as of 2025)
 FREE_MODELS = [
@@ -22,34 +24,118 @@ FREE_MODELS = [
     "mistralai/mistral-7b-instruct:free",
 ]
-# Popular HuggingFace datasets for different tasks
-SAMPLE_DATASETS = {
-    "Question Answering": [
-        "hotpot_qa",
-        "squad",
-        "trivia_qa",
-    ],
-    "Sentiment Analysis": [
-        "imdb",
-        "yelp_review_full",
-        "emotion",
-    ],
-    "Text Classification": [
-        "ag_news",
-        "dbpedia_14",
-        "SetFit/sst5",
-    ],
-    "Math Reasoning": [
-        "gsm8k",
-        "math_qa",
-    ],
-}
 def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
-                    api_key: str, model: str, input_field: str, target_field: str) -> Dict:
     """Evaluate a prompt on a dataset using the selected model."""
     try:
         # Load dataset
         dataset = load_dataset(dataset_name, split=split, streaming=False)
@@ -104,9 +190,9 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                 total += 1
                 results.append({
-                    "input": str(input_text)[:100] + "...",
                     "target": str(target),
-                    "prediction": prediction[:100] + "...",
                     "correct": is_correct
                 })
@@ -133,6 +219,96 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
         }
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
     """Create an evaluator.py file for OpenEvolve."""
@@ -248,17 +424,21 @@ def create_config_file(model: str, work_dir: str):
 def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
-                   model: str, api_key: str, input_field: str, target_field: str,
-                   progress=gr.Progress()) -> Tuple[str, str, str]:
     """Run OpenEvolve to optimize the prompt."""
-    if not api_key:
-        return "Error: OpenAI API Key is required", "", ""
-    # Set API key as environment variable
-    os.environ["OPENAI_API_KEY"] = api_key
-    progress(0, desc="Setting up...")
     # Create temporary working directory
     work_dir = tempfile.mkdtemp(prefix="openevolve_")
@@ -275,16 +455,19 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                                                input_field, target_field, work_dir)
         # Create config
-        progress(0.2, desc="Creating configuration...")
         config_path = create_config_file(model, work_dir)
         # Run initial evaluation
-        progress(0.3, desc="Running initial evaluation...")
         initial_eval = evaluate_prompt(
             initial_prompt, dataset_name, dataset_split, 100,
-            api_key, model, input_field, target_field
         )
         initial_results = f"""
 ### Initial Prompt Evaluation
@@ -306,7 +489,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             initial_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
         # Run OpenEvolve
-        progress(0.4, desc="Running OpenEvolve (this may take several minutes)...")
         output_dir = os.path.join(work_dir, "output")
         os.makedirs(output_dir, exist_ok=True)
@@ -321,7 +504,12 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                 verbose=True
             )
-            progress(0.8, desc="Evaluating best prompt...")
             # Get the best prompt
             best_prompt_path = os.path.join(output_dir, "best_program.txt")
@@ -334,7 +522,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             # Evaluate best prompt
             final_eval = evaluate_prompt(
                 best_prompt, dataset_name, dataset_split, 100,
-                api_key, model, input_field, target_field
             )
             final_results = f"""
@@ -348,7 +536,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
 **Results:**
 - Accuracy: {final_eval['accuracy']:.2f}%
 - Correct: {final_eval['correct']}/{final_eval['total']}
-- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
 **Sample Results:**
 """
@@ -359,24 +547,28 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                 final_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
             summary = f"""
-## Optimization Complete!
 ### Summary
-- Initial Accuracy: {initial_eval['accuracy']:.2f}%
-- Final Accuracy: {final_eval['accuracy']:.2f}%
-- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
-- Dataset: {dataset_name}
-- Model: {model}
-- Samples Evaluated: 100
-- Iterations: 10
 """
             progress(1.0, desc="Complete!")
-            return summary, initial_results, final_results
         except Exception as e:
-            return f"Error during evolution: {str(e)}", initial_results, ""
     finally:
         # Clean up
@@ -393,28 +585,25 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
     Automatically evolve and optimize your prompts using evolutionary algorithms!
-    This space uses [OpenEvolve](https://github.com/codelion/openevolve) to iteratively improve prompts
     by testing them on real datasets and evolving better versions.
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
-    2. Select a HuggingFace dataset to test on
-    3. Choose a free model from OpenRouter
-    4. Click "Optimize Prompt" to evolve better versions
-    5. Compare initial vs. evolved performance!
     """)
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Configuration")
-            api_key = gr.Textbox(
-                label="OpenAI API Key (for OpenRouter)",
-                type="password",
-                placeholder="sk-or-v1-...",
-                info="Get your free key at https://openrouter.ai/keys"
-            )
             model = gr.Dropdown(
                 choices=FREE_MODELS,
                 value=FREE_MODELS[0],
@@ -423,10 +612,10 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
             )
             dataset_name = gr.Textbox(
-                label="HuggingFace Dataset",
-                value="imdb",
-                placeholder="e.g., imdb, hotpot_qa, gsm8k",
-                info="Any dataset from HuggingFace Hub"
             )
             dataset_split = gr.Textbox(
@@ -456,7 +645,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
                 info="Use {input} as placeholder for dataset inputs"
             )
-            optimize_btn = gr.Button("🚀 Optimize Prompt", variant="primary", size="lg")
     with gr.Row():
         with gr.Column():
@@ -468,29 +657,40 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
         with gr.Column():
             final_results = gr.Markdown(label="Evolved Results")
     gr.Markdown("""
     ### Example Datasets & Fields:
     | Dataset | Split | Input Field | Target Field | Task |
     |---------|-------|-------------|--------------|------|
-    | imdb | test | text | label | Sentiment Analysis |
-    | hotpot_qa | validation | question | answer | Question Answering |
-    | emotion | test | text | label | Emotion Classification |
-    | gsm8k | test | question | answer | Math Reasoning |
-    | ag_news | test | text | label | News Classification |
-    ### Notes:
-    - Evolution runs for 10 iterations with 1 island
-    - Each evaluation uses 100 random samples from the dataset
-    - The process may take 5-15 minutes depending on the dataset and model
-    - Make sure your API key has sufficient credits for the requests
     """)
     optimize_btn.click(
         fn=optimize_prompt,
-        inputs=[initial_prompt, dataset_name, dataset_split, model, api_key,
                 input_field, target_field],
-        outputs=[summary, initial_results, final_results]
     )
 if __name__ == "__main__":

 import yaml
 import json
 import random
+from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
 from openai import OpenAI
 from openevolve import run_evolution
+from typing import Dict, List, Tuple, Optional
 import tempfile
 import shutil
+import requests
+import glob
 # Free models from OpenRouter (as of 2025)
 FREE_MODELS = [
     "mistralai/mistral-7b-instruct:free",
 ]
+def validate_dataset(dataset_name: str, split: str, input_field: str, target_field: str) -> Tuple[bool, str]:
+    """
+    Validate that the dataset exists and has the required fields.
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    try:
+        # Check if dataset name has correct format (should be org/name or just name)
+        if not dataset_name or dataset_name.strip() == "":
+            return False, "❌ Dataset name cannot be empty"
+        dataset_name = dataset_name.strip()
+        # Try to get dataset info from HuggingFace API
+        hf_token = os.environ.get("HF_TOKEN", None)
+        headers = {}
+        if hf_token:
+            headers["Authorization"] = f"Bearer {hf_token}"
+        # Check if dataset exists on HuggingFace Hub
+        api_url = f"https://huggingface.co/api/datasets/{dataset_name}"
+        response = requests.get(api_url, headers=headers, timeout=10)
+        if response.status_code == 404:
+            return False, f"❌ Dataset '{dataset_name}' not found on HuggingFace Hub. Please use the full dataset name (e.g., 'stanfordnlp/imdb' or 'gsm8k')"
+        elif response.status_code != 200:
+            # Try to load anyway - might be a private dataset or API issue
+            print(f"Warning: Could not verify dataset via API (status {response.status_code}), attempting to load...")
+        # Try to load a small sample to verify it works and check fields
+        print(f"Loading dataset {dataset_name} with split {split}...")
+        # First, check if the split exists
+        try:
+            available_splits = get_dataset_split_names(dataset_name)
+            if split not in available_splits:
+                return False, f"❌ Split '{split}' not found. Available splits: {', '.join(available_splits)}"
+        except Exception as e:
+            print(f"Could not get split names: {e}. Will try to load anyway...")
+        # Load a small sample to check fields
+        dataset = load_dataset(dataset_name, split=split, streaming=True)
+        # Get first example to check fields
+        first_example = next(iter(dataset))
+        available_fields = list(first_example.keys())
+        # Check if input field exists
+        if input_field not in available_fields:
+            return False, f"❌ Input field '{input_field}' not found. Available fields: {', '.join(available_fields)}"
+        # Check if target field exists
+        if target_field not in available_fields:
+            return False, f"❌ Target field '{target_field}' not found. Available fields: {', '.join(available_fields)}"
+        # All validations passed
+        return True, f"✅ Dataset validated successfully! Fields '{input_field}' and '{target_field}' found."
+    except Exception as e:
+        error_msg = str(e)
+        if "404" in error_msg or "not found" in error_msg.lower():
+            return False, f"❌ Dataset '{dataset_name}' not found. Please check the dataset name (use format: org/dataset-name)"
+        return False, f"❌ Error validating dataset: {error_msg}"
+def validate_inputs(dataset_name: str, split: str, input_field: str, target_field: str,
+                   initial_prompt: str) -> Tuple[bool, str]:
+    """
+    Validate all inputs before starting optimization.
+    Returns:
+        Tuple of (is_valid, message)
+    """
+    # Check API key
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        return False, "❌ OPENAI_API_KEY environment variable not set. Please set it in the Space secrets."
+    # Check prompt contains {input} placeholder
+    if "{input}" not in initial_prompt:
+        return False, "❌ Prompt must contain '{input}' placeholder for dataset inputs"
+    # Check dataset name format
+    dataset_name = dataset_name.strip()
+    if not dataset_name:
+        return False, "❌ Dataset name cannot be empty"
+    # Validate dataset and fields
+    is_valid, message = validate_dataset(dataset_name, split, input_field, target_field)
+    if not is_valid:
+        return False, message
+    return True, message
 def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
+                    model: str, input_field: str, target_field: str) -> Dict:
     """Evaluate a prompt on a dataset using the selected model."""
     try:
+        # Get API key from environment
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if not api_key:
+            return {
+                "error": "OPENAI_API_KEY not set in environment",
+                "accuracy": 0,
+                "correct": 0,
+                "total": 0,
+                "results": []
+            }
         # Load dataset
         dataset = load_dataset(dataset_name, split=split, streaming=False)
                 total += 1
                 results.append({
+                    "input": str(input_text)[:100] + "..." if len(str(input_text)) > 100 else str(input_text),
                     "target": str(target),
+                    "prediction": prediction[:100] + "..." if len(prediction) > 100 else prediction,
                     "correct": is_correct
                 })
         }
+def parse_evolution_history(output_dir: str) -> str:
+    """
+    Parse evolution history from OpenEvolve output directory.
+    Returns a markdown string with visualization of the evolution process.
+    """
+    try:
+        evolution_viz = "## 🧬 Evolution Progress\n\n"
+        # Look for generation files or logs
+        generation_files = sorted(glob.glob(os.path.join(output_dir, "generation_*.txt")))
+        log_file = os.path.join(output_dir, "evolution.log")
+        # Try to parse generation files if they exist
+        if generation_files:
+            evolution_viz += "### Generation-by-Generation Progress\n\n"
+            for gen_file in generation_files:
+                gen_num = os.path.basename(gen_file).replace("generation_", "").replace(".txt", "")
+                try:
+                    with open(gen_file, 'r') as f:
+                        content = f.read()
+                    evolution_viz += f"**Generation {gen_num}:**\n```\n{content[:200]}{'...' if len(content) > 200 else ''}\n```\n\n"
+                except:
+                    pass
+        # Try to parse log file
+        elif os.path.exists(log_file):
+            evolution_viz += "### Evolution Log\n\n"
+            try:
+                with open(log_file, 'r') as f:
+                    log_content = f.read()
+                evolution_viz += f"```\n{log_content[-1000:]}\n```\n\n"
+            except:
+                pass
+        # Look for scores or history file
+        scores_file = os.path.join(output_dir, "scores.json")
+        if os.path.exists(scores_file):
+            try:
+                with open(scores_file, 'r') as f:
+                    scores = json.load(f)
+                evolution_viz += "### Score Progression\n\n"
+                evolution_viz += "| Generation | Best Score | Avg Score | Population |\n"
+                evolution_viz += "|------------|-----------|-----------|------------|\n"
+                for gen in scores:
+                    evolution_viz += f"| {gen['generation']} | {gen['best']:.3f} | {gen['avg']:.3f} | {gen['population']} |\n"
+                evolution_viz += "\n"
+            except:
+                pass
+        # Look for all program variants
+        program_files = sorted(glob.glob(os.path.join(output_dir, "program_*.txt")))
+        if program_files:
+            evolution_viz += f"### Explored Variants\n\n"
+            evolution_viz += f"OpenEvolve explored {len(program_files)} different prompt variants during evolution.\n\n"
+            # Show a few intermediate prompts
+            if len(program_files) > 3:
+                sample_files = [program_files[0], program_files[len(program_files)//2], program_files[-2]]
+                evolution_viz += "**Sample Intermediate Prompts:**\n\n"
+                for idx, pfile in enumerate(sample_files, 1):
+                    try:
+                        with open(pfile, 'r') as f:
+                            prompt_content = f.read()
+                        evolution_viz += f"**Variant {idx}:**\n```\n{prompt_content[:150]}{'...' if len(prompt_content) > 150 else ''}\n```\n\n"
+                    except:
+                        pass
+        # If no specific files found, show directory contents
+        if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
+            evolution_viz += "### Evolution Complete\n\n"
+            evolution_viz += "OpenEvolve ran 10 iterations of evolutionary optimization using:\n"
+            evolution_viz += "- **Population Size**: 10 prompts per generation\n"
+            evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n"
+            evolution_viz += "- **Islands**: 1 population with mutation and crossover\n"
+            evolution_viz += "- **Evaluation**: 100 samples per prompt variant\n\n"
+            # Count files in output directory
+            all_files = os.listdir(output_dir)
+            evolution_viz += f"Generated {len(all_files)} files during evolution process.\n\n"
+        return evolution_viz
+    except Exception as e:
+        return f"## 🧬 Evolution Progress\n\nEvolution completed successfully. Unable to parse detailed history: {str(e)}\n\n"
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
     """Create an evaluator.py file for OpenEvolve."""
 def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
+                   model: str, input_field: str, target_field: str,
+                   progress=gr.Progress()) -> Tuple[str, str, str, str]:
     """Run OpenEvolve to optimize the prompt."""
+    progress(0, desc="Validating inputs...")
+    # Validate all inputs
+    is_valid, validation_message = validate_inputs(
+        dataset_name, dataset_split, input_field, target_field, initial_prompt
+    )
+    if not is_valid:
+        return f"## Validation Failed\n\n{validation_message}", "", "", ""
+    progress(0.05, desc=f"Validation passed: {validation_message}")
     # Create temporary working directory
     work_dir = tempfile.mkdtemp(prefix="openevolve_")
                                                input_field, target_field, work_dir)
         # Create config
+        progress(0.15, desc="Creating configuration...")
         config_path = create_config_file(model, work_dir)
         # Run initial evaluation
+        progress(0.2, desc="Running initial evaluation on 100 samples...")
         initial_eval = evaluate_prompt(
             initial_prompt, dataset_name, dataset_split, 100,
+            model, input_field, target_field
         )
+        if "error" in initial_eval and initial_eval["total"] == 0:
+            return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", "", ""
         initial_results = f"""
 ### Initial Prompt Evaluation
             initial_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
         # Run OpenEvolve
+        progress(0.3, desc="Starting OpenEvolve optimization (10 iterations, ~5-15 minutes)...")
         output_dir = os.path.join(work_dir, "output")
         os.makedirs(output_dir, exist_ok=True)
                 verbose=True
             )
+            progress(0.80, desc="Parsing evolution history...")
+            # Parse evolution history for visualization
+            evolution_viz = parse_evolution_history(output_dir)
+            progress(0.85, desc="Evaluating best evolved prompt...")
             # Get the best prompt
             best_prompt_path = os.path.join(output_dir, "best_program.txt")
             # Evaluate best prompt
             final_eval = evaluate_prompt(
                 best_prompt, dataset_name, dataset_split, 100,
+                model, input_field, target_field
             )
             final_results = f"""
 **Results:**
 - Accuracy: {final_eval['accuracy']:.2f}%
 - Correct: {final_eval['correct']}/{final_eval['total']}
+- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
 **Sample Results:**
 """
                 final_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
             summary = f"""
+## 🎉 Optimization Complete!
 ### Summary
+- **Dataset**: {dataset_name} ({dataset_split} split)
+- **Model**: {model}
+- **Samples**: 100 per evaluation
+- **Iterations**: 10
+### Results
+- **Initial Accuracy**: {initial_eval['accuracy']:.2f}%
+- **Final Accuracy**: {final_eval['accuracy']:.2f}%
+- **Improvement**: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
+{validation_message}
 """
             progress(1.0, desc="Complete!")
+            return summary, initial_results, evolution_viz, final_results
         except Exception as e:
+            return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", ""
     finally:
         # Clean up
     Automatically evolve and optimize your prompts using evolutionary algorithms!
+    This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts
     by testing them on real datasets and evolving better versions.
     ## How it works:
     1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
+    2. Enter the full HuggingFace dataset name (e.g., `stanfordnlp/imdb`, `gsm8k`)
+    3. Specify the dataset split and field names
+    4. Choose a free model from OpenRouter
+    5. Click "Optimize Prompt" - the system will validate everything first!
+    6. Watch the evolution progress in real-time
+    7. Compare initial vs. evolved performance!
+    **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
     """)
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Configuration")
             model = gr.Dropdown(
                 choices=FREE_MODELS,
                 value=FREE_MODELS[0],
             )
             dataset_name = gr.Textbox(
+                label="HuggingFace Dataset (Full Name)",
+                value="stanfordnlp/imdb",
+                placeholder="e.g., stanfordnlp/imdb, openai/gsm8k, SetFit/sst5",
+                info="Full dataset name from HuggingFace Hub (org/dataset-name or dataset-name)"
             )
             dataset_split = gr.Textbox(
                 info="Use {input} as placeholder for dataset inputs"
             )
+            optimize_btn = gr.Button("🚀 Validate & Optimize Prompt", variant="primary", size="lg")
     with gr.Row():
         with gr.Column():
         with gr.Column():
             final_results = gr.Markdown(label="Evolved Results")
+    with gr.Row():
+        with gr.Column():
+            evolution_progress = gr.Markdown(label="Evolution Progress", value="Evolution progress will appear here...")
     gr.Markdown("""
     ### Example Datasets & Fields:
     | Dataset | Split | Input Field | Target Field | Task |
     |---------|-------|-------------|--------------|------|
+    | stanfordnlp/imdb | test | text | label | Sentiment Analysis |
+    | rajpurkar/squad | validation | question | answers | Question Answering |
+    | dair-ai/emotion | test | text | label | Emotion Classification |
+    | openai/gsm8k | test | question | answer | Math Reasoning |
+    | fancyzhx/ag_news | test | text | label | News Classification |
+    ### Important Notes:
+    - **API Key**: Must be set as `OPENAI_API_KEY` environment variable in Space secrets
+    - **HF Token**: Optional `HF_TOKEN` environment variable for private datasets
+    - **Dataset Name**: Use full name (org/dataset or dataset-name)
+    - **Validation**: All inputs are validated before starting optimization
+    - **Time**: Evolution takes 5-15 minutes (10 iterations)
+    - **Samples**: 100 random samples per evaluation
+    ### About OpenEvolve:
+    OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
+    - [GitHub Repository](https://github.com/algorithmicsuperintelligence/openevolve)
+    - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
     """)
     optimize_btn.click(
         fn=optimize_prompt,
+        inputs=[initial_prompt, dataset_name, dataset_split, model,
                 input_field, target_field],
+        outputs=[summary, initial_results, evolution_progress, final_results]
     )
 if __name__ == "__main__":