Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on Nov 17

Commit

399f83d

verified ·

1 Parent(s): e86b678

Upload app.py

Browse files

Files changed (1) hide show

app.py +126 -52

app.py CHANGED Viewed

@@ -182,8 +182,21 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                 prediction = response.choices[0].message.content.strip()
-                # Simple exact match evaluation
-                is_correct = str(target).lower().strip() in prediction.lower()
                 if is_correct:
                     correct += 1
                 total += 1
@@ -325,7 +338,7 @@ def parse_evolution_history(output_dir: str) -> str:
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
-    """Create an evaluator.py file for OpenEvolve."""
     evaluator_code = f'''
 import os
 import random
@@ -333,19 +346,22 @@ from datasets import load_dataset
 from openai import OpenAI
 def evaluate(prompt: str) -> float:
-    """Evaluate a prompt and return a score between 0 and 1."""
     try:
         # Load dataset
         dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
-        # Sample 100 random examples
-        num_samples = min(100, len(dataset))
-        if len(dataset) > num_samples:
-            indices = random.sample(range(len(dataset)), num_samples)
-            samples = [dataset[i] for i in indices]
-        else:
-            samples = list(dataset)[:num_samples]
         # Initialize OpenAI client
         api_key = os.environ.get("OPENAI_API_KEY")
         client = OpenAI(
@@ -353,48 +369,107 @@ def evaluate(prompt: str) -> float:
             api_key=api_key,
         )
-        correct = 0
-        total = 0
-        for sample in samples:
-            try:
-                # Get input and target
-                input_text = sample.get("{input_field}", "")
-                if isinstance(input_text, dict):
-                    input_text = str(input_text)
-                target = sample.get("{target_field}", "")
-                if isinstance(target, dict):
-                    target = str(target)
-                # Format the prompt
-                formatted_prompt = prompt.replace("{{input}}", str(input_text))
-                # Call the model
-                response = client.chat.completions.create(
-                    model="{model}",
-                    messages=[
-                        {{"role": "system", "content": "You are a helpful assistant."}},
-                        {{"role": "user", "content": formatted_prompt}}
-                    ],
-                    temperature=0.1,
-                    max_tokens=500,
-                )
-                prediction = response.choices[0].message.content.strip()
-                # Simple evaluation
-                is_correct = str(target).lower().strip() in prediction.lower()
-                if is_correct:
-                    correct += 1
-                total += 1
-            except Exception as e:
-                print(f"Error evaluating sample: {{e}}")
-                continue
-        # Return score between 0 and 1
-        return (correct / total) if total > 0 else 0.0
     except Exception as e:
         print(f"Error in evaluation: {{e}}")
@@ -412,10 +487,8 @@ def create_config_file(model: str, work_dir: str):
     """Create a config.yaml file for OpenEvolve."""
     config = {
         "llm": {
-            "api_base": "https://openrouter.ai/api/v1",
-            "model": model,
             "temperature": 0.7,
-            "max_tokens": 4096,
         },
         "evolution": {
             "max_iterations": 10,
@@ -506,7 +579,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             initial_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
         # Run OpenEvolve
-        progress(0.3, desc="Starting OpenEvolve optimization (10 iterations, ~5-15 minutes)...")
         output_dir = os.path.join(work_dir, "output")
         os.makedirs(output_dir, exist_ok=True)
@@ -568,7 +641,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
 ### Summary
 - **Dataset**: {dataset_name} ({dataset_split} split)
 - **Model**: {model}
-- **Samples**: 100 per evaluation
 - **Iterations**: 10
 ### Results
@@ -715,7 +788,8 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
            - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
         4. **Run & Monitor**:
            - All inputs are validated before starting
-           - Evolution takes 5-15 minutes (10 iterations, 100 samples per evaluation)
            - Watch evolution progress visualization in real-time
         ### About OpenEvolve:

                 prediction = response.choices[0].message.content.strip()
+                # Smart evaluation - handle both exact match and semantic match
+                target_str = str(target).lower().strip()
+                pred_lower = prediction.lower()
+                # Check exact match first
+                is_correct = target_str in pred_lower
+                # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
+                if not is_correct:
+                    # Common sentiment mappings
+                    if target_str in ["1", "positive", "pos"]:
+                        is_correct = any(word in pred_lower for word in ["positive", "good", "great"])
+                    elif target_str in ["0", "negative", "neg"]:
+                        is_correct = any(word in pred_lower for word in ["negative", "bad", "poor"])
                 if is_correct:
                     correct += 1
                 total += 1
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
+    """Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
     evaluator_code = f'''
 import os
 import random
 from openai import OpenAI
 def evaluate(prompt: str) -> float:
+    """
+    Evaluate a prompt using 2-stage cascading evaluation to save API calls.
+    Stage 1: Evaluate with 20 samples
+    - If accuracy >= 0.5, proceed to Stage 2
+    - If accuracy < 0.5, return early (no point wasting 80 more samples)
+    Stage 2: Evaluate with 80 more samples (total 100)
+    - Combine results for final score
+    Returns score between 0 and 1.
+    """
     try:
         # Load dataset
         dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
         # Initialize OpenAI client
         api_key = os.environ.get("OPENAI_API_KEY")
         client = OpenAI(
             api_key=api_key,
         )
+        def evaluate_samples(samples, correct_so_far=0, total_so_far=0):
+            """Helper function to evaluate a batch of samples."""
+            correct = correct_so_far
+            total = total_so_far
+            for sample in samples:
+                try:
+                    # Get input and target
+                    input_text = sample.get("{input_field}", "")
+                    if isinstance(input_text, dict):
+                        input_text = str(input_text)
+                    target = sample.get("{target_field}", "")
+                    if isinstance(target, dict):
+                        target = str(target)
+                    # Format the prompt
+                    formatted_prompt = prompt.replace("{{input}}", str(input_text))
+                    # Call the model
+                    response = client.chat.completions.create(
+                        model="{model}",
+                        messages=[
+                            {{"role": "system", "content": "You are a helpful assistant."}},
+                            {{"role": "user", "content": formatted_prompt}}
+                        ],
+                        temperature=0.1,
+                        max_tokens=500,
+                    )
+                    prediction = response.choices[0].message.content.strip()
+                    # Smart evaluation - handle both exact match and semantic match
+                    target_str = str(target).lower().strip()
+                    pred_lower = prediction.lower()
+                    # Check exact match first
+                    is_correct = target_str in pred_lower
+                    # If not exact match, check for semantic equivalents (e.g., "1" = "positive")
+                    if not is_correct:
+                        # Common sentiment mappings
+                        if target_str in ["1", "positive", "pos"]:
+                            is_correct = any(word in pred_lower for word in ["positive", "good", "great"])
+                        elif target_str in ["0", "negative", "neg"]:
+                            is_correct = any(word in pred_lower for word in ["negative", "bad", "poor"])
+                    if is_correct:
+                        correct += 1
+                    total += 1
+                except Exception as e:
+                    print(f"Error evaluating sample: {{e}}")
+                    continue
+            return correct, total
+        # STAGE 1: Evaluate with 20 samples first
+        stage1_size = 20
+        stage1_samples_count = min(stage1_size, len(dataset))
+        if len(dataset) > stage1_samples_count:
+            stage1_indices = random.sample(range(len(dataset)), stage1_samples_count)
+            stage1_samples = [dataset[i] for i in stage1_indices]
+        else:
+            stage1_samples = list(dataset)[:stage1_samples_count]
+        print(f"[Stage 1/2] Evaluating with {{len(stage1_samples)}} samples...")
+        correct, total = evaluate_samples(stage1_samples)
+        stage1_score = (correct / total) if total > 0 else 0.0
+        print(f"[Stage 1/2] Score: {{stage1_score:.3f}} ({{correct}}/{{total}})")
+        # Early exit if Stage 1 score is below threshold
+        if stage1_score < 0.5:
+            print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
+            return stage1_score
+        # STAGE 2: Continue with 80 more samples
+        print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
+        stage2_size = 80
+        stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
+        if stage2_samples_count > 0:
+            # Get different samples from Stage 1
+            remaining_indices = list(set(range(len(dataset))) - set(stage1_indices if 'stage1_indices' in locals() else []))
+            if len(remaining_indices) >= stage2_samples_count:
+                stage2_indices = random.sample(remaining_indices, stage2_samples_count)
+                stage2_samples = [dataset[i] for i in stage2_indices]
+            else:
+                stage2_samples = [dataset[i] for i in remaining_indices[:stage2_samples_count]]
+            correct, total = evaluate_samples(stage2_samples, correct, total)
+            final_score = (correct / total) if total > 0 else stage1_score
+            print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
+            return final_score
+        else:
+            print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
+            return stage1_score
     except Exception as e:
         print(f"Error in evaluation: {{e}}")
     """Create a config.yaml file for OpenEvolve."""
     config = {
         "llm": {
+            "primary_model": model,
             "temperature": 0.7,
         },
         "evolution": {
             "max_iterations": 10,
             initial_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
         # Run OpenEvolve
+        progress(0.3, desc="Starting OpenEvolve optimization (10 iterations with staged evaluation)...")
         output_dir = os.path.join(work_dir, "output")
         os.makedirs(output_dir, exist_ok=True)
 ### Summary
 - **Dataset**: {dataset_name} ({dataset_split} split)
 - **Model**: {model}
+- **Evaluation**: Staged (20 samples → 100 if score ≥ 0.5)
 - **Iterations**: 10
 ### Results
            - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
         4. **Run & Monitor**:
            - All inputs are validated before starting
+           - Evolution uses staged evaluation (20 samples first, then 80 more if promising)
+           - Saves API calls by early-stopping poor prompts (< 50% accuracy)
            - Watch evolution progress visualization in real-time
         ### About OpenEvolve: