Upload app.py
Browse files
app.py
CHANGED
|
@@ -494,7 +494,7 @@ def parse_evolution_history(output_dir: str) -> str:
|
|
| 494 |
|
| 495 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 496 |
input_field: str, target_field: str, work_dir: str):
|
| 497 |
-
"""Create an evaluator.py file for OpenEvolve that uses
|
| 498 |
evaluator_code = f'''
|
| 499 |
import os
|
| 500 |
import random
|
|
@@ -503,10 +503,10 @@ from openai import OpenAI
|
|
| 503 |
|
| 504 |
def evaluate(prompt: str) -> dict:
|
| 505 |
"""
|
| 506 |
-
Evaluate a prompt using
|
| 507 |
|
| 508 |
-
|
| 509 |
-
|
| 510 |
"""
|
| 511 |
try:
|
| 512 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
|
@@ -524,8 +524,8 @@ def evaluate(prompt: str) -> dict:
|
|
| 524 |
else:
|
| 525 |
raise
|
| 526 |
|
| 527 |
-
# Sample
|
| 528 |
-
num_samples =
|
| 529 |
if len(dataset) > num_samples:
|
| 530 |
# Use SAME sampling logic as initial/final eval
|
| 531 |
indices = random.sample(range(len(dataset)), num_samples)
|
|
@@ -726,7 +726,7 @@ Your improved prompt here
|
|
| 726 |
"llm": {
|
| 727 |
"primary_model": model,
|
| 728 |
"api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
|
| 729 |
-
"temperature": 0
|
| 730 |
},
|
| 731 |
"max_iterations": 5,
|
| 732 |
"checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
|
|
@@ -738,11 +738,11 @@ Your improved prompt here
|
|
| 738 |
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 739 |
},
|
| 740 |
"evolution": {
|
| 741 |
-
"population_size": 10
|
| 742 |
"num_islands": 1, # Single island for simpler evolution
|
| 743 |
-
"elite_ratio": 0.1,
|
| 744 |
-
"explore_ratio": 0.3
|
| 745 |
-
"exploit_ratio": 0.6
|
| 746 |
},
|
| 747 |
"database": {
|
| 748 |
"log_prompts": True, # Save prompts used to generate each program
|
|
@@ -940,7 +940,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 940 |
- **Model**: {model}
|
| 941 |
- **Initial Eval**: 50 samples
|
| 942 |
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 943 |
-
- **Evolution**:
|
| 944 |
- **Iterations**: 5
|
| 945 |
|
| 946 |
### Results
|
|
|
|
| 494 |
|
| 495 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 496 |
input_field: str, target_field: str, work_dir: str):
|
| 497 |
+
"""Create an evaluator.py file for OpenEvolve that uses 150 samples for better signal."""
|
| 498 |
evaluator_code = f'''
|
| 499 |
import os
|
| 500 |
import random
|
|
|
|
| 503 |
|
| 504 |
def evaluate(prompt: str) -> dict:
|
| 505 |
"""
|
| 506 |
+
Evaluate a prompt using 150 fixed samples for stronger evolution signal.
|
| 507 |
|
| 508 |
+
Using more samples (150 vs 50) gives evolution better signal to distinguish
|
| 509 |
+
good prompts from bad ones. Final comparison still uses the same 50 samples.
|
| 510 |
"""
|
| 511 |
try:
|
| 512 |
# IMPORTANT: Use fixed seed for consistent sampling across all evaluations
|
|
|
|
| 524 |
else:
|
| 525 |
raise
|
| 526 |
|
| 527 |
+
# Sample 150 samples with seed 42 for stronger signal during evolution
|
| 528 |
+
num_samples = 150
|
| 529 |
if len(dataset) > num_samples:
|
| 530 |
# Use SAME sampling logic as initial/final eval
|
| 531 |
indices = random.sample(range(len(dataset)), num_samples)
|
|
|
|
| 726 |
"llm": {
|
| 727 |
"primary_model": model,
|
| 728 |
"api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
|
| 729 |
+
"temperature": 1.0, # Higher temperature for more diverse prompt variations
|
| 730 |
},
|
| 731 |
"max_iterations": 5,
|
| 732 |
"checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
|
|
|
|
| 738 |
"template_dir": templates_dir, # Use our custom prompt engineering templates
|
| 739 |
},
|
| 740 |
"evolution": {
|
| 741 |
+
"population_size": 15, # Increased from 10 for more exploration
|
| 742 |
"num_islands": 1, # Single island for simpler evolution
|
| 743 |
+
"elite_ratio": 0.1, # Keep top 10% (1-2 best prompts)
|
| 744 |
+
"explore_ratio": 0.4, # Increased exploration (was 0.3)
|
| 745 |
+
"exploit_ratio": 0.5, # Reduced exploitation (was 0.6)
|
| 746 |
},
|
| 747 |
"database": {
|
| 748 |
"log_prompts": True, # Save prompts used to generate each program
|
|
|
|
| 940 |
- **Model**: {model}
|
| 941 |
- **Initial Eval**: 50 samples
|
| 942 |
- **Final Eval**: 50 samples (same samples for fair comparison)
|
| 943 |
+
- **Evolution**: 150 samples per variant (more data for stronger signal)
|
| 944 |
- **Iterations**: 5
|
| 945 |
|
| 946 |
### Results
|