Upload app.py
Browse files
app.py
CHANGED
|
@@ -249,13 +249,13 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 249 |
|
| 250 |
def collect_prompt_history(output_dir: str) -> List[Dict]:
|
| 251 |
"""
|
| 252 |
-
Collect
|
| 253 |
-
|
| 254 |
|
| 255 |
Returns a list of dicts with: {prompt, score, iteration, id}
|
| 256 |
"""
|
| 257 |
try:
|
| 258 |
-
|
| 259 |
seen_prompts = set() # Track unique prompts
|
| 260 |
|
| 261 |
# OpenEvolve saves programs in checkpoint directories as JSON files
|
|
@@ -268,6 +268,7 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
|
|
| 268 |
# Find all checkpoint directories
|
| 269 |
checkpoint_dirs = sorted(glob.glob(os.path.join(checkpoints_dir, "checkpoint_*")))
|
| 270 |
|
|
|
|
| 271 |
for checkpoint_dir in checkpoint_dirs:
|
| 272 |
programs_dir = os.path.join(checkpoint_dir, "programs")
|
| 273 |
if not os.path.exists(programs_dir):
|
|
@@ -290,16 +291,7 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
|
|
| 290 |
# Get combined score for comparison
|
| 291 |
combined_score = metrics.get("combined_score", 0.0)
|
| 292 |
|
| 293 |
-
|
| 294 |
-
normalized_prompt = " ".join(prompt_content.split())
|
| 295 |
-
|
| 296 |
-
# Skip duplicates
|
| 297 |
-
if normalized_prompt in seen_prompts:
|
| 298 |
-
continue
|
| 299 |
-
|
| 300 |
-
seen_prompts.add(normalized_prompt)
|
| 301 |
-
|
| 302 |
-
prompts.append({
|
| 303 |
"prompt": prompt_content,
|
| 304 |
"id": prog_id,
|
| 305 |
"file": pfile,
|
|
@@ -311,10 +303,33 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
|
|
| 311 |
print(f"Error reading program file {pfile}: {e}")
|
| 312 |
continue
|
| 313 |
|
| 314 |
-
# Sort by
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
-
return prompts
|
| 318 |
except Exception as e:
|
| 319 |
print(f"Error collecting prompt history: {e}")
|
| 320 |
return []
|
|
@@ -851,23 +866,30 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 851 |
|
| 852 |
progress(1.0, desc="Complete!")
|
| 853 |
|
| 854 |
-
# Collect
|
| 855 |
all_prompts = []
|
| 856 |
|
| 857 |
# Add initial prompt
|
| 858 |
all_prompts.append({
|
| 859 |
"prompt": initial_prompt,
|
| 860 |
"score": initial_eval['accuracy'] / 100.0, # Convert to 0-1 scale
|
| 861 |
-
"label": "Initial Prompt"
|
|
|
|
| 862 |
})
|
| 863 |
|
| 864 |
-
# Add evolved prompts (
|
|
|
|
| 865 |
prompt_history = collect_prompt_history(output_dir)
|
| 866 |
for i, p in enumerate(prompt_history):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 867 |
all_prompts.append({
|
| 868 |
"prompt": p["prompt"],
|
| 869 |
"score": p.get("score", 0.0),
|
| 870 |
-
"label": f"
|
|
|
|
| 871 |
})
|
| 872 |
|
| 873 |
return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
|
|
@@ -976,7 +998,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 976 |
# Prompt History Browser
|
| 977 |
gr.Markdown("---")
|
| 978 |
gr.Markdown("## π Prompt History Browser")
|
| 979 |
-
gr.Markdown("Browse through
|
| 980 |
|
| 981 |
with gr.Row():
|
| 982 |
with gr.Column(scale=8):
|
|
|
|
| 249 |
|
| 250 |
def collect_prompt_history(output_dir: str) -> List[Dict]:
|
| 251 |
"""
|
| 252 |
+
Collect only the prompts that were "best" at some point during evolution.
|
| 253 |
+
Returns: initial prompt + any program that improved the best score (deduplicated).
|
| 254 |
|
| 255 |
Returns a list of dicts with: {prompt, score, iteration, id}
|
| 256 |
"""
|
| 257 |
try:
|
| 258 |
+
all_programs = []
|
| 259 |
seen_prompts = set() # Track unique prompts
|
| 260 |
|
| 261 |
# OpenEvolve saves programs in checkpoint directories as JSON files
|
|
|
|
| 268 |
# Find all checkpoint directories
|
| 269 |
checkpoint_dirs = sorted(glob.glob(os.path.join(checkpoints_dir, "checkpoint_*")))
|
| 270 |
|
| 271 |
+
# Collect all programs from all checkpoints
|
| 272 |
for checkpoint_dir in checkpoint_dirs:
|
| 273 |
programs_dir = os.path.join(checkpoint_dir, "programs")
|
| 274 |
if not os.path.exists(programs_dir):
|
|
|
|
| 291 |
# Get combined score for comparison
|
| 292 |
combined_score = metrics.get("combined_score", 0.0)
|
| 293 |
|
| 294 |
+
all_programs.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
"prompt": prompt_content,
|
| 296 |
"id": prog_id,
|
| 297 |
"file": pfile,
|
|
|
|
| 303 |
print(f"Error reading program file {pfile}: {e}")
|
| 304 |
continue
|
| 305 |
|
| 306 |
+
# Sort all programs by iteration (chronological order)
|
| 307 |
+
all_programs.sort(key=lambda x: x.get("iteration", 0))
|
| 308 |
+
|
| 309 |
+
# Filter to keep only programs that improved the best score
|
| 310 |
+
best_programs = []
|
| 311 |
+
current_best_score = -float('inf')
|
| 312 |
+
|
| 313 |
+
for program in all_programs:
|
| 314 |
+
prompt_content = program["prompt"]
|
| 315 |
+
score = program["score"]
|
| 316 |
+
|
| 317 |
+
# Create a normalized version for duplicate detection (ignore whitespace differences)
|
| 318 |
+
normalized_prompt = " ".join(prompt_content.split())
|
| 319 |
+
|
| 320 |
+
# Skip duplicates
|
| 321 |
+
if normalized_prompt in seen_prompts:
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
# Only keep if this program improved the best score
|
| 325 |
+
if score > current_best_score:
|
| 326 |
+
seen_prompts.add(normalized_prompt)
|
| 327 |
+
best_programs.append(program)
|
| 328 |
+
current_best_score = score
|
| 329 |
+
print(f" Best program at iteration {program['iteration']}: score={score:.2%}")
|
| 330 |
+
|
| 331 |
+
return best_programs
|
| 332 |
|
|
|
|
| 333 |
except Exception as e:
|
| 334 |
print(f"Error collecting prompt history: {e}")
|
| 335 |
return []
|
|
|
|
| 866 |
|
| 867 |
progress(1.0, desc="Complete!")
|
| 868 |
|
| 869 |
+
# Collect only the "best" prompts - ones that improved the score during evolution
|
| 870 |
all_prompts = []
|
| 871 |
|
| 872 |
# Add initial prompt
|
| 873 |
all_prompts.append({
|
| 874 |
"prompt": initial_prompt,
|
| 875 |
"score": initial_eval['accuracy'] / 100.0, # Convert to 0-1 scale
|
| 876 |
+
"label": "Initial Prompt",
|
| 877 |
+
"iteration": 0
|
| 878 |
})
|
| 879 |
|
| 880 |
+
# Add evolved prompts (only programs that were "best" at some point)
|
| 881 |
+
# These are already filtered to show progression: initial β better β best
|
| 882 |
prompt_history = collect_prompt_history(output_dir)
|
| 883 |
for i, p in enumerate(prompt_history):
|
| 884 |
+
# Skip if it's the same as initial (shouldn't happen, but just in case)
|
| 885 |
+
if i == 0 and p.get("iteration", -1) == 0:
|
| 886 |
+
continue
|
| 887 |
+
|
| 888 |
all_prompts.append({
|
| 889 |
"prompt": p["prompt"],
|
| 890 |
"score": p.get("score", 0.0),
|
| 891 |
+
"label": f"Best at Iteration {p.get('iteration', i+1)}",
|
| 892 |
+
"iteration": p.get("iteration", i+1)
|
| 893 |
})
|
| 894 |
|
| 895 |
return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
|
|
|
|
| 998 |
# Prompt History Browser
|
| 999 |
gr.Markdown("---")
|
| 1000 |
gr.Markdown("## π Prompt History Browser")
|
| 1001 |
+
gr.Markdown("Browse through the progression of **best** prompts found during evolution. Only shows prompts that improved the score (no duplicates or intermediate programs).")
|
| 1002 |
|
| 1003 |
with gr.Row():
|
| 1004 |
with gr.Column(scale=8):
|