Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

App Files Files Community

codelion commited on Nov 18

Commit

e35db16

verified ·

1 Parent(s): 4d557d2

Upload app.py

Browse files

Files changed (1) hide show

app.py +43 -21

app.py CHANGED Viewed

@@ -249,13 +249,13 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
 def collect_prompt_history(output_dir: str) -> List[Dict]:
     """
-    Collect unique, high-quality prompts discovered during evolution.
-    Only returns prompts that are better than previous ones (no duplicates).
     Returns a list of dicts with: {prompt, score, iteration, id}
     """
     try:
-        prompts = []
         seen_prompts = set()  # Track unique prompts
         # OpenEvolve saves programs in checkpoint directories as JSON files
@@ -268,6 +268,7 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
         # Find all checkpoint directories
         checkpoint_dirs = sorted(glob.glob(os.path.join(checkpoints_dir, "checkpoint_*")))
         for checkpoint_dir in checkpoint_dirs:
             programs_dir = os.path.join(checkpoint_dir, "programs")
             if not os.path.exists(programs_dir):
@@ -290,16 +291,7 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
                     # Get combined score for comparison
                     combined_score = metrics.get("combined_score", 0.0)
-                    # Create a normalized version for duplicate detection (ignore whitespace differences)
-                    normalized_prompt = " ".join(prompt_content.split())
-                    # Skip duplicates
-                    if normalized_prompt in seen_prompts:
-                        continue
-                    seen_prompts.add(normalized_prompt)
-                    prompts.append({
                         "prompt": prompt_content,
                         "id": prog_id,
                         "file": pfile,
@@ -311,10 +303,33 @@ def collect_prompt_history(output_dir: str) -> List[Dict]:
                     print(f"Error reading program file {pfile}: {e}")
                     continue
-        # Sort by score (descending) to show best prompts first
-        prompts.sort(key=lambda x: x.get("score", 0.0), reverse=True)
-        return prompts
     except Exception as e:
         print(f"Error collecting prompt history: {e}")
         return []
@@ -851,23 +866,30 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             progress(1.0, desc="Complete!")
-            # Collect all unique discovered prompts for browsing (sorted by score, best first)
             all_prompts = []
             # Add initial prompt
             all_prompts.append({
                 "prompt": initial_prompt,
                 "score": initial_eval['accuracy'] / 100.0,  # Convert to 0-1 scale
-                "label": "Initial Prompt"
             })
-            # Add evolved prompts (already unique and sorted by score)
             prompt_history = collect_prompt_history(output_dir)
             for i, p in enumerate(prompt_history):
                 all_prompts.append({
                     "prompt": p["prompt"],
                     "score": p.get("score", 0.0),
-                    "label": f"Evolved #{i+1}"
                 })
             return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
@@ -976,7 +998,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
     # Prompt History Browser
     gr.Markdown("---")
     gr.Markdown("## 📜 Prompt History Browser")
-    gr.Markdown("Browse through all prompts discovered during evolution (initial → intermediate → final)")
     with gr.Row():
         with gr.Column(scale=8):

 def collect_prompt_history(output_dir: str) -> List[Dict]:
     """
+    Collect only the prompts that were "best" at some point during evolution.
+    Returns: initial prompt + any program that improved the best score (deduplicated).
     Returns a list of dicts with: {prompt, score, iteration, id}
     """
     try:
+        all_programs = []
         seen_prompts = set()  # Track unique prompts
         # OpenEvolve saves programs in checkpoint directories as JSON files
         # Find all checkpoint directories
         checkpoint_dirs = sorted(glob.glob(os.path.join(checkpoints_dir, "checkpoint_*")))
+        # Collect all programs from all checkpoints
         for checkpoint_dir in checkpoint_dirs:
             programs_dir = os.path.join(checkpoint_dir, "programs")
             if not os.path.exists(programs_dir):
                     # Get combined score for comparison
                     combined_score = metrics.get("combined_score", 0.0)
+                    all_programs.append({
                         "prompt": prompt_content,
                         "id": prog_id,
                         "file": pfile,
                     print(f"Error reading program file {pfile}: {e}")
                     continue
+        # Sort all programs by iteration (chronological order)
+        all_programs.sort(key=lambda x: x.get("iteration", 0))
+        # Filter to keep only programs that improved the best score
+        best_programs = []
+        current_best_score = -float('inf')
+        for program in all_programs:
+            prompt_content = program["prompt"]
+            score = program["score"]
+            # Create a normalized version for duplicate detection (ignore whitespace differences)
+            normalized_prompt = " ".join(prompt_content.split())
+            # Skip duplicates
+            if normalized_prompt in seen_prompts:
+                continue
+            # Only keep if this program improved the best score
+            if score > current_best_score:
+                seen_prompts.add(normalized_prompt)
+                best_programs.append(program)
+                current_best_score = score
+                print(f"  Best program at iteration {program['iteration']}: score={score:.2%}")
+        return best_programs
     except Exception as e:
         print(f"Error collecting prompt history: {e}")
         return []
             progress(1.0, desc="Complete!")
+            # Collect only the "best" prompts - ones that improved the score during evolution
             all_prompts = []
             # Add initial prompt
             all_prompts.append({
                 "prompt": initial_prompt,
                 "score": initial_eval['accuracy'] / 100.0,  # Convert to 0-1 scale
+                "label": "Initial Prompt",
+                "iteration": 0
             })
+            # Add evolved prompts (only programs that were "best" at some point)
+            # These are already filtered to show progression: initial → better → best
             prompt_history = collect_prompt_history(output_dir)
             for i, p in enumerate(prompt_history):
+                # Skip if it's the same as initial (shouldn't happen, but just in case)
+                if i == 0 and p.get("iteration", -1) == 0:
+                    continue
                 all_prompts.append({
                     "prompt": p["prompt"],
                     "score": p.get("score", 0.0),
+                    "label": f"Best at Iteration {p.get('iteration', i+1)}",
+                    "iteration": p.get("iteration", i+1)
                 })
             return summary, initial_results, evolution_viz, final_results, all_prompts, 0, len(all_prompts)
     # Prompt History Browser
     gr.Markdown("---")
     gr.Markdown("## 📜 Prompt History Browser")
+    gr.Markdown("Browse through the progression of **best** prompts found during evolution. Only shows prompts that improved the score (no duplicates or intermediate programs).")
     with gr.Row():
         with gr.Column(scale=8):