codelion commited on
Commit
e51517c
Β·
verified Β·
1 Parent(s): 3c1041b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -81
app.py CHANGED
@@ -3,12 +3,14 @@ import os
3
  import yaml
4
  import json
5
  import random
6
- from datasets import load_dataset
7
  from openai import OpenAI
8
  from openevolve import run_evolution
9
- from typing import Dict, List, Tuple
10
  import tempfile
11
  import shutil
 
 
12
 
13
  # Free models from OpenRouter (as of 2025)
14
  FREE_MODELS = [
@@ -22,34 +24,118 @@ FREE_MODELS = [
22
  "mistralai/mistral-7b-instruct:free",
23
  ]
24
 
25
- # Popular HuggingFace datasets for different tasks
26
- SAMPLE_DATASETS = {
27
- "Question Answering": [
28
- "hotpot_qa",
29
- "squad",
30
- "trivia_qa",
31
- ],
32
- "Sentiment Analysis": [
33
- "imdb",
34
- "yelp_review_full",
35
- "emotion",
36
- ],
37
- "Text Classification": [
38
- "ag_news",
39
- "dbpedia_14",
40
- "SetFit/sst5",
41
- ],
42
- "Math Reasoning": [
43
- "gsm8k",
44
- "math_qa",
45
- ],
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
 
49
  def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
50
- api_key: str, model: str, input_field: str, target_field: str) -> Dict:
51
  """Evaluate a prompt on a dataset using the selected model."""
52
  try:
 
 
 
 
 
 
 
 
 
 
 
53
  # Load dataset
54
  dataset = load_dataset(dataset_name, split=split, streaming=False)
55
 
@@ -104,9 +190,9 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
104
  total += 1
105
 
106
  results.append({
107
- "input": str(input_text)[:100] + "...",
108
  "target": str(target),
109
- "prediction": prediction[:100] + "...",
110
  "correct": is_correct
111
  })
112
 
@@ -133,6 +219,96 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
133
  }
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def create_evaluator_file(dataset_name: str, split: str, model: str,
137
  input_field: str, target_field: str, work_dir: str):
138
  """Create an evaluator.py file for OpenEvolve."""
@@ -248,17 +424,21 @@ def create_config_file(model: str, work_dir: str):
248
 
249
 
250
  def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
251
- model: str, api_key: str, input_field: str, target_field: str,
252
- progress=gr.Progress()) -> Tuple[str, str, str]:
253
  """Run OpenEvolve to optimize the prompt."""
254
 
255
- if not api_key:
256
- return "Error: OpenAI API Key is required", "", ""
257
 
258
- # Set API key as environment variable
259
- os.environ["OPENAI_API_KEY"] = api_key
 
 
260
 
261
- progress(0, desc="Setting up...")
 
 
 
262
 
263
  # Create temporary working directory
264
  work_dir = tempfile.mkdtemp(prefix="openevolve_")
@@ -275,16 +455,19 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
275
  input_field, target_field, work_dir)
276
 
277
  # Create config
278
- progress(0.2, desc="Creating configuration...")
279
  config_path = create_config_file(model, work_dir)
280
 
281
  # Run initial evaluation
282
- progress(0.3, desc="Running initial evaluation...")
283
  initial_eval = evaluate_prompt(
284
  initial_prompt, dataset_name, dataset_split, 100,
285
- api_key, model, input_field, target_field
286
  )
287
 
 
 
 
288
  initial_results = f"""
289
  ### Initial Prompt Evaluation
290
 
@@ -306,7 +489,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
306
  initial_results += f" βœ“ Correct\n" if result['correct'] else f" βœ— Incorrect\n"
307
 
308
  # Run OpenEvolve
309
- progress(0.4, desc="Running OpenEvolve (this may take several minutes)...")
310
 
311
  output_dir = os.path.join(work_dir, "output")
312
  os.makedirs(output_dir, exist_ok=True)
@@ -321,7 +504,12 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
321
  verbose=True
322
  )
323
 
324
- progress(0.8, desc="Evaluating best prompt...")
 
 
 
 
 
325
 
326
  # Get the best prompt
327
  best_prompt_path = os.path.join(output_dir, "best_program.txt")
@@ -334,7 +522,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
334
  # Evaluate best prompt
335
  final_eval = evaluate_prompt(
336
  best_prompt, dataset_name, dataset_split, 100,
337
- api_key, model, input_field, target_field
338
  )
339
 
340
  final_results = f"""
@@ -348,7 +536,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
348
  **Results:**
349
  - Accuracy: {final_eval['accuracy']:.2f}%
350
  - Correct: {final_eval['correct']}/{final_eval['total']}
351
- - Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
352
 
353
  **Sample Results:**
354
  """
@@ -359,24 +547,28 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
359
  final_results += f" βœ“ Correct\n" if result['correct'] else f" βœ— Incorrect\n"
360
 
361
  summary = f"""
362
- ## Optimization Complete!
363
 
364
  ### Summary
365
- - Initial Accuracy: {initial_eval['accuracy']:.2f}%
366
- - Final Accuracy: {final_eval['accuracy']:.2f}%
367
- - Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:.2f}%
368
- - Dataset: {dataset_name}
369
- - Model: {model}
370
- - Samples Evaluated: 100
371
- - Iterations: 10
 
 
 
 
372
  """
373
 
374
  progress(1.0, desc="Complete!")
375
 
376
- return summary, initial_results, final_results
377
 
378
  except Exception as e:
379
- return f"Error during evolution: {str(e)}", initial_results, ""
380
 
381
  finally:
382
  # Clean up
@@ -393,28 +585,25 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
393
 
394
  Automatically evolve and optimize your prompts using evolutionary algorithms!
395
 
396
- This space uses [OpenEvolve](https://github.com/codelion/openevolve) to iteratively improve prompts
397
  by testing them on real datasets and evolving better versions.
398
 
399
  ## How it works:
400
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
401
- 2. Select a HuggingFace dataset to test on
402
- 3. Choose a free model from OpenRouter
403
- 4. Click "Optimize Prompt" to evolve better versions
404
- 5. Compare initial vs. evolved performance!
 
 
 
 
405
  """)
406
 
407
  with gr.Row():
408
  with gr.Column():
409
  gr.Markdown("### Configuration")
410
 
411
- api_key = gr.Textbox(
412
- label="OpenAI API Key (for OpenRouter)",
413
- type="password",
414
- placeholder="sk-or-v1-...",
415
- info="Get your free key at https://openrouter.ai/keys"
416
- )
417
-
418
  model = gr.Dropdown(
419
  choices=FREE_MODELS,
420
  value=FREE_MODELS[0],
@@ -423,10 +612,10 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
423
  )
424
 
425
  dataset_name = gr.Textbox(
426
- label="HuggingFace Dataset",
427
- value="imdb",
428
- placeholder="e.g., imdb, hotpot_qa, gsm8k",
429
- info="Any dataset from HuggingFace Hub"
430
  )
431
 
432
  dataset_split = gr.Textbox(
@@ -456,7 +645,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
456
  info="Use {input} as placeholder for dataset inputs"
457
  )
458
 
459
- optimize_btn = gr.Button("πŸš€ Optimize Prompt", variant="primary", size="lg")
460
 
461
  with gr.Row():
462
  with gr.Column():
@@ -468,29 +657,40 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
468
  with gr.Column():
469
  final_results = gr.Markdown(label="Evolved Results")
470
 
 
 
 
 
471
  gr.Markdown("""
472
  ### Example Datasets & Fields:
473
 
474
  | Dataset | Split | Input Field | Target Field | Task |
475
  |---------|-------|-------------|--------------|------|
476
- | imdb | test | text | label | Sentiment Analysis |
477
- | hotpot_qa | validation | question | answer | Question Answering |
478
- | emotion | test | text | label | Emotion Classification |
479
- | gsm8k | test | question | answer | Math Reasoning |
480
- | ag_news | test | text | label | News Classification |
481
-
482
- ### Notes:
483
- - Evolution runs for 10 iterations with 1 island
484
- - Each evaluation uses 100 random samples from the dataset
485
- - The process may take 5-15 minutes depending on the dataset and model
486
- - Make sure your API key has sufficient credits for the requests
 
 
 
 
 
 
 
487
  """)
488
 
489
  optimize_btn.click(
490
  fn=optimize_prompt,
491
- inputs=[initial_prompt, dataset_name, dataset_split, model, api_key,
492
  input_field, target_field],
493
- outputs=[summary, initial_results, final_results]
494
  )
495
 
496
  if __name__ == "__main__":
 
3
  import yaml
4
  import json
5
  import random
6
+ from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
7
  from openai import OpenAI
8
  from openevolve import run_evolution
9
+ from typing import Dict, List, Tuple, Optional
10
  import tempfile
11
  import shutil
12
+ import requests
13
+ import glob
14
 
15
  # Free models from OpenRouter (as of 2025)
16
  FREE_MODELS = [
 
24
  "mistralai/mistral-7b-instruct:free",
25
  ]
26
 
27
+
28
+ def validate_dataset(dataset_name: str, split: str, input_field: str, target_field: str) -> Tuple[bool, str]:
29
+ """
30
+ Validate that the dataset exists and has the required fields.
31
+
32
+ Returns:
33
+ Tuple of (is_valid, error_message)
34
+ """
35
+ try:
36
+ # Check if dataset name has correct format (should be org/name or just name)
37
+ if not dataset_name or dataset_name.strip() == "":
38
+ return False, "❌ Dataset name cannot be empty"
39
+
40
+ dataset_name = dataset_name.strip()
41
+
42
+ # Try to get dataset info from HuggingFace API
43
+ hf_token = os.environ.get("HF_TOKEN", None)
44
+ headers = {}
45
+ if hf_token:
46
+ headers["Authorization"] = f"Bearer {hf_token}"
47
+
48
+ # Check if dataset exists on HuggingFace Hub
49
+ api_url = f"https://huggingface.co/api/datasets/{dataset_name}"
50
+ response = requests.get(api_url, headers=headers, timeout=10)
51
+
52
+ if response.status_code == 404:
53
+ return False, f"❌ Dataset '{dataset_name}' not found on HuggingFace Hub. Please use the full dataset name (e.g., 'stanfordnlp/imdb' or 'gsm8k')"
54
+ elif response.status_code != 200:
55
+ # Try to load anyway - might be a private dataset or API issue
56
+ print(f"Warning: Could not verify dataset via API (status {response.status_code}), attempting to load...")
57
+
58
+ # Try to load a small sample to verify it works and check fields
59
+ print(f"Loading dataset {dataset_name} with split {split}...")
60
+
61
+ # First, check if the split exists
62
+ try:
63
+ available_splits = get_dataset_split_names(dataset_name)
64
+ if split not in available_splits:
65
+ return False, f"❌ Split '{split}' not found. Available splits: {', '.join(available_splits)}"
66
+ except Exception as e:
67
+ print(f"Could not get split names: {e}. Will try to load anyway...")
68
+
69
+ # Load a small sample to check fields
70
+ dataset = load_dataset(dataset_name, split=split, streaming=True)
71
+
72
+ # Get first example to check fields
73
+ first_example = next(iter(dataset))
74
+ available_fields = list(first_example.keys())
75
+
76
+ # Check if input field exists
77
+ if input_field not in available_fields:
78
+ return False, f"❌ Input field '{input_field}' not found. Available fields: {', '.join(available_fields)}"
79
+
80
+ # Check if target field exists
81
+ if target_field not in available_fields:
82
+ return False, f"❌ Target field '{target_field}' not found. Available fields: {', '.join(available_fields)}"
83
+
84
+ # All validations passed
85
+ return True, f"βœ… Dataset validated successfully! Fields '{input_field}' and '{target_field}' found."
86
+
87
+ except Exception as e:
88
+ error_msg = str(e)
89
+ if "404" in error_msg or "not found" in error_msg.lower():
90
+ return False, f"❌ Dataset '{dataset_name}' not found. Please check the dataset name (use format: org/dataset-name)"
91
+ return False, f"❌ Error validating dataset: {error_msg}"
92
+
93
+
94
+ def validate_inputs(dataset_name: str, split: str, input_field: str, target_field: str,
95
+ initial_prompt: str) -> Tuple[bool, str]:
96
+ """
97
+ Validate all inputs before starting optimization.
98
+
99
+ Returns:
100
+ Tuple of (is_valid, message)
101
+ """
102
+ # Check API key
103
+ api_key = os.environ.get("OPENAI_API_KEY")
104
+ if not api_key:
105
+ return False, "❌ OPENAI_API_KEY environment variable not set. Please set it in the Space secrets."
106
+
107
+ # Check prompt contains {input} placeholder
108
+ if "{input}" not in initial_prompt:
109
+ return False, "❌ Prompt must contain '{input}' placeholder for dataset inputs"
110
+
111
+ # Check dataset name format
112
+ dataset_name = dataset_name.strip()
113
+ if not dataset_name:
114
+ return False, "❌ Dataset name cannot be empty"
115
+
116
+ # Validate dataset and fields
117
+ is_valid, message = validate_dataset(dataset_name, split, input_field, target_field)
118
+ if not is_valid:
119
+ return False, message
120
+
121
+ return True, message
122
 
123
 
124
  def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int,
125
+ model: str, input_field: str, target_field: str) -> Dict:
126
  """Evaluate a prompt on a dataset using the selected model."""
127
  try:
128
+ # Get API key from environment
129
+ api_key = os.environ.get("OPENAI_API_KEY")
130
+ if not api_key:
131
+ return {
132
+ "error": "OPENAI_API_KEY not set in environment",
133
+ "accuracy": 0,
134
+ "correct": 0,
135
+ "total": 0,
136
+ "results": []
137
+ }
138
+
139
  # Load dataset
140
  dataset = load_dataset(dataset_name, split=split, streaming=False)
141
 
 
190
  total += 1
191
 
192
  results.append({
193
+ "input": str(input_text)[:100] + "..." if len(str(input_text)) > 100 else str(input_text),
194
  "target": str(target),
195
+ "prediction": prediction[:100] + "..." if len(prediction) > 100 else prediction,
196
  "correct": is_correct
197
  })
198
 
 
219
  }
220
 
221
 
222
+ def parse_evolution_history(output_dir: str) -> str:
223
+ """
224
+ Parse evolution history from OpenEvolve output directory.
225
+
226
+ Returns a markdown string with visualization of the evolution process.
227
+ """
228
+ try:
229
+ evolution_viz = "## 🧬 Evolution Progress\n\n"
230
+
231
+ # Look for generation files or logs
232
+ generation_files = sorted(glob.glob(os.path.join(output_dir, "generation_*.txt")))
233
+ log_file = os.path.join(output_dir, "evolution.log")
234
+
235
+ # Try to parse generation files if they exist
236
+ if generation_files:
237
+ evolution_viz += "### Generation-by-Generation Progress\n\n"
238
+ for gen_file in generation_files:
239
+ gen_num = os.path.basename(gen_file).replace("generation_", "").replace(".txt", "")
240
+ try:
241
+ with open(gen_file, 'r') as f:
242
+ content = f.read()
243
+ evolution_viz += f"**Generation {gen_num}:**\n```\n{content[:200]}{'...' if len(content) > 200 else ''}\n```\n\n"
244
+ except:
245
+ pass
246
+
247
+ # Try to parse log file
248
+ elif os.path.exists(log_file):
249
+ evolution_viz += "### Evolution Log\n\n"
250
+ try:
251
+ with open(log_file, 'r') as f:
252
+ log_content = f.read()
253
+ evolution_viz += f"```\n{log_content[-1000:]}\n```\n\n"
254
+ except:
255
+ pass
256
+
257
+ # Look for scores or history file
258
+ scores_file = os.path.join(output_dir, "scores.json")
259
+ if os.path.exists(scores_file):
260
+ try:
261
+ with open(scores_file, 'r') as f:
262
+ scores = json.load(f)
263
+
264
+ evolution_viz += "### Score Progression\n\n"
265
+ evolution_viz += "| Generation | Best Score | Avg Score | Population |\n"
266
+ evolution_viz += "|------------|-----------|-----------|------------|\n"
267
+
268
+ for gen in scores:
269
+ evolution_viz += f"| {gen['generation']} | {gen['best']:.3f} | {gen['avg']:.3f} | {gen['population']} |\n"
270
+
271
+ evolution_viz += "\n"
272
+ except:
273
+ pass
274
+
275
+ # Look for all program variants
276
+ program_files = sorted(glob.glob(os.path.join(output_dir, "program_*.txt")))
277
+ if program_files:
278
+ evolution_viz += f"### Explored Variants\n\n"
279
+ evolution_viz += f"OpenEvolve explored {len(program_files)} different prompt variants during evolution.\n\n"
280
+
281
+ # Show a few intermediate prompts
282
+ if len(program_files) > 3:
283
+ sample_files = [program_files[0], program_files[len(program_files)//2], program_files[-2]]
284
+ evolution_viz += "**Sample Intermediate Prompts:**\n\n"
285
+ for idx, pfile in enumerate(sample_files, 1):
286
+ try:
287
+ with open(pfile, 'r') as f:
288
+ prompt_content = f.read()
289
+ evolution_viz += f"**Variant {idx}:**\n```\n{prompt_content[:150]}{'...' if len(prompt_content) > 150 else ''}\n```\n\n"
290
+ except:
291
+ pass
292
+
293
+ # If no specific files found, show directory contents
294
+ if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
295
+ evolution_viz += "### Evolution Complete\n\n"
296
+ evolution_viz += "OpenEvolve ran 10 iterations of evolutionary optimization using:\n"
297
+ evolution_viz += "- **Population Size**: 10 prompts per generation\n"
298
+ evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n"
299
+ evolution_viz += "- **Islands**: 1 population with mutation and crossover\n"
300
+ evolution_viz += "- **Evaluation**: 100 samples per prompt variant\n\n"
301
+
302
+ # Count files in output directory
303
+ all_files = os.listdir(output_dir)
304
+ evolution_viz += f"Generated {len(all_files)} files during evolution process.\n\n"
305
+
306
+ return evolution_viz
307
+
308
+ except Exception as e:
309
+ return f"## 🧬 Evolution Progress\n\nEvolution completed successfully. Unable to parse detailed history: {str(e)}\n\n"
310
+
311
+
312
  def create_evaluator_file(dataset_name: str, split: str, model: str,
313
  input_field: str, target_field: str, work_dir: str):
314
  """Create an evaluator.py file for OpenEvolve."""
 
424
 
425
 
426
  def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
427
+ model: str, input_field: str, target_field: str,
428
+ progress=gr.Progress()) -> Tuple[str, str, str, str]:
429
  """Run OpenEvolve to optimize the prompt."""
430
 
431
+ progress(0, desc="Validating inputs...")
 
432
 
433
+ # Validate all inputs
434
+ is_valid, validation_message = validate_inputs(
435
+ dataset_name, dataset_split, input_field, target_field, initial_prompt
436
+ )
437
 
438
+ if not is_valid:
439
+ return f"## Validation Failed\n\n{validation_message}", "", "", ""
440
+
441
+ progress(0.05, desc=f"Validation passed: {validation_message}")
442
 
443
  # Create temporary working directory
444
  work_dir = tempfile.mkdtemp(prefix="openevolve_")
 
455
  input_field, target_field, work_dir)
456
 
457
  # Create config
458
+ progress(0.15, desc="Creating configuration...")
459
  config_path = create_config_file(model, work_dir)
460
 
461
  # Run initial evaluation
462
+ progress(0.2, desc="Running initial evaluation on 100 samples...")
463
  initial_eval = evaluate_prompt(
464
  initial_prompt, dataset_name, dataset_split, 100,
465
+ model, input_field, target_field
466
  )
467
 
468
+ if "error" in initial_eval and initial_eval["total"] == 0:
469
+ return f"## Error\n\n❌ Initial evaluation failed: {initial_eval['error']}", "", "", ""
470
+
471
  initial_results = f"""
472
  ### Initial Prompt Evaluation
473
 
 
489
  initial_results += f" βœ“ Correct\n" if result['correct'] else f" βœ— Incorrect\n"
490
 
491
  # Run OpenEvolve
492
+ progress(0.3, desc="Starting OpenEvolve optimization (10 iterations, ~5-15 minutes)...")
493
 
494
  output_dir = os.path.join(work_dir, "output")
495
  os.makedirs(output_dir, exist_ok=True)
 
504
  verbose=True
505
  )
506
 
507
+ progress(0.80, desc="Parsing evolution history...")
508
+
509
+ # Parse evolution history for visualization
510
+ evolution_viz = parse_evolution_history(output_dir)
511
+
512
+ progress(0.85, desc="Evaluating best evolved prompt...")
513
 
514
  # Get the best prompt
515
  best_prompt_path = os.path.join(output_dir, "best_program.txt")
 
522
  # Evaluate best prompt
523
  final_eval = evaluate_prompt(
524
  best_prompt, dataset_name, dataset_split, 100,
525
+ model, input_field, target_field
526
  )
527
 
528
  final_results = f"""
 
536
  **Results:**
537
  - Accuracy: {final_eval['accuracy']:.2f}%
538
  - Correct: {final_eval['correct']}/{final_eval['total']}
539
+ - Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
540
 
541
  **Sample Results:**
542
  """
 
547
  final_results += f" βœ“ Correct\n" if result['correct'] else f" βœ— Incorrect\n"
548
 
549
  summary = f"""
550
+ ## πŸŽ‰ Optimization Complete!
551
 
552
  ### Summary
553
+ - **Dataset**: {dataset_name} ({dataset_split} split)
554
+ - **Model**: {model}
555
+ - **Samples**: 100 per evaluation
556
+ - **Iterations**: 10
557
+
558
+ ### Results
559
+ - **Initial Accuracy**: {initial_eval['accuracy']:.2f}%
560
+ - **Final Accuracy**: {final_eval['accuracy']:.2f}%
561
+ - **Improvement**: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
562
+
563
+ {validation_message}
564
  """
565
 
566
  progress(1.0, desc="Complete!")
567
 
568
+ return summary, initial_results, evolution_viz, final_results
569
 
570
  except Exception as e:
571
+ return f"## Error During Evolution\n\n❌ {str(e)}", initial_results, "", ""
572
 
573
  finally:
574
  # Clean up
 
585
 
586
  Automatically evolve and optimize your prompts using evolutionary algorithms!
587
 
588
+ This space uses [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) to iteratively improve prompts
589
  by testing them on real datasets and evolving better versions.
590
 
591
  ## How it works:
592
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
593
+ 2. Enter the full HuggingFace dataset name (e.g., `stanfordnlp/imdb`, `gsm8k`)
594
+ 3. Specify the dataset split and field names
595
+ 4. Choose a free model from OpenRouter
596
+ 5. Click "Optimize Prompt" - the system will validate everything first!
597
+ 6. Watch the evolution progress in real-time
598
+ 7. Compare initial vs. evolved performance!
599
+
600
+ **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
601
  """)
602
 
603
  with gr.Row():
604
  with gr.Column():
605
  gr.Markdown("### Configuration")
606
 
 
 
 
 
 
 
 
607
  model = gr.Dropdown(
608
  choices=FREE_MODELS,
609
  value=FREE_MODELS[0],
 
612
  )
613
 
614
  dataset_name = gr.Textbox(
615
+ label="HuggingFace Dataset (Full Name)",
616
+ value="stanfordnlp/imdb",
617
+ placeholder="e.g., stanfordnlp/imdb, openai/gsm8k, SetFit/sst5",
618
+ info="Full dataset name from HuggingFace Hub (org/dataset-name or dataset-name)"
619
  )
620
 
621
  dataset_split = gr.Textbox(
 
645
  info="Use {input} as placeholder for dataset inputs"
646
  )
647
 
648
+ optimize_btn = gr.Button("πŸš€ Validate & Optimize Prompt", variant="primary", size="lg")
649
 
650
  with gr.Row():
651
  with gr.Column():
 
657
  with gr.Column():
658
  final_results = gr.Markdown(label="Evolved Results")
659
 
660
+ with gr.Row():
661
+ with gr.Column():
662
+ evolution_progress = gr.Markdown(label="Evolution Progress", value="Evolution progress will appear here...")
663
+
664
  gr.Markdown("""
665
  ### Example Datasets & Fields:
666
 
667
  | Dataset | Split | Input Field | Target Field | Task |
668
  |---------|-------|-------------|--------------|------|
669
+ | stanfordnlp/imdb | test | text | label | Sentiment Analysis |
670
+ | rajpurkar/squad | validation | question | answers | Question Answering |
671
+ | dair-ai/emotion | test | text | label | Emotion Classification |
672
+ | openai/gsm8k | test | question | answer | Math Reasoning |
673
+ | fancyzhx/ag_news | test | text | label | News Classification |
674
+
675
+ ### Important Notes:
676
+ - **API Key**: Must be set as `OPENAI_API_KEY` environment variable in Space secrets
677
+ - **HF Token**: Optional `HF_TOKEN` environment variable for private datasets
678
+ - **Dataset Name**: Use full name (org/dataset or dataset-name)
679
+ - **Validation**: All inputs are validated before starting optimization
680
+ - **Time**: Evolution takes 5-15 minutes (10 iterations)
681
+ - **Samples**: 100 random samples per evaluation
682
+
683
+ ### About OpenEvolve:
684
+ OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
685
+ - [GitHub Repository](https://github.com/algorithmicsuperintelligence/openevolve)
686
+ - [Documentation](https://github.com/algorithmicsuperintelligence/openevolve#readme)
687
  """)
688
 
689
  optimize_btn.click(
690
  fn=optimize_prompt,
691
+ inputs=[initial_prompt, dataset_name, dataset_split, model,
692
  input_field, target_field],
693
+ outputs=[summary, initial_results, evolution_progress, final_results]
694
  )
695
 
696
  if __name__ == "__main__":