Upload app.py
Browse files
app.py
CHANGED
|
@@ -182,8 +182,21 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
|
|
| 182 |
|
| 183 |
prediction = response.choices[0].message.content.strip()
|
| 184 |
|
| 185 |
-
#
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
if is_correct:
|
| 188 |
correct += 1
|
| 189 |
total += 1
|
|
@@ -325,7 +338,7 @@ def parse_evolution_history(output_dir: str) -> str:
|
|
| 325 |
|
| 326 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 327 |
input_field: str, target_field: str, work_dir: str):
|
| 328 |
-
"""Create an evaluator.py file for OpenEvolve."""
|
| 329 |
evaluator_code = f'''
|
| 330 |
import os
|
| 331 |
import random
|
|
@@ -333,19 +346,22 @@ from datasets import load_dataset
|
|
| 333 |
from openai import OpenAI
|
| 334 |
|
| 335 |
def evaluate(prompt: str) -> float:
|
| 336 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
try:
|
| 338 |
# Load dataset
|
| 339 |
dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
|
| 340 |
|
| 341 |
-
# Sample 100 random examples
|
| 342 |
-
num_samples = min(100, len(dataset))
|
| 343 |
-
if len(dataset) > num_samples:
|
| 344 |
-
indices = random.sample(range(len(dataset)), num_samples)
|
| 345 |
-
samples = [dataset[i] for i in indices]
|
| 346 |
-
else:
|
| 347 |
-
samples = list(dataset)[:num_samples]
|
| 348 |
-
|
| 349 |
# Initialize OpenAI client
|
| 350 |
api_key = os.environ.get("OPENAI_API_KEY")
|
| 351 |
client = OpenAI(
|
|
@@ -353,48 +369,107 @@ def evaluate(prompt: str) -> float:
|
|
| 353 |
api_key=api_key,
|
| 354 |
)
|
| 355 |
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
|
| 370 |
-
|
| 371 |
-
formatted_prompt = prompt.replace("{{input}}", str(input_text))
|
| 372 |
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
{{"role": "system", "content": "You are a helpful assistant."}},
|
| 378 |
-
{{"role": "user", "content": formatted_prompt}}
|
| 379 |
-
],
|
| 380 |
-
temperature=0.1,
|
| 381 |
-
max_tokens=500,
|
| 382 |
-
)
|
| 383 |
|
| 384 |
-
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
correct += 1
|
| 390 |
-
total += 1
|
| 391 |
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
|
|
|
|
|
|
| 395 |
|
| 396 |
-
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
except Exception as e:
|
| 400 |
print(f"Error in evaluation: {{e}}")
|
|
@@ -412,10 +487,8 @@ def create_config_file(model: str, work_dir: str):
|
|
| 412 |
"""Create a config.yaml file for OpenEvolve."""
|
| 413 |
config = {
|
| 414 |
"llm": {
|
| 415 |
-
"
|
| 416 |
-
"model": model,
|
| 417 |
"temperature": 0.7,
|
| 418 |
-
"max_tokens": 4096,
|
| 419 |
},
|
| 420 |
"evolution": {
|
| 421 |
"max_iterations": 10,
|
|
@@ -506,7 +579,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 506 |
initial_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"
|
| 507 |
|
| 508 |
# Run OpenEvolve
|
| 509 |
-
progress(0.3, desc="Starting OpenEvolve optimization (10 iterations
|
| 510 |
|
| 511 |
output_dir = os.path.join(work_dir, "output")
|
| 512 |
os.makedirs(output_dir, exist_ok=True)
|
|
@@ -568,7 +641,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 568 |
### Summary
|
| 569 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 570 |
- **Model**: {model}
|
| 571 |
-
- **
|
| 572 |
- **Iterations**: 10
|
| 573 |
|
| 574 |
### Results
|
|
@@ -715,7 +788,8 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 715 |
- Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
|
| 716 |
4. **Run & Monitor**:
|
| 717 |
- All inputs are validated before starting
|
| 718 |
-
- Evolution
|
|
|
|
| 719 |
- Watch evolution progress visualization in real-time
|
| 720 |
|
| 721 |
### About OpenEvolve:
|
|
|
|
| 182 |
|
| 183 |
prediction = response.choices[0].message.content.strip()
|
| 184 |
|
| 185 |
+
# Smart evaluation - handle both exact match and semantic match
|
| 186 |
+
target_str = str(target).lower().strip()
|
| 187 |
+
pred_lower = prediction.lower()
|
| 188 |
+
|
| 189 |
+
# Check exact match first
|
| 190 |
+
is_correct = target_str in pred_lower
|
| 191 |
+
|
| 192 |
+
# If not exact match, check for semantic equivalents (e.g., "1" = "positive")
|
| 193 |
+
if not is_correct:
|
| 194 |
+
# Common sentiment mappings
|
| 195 |
+
if target_str in ["1", "positive", "pos"]:
|
| 196 |
+
is_correct = any(word in pred_lower for word in ["positive", "good", "great"])
|
| 197 |
+
elif target_str in ["0", "negative", "neg"]:
|
| 198 |
+
is_correct = any(word in pred_lower for word in ["negative", "bad", "poor"])
|
| 199 |
+
|
| 200 |
if is_correct:
|
| 201 |
correct += 1
|
| 202 |
total += 1
|
|
|
|
| 338 |
|
| 339 |
def create_evaluator_file(dataset_name: str, split: str, model: str,
|
| 340 |
input_field: str, target_field: str, work_dir: str):
|
| 341 |
+
"""Create an evaluator.py file for OpenEvolve with staged/cascading evaluation."""
|
| 342 |
evaluator_code = f'''
|
| 343 |
import os
|
| 344 |
import random
|
|
|
|
| 346 |
from openai import OpenAI
|
| 347 |
|
| 348 |
def evaluate(prompt: str) -> float:
|
| 349 |
+
"""
|
| 350 |
+
Evaluate a prompt using 2-stage cascading evaluation to save API calls.
|
| 351 |
+
|
| 352 |
+
Stage 1: Evaluate with 20 samples
|
| 353 |
+
- If accuracy >= 0.5, proceed to Stage 2
|
| 354 |
+
- If accuracy < 0.5, return early (no point wasting 80 more samples)
|
| 355 |
+
|
| 356 |
+
Stage 2: Evaluate with 80 more samples (total 100)
|
| 357 |
+
- Combine results for final score
|
| 358 |
+
|
| 359 |
+
Returns score between 0 and 1.
|
| 360 |
+
"""
|
| 361 |
try:
|
| 362 |
# Load dataset
|
| 363 |
dataset = load_dataset("{dataset_name}", split="{split}", streaming=False)
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
# Initialize OpenAI client
|
| 366 |
api_key = os.environ.get("OPENAI_API_KEY")
|
| 367 |
client = OpenAI(
|
|
|
|
| 369 |
api_key=api_key,
|
| 370 |
)
|
| 371 |
|
| 372 |
+
def evaluate_samples(samples, correct_so_far=0, total_so_far=0):
|
| 373 |
+
"""Helper function to evaluate a batch of samples."""
|
| 374 |
+
correct = correct_so_far
|
| 375 |
+
total = total_so_far
|
| 376 |
|
| 377 |
+
for sample in samples:
|
| 378 |
+
try:
|
| 379 |
+
# Get input and target
|
| 380 |
+
input_text = sample.get("{input_field}", "")
|
| 381 |
+
if isinstance(input_text, dict):
|
| 382 |
+
input_text = str(input_text)
|
| 383 |
+
|
| 384 |
+
target = sample.get("{target_field}", "")
|
| 385 |
+
if isinstance(target, dict):
|
| 386 |
+
target = str(target)
|
| 387 |
+
|
| 388 |
+
# Format the prompt
|
| 389 |
+
formatted_prompt = prompt.replace("{{input}}", str(input_text))
|
| 390 |
+
|
| 391 |
+
# Call the model
|
| 392 |
+
response = client.chat.completions.create(
|
| 393 |
+
model="{model}",
|
| 394 |
+
messages=[
|
| 395 |
+
{{"role": "system", "content": "You are a helpful assistant."}},
|
| 396 |
+
{{"role": "user", "content": formatted_prompt}}
|
| 397 |
+
],
|
| 398 |
+
temperature=0.1,
|
| 399 |
+
max_tokens=500,
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
prediction = response.choices[0].message.content.strip()
|
| 403 |
+
|
| 404 |
+
# Smart evaluation - handle both exact match and semantic match
|
| 405 |
+
target_str = str(target).lower().strip()
|
| 406 |
+
pred_lower = prediction.lower()
|
| 407 |
+
|
| 408 |
+
# Check exact match first
|
| 409 |
+
is_correct = target_str in pred_lower
|
| 410 |
+
|
| 411 |
+
# If not exact match, check for semantic equivalents (e.g., "1" = "positive")
|
| 412 |
+
if not is_correct:
|
| 413 |
+
# Common sentiment mappings
|
| 414 |
+
if target_str in ["1", "positive", "pos"]:
|
| 415 |
+
is_correct = any(word in pred_lower for word in ["positive", "good", "great"])
|
| 416 |
+
elif target_str in ["0", "negative", "neg"]:
|
| 417 |
+
is_correct = any(word in pred_lower for word in ["negative", "bad", "poor"])
|
| 418 |
+
|
| 419 |
+
if is_correct:
|
| 420 |
+
correct += 1
|
| 421 |
+
total += 1
|
| 422 |
+
|
| 423 |
+
except Exception as e:
|
| 424 |
+
print(f"Error evaluating sample: {{e}}")
|
| 425 |
+
continue
|
| 426 |
+
|
| 427 |
+
return correct, total
|
| 428 |
+
|
| 429 |
+
# STAGE 1: Evaluate with 20 samples first
|
| 430 |
+
stage1_size = 20
|
| 431 |
+
stage1_samples_count = min(stage1_size, len(dataset))
|
| 432 |
+
|
| 433 |
+
if len(dataset) > stage1_samples_count:
|
| 434 |
+
stage1_indices = random.sample(range(len(dataset)), stage1_samples_count)
|
| 435 |
+
stage1_samples = [dataset[i] for i in stage1_indices]
|
| 436 |
+
else:
|
| 437 |
+
stage1_samples = list(dataset)[:stage1_samples_count]
|
| 438 |
|
| 439 |
+
print(f"[Stage 1/2] Evaluating with {{len(stage1_samples)}} samples...")
|
| 440 |
+
correct, total = evaluate_samples(stage1_samples)
|
| 441 |
+
stage1_score = (correct / total) if total > 0 else 0.0
|
| 442 |
|
| 443 |
+
print(f"[Stage 1/2] Score: {{stage1_score:.3f}} ({{correct}}/{{total}})")
|
|
|
|
| 444 |
|
| 445 |
+
# Early exit if Stage 1 score is below threshold
|
| 446 |
+
if stage1_score < 0.5:
|
| 447 |
+
print(f"[Stage 1/2] Score below 0.5 threshold - skipping Stage 2 (saved 80 API calls)")
|
| 448 |
+
return stage1_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
+
# STAGE 2: Continue with 80 more samples
|
| 451 |
+
print(f"[Stage 2/2] Score >= 0.5 - proceeding with 80 more samples...")
|
| 452 |
+
stage2_size = 80
|
| 453 |
+
stage2_samples_count = min(stage2_size, max(0, len(dataset) - stage1_samples_count))
|
| 454 |
|
| 455 |
+
if stage2_samples_count > 0:
|
| 456 |
+
# Get different samples from Stage 1
|
| 457 |
+
remaining_indices = list(set(range(len(dataset))) - set(stage1_indices if 'stage1_indices' in locals() else []))
|
|
|
|
|
|
|
| 458 |
|
| 459 |
+
if len(remaining_indices) >= stage2_samples_count:
|
| 460 |
+
stage2_indices = random.sample(remaining_indices, stage2_samples_count)
|
| 461 |
+
stage2_samples = [dataset[i] for i in stage2_indices]
|
| 462 |
+
else:
|
| 463 |
+
stage2_samples = [dataset[i] for i in remaining_indices[:stage2_samples_count]]
|
| 464 |
|
| 465 |
+
correct, total = evaluate_samples(stage2_samples, correct, total)
|
| 466 |
+
final_score = (correct / total) if total > 0 else stage1_score
|
| 467 |
+
|
| 468 |
+
print(f"[Stage 2/2] Final score: {{final_score:.3f}} ({{correct}}/{{total}})")
|
| 469 |
+
return final_score
|
| 470 |
+
else:
|
| 471 |
+
print(f"[Stage 2/2] Not enough samples in dataset for Stage 2")
|
| 472 |
+
return stage1_score
|
| 473 |
|
| 474 |
except Exception as e:
|
| 475 |
print(f"Error in evaluation: {{e}}")
|
|
|
|
| 487 |
"""Create a config.yaml file for OpenEvolve."""
|
| 488 |
config = {
|
| 489 |
"llm": {
|
| 490 |
+
"primary_model": model,
|
|
|
|
| 491 |
"temperature": 0.7,
|
|
|
|
| 492 |
},
|
| 493 |
"evolution": {
|
| 494 |
"max_iterations": 10,
|
|
|
|
| 579 |
initial_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"
|
| 580 |
|
| 581 |
# Run OpenEvolve
|
| 582 |
+
progress(0.3, desc="Starting OpenEvolve optimization (10 iterations with staged evaluation)...")
|
| 583 |
|
| 584 |
output_dir = os.path.join(work_dir, "output")
|
| 585 |
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
| 641 |
### Summary
|
| 642 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 643 |
- **Model**: {model}
|
| 644 |
+
- **Evaluation**: Staged (20 samples → 100 if score ≥ 0.5)
|
| 645 |
- **Iterations**: 10
|
| 646 |
|
| 647 |
### Results
|
|
|
|
| 788 |
- Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
|
| 789 |
4. **Run & Monitor**:
|
| 790 |
- All inputs are validated before starting
|
| 791 |
+
- Evolution uses staged evaluation (20 samples first, then 80 more if promising)
|
| 792 |
+
- Saves API calls by early-stopping poor prompts (< 50% accuracy)
|
| 793 |
- Watch evolution progress visualization in real-time
|
| 794 |
|
| 795 |
### About OpenEvolve:
|