codelion commited on
Commit
07d6b9d
·
verified ·
1 Parent(s): 96e88bc

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -28
app.py CHANGED
@@ -890,11 +890,29 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
890
  progress(0.15, desc="Creating configuration...")
891
  config_path = create_config_file(model, work_dir)
892
 
893
- # Run initial evaluation (using 200 samples for accurate baseline)
 
 
894
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
895
- progress(0.2, desc="Running initial evaluation on 200 samples...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
896
  initial_eval = evaluate_prompt(
897
- initial_prompt, dataset_name, dataset_split, 200,
898
  model, input_field, target_field
899
  )
900
 
@@ -973,7 +991,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
973
  # Parse evolution history for visualization
974
  evolution_viz = parse_evolution_history(output_dir)
975
 
976
- progress(0.85, desc="Evaluating best evolved prompt on 200 samples...")
977
 
978
  # Get the best prompt (OpenEvolve saves to output_dir/best/best_program.txt)
979
  best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
@@ -989,13 +1007,67 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
989
  else:
990
  best_prompt = initial_prompt
991
 
992
- # Evaluate best prompt on THE SAME samples as initial eval (fair comparison)
 
 
 
993
  final_eval = evaluate_prompt(
994
- best_prompt, dataset_name, dataset_split, 200,
995
  model, input_field, target_field,
996
  fixed_indices=eval_indices # Use same samples as initial eval!
997
  )
998
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  final_results = f"""
1000
  ### Evolved Prompt Evaluation
1001
 
@@ -1022,14 +1094,15 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
1022
 
1023
  ### Summary
1024
  - **Dataset**: {dataset_name} ({dataset_split} split)
 
1025
  - **Model**: {model}
1026
- - **Initial/Final Eval**: 200 samples each (same samples for fair comparison)
1027
  - **Evolution Eval**: Staged (50 → 200 if score ≥ 0.5)
1028
  - **Iterations**: 10
1029
 
1030
  ### Results
1031
- - **Initial Accuracy**: {initial_eval['accuracy']:.2f}%
1032
- - **Final Accuracy**: {final_eval['accuracy']:.2f}%
1033
  - **Improvement**: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
1034
 
1035
  {validation_message}
@@ -1060,12 +1133,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1060
 
1061
  ## How it works:
1062
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1063
- 2. Default dataset is **GSM8K** (grade school math) - great for showing prompt improvement!
1064
- 3. Specify the dataset split and field names (or use other datasets like `glue`, `stanfordnlp/imdb`)
1065
  4. Choose a free model from OpenRouter
1066
  5. Click "Optimize Prompt" - the system will validate everything first!
1067
  6. Watch the evolution progress in real-time
1068
- 7. Compare initial vs. best prompt side-by-side with full 200-sample accuracy!
1069
 
1070
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1071
  """)
@@ -1083,9 +1156,9 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1083
 
1084
  dataset_name = gr.Textbox(
1085
  label="HuggingFace Dataset (Full Name)",
1086
- value="gsm8k",
1087
- placeholder="e.g., gsm8k, glue, stanfordnlp/imdb",
1088
- info="Dataset name from HuggingFace Hub. Configs auto-detected (e.g., 'glue' 'glue:sst2')"
1089
  )
1090
 
1091
  dataset_split = gr.Textbox(
@@ -1096,8 +1169,8 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1096
 
1097
  input_field = gr.Textbox(
1098
  label="Input Field Name",
1099
- value="question",
1100
- placeholder="e.g., question, sentence, text",
1101
  info="The field containing inputs to process"
1102
  )
1103
 
@@ -1110,7 +1183,7 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1110
 
1111
  initial_prompt = gr.TextArea(
1112
  label="Initial Prompt",
1113
- value="{input}\n\nAnswer:",
1114
  lines=6,
1115
  info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
1116
  )
@@ -1144,13 +1217,13 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1144
  gr.Markdown("""
1145
  ### Example Datasets & Fields:
1146
 
1147
- | Dataset | Split | Input Field | Target Field | Task |
1148
- |---------|-------|-------------|--------------|------|
1149
- | stanfordnlp/imdb | test | text | label | Sentiment Analysis |
1150
- | rajpurkar/squad | validation | question | answers | Question Answering |
1151
- | dair-ai/emotion | test | text | label | Emotion Classification |
1152
- | openai/gsm8k | test | question | answer | Math Reasoning |
1153
- | fancyzhx/ag_news | test | text | label | News Classification |
1154
 
1155
  ### About This Demo Space:
1156
 
@@ -1169,9 +1242,11 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1169
  - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
1170
  4. **Run & Monitor**:
1171
  - All inputs are validated before starting
1172
- - Initial and final evaluations use 200 samples each for accurate comparison
1173
- - Evolution uses staged evaluation (50 samples first, then 200 more if promising)
1174
- - Saves API calls by early-stopping poor prompts (< 50% accuracy)
 
 
1175
  - Compare initial vs best prompt side-by-side
1176
 
1177
  ### About OpenEvolve:
 
890
  progress(0.15, desc="Creating configuration...")
891
  config_path = create_config_file(model, work_dir)
892
 
893
+ # Run initial evaluation
894
+ # For small datasets (like AIME with 30 instances), use all available
895
+ # For larger datasets, use 50 samples for quick baseline
896
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
897
+
898
+ # Load dataset to check size
899
+ try:
900
+ temp_dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
901
+ except ValueError as e:
902
+ if "config" in str(e).lower():
903
+ default_config = "main"
904
+ if dataset_name.lower() == "glue":
905
+ default_config = "sst2"
906
+ temp_dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
907
+ else:
908
+ raise
909
+
910
+ dataset_size = len(temp_dataset)
911
+ eval_samples = min(dataset_size, 50) # Use all if dataset has ≤50, else use 50
912
+
913
+ progress(0.2, desc=f"Running initial evaluation on {eval_samples} samples (dataset has {dataset_size} total)...")
914
  initial_eval = evaluate_prompt(
915
+ initial_prompt, dataset_name, dataset_split, eval_samples,
916
  model, input_field, target_field
917
  )
918
 
 
991
  # Parse evolution history for visualization
992
  evolution_viz = parse_evolution_history(output_dir)
993
 
994
+ progress(0.85, desc="Evaluating best evolved prompt...")
995
 
996
  # Get the best prompt (OpenEvolve saves to output_dir/best/best_program.txt)
997
  best_prompt_path = os.path.join(output_dir, "best", "best_program.txt")
 
1007
  else:
1008
  best_prompt = initial_prompt
1009
 
1010
+ # Final evaluation: Use same samples as initial eval
1011
+ # For small datasets (≤50), we already used all samples
1012
+ # For large datasets, we used 50 samples with potential for Stage 2
1013
+ progress(0.85, desc=f"Evaluating best prompt on {eval_samples} samples...")
1014
  final_eval = evaluate_prompt(
1015
+ best_prompt, dataset_name, dataset_split, eval_samples,
1016
  model, input_field, target_field,
1017
  fixed_indices=eval_indices # Use same samples as initial eval!
1018
  )
1019
 
1020
+ # Stage 2: Only for large datasets (>50) with high accuracy
1021
+ if dataset_size > 50 and final_eval.get('accuracy', 0) > 80.0:
1022
+ progress(0.90, desc="Stage 2: Accuracy >80%! Evaluating 50 more samples...")
1023
+
1024
+ # Load dataset to get additional samples
1025
+ try:
1026
+ dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
1027
+ except ValueError as e:
1028
+ if "config" in str(e).lower():
1029
+ default_config = "main"
1030
+ if dataset_name.lower() == "glue":
1031
+ default_config = "sst2"
1032
+ dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
1033
+ else:
1034
+ raise
1035
+
1036
+ # Get 50 additional indices (different from initial 50)
1037
+ import random
1038
+ random.seed(42)
1039
+ all_indices = list(range(len(dataset)))
1040
+ remaining_indices = [i for i in all_indices if i not in eval_indices]
1041
+
1042
+ if len(remaining_indices) >= 50:
1043
+ additional_indices = random.sample(remaining_indices, 50)
1044
+
1045
+ # Evaluate on additional 50 samples
1046
+ additional_eval = evaluate_prompt(
1047
+ best_prompt, dataset_name, dataset_split, 50,
1048
+ model, input_field, target_field,
1049
+ fixed_indices=additional_indices
1050
+ )
1051
+
1052
+ # Combine results from both stages
1053
+ combined_correct = final_eval['correct'] + additional_eval['correct']
1054
+ combined_total = final_eval['total'] + additional_eval['total']
1055
+ combined_accuracy = (combined_correct / combined_total * 100) if combined_total > 0 else 0
1056
+
1057
+ final_eval = {
1058
+ 'accuracy': combined_accuracy,
1059
+ 'correct': combined_correct,
1060
+ 'total': combined_total,
1061
+ 'results': final_eval['results'] + additional_eval['results']
1062
+ }
1063
+ progress(0.95, desc=f"Stage 2 complete: {combined_correct}/{combined_total} = {combined_accuracy:.1f}%")
1064
+ else:
1065
+ progress(0.90, desc="Not enough samples for Stage 2, using Stage 1 results")
1066
+ elif dataset_size <= 50:
1067
+ progress(0.90, desc=f"Complete: Evaluated on all {dataset_size} instances")
1068
+ else:
1069
+ progress(0.90, desc=f"Stage 1 complete: {final_eval['accuracy']:.1f}% (below 80%, skipping Stage 2)")
1070
+
1071
  final_results = f"""
1072
  ### Evolved Prompt Evaluation
1073
 
 
1094
 
1095
  ### Summary
1096
  - **Dataset**: {dataset_name} ({dataset_split} split)
1097
+ - **Dataset Size**: {dataset_size} instances
1098
  - **Model**: {model}
1099
+ - **Evaluation**: {eval_samples} samples {'(all instances)' if eval_samples == dataset_size else ''}
1100
  - **Evolution Eval**: Staged (50 → 200 if score ≥ 0.5)
1101
  - **Iterations**: 10
1102
 
1103
  ### Results
1104
+ - **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
1105
+ - **Final Accuracy**: {final_eval['accuracy']:.2f}% ({final_eval['correct']}/{final_eval['total']})
1106
  - **Improvement**: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
1107
 
1108
  {validation_message}
 
1133
 
1134
  ## How it works:
1135
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1136
+ 2. Default dataset is **AIME 2025** (30 hard math competition problems) - tests on challenging problems!
1137
+ 3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `stanfordnlp/imdb`)
1138
  4. Choose a free model from OpenRouter
1139
  5. Click "Optimize Prompt" - the system will validate everything first!
1140
  6. Watch the evolution progress in real-time
1141
+ 7. Compare initial vs. best prompt side-by-side (uses all instances for small datasets ≤50)!
1142
 
1143
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1144
  """)
 
1156
 
1157
  dataset_name = gr.Textbox(
1158
  label="HuggingFace Dataset (Full Name)",
1159
+ value="MathArena/aime_2025",
1160
+ placeholder="e.g., MathArena/aime_2025, gsm8k, stanfordnlp/imdb",
1161
+ info="Dataset name from HuggingFace Hub. Default: AIME 2025 (30 hard math problems)"
1162
  )
1163
 
1164
  dataset_split = gr.Textbox(
 
1169
 
1170
  input_field = gr.Textbox(
1171
  label="Input Field Name",
1172
+ value="problem",
1173
+ placeholder="e.g., problem, question, sentence, text",
1174
  info="The field containing inputs to process"
1175
  )
1176
 
 
1183
 
1184
  initial_prompt = gr.TextArea(
1185
  label="Initial Prompt",
1186
+ value="Solve this math problem step by step.\n\nProblem: {input}\n\nProvide your final answer as a number after ####.",
1187
  lines=6,
1188
  info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
1189
  )
 
1217
  gr.Markdown("""
1218
  ### Example Datasets & Fields:
1219
 
1220
+ | Dataset | Split | Input Field | Target Field | Task | Size |
1221
+ |---------|-------|-------------|--------------|------|------|
1222
+ | **MathArena/aime_2025** | train | problem | answer | Hard Math (AIME) | 30 |
1223
+ | gsm8k | train | question | answer | Grade School Math | 7,473 |
1224
+ | stanfordnlp/imdb | test | text | label | Sentiment Analysis | 25,000 |
1225
+ | dair-ai/emotion | test | text | label | Emotion Classification | 2,000 |
1226
+ | fancyzhx/ag_news | test | text | label | News Classification | 7,600 |
1227
 
1228
  ### About This Demo Space:
1229
 
 
1242
  - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
1243
  4. **Run & Monitor**:
1244
  - All inputs are validated before starting
1245
+ - **Adaptive evaluation**:
1246
+ - Small datasets (50): Uses all instances for accurate results
1247
+ - Large datasets (>50): Uses 50 samples 100 if accuracy >80%
1248
+ - Evolution uses staged evaluation (50 → 200 if score ≥ 0.5)
1249
+ - Saves API calls and time with smart staged evaluation
1250
  - Compare initial vs best prompt side-by-side
1251
 
1252
  ### About OpenEvolve: