codelion commited on
Commit
93d757f
·
verified ·
1 Parent(s): 81a72ce

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -110
app.py CHANGED
@@ -890,29 +890,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
890
  progress(0.15, desc="Creating configuration...")
891
  config_path = create_config_file(model, work_dir)
892
 
893
- # Run initial evaluation
894
- # For small datasets (like AIME with 30 instances), use all available
895
- # For larger datasets, use 50 samples for quick baseline
896
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
897
-
898
- # Load dataset to check size
899
- try:
900
- temp_dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
901
- except ValueError as e:
902
- if "config" in str(e).lower():
903
- default_config = "main"
904
- if dataset_name.lower() == "glue":
905
- default_config = "sst2"
906
- temp_dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
907
- else:
908
- raise
909
-
910
- dataset_size = len(temp_dataset)
911
- eval_samples = min(dataset_size, 50) # Use all if dataset has ≤50, else use 50
912
-
913
- progress(0.2, desc=f"Running initial evaluation on {eval_samples} samples (dataset has {dataset_size} total)...")
914
  initial_eval = evaluate_prompt(
915
- initial_prompt, dataset_name, dataset_split, eval_samples,
916
  model, input_field, target_field
917
  )
918
 
@@ -1007,66 +989,60 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
1007
  else:
1008
  best_prompt = initial_prompt
1009
 
1010
- # Final evaluation: Use same samples as initial eval
1011
- # For small datasets (≤50), we already used all samples
1012
- # For large datasets, we used 50 samples with potential for Stage 2
1013
- progress(0.85, desc=f"Evaluating best prompt on {eval_samples} samples...")
1014
  final_eval = evaluate_prompt(
1015
- best_prompt, dataset_name, dataset_split, eval_samples,
1016
  model, input_field, target_field,
1017
  fixed_indices=eval_indices # Use same samples as initial eval!
1018
  )
1019
 
1020
- # Stage 2: Only for large datasets (>50) with high accuracy
1021
- if dataset_size > 50 and final_eval.get('accuracy', 0) > 80.0:
1022
- progress(0.90, desc="Stage 2: Accuracy >80%! Evaluating 50 more samples...")
1023
 
1024
- # Load dataset to get additional samples
1025
- try:
1026
- dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
1027
- except ValueError as e:
1028
- if "config" in str(e).lower():
1029
- default_config = "main"
1030
- if dataset_name.lower() == "glue":
1031
- default_config = "sst2"
1032
- dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
1033
- else:
1034
- raise
1035
-
1036
- # Get 50 additional indices (different from initial 50)
1037
- import random
1038
- random.seed(42)
1039
- all_indices = list(range(len(dataset)))
1040
- remaining_indices = [i for i in all_indices if i not in eval_indices]
1041
-
1042
- if len(remaining_indices) >= 50:
1043
- additional_indices = random.sample(remaining_indices, 50)
1044
-
1045
- # Evaluate on additional 50 samples
1046
- additional_eval = evaluate_prompt(
1047
- best_prompt, dataset_name, dataset_split, 50,
1048
- model, input_field, target_field,
1049
- fixed_indices=additional_indices
1050
- )
1051
-
1052
- # Combine results from both stages
1053
- combined_correct = final_eval['correct'] + additional_eval['correct']
1054
- combined_total = final_eval['total'] + additional_eval['total']
1055
- combined_accuracy = (combined_correct / combined_total * 100) if combined_total > 0 else 0
1056
-
1057
- final_eval = {
1058
- 'accuracy': combined_accuracy,
1059
- 'correct': combined_correct,
1060
- 'total': combined_total,
1061
- 'results': final_eval['results'] + additional_eval['results']
1062
- }
1063
- progress(0.95, desc=f"Stage 2 complete: {combined_correct}/{combined_total} = {combined_accuracy:.1f}%")
1064
  else:
1065
- progress(0.90, desc="Not enough samples for Stage 2, using Stage 1 results")
1066
- elif dataset_size <= 50:
1067
- progress(0.90, desc=f"Complete: Evaluated on all {dataset_size} instances")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1068
  else:
1069
- progress(0.90, desc=f"Stage 1 complete: {final_eval['accuracy']:.1f}% (below 80%, skipping Stage 2)")
1070
 
1071
  final_results = f"""
1072
  ### Evolved Prompt Evaluation
@@ -1094,10 +1070,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
1094
 
1095
  ### Summary
1096
  - **Dataset**: {dataset_name} ({dataset_split} split)
1097
- - **Dataset Size**: {dataset_size} instances
1098
  - **Model**: {model}
1099
- - **Evaluation**: {eval_samples} samples {'(all instances)' if eval_samples == dataset_size else ''}
1100
- - **Evolution Eval**: Staged (50 → 200 if score ≥ 0.5)
 
1101
  - **Iterations**: 10
1102
 
1103
  ### Results
@@ -1133,12 +1109,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1133
 
1134
  ## How it works:
1135
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1136
- 2. Default dataset is **AIME 2025** (30 hard math competition problems) - tests on challenging problems!
1137
- 3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `stanfordnlp/imdb`)
1138
  4. Choose a free model from OpenRouter
1139
  5. Click "Optimize Prompt" - the system will validate everything first!
1140
  6. Watch the evolution progress in real-time
1141
- 7. Compare initial vs. best prompt side-by-side (uses all instances for small datasets ≤50)!
1142
 
1143
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1144
  """)
@@ -1156,47 +1132,36 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1156
 
1157
  dataset_name = gr.Textbox(
1158
  label="HuggingFace Dataset (Full Name)",
1159
- value="MathArena/aime_2025",
1160
- placeholder="e.g., MathArena/aime_2025, gsm8k, stanfordnlp/imdb",
1161
- info="Dataset name from HuggingFace Hub. Default: AIME 2025 (30 hard math problems)"
1162
  )
1163
 
1164
  dataset_split = gr.Textbox(
1165
  label="Dataset Split",
1166
- value="train",
1167
  placeholder="e.g., train, test, validation"
1168
  )
1169
 
1170
  input_field = gr.Textbox(
1171
  label="Input Field Name",
1172
- value="problem",
1173
- placeholder="e.g., problem, question, sentence, text",
1174
  info="The field containing inputs to process"
1175
  )
1176
 
1177
  target_field = gr.Textbox(
1178
  label="Target Field Name",
1179
- value="answer",
1180
- placeholder="e.g., answer, label, target",
1181
  info="The field containing expected outputs"
1182
  )
1183
 
1184
  initial_prompt = gr.TextArea(
1185
  label="Initial Prompt",
1186
- value="""You are a math competition expert. Solve this AIME problem using clear reasoning.
1187
-
1188
- Problem: {input}
1189
-
1190
- Instructions:
1191
- 1. Read the problem carefully
1192
- 2. Identify what is being asked
1193
- 3. Break down the solution into steps
1194
- 4. Show your work clearly
1195
- 5. Provide the final numerical answer after ####
1196
-
1197
- Solution:""",
1198
- lines=10,
1199
- info="Use {input} as placeholder for dataset inputs. Chain-of-thought prompting works best for AIME!"
1200
  )
1201
 
1202
  # Button outside the column for better visibility
@@ -1230,11 +1195,11 @@ Solution:""",
1230
 
1231
  | Dataset | Split | Input Field | Target Field | Task | Size |
1232
  |---------|-------|-------------|--------------|------|------|
1233
- | **MathArena/aime_2025** | train | problem | answer | Hard Math (AIME) | 30 |
1234
- | gsm8k | train | question | answer | Grade School Math | 7,473 |
1235
- | stanfordnlp/imdb | test | text | label | Sentiment Analysis | 25,000 |
1236
  | dair-ai/emotion | test | text | label | Emotion Classification | 2,000 |
 
1237
  | fancyzhx/ag_news | test | text | label | News Classification | 7,600 |
 
1238
 
1239
  ### About This Demo Space:
1240
 
@@ -1253,12 +1218,11 @@ Solution:""",
1253
  - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
1254
  4. **Run & Monitor**:
1255
  - All inputs are validated before starting
1256
- - **Adaptive evaluation**:
1257
- - Small datasets (≤50): Uses all instances for accurate results
1258
- - Large datasets (>50): Uses 50 samples 100 if accuracy >80%
1259
- - Evolution uses staged evaluation (50 → 200 if score ≥ 0.5)
1260
- - Saves API calls and time with smart staged evaluation
1261
- - Compare initial vs best prompt side-by-side
1262
 
1263
  ### About OpenEvolve:
1264
  OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
 
890
  progress(0.15, desc="Creating configuration...")
891
  config_path = create_config_file(model, work_dir)
892
 
893
+ # Run initial evaluation with 50 samples
 
 
894
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
895
+ progress(0.2, desc="Running initial evaluation on 50 samples...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
896
  initial_eval = evaluate_prompt(
897
+ initial_prompt, dataset_name, dataset_split, 50,
898
  model, input_field, target_field
899
  )
900
 
 
989
  else:
990
  best_prompt = initial_prompt
991
 
992
+ # Two-stage final evaluation: 50 200 samples
993
+ # Stage 1: Evaluate on same 50 samples as initial eval
994
+ progress(0.85, desc="Stage 1: Evaluating on 50 samples...")
 
995
  final_eval = evaluate_prompt(
996
+ best_prompt, dataset_name, dataset_split, 50,
997
  model, input_field, target_field,
998
  fixed_indices=eval_indices # Use same samples as initial eval!
999
  )
1000
 
1001
+ # Stage 2: Continue to 200 total samples (add 150 more)
1002
+ progress(0.90, desc="Stage 2: Evaluating 150 more samples (200 total)...")
 
1003
 
1004
+ # Load dataset to get additional samples
1005
+ try:
1006
+ dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
1007
+ except ValueError as e:
1008
+ if "config" in str(e).lower():
1009
+ default_config = "main"
1010
+ if dataset_name.lower() == "glue":
1011
+ default_config = "sst2"
1012
+ dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1013
  else:
1014
+ raise
1015
+
1016
+ # Get 150 additional indices (different from initial 50)
1017
+ import random
1018
+ random.seed(42)
1019
+ all_indices = list(range(len(dataset)))
1020
+ remaining_indices = [i for i in all_indices if i not in eval_indices]
1021
+
1022
+ if len(remaining_indices) >= 150:
1023
+ additional_indices = random.sample(remaining_indices, 150)
1024
+
1025
+ # Evaluate on additional 150 samples
1026
+ additional_eval = evaluate_prompt(
1027
+ best_prompt, dataset_name, dataset_split, 150,
1028
+ model, input_field, target_field,
1029
+ fixed_indices=additional_indices
1030
+ )
1031
+
1032
+ # Combine results from both stages
1033
+ combined_correct = final_eval['correct'] + additional_eval['correct']
1034
+ combined_total = final_eval['total'] + additional_eval['total']
1035
+ combined_accuracy = (combined_correct / combined_total * 100) if combined_total > 0 else 0
1036
+
1037
+ final_eval = {
1038
+ 'accuracy': combined_accuracy,
1039
+ 'correct': combined_correct,
1040
+ 'total': combined_total,
1041
+ 'results': final_eval['results'] + additional_eval['results']
1042
+ }
1043
+ progress(0.95, desc=f"Stage 2 complete: {combined_correct}/{combined_total} = {combined_accuracy:.1f}%")
1044
  else:
1045
+ progress(0.90, desc=f"Not enough samples for Stage 2, using Stage 1 results ({final_eval['correct']}/{final_eval['total']})")
1046
 
1047
  final_results = f"""
1048
  ### Evolved Prompt Evaluation
 
1070
 
1071
  ### Summary
1072
  - **Dataset**: {dataset_name} ({dataset_split} split)
 
1073
  - **Model**: {model}
1074
+ - **Initial Eval**: 50 samples
1075
+ - **Final Eval**: 50 samples → 200 samples (two-stage)
1076
+ - **Evolution**: Staged (50 → 200 if score ≥ 0.5)
1077
  - **Iterations**: 10
1078
 
1079
  ### Results
 
1109
 
1110
  ## How it works:
1111
  1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
1112
+ 2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
1113
+ 3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
1114
  4. Choose a free model from OpenRouter
1115
  5. Click "Optimize Prompt" - the system will validate everything first!
1116
  6. Watch the evolution progress in real-time
1117
+ 7. Compare initial vs. best prompt side-by-side (50 samples 200 samples for final evaluation)!
1118
 
1119
  **Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
1120
  """)
 
1132
 
1133
  dataset_name = gr.Textbox(
1134
  label="HuggingFace Dataset (Full Name)",
1135
+ value="stanfordnlp/imdb",
1136
+ placeholder="e.g., stanfordnlp/imdb, gsm8k, MathArena/aime_2025",
1137
+ info="Dataset name from HuggingFace Hub. Default: IMDB (sentiment classification)"
1138
  )
1139
 
1140
  dataset_split = gr.Textbox(
1141
  label="Dataset Split",
1142
+ value="test",
1143
  placeholder="e.g., train, test, validation"
1144
  )
1145
 
1146
  input_field = gr.Textbox(
1147
  label="Input Field Name",
1148
+ value="text",
1149
+ placeholder="e.g., text, question, sentence",
1150
  info="The field containing inputs to process"
1151
  )
1152
 
1153
  target_field = gr.Textbox(
1154
  label="Target Field Name",
1155
+ value="label",
1156
+ placeholder="e.g., label, answer, target",
1157
  info="The field containing expected outputs"
1158
  )
1159
 
1160
  initial_prompt = gr.TextArea(
1161
  label="Initial Prompt",
1162
+ value="Classify the sentiment of this review as positive or negative.\n\nReview: {input}\n\nSentiment:",
1163
+ lines=5,
1164
+ info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
 
 
 
 
 
 
 
 
 
 
 
1165
  )
1166
 
1167
  # Button outside the column for better visibility
 
1195
 
1196
  | Dataset | Split | Input Field | Target Field | Task | Size |
1197
  |---------|-------|-------------|--------------|------|------|
1198
+ | **stanfordnlp/imdb** | test | text | label | Sentiment Analysis | 25,000 |
 
 
1199
  | dair-ai/emotion | test | text | label | Emotion Classification | 2,000 |
1200
+ | gsm8k | train | question | answer | Grade School Math | 7,473 |
1201
  | fancyzhx/ag_news | test | text | label | News Classification | 7,600 |
1202
+ | MathArena/aime_2025 | train | problem | answer | Hard Math (AIME) | 30 |
1203
 
1204
  ### About This Demo Space:
1205
 
 
1218
  - Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
1219
  4. **Run & Monitor**:
1220
  - All inputs are validated before starting
1221
+ - **Evaluation strategy**:
1222
+ - Initial evaluation: 50 samples (quick baseline)
1223
+ - Final evaluation: 50 200 samples (two-stage for accuracy)
1224
+ - Evolution: Staged (50 → 200 if score ≥ 0.5 to save API calls)
1225
+ - Compare initial vs best prompt side-by-side with full results
 
1226
 
1227
  ### About OpenEvolve:
1228
  OpenEvolve is an open-source evolutionary optimization framework. Learn more at: