Upload app.py
Browse files
app.py
CHANGED
|
@@ -890,29 +890,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 890 |
progress(0.15, desc="Creating configuration...")
|
| 891 |
config_path = create_config_file(model, work_dir)
|
| 892 |
|
| 893 |
-
# Run initial evaluation
|
| 894 |
-
# For small datasets (like AIME with 30 instances), use all available
|
| 895 |
-
# For larger datasets, use 50 samples for quick baseline
|
| 896 |
# IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
|
| 897 |
-
|
| 898 |
-
# Load dataset to check size
|
| 899 |
-
try:
|
| 900 |
-
temp_dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
|
| 901 |
-
except ValueError as e:
|
| 902 |
-
if "config" in str(e).lower():
|
| 903 |
-
default_config = "main"
|
| 904 |
-
if dataset_name.lower() == "glue":
|
| 905 |
-
default_config = "sst2"
|
| 906 |
-
temp_dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
|
| 907 |
-
else:
|
| 908 |
-
raise
|
| 909 |
-
|
| 910 |
-
dataset_size = len(temp_dataset)
|
| 911 |
-
eval_samples = min(dataset_size, 50) # Use all if dataset has ≤50, else use 50
|
| 912 |
-
|
| 913 |
-
progress(0.2, desc=f"Running initial evaluation on {eval_samples} samples (dataset has {dataset_size} total)...")
|
| 914 |
initial_eval = evaluate_prompt(
|
| 915 |
-
initial_prompt, dataset_name, dataset_split,
|
| 916 |
model, input_field, target_field
|
| 917 |
)
|
| 918 |
|
|
@@ -1007,66 +989,60 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 1007 |
else:
|
| 1008 |
best_prompt = initial_prompt
|
| 1009 |
|
| 1010 |
-
#
|
| 1011 |
-
#
|
| 1012 |
-
|
| 1013 |
-
progress(0.85, desc=f"Evaluating best prompt on {eval_samples} samples...")
|
| 1014 |
final_eval = evaluate_prompt(
|
| 1015 |
-
best_prompt, dataset_name, dataset_split,
|
| 1016 |
model, input_field, target_field,
|
| 1017 |
fixed_indices=eval_indices # Use same samples as initial eval!
|
| 1018 |
)
|
| 1019 |
|
| 1020 |
-
# Stage 2:
|
| 1021 |
-
|
| 1022 |
-
progress(0.90, desc="Stage 2: Accuracy >80%! Evaluating 50 more samples...")
|
| 1023 |
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
else:
|
| 1034 |
-
raise
|
| 1035 |
-
|
| 1036 |
-
# Get 50 additional indices (different from initial 50)
|
| 1037 |
-
import random
|
| 1038 |
-
random.seed(42)
|
| 1039 |
-
all_indices = list(range(len(dataset)))
|
| 1040 |
-
remaining_indices = [i for i in all_indices if i not in eval_indices]
|
| 1041 |
-
|
| 1042 |
-
if len(remaining_indices) >= 50:
|
| 1043 |
-
additional_indices = random.sample(remaining_indices, 50)
|
| 1044 |
-
|
| 1045 |
-
# Evaluate on additional 50 samples
|
| 1046 |
-
additional_eval = evaluate_prompt(
|
| 1047 |
-
best_prompt, dataset_name, dataset_split, 50,
|
| 1048 |
-
model, input_field, target_field,
|
| 1049 |
-
fixed_indices=additional_indices
|
| 1050 |
-
)
|
| 1051 |
-
|
| 1052 |
-
# Combine results from both stages
|
| 1053 |
-
combined_correct = final_eval['correct'] + additional_eval['correct']
|
| 1054 |
-
combined_total = final_eval['total'] + additional_eval['total']
|
| 1055 |
-
combined_accuracy = (combined_correct / combined_total * 100) if combined_total > 0 else 0
|
| 1056 |
-
|
| 1057 |
-
final_eval = {
|
| 1058 |
-
'accuracy': combined_accuracy,
|
| 1059 |
-
'correct': combined_correct,
|
| 1060 |
-
'total': combined_total,
|
| 1061 |
-
'results': final_eval['results'] + additional_eval['results']
|
| 1062 |
-
}
|
| 1063 |
-
progress(0.95, desc=f"Stage 2 complete: {combined_correct}/{combined_total} = {combined_accuracy:.1f}%")
|
| 1064 |
else:
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1068 |
else:
|
| 1069 |
-
progress(0.90, desc=f"Stage 1
|
| 1070 |
|
| 1071 |
final_results = f"""
|
| 1072 |
### Evolved Prompt Evaluation
|
|
@@ -1094,10 +1070,10 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
|
|
| 1094 |
|
| 1095 |
### Summary
|
| 1096 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
| 1097 |
-
- **Dataset Size**: {dataset_size} instances
|
| 1098 |
- **Model**: {model}
|
| 1099 |
-
- **
|
| 1100 |
-
- **
|
|
|
|
| 1101 |
- **Iterations**: 10
|
| 1102 |
|
| 1103 |
### Results
|
|
@@ -1133,12 +1109,12 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1133 |
|
| 1134 |
## How it works:
|
| 1135 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 1136 |
-
2. Default dataset is **
|
| 1137 |
-
3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `
|
| 1138 |
4. Choose a free model from OpenRouter
|
| 1139 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 1140 |
6. Watch the evolution progress in real-time
|
| 1141 |
-
7. Compare initial vs. best prompt side-by-side (
|
| 1142 |
|
| 1143 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 1144 |
""")
|
|
@@ -1156,47 +1132,36 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
|
|
| 1156 |
|
| 1157 |
dataset_name = gr.Textbox(
|
| 1158 |
label="HuggingFace Dataset (Full Name)",
|
| 1159 |
-
value="
|
| 1160 |
-
placeholder="e.g.,
|
| 1161 |
-
info="Dataset name from HuggingFace Hub. Default:
|
| 1162 |
)
|
| 1163 |
|
| 1164 |
dataset_split = gr.Textbox(
|
| 1165 |
label="Dataset Split",
|
| 1166 |
-
value="
|
| 1167 |
placeholder="e.g., train, test, validation"
|
| 1168 |
)
|
| 1169 |
|
| 1170 |
input_field = gr.Textbox(
|
| 1171 |
label="Input Field Name",
|
| 1172 |
-
value="
|
| 1173 |
-
placeholder="e.g.,
|
| 1174 |
info="The field containing inputs to process"
|
| 1175 |
)
|
| 1176 |
|
| 1177 |
target_field = gr.Textbox(
|
| 1178 |
label="Target Field Name",
|
| 1179 |
-
value="
|
| 1180 |
-
placeholder="e.g.,
|
| 1181 |
info="The field containing expected outputs"
|
| 1182 |
)
|
| 1183 |
|
| 1184 |
initial_prompt = gr.TextArea(
|
| 1185 |
label="Initial Prompt",
|
| 1186 |
-
value="
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
| 1190 |
-
Instructions:
|
| 1191 |
-
1. Read the problem carefully
|
| 1192 |
-
2. Identify what is being asked
|
| 1193 |
-
3. Break down the solution into steps
|
| 1194 |
-
4. Show your work clearly
|
| 1195 |
-
5. Provide the final numerical answer after ####
|
| 1196 |
-
|
| 1197 |
-
Solution:""",
|
| 1198 |
-
lines=10,
|
| 1199 |
-
info="Use {input} as placeholder for dataset inputs. Chain-of-thought prompting works best for AIME!"
|
| 1200 |
)
|
| 1201 |
|
| 1202 |
# Button outside the column for better visibility
|
|
@@ -1230,11 +1195,11 @@ Solution:""",
|
|
| 1230 |
|
| 1231 |
| Dataset | Split | Input Field | Target Field | Task | Size |
|
| 1232 |
|---------|-------|-------------|--------------|------|------|
|
| 1233 |
-
| **
|
| 1234 |
-
| gsm8k | train | question | answer | Grade School Math | 7,473 |
|
| 1235 |
-
| stanfordnlp/imdb | test | text | label | Sentiment Analysis | 25,000 |
|
| 1236 |
| dair-ai/emotion | test | text | label | Emotion Classification | 2,000 |
|
|
|
|
| 1237 |
| fancyzhx/ag_news | test | text | label | News Classification | 7,600 |
|
|
|
|
| 1238 |
|
| 1239 |
### About This Demo Space:
|
| 1240 |
|
|
@@ -1253,12 +1218,11 @@ Solution:""",
|
|
| 1253 |
- Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
|
| 1254 |
4. **Run & Monitor**:
|
| 1255 |
- All inputs are validated before starting
|
| 1256 |
-
- **
|
| 1257 |
-
-
|
| 1258 |
-
-
|
| 1259 |
-
|
| 1260 |
-
-
|
| 1261 |
-
- Compare initial vs best prompt side-by-side
|
| 1262 |
|
| 1263 |
### About OpenEvolve:
|
| 1264 |
OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
|
|
|
|
| 890 |
progress(0.15, desc="Creating configuration...")
|
| 891 |
config_path = create_config_file(model, work_dir)
|
| 892 |
|
| 893 |
+
# Run initial evaluation with 50 samples
|
|
|
|
|
|
|
| 894 |
# IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
|
| 895 |
+
progress(0.2, desc="Running initial evaluation on 50 samples...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
initial_eval = evaluate_prompt(
|
| 897 |
+
initial_prompt, dataset_name, dataset_split, 50,
|
| 898 |
model, input_field, target_field
|
| 899 |
)
|
| 900 |
|
|
|
|
| 989 |
else:
|
| 990 |
best_prompt = initial_prompt
|
| 991 |
|
| 992 |
+
# Two-stage final evaluation: 50 → 200 samples
|
| 993 |
+
# Stage 1: Evaluate on same 50 samples as initial eval
|
| 994 |
+
progress(0.85, desc="Stage 1: Evaluating on 50 samples...")
|
|
|
|
| 995 |
final_eval = evaluate_prompt(
|
| 996 |
+
best_prompt, dataset_name, dataset_split, 50,
|
| 997 |
model, input_field, target_field,
|
| 998 |
fixed_indices=eval_indices # Use same samples as initial eval!
|
| 999 |
)
|
| 1000 |
|
| 1001 |
+
# Stage 2: Continue to 200 total samples (add 150 more)
|
| 1002 |
+
progress(0.90, desc="Stage 2: Evaluating 150 more samples (200 total)...")
|
|
|
|
| 1003 |
|
| 1004 |
+
# Load dataset to get additional samples
|
| 1005 |
+
try:
|
| 1006 |
+
dataset = load_dataset(dataset_name, split=dataset_split, streaming=False)
|
| 1007 |
+
except ValueError as e:
|
| 1008 |
+
if "config" in str(e).lower():
|
| 1009 |
+
default_config = "main"
|
| 1010 |
+
if dataset_name.lower() == "glue":
|
| 1011 |
+
default_config = "sst2"
|
| 1012 |
+
dataset = load_dataset(dataset_name, default_config, split=dataset_split, streaming=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1013 |
else:
|
| 1014 |
+
raise
|
| 1015 |
+
|
| 1016 |
+
# Get 150 additional indices (different from initial 50)
|
| 1017 |
+
import random
|
| 1018 |
+
random.seed(42)
|
| 1019 |
+
all_indices = list(range(len(dataset)))
|
| 1020 |
+
remaining_indices = [i for i in all_indices if i not in eval_indices]
|
| 1021 |
+
|
| 1022 |
+
if len(remaining_indices) >= 150:
|
| 1023 |
+
additional_indices = random.sample(remaining_indices, 150)
|
| 1024 |
+
|
| 1025 |
+
# Evaluate on additional 150 samples
|
| 1026 |
+
additional_eval = evaluate_prompt(
|
| 1027 |
+
best_prompt, dataset_name, dataset_split, 150,
|
| 1028 |
+
model, input_field, target_field,
|
| 1029 |
+
fixed_indices=additional_indices
|
| 1030 |
+
)
|
| 1031 |
+
|
| 1032 |
+
# Combine results from both stages
|
| 1033 |
+
combined_correct = final_eval['correct'] + additional_eval['correct']
|
| 1034 |
+
combined_total = final_eval['total'] + additional_eval['total']
|
| 1035 |
+
combined_accuracy = (combined_correct / combined_total * 100) if combined_total > 0 else 0
|
| 1036 |
+
|
| 1037 |
+
final_eval = {
|
| 1038 |
+
'accuracy': combined_accuracy,
|
| 1039 |
+
'correct': combined_correct,
|
| 1040 |
+
'total': combined_total,
|
| 1041 |
+
'results': final_eval['results'] + additional_eval['results']
|
| 1042 |
+
}
|
| 1043 |
+
progress(0.95, desc=f"Stage 2 complete: {combined_correct}/{combined_total} = {combined_accuracy:.1f}%")
|
| 1044 |
else:
|
| 1045 |
+
progress(0.90, desc=f"Not enough samples for Stage 2, using Stage 1 results ({final_eval['correct']}/{final_eval['total']})")
|
| 1046 |
|
| 1047 |
final_results = f"""
|
| 1048 |
### Evolved Prompt Evaluation
|
|
|
|
| 1070 |
|
| 1071 |
### Summary
|
| 1072 |
- **Dataset**: {dataset_name} ({dataset_split} split)
|
|
|
|
| 1073 |
- **Model**: {model}
|
| 1074 |
+
- **Initial Eval**: 50 samples
|
| 1075 |
+
- **Final Eval**: 50 samples → 200 samples (two-stage)
|
| 1076 |
+
- **Evolution**: Staged (50 → 200 if score ≥ 0.5)
|
| 1077 |
- **Iterations**: 10
|
| 1078 |
|
| 1079 |
### Results
|
|
|
|
| 1109 |
|
| 1110 |
## How it works:
|
| 1111 |
1. Enter an initial prompt (use `{input}` as a placeholder for dataset inputs)
|
| 1112 |
+
2. Default dataset is **IMDB** (movie review sentiment classification) - great for showing prompt improvement!
|
| 1113 |
+
3. Specify the dataset split and field names (or use other datasets like `gsm8k`, `dair-ai/emotion`)
|
| 1114 |
4. Choose a free model from OpenRouter
|
| 1115 |
5. Click "Optimize Prompt" - the system will validate everything first!
|
| 1116 |
6. Watch the evolution progress in real-time
|
| 1117 |
+
7. Compare initial vs. best prompt side-by-side (50 samples → 200 samples for final evaluation)!
|
| 1118 |
|
| 1119 |
**Note**: API key is read from `OPENAI_API_KEY` environment variable (set in Space secrets)
|
| 1120 |
""")
|
|
|
|
| 1132 |
|
| 1133 |
dataset_name = gr.Textbox(
|
| 1134 |
label="HuggingFace Dataset (Full Name)",
|
| 1135 |
+
value="stanfordnlp/imdb",
|
| 1136 |
+
placeholder="e.g., stanfordnlp/imdb, gsm8k, MathArena/aime_2025",
|
| 1137 |
+
info="Dataset name from HuggingFace Hub. Default: IMDB (sentiment classification)"
|
| 1138 |
)
|
| 1139 |
|
| 1140 |
dataset_split = gr.Textbox(
|
| 1141 |
label="Dataset Split",
|
| 1142 |
+
value="test",
|
| 1143 |
placeholder="e.g., train, test, validation"
|
| 1144 |
)
|
| 1145 |
|
| 1146 |
input_field = gr.Textbox(
|
| 1147 |
label="Input Field Name",
|
| 1148 |
+
value="text",
|
| 1149 |
+
placeholder="e.g., text, question, sentence",
|
| 1150 |
info="The field containing inputs to process"
|
| 1151 |
)
|
| 1152 |
|
| 1153 |
target_field = gr.Textbox(
|
| 1154 |
label="Target Field Name",
|
| 1155 |
+
value="label",
|
| 1156 |
+
placeholder="e.g., label, answer, target",
|
| 1157 |
info="The field containing expected outputs"
|
| 1158 |
)
|
| 1159 |
|
| 1160 |
initial_prompt = gr.TextArea(
|
| 1161 |
label="Initial Prompt",
|
| 1162 |
+
value="Classify the sentiment of this review as positive or negative.\n\nReview: {input}\n\nSentiment:",
|
| 1163 |
+
lines=5,
|
| 1164 |
+
info="Use {input} as placeholder for dataset inputs. Start simple - evolution will improve it!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1165 |
)
|
| 1166 |
|
| 1167 |
# Button outside the column for better visibility
|
|
|
|
| 1195 |
|
| 1196 |
| Dataset | Split | Input Field | Target Field | Task | Size |
|
| 1197 |
|---------|-------|-------------|--------------|------|------|
|
| 1198 |
+
| **stanfordnlp/imdb** | test | text | label | Sentiment Analysis | 25,000 |
|
|
|
|
|
|
|
| 1199 |
| dair-ai/emotion | test | text | label | Emotion Classification | 2,000 |
|
| 1200 |
+
| gsm8k | train | question | answer | Grade School Math | 7,473 |
|
| 1201 |
| fancyzhx/ag_news | test | text | label | News Classification | 7,600 |
|
| 1202 |
+
| MathArena/aime_2025 | train | problem | answer | Hard Math (AIME) | 30 |
|
| 1203 |
|
| 1204 |
### About This Demo Space:
|
| 1205 |
|
|
|
|
| 1218 |
- Model: Choose from 5 curated free models (larger models = better results but slower/rate-limited)
|
| 1219 |
4. **Run & Monitor**:
|
| 1220 |
- All inputs are validated before starting
|
| 1221 |
+
- **Evaluation strategy**:
|
| 1222 |
+
- Initial evaluation: 50 samples (quick baseline)
|
| 1223 |
+
- Final evaluation: 50 → 200 samples (two-stage for accuracy)
|
| 1224 |
+
- Evolution: Staged (50 → 200 if score ≥ 0.5 to save API calls)
|
| 1225 |
+
- Compare initial vs best prompt side-by-side with full results
|
|
|
|
| 1226 |
|
| 1227 |
### About OpenEvolve:
|
| 1228 |
OpenEvolve is an open-source evolutionary optimization framework. Learn more at:
|