Spaces:
Running
Running
Commit
·
eb4ec23
1
Parent(s):
1afb9ca
Parse judgments with structured output prompting, one response model, one judge model at a time.
Browse files- app.py +124 -66
- judging_dataclasses.py +3 -3
- prompts.py +8 -5
app.py
CHANGED
|
@@ -17,7 +17,7 @@ from constants import (
|
|
| 17 |
)
|
| 18 |
from prompts import *
|
| 19 |
from judging_dataclasses import (
|
| 20 |
-
DirectAssessmentJudgingResponse,
|
| 21 |
DirectAssessmentCriterionScore,
|
| 22 |
DirectAssessmentCriteriaScores,
|
| 23 |
)
|
|
@@ -191,24 +191,24 @@ def get_llm_response_stream(model_identifier, prompt):
|
|
| 191 |
|
| 192 |
|
| 193 |
def create_dataframe_for_direct_assessment_judging_response(
|
| 194 |
-
response:
|
| 195 |
-
):
|
| 196 |
# Initialize empty list to collect data
|
| 197 |
data = []
|
| 198 |
|
| 199 |
# Loop through models
|
| 200 |
-
for judging_model in response.judging_models:
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
|
| 213 |
# Create DataFrame
|
| 214 |
return pd.DataFrame(data)
|
|
@@ -295,26 +295,29 @@ def get_default_aggregator_prompt(user_prompt, llms):
|
|
| 295 |
|
| 296 |
|
| 297 |
def get_parse_judging_response_for_direct_assessment_prompt(
|
| 298 |
-
|
| 299 |
criteria_list,
|
| 300 |
options,
|
| 301 |
-
):
|
| 302 |
-
formatted_judging_responses = "\n\n".join(
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
|
|
|
|
|
|
|
|
|
| 307 |
)
|
| 308 |
return PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT.format(
|
| 309 |
-
|
| 310 |
criteria_list=format_criteria_list(criteria_list),
|
| 311 |
options=format_likert_comparison_options(options),
|
| 312 |
)
|
| 313 |
|
| 314 |
|
| 315 |
-
def
|
| 316 |
-
prompt: str,
|
| 317 |
-
) ->
|
| 318 |
# if os.getenv("DEBUG_MODE") == "True":
|
| 319 |
# return DirectAssessmentJudgingResponse(
|
| 320 |
# judging_models=[
|
|
@@ -358,7 +361,7 @@ def parse_judging_responses(
|
|
| 358 |
},
|
| 359 |
{"role": "user", "content": prompt},
|
| 360 |
],
|
| 361 |
-
response_format=
|
| 362 |
)
|
| 363 |
# Track token usage.
|
| 364 |
st.session_state["input_token_usage"][
|
|
@@ -443,7 +446,7 @@ def plot_overall_scores(overall_scores_df):
|
|
| 443 |
y="mean_score",
|
| 444 |
hue="ui_friendly_name",
|
| 445 |
data=summary,
|
| 446 |
-
palette="
|
| 447 |
capsize=0.1,
|
| 448 |
legend=False,
|
| 449 |
)
|
|
@@ -663,29 +666,76 @@ def st_direct_assessment_results(user_prompt, direct_assessment_prompt, criteria
|
|
| 663 |
judging_stream
|
| 664 |
)
|
| 665 |
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
if response_model not in st.session_state.direct_assessment_judging_df:
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 678 |
)
|
| 679 |
-
)
|
| 680 |
-
parsed_judging_responses = parse_judging_responses(
|
| 681 |
-
parse_judging_response_prompt, judging_responses
|
| 682 |
-
)
|
| 683 |
st.session_state.direct_assessment_judging_df[response_model] = (
|
| 684 |
-
|
| 685 |
-
parsed_judging_responses
|
| 686 |
-
)
|
| 687 |
)
|
| 688 |
|
|
|
|
|
|
|
|
|
|
| 689 |
# Uses the session state to plot the criteria scores and graphs for a given response
|
| 690 |
# model.
|
| 691 |
plot_criteria_scores(
|
|
@@ -706,13 +756,11 @@ def st_direct_assessment_results(user_prompt, direct_assessment_prompt, criteria
|
|
| 706 |
|
| 707 |
# Save the overall scores to the session state if it's not already there.
|
| 708 |
for record in grouped.to_dict(orient="records"):
|
| 709 |
-
|
| 710 |
-
response_model
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
record["judging_model"]
|
| 715 |
-
] = record["overall_score"]
|
| 716 |
|
| 717 |
overall_score = grouped["overall_score"].mean()
|
| 718 |
controversy = grouped["overall_score"].std()
|
|
@@ -796,7 +844,14 @@ def main():
|
|
| 796 |
if "direct_assessment_overall_score" not in st.session_state:
|
| 797 |
st.session_state.direct_assessment_overall_score = {}
|
| 798 |
if "direct_assessment_judging_df" not in st.session_state:
|
| 799 |
-
st.session_state.direct_assessment_judging_df =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
if "direct_assessment_judging_responses" not in st.session_state:
|
| 801 |
st.session_state.direct_assessment_judging_responses = defaultdict(dict)
|
| 802 |
if "direct_assessment_overall_scores" not in st.session_state:
|
|
@@ -940,19 +995,22 @@ def main():
|
|
| 940 |
overall_scores_df["response_model"] = overall_scores_df[
|
| 941 |
"response_model"
|
| 942 |
].apply(get_ui_friendly_name)
|
| 943 |
-
overall_scores_df["judging_model"] = overall_scores_df[
|
| 944 |
-
|
| 945 |
-
].apply(get_ui_friendly_name)
|
| 946 |
|
| 947 |
with st.expander("Overall scores from all judges"):
|
|
|
|
|
|
|
| 948 |
st.dataframe(overall_scores_df)
|
| 949 |
|
| 950 |
# All criteria scores.
|
| 951 |
with right_column:
|
| 952 |
all_scores_df = pd.DataFrame()
|
| 953 |
-
for
|
| 954 |
-
|
| 955 |
-
|
|
|
|
| 956 |
score_df["response_model"] = response_model
|
| 957 |
all_scores_df = pd.concat([all_scores_df, score_df])
|
| 958 |
all_scores_df = all_scores_df.reset_index()
|
|
@@ -968,12 +1026,12 @@ def main():
|
|
| 968 |
"explanation",
|
| 969 |
]
|
| 970 |
]
|
| 971 |
-
all_scores_df["response_model"] = all_scores_df[
|
| 972 |
-
|
| 973 |
-
].apply(get_ui_friendly_name)
|
| 974 |
-
all_scores_df["judging_model"] = all_scores_df[
|
| 975 |
-
|
| 976 |
-
].apply(get_ui_friendly_name)
|
| 977 |
|
| 978 |
with st.expander(
|
| 979 |
"Criteria-specific scores and explanations from all judges"
|
|
|
|
| 17 |
)
|
| 18 |
from prompts import *
|
| 19 |
from judging_dataclasses import (
|
| 20 |
+
# DirectAssessmentJudgingResponse,
|
| 21 |
DirectAssessmentCriterionScore,
|
| 22 |
DirectAssessmentCriteriaScores,
|
| 23 |
)
|
|
|
|
| 191 |
|
| 192 |
|
| 193 |
def create_dataframe_for_direct_assessment_judging_response(
|
| 194 |
+
response: DirectAssessmentCriteriaScores, judging_model: str
|
| 195 |
+
) -> pd.DataFrame:
|
| 196 |
# Initialize empty list to collect data
|
| 197 |
data = []
|
| 198 |
|
| 199 |
# Loop through models
|
| 200 |
+
# for judging_model in response.judging_models:
|
| 201 |
+
# model_name = judging_model.model
|
| 202 |
+
# Loop through criteria_scores
|
| 203 |
+
for criteria_score in response.criteria_scores:
|
| 204 |
+
data.append(
|
| 205 |
+
{
|
| 206 |
+
"judging_model": judging_model, # Gets passed in.
|
| 207 |
+
"criteria": criteria_score.criterion,
|
| 208 |
+
"score": criteria_score.score,
|
| 209 |
+
"explanation": criteria_score.explanation,
|
| 210 |
+
}
|
| 211 |
+
)
|
| 212 |
|
| 213 |
# Create DataFrame
|
| 214 |
return pd.DataFrame(data)
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
def get_parse_judging_response_for_direct_assessment_prompt(
|
| 298 |
+
judging_response: str,
|
| 299 |
criteria_list,
|
| 300 |
options,
|
| 301 |
+
) -> str:
|
| 302 |
+
# formatted_judging_responses = "\n\n\n".join(
|
| 303 |
+
# [
|
| 304 |
+
# f"----- {get_ui_friendly_name(model)} START -----\n\n\n{judging_responses[model]}\n\n\n-----{get_ui_friendly_name(model)} END-----\n\n\n"
|
| 305 |
+
# for model in judging_responses.keys()
|
| 306 |
+
# ]
|
| 307 |
+
# )
|
| 308 |
+
formatted_judging_response = (
|
| 309 |
+
f"----- START -----\n\n\n{judging_response}\n\n\n----- END -----\n\n\n"
|
| 310 |
)
|
| 311 |
return PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT.format(
|
| 312 |
+
judging_response=formatted_judging_response,
|
| 313 |
criteria_list=format_criteria_list(criteria_list),
|
| 314 |
options=format_likert_comparison_options(options),
|
| 315 |
)
|
| 316 |
|
| 317 |
|
| 318 |
+
def get_parsed_judging_response_obj_using_llm(
|
| 319 |
+
prompt: str,
|
| 320 |
+
) -> DirectAssessmentCriteriaScores:
|
| 321 |
# if os.getenv("DEBUG_MODE") == "True":
|
| 322 |
# return DirectAssessmentJudgingResponse(
|
| 323 |
# judging_models=[
|
|
|
|
| 361 |
},
|
| 362 |
{"role": "user", "content": prompt},
|
| 363 |
],
|
| 364 |
+
response_format=DirectAssessmentCriteriaScores,
|
| 365 |
)
|
| 366 |
# Track token usage.
|
| 367 |
st.session_state["input_token_usage"][
|
|
|
|
| 446 |
y="mean_score",
|
| 447 |
hue="ui_friendly_name",
|
| 448 |
data=summary,
|
| 449 |
+
palette="rainbow",
|
| 450 |
capsize=0.1,
|
| 451 |
legend=False,
|
| 452 |
)
|
|
|
|
| 666 |
judging_stream
|
| 667 |
)
|
| 668 |
|
| 669 |
+
# Parse the judging response. If parsing results are already cached, then
|
| 670 |
+
# skip.
|
| 671 |
+
# Use Structured Output to parse the judging response.
|
| 672 |
+
parse_judging_response_prompt = get_parse_judging_response_for_direct_assessment_prompt(
|
| 673 |
+
judging_response=st.session_state.direct_assessment_judging_responses[
|
| 674 |
+
response_model
|
| 675 |
+
][
|
| 676 |
+
judging_model
|
| 677 |
+
],
|
| 678 |
+
criteria_list=criteria_list,
|
| 679 |
+
options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
|
| 680 |
+
)
|
| 681 |
+
|
| 682 |
+
st.write("Parse judging response prompt:")
|
| 683 |
+
st.write(parse_judging_response_prompt)
|
| 684 |
+
|
| 685 |
+
if (
|
| 686 |
+
response_model
|
| 687 |
+
not in st.session_state.direct_assessment_judging_by_response_and_judging_model_df
|
| 688 |
+
or judging_model
|
| 689 |
+
not in st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
|
| 690 |
+
response_model
|
| 691 |
+
]
|
| 692 |
+
):
|
| 693 |
+
parsed_judging_response_obj = (
|
| 694 |
+
get_parsed_judging_response_obj_using_llm(
|
| 695 |
+
parse_judging_response_prompt
|
| 696 |
+
)
|
| 697 |
+
)
|
| 698 |
+
st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
|
| 699 |
+
response_model
|
| 700 |
+
][
|
| 701 |
+
judging_model
|
| 702 |
+
] = create_dataframe_for_direct_assessment_judging_response(
|
| 703 |
+
parsed_judging_response_obj, judging_model
|
| 704 |
+
)
|
| 705 |
+
|
| 706 |
+
# with st.expander("Structured output parsing response"):
|
| 707 |
+
st.write("Structured output parsing response:")
|
| 708 |
+
st.write(
|
| 709 |
+
st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
|
| 710 |
+
response_model
|
| 711 |
+
][
|
| 712 |
+
judging_model
|
| 713 |
+
]
|
| 714 |
+
)
|
| 715 |
+
|
| 716 |
+
# Combined the dataframes for each judging model into a single dataframe for each
|
| 717 |
+
# response model.
|
| 718 |
if response_model not in st.session_state.direct_assessment_judging_df:
|
| 719 |
+
# Combine the dataframes for each judging model into a single dataframe.
|
| 720 |
+
combined_judging_df = pd.DataFrame()
|
| 721 |
+
for judging_model in st.session_state.selected_models:
|
| 722 |
+
combined_judging_df = pd.concat(
|
| 723 |
+
[
|
| 724 |
+
combined_judging_df,
|
| 725 |
+
st.session_state.direct_assessment_judging_by_response_and_judging_model_df[
|
| 726 |
+
response_model
|
| 727 |
+
][
|
| 728 |
+
judging_model
|
| 729 |
+
],
|
| 730 |
+
]
|
| 731 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 732 |
st.session_state.direct_assessment_judging_df[response_model] = (
|
| 733 |
+
combined_judging_df
|
|
|
|
|
|
|
| 734 |
)
|
| 735 |
|
| 736 |
+
with st.expander("Judging results from all judges"):
|
| 737 |
+
st.write(st.session_state.direct_assessment_judging_df[response_model])
|
| 738 |
+
|
| 739 |
# Uses the session state to plot the criteria scores and graphs for a given response
|
| 740 |
# model.
|
| 741 |
plot_criteria_scores(
|
|
|
|
| 756 |
|
| 757 |
# Save the overall scores to the session state if it's not already there.
|
| 758 |
for record in grouped.to_dict(orient="records"):
|
| 759 |
+
st.session_state.direct_assessment_overall_scores[
|
| 760 |
+
get_ui_friendly_name(response_model)
|
| 761 |
+
][get_ui_friendly_name(record["judging_model"])] = record[
|
| 762 |
+
"overall_score"
|
| 763 |
+
]
|
|
|
|
|
|
|
| 764 |
|
| 765 |
overall_score = grouped["overall_score"].mean()
|
| 766 |
controversy = grouped["overall_score"].std()
|
|
|
|
| 844 |
if "direct_assessment_overall_score" not in st.session_state:
|
| 845 |
st.session_state.direct_assessment_overall_score = {}
|
| 846 |
if "direct_assessment_judging_df" not in st.session_state:
|
| 847 |
+
st.session_state.direct_assessment_judging_df = {}
|
| 848 |
+
if (
|
| 849 |
+
"direct_assessment_judging_by_response_and_judging_model_df"
|
| 850 |
+
not in st.session_state
|
| 851 |
+
):
|
| 852 |
+
st.session_state.direct_assessment_judging_by_response_and_judging_model_df = defaultdict(
|
| 853 |
+
dict
|
| 854 |
+
)
|
| 855 |
if "direct_assessment_judging_responses" not in st.session_state:
|
| 856 |
st.session_state.direct_assessment_judging_responses = defaultdict(dict)
|
| 857 |
if "direct_assessment_overall_scores" not in st.session_state:
|
|
|
|
| 995 |
overall_scores_df["response_model"] = overall_scores_df[
|
| 996 |
"response_model"
|
| 997 |
].apply(get_ui_friendly_name)
|
| 998 |
+
# overall_scores_df["judging_model"] = overall_scores_df[
|
| 999 |
+
# "judging_model"
|
| 1000 |
+
# ].apply(get_ui_friendly_name)
|
| 1001 |
|
| 1002 |
with st.expander("Overall scores from all judges"):
|
| 1003 |
+
st.write(st.session_state.direct_assessment_overall_scores)
|
| 1004 |
+
st.dataframe(overall_scores_df_raw)
|
| 1005 |
st.dataframe(overall_scores_df)
|
| 1006 |
|
| 1007 |
# All criteria scores.
|
| 1008 |
with right_column:
|
| 1009 |
all_scores_df = pd.DataFrame()
|
| 1010 |
+
for (
|
| 1011 |
+
response_model,
|
| 1012 |
+
score_df,
|
| 1013 |
+
) in st.session_state.direct_assessment_judging_df.items():
|
| 1014 |
score_df["response_model"] = response_model
|
| 1015 |
all_scores_df = pd.concat([all_scores_df, score_df])
|
| 1016 |
all_scores_df = all_scores_df.reset_index()
|
|
|
|
| 1026 |
"explanation",
|
| 1027 |
]
|
| 1028 |
]
|
| 1029 |
+
# all_scores_df["response_model"] = all_scores_df[
|
| 1030 |
+
# "response_model"
|
| 1031 |
+
# ].apply(get_ui_friendly_name)
|
| 1032 |
+
# all_scores_df["judging_model"] = all_scores_df[
|
| 1033 |
+
# "judging_model"
|
| 1034 |
+
# ].apply(get_ui_friendly_name)
|
| 1035 |
|
| 1036 |
with st.expander(
|
| 1037 |
"Criteria-specific scores and explanations from all judges"
|
judging_dataclasses.py
CHANGED
|
@@ -35,9 +35,9 @@ class DirectAssessmentCriterionScore(BaseModel):
|
|
| 35 |
|
| 36 |
|
| 37 |
class DirectAssessmentCriteriaScores(BaseModel):
|
| 38 |
-
model: str
|
| 39 |
criteria_scores: List[DirectAssessmentCriterionScore]
|
| 40 |
|
| 41 |
|
| 42 |
-
class DirectAssessmentJudgingResponse(BaseModel):
|
| 43 |
-
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
class DirectAssessmentCriteriaScores(BaseModel):
|
| 38 |
+
# model: str
|
| 39 |
criteria_scores: List[DirectAssessmentCriterionScore]
|
| 40 |
|
| 41 |
|
| 42 |
+
# class DirectAssessmentJudgingResponse(BaseModel):
|
| 43 |
+
# judging_models: List[DirectAssessmentCriteriaScores]
|
prompts.py
CHANGED
|
@@ -1,18 +1,21 @@
|
|
| 1 |
from judging_dataclasses import Criteria
|
| 2 |
|
| 3 |
|
| 4 |
-
PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
Each judge was asked to give a rating for each of the following criteria, along with an explanation:
|
| 7 |
{criteria_list}
|
| 8 |
|
| 9 |
The possible options for each criterion are as follows:
|
|
|
|
| 10 |
{options}
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
-
Please provide a JSON object
|
| 16 |
"""
|
| 17 |
|
| 18 |
|
|
|
|
| 1 |
from judging_dataclasses import Criteria
|
| 2 |
|
| 3 |
|
| 4 |
+
PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the response from a judge for a direct assessment.
|
| 5 |
+
|
| 6 |
+
The judge was asked to give a rating for each of the following criteria, along with an explanation:
|
| 7 |
|
|
|
|
| 8 |
{criteria_list}
|
| 9 |
|
| 10 |
The possible options for each criterion are as follows:
|
| 11 |
+
|
| 12 |
{options}
|
| 13 |
|
| 14 |
+
Here is the response from the judge:
|
| 15 |
+
|
| 16 |
+
{judging_response}
|
| 17 |
|
| 18 |
+
Please provide a JSON object the scores for each of the criteria, along with any explanation the judge provided.
|
| 19 |
"""
|
| 20 |
|
| 21 |
|