Spaces:
Running
Running
Parse judgments with structured output prompting, one response model, one judge model at a time.
eb4ec23
| from judging_dataclasses import Criteria | |
| PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the response from a judge for a direct assessment. | |
| The judge was asked to give a rating for each of the following criteria, along with an explanation: | |
| {criteria_list} | |
| The possible options for each criterion are as follows: | |
| {options} | |
| Here is the response from the judge: | |
| {judging_response} | |
| Please provide a JSON object the scores for each of the criteria, along with any explanation the judge provided. | |
| """ | |
| DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses. | |
| [USER PROMPT START] | |
| {user_prompt} | |
| [USER PROMPT END] | |
| Responses from other LLMs: | |
| {responses_from_other_llms} | |
| Consider how you would combine the best aspects of the responses above into a single response. | |
| Directly provide your response to the user's query as if you were the original LLM. Do not mention that you are synthesizing the responses from other LLMs. | |
| """ | |
| DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query. | |
| [USER PROMPT START] | |
| {user_prompt} | |
| [USER PROMPT END] | |
| The response is as follows: | |
| [RESPONSE START] | |
| {response} | |
| [RESPONSE END] | |
| Please evaluate the quality of the response based on the following criteria: | |
| {criteria_list} | |
| Options: | |
| {options} | |
| For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion.""" | |
| DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [ | |
| Criteria( | |
| name="helpfulness", | |
| description="Provides meaningful information and clear solutions that address the query.", | |
| min_score=1, | |
| max_score=7, | |
| ), | |
| Criteria( | |
| name="relevance", | |
| description="Stays on topic and directly relates to the query without unnecessary details.", | |
| min_score=1, | |
| max_score=7, | |
| ), | |
| Criteria( | |
| name="conciseness", | |
| description="Communicates clearly and efficiently, avoiding excess content while retaining substance.", | |
| min_score=1, | |
| max_score=7, | |
| ), | |
| ] | |
| # 7-point likert scale. | |
| SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
| "Strongly Disagree", | |
| "Disagree", | |
| "Slightly Disagree", | |
| "Neither Agree Nor Disagree", | |
| "Slightly Agree", | |
| "Agree", | |
| "Strongly Agree", | |
| ] | |
| # 6-point likert scale. | |
| SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
| "Strongly Disagree", | |
| "Disagree", | |
| "Slightly Disagree", | |
| "Slightly Agree", | |
| "Agree", | |
| "Strongly Agree", | |
| ] | |
| # 5-point likert scale. | |
| FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
| "Strongly Disagree", | |
| "Disagree", | |
| "Neither Agree Nor Disagree", | |
| "Agree", | |
| "Strongly Agree", | |
| ] | |
| # 4-point likert scale. | |
| FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
| "Strongly Disagree", | |
| "Disagree", | |
| "Agree", | |
| "Strongly Agree", | |
| ] | |
| # 3-point likert scale. | |
| THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [ | |
| "Disagree", | |
| "Neither Agree Nor Disagree", | |
| "Agree", | |
| ] | |
| # 2-point likert scale. | |
| BINARY_DIRECT_ASSESSMENT_OPTIONS = [ | |
| "Disagree", | |
| "Agree", | |
| ] | |
| DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query. | |
| [USER PROMPT START] | |
| {prompt} | |
| [USER PROMPT END] | |
| [RESPONSE A START] | |
| {first_completion} | |
| [RESPONSE A END] | |
| [RESPONSE B START] | |
| {second_completion} | |
| [RESPONSE B END] | |
| Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}. | |
| After providing your explanation, output your final verdict as one of the following options: | |
| {pairwise_comparison_options} | |
| """ | |
| DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [ | |
| ( | |
| "helpfulness", | |
| "Provides meaningful information and clear solutions that address the query.", | |
| ), | |
| ( | |
| "relevance", | |
| "Stays on topic and directly relates to the query without unnecessary details.", | |
| ), | |
| ( | |
| "conciseness", | |
| "Communicates clearly and efficiently, avoiding excess content while retaining substance.", | |
| ), | |
| ] | |
| # COARSE WITH TIE. | |
| DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [ | |
| ("A>B", "Response A is better than Response B"), | |
| ("B<A", "Response B is better than Response A"), | |
| ("A=B", "Both responses are equally good"), | |
| ] | |