Spaces:

llm-council
/

sandbox

Running

App Files Files Community

justinxzhao commited on Oct 1, 2024

Commit

6fae7e2

1 Parent(s): 16d72cb

Added general rendering of chats so that they don't disappear during app saving.

Browse files

Files changed (4) hide show

.gitignore +2 -1
app.py +455 -340
constants.py +50 -18
img/qwen.webp +0 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 env/
 client_secret.json
-__pycache__

 env/
 client_secret.json
+__pycache__
+.env

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import anthropic
 from together import Together
 import google.generativeai as genai
 import time
 from typing import List, Optional, Literal, Union, Dict
 from constants import (
     LLM_COUNCIL_MEMBERS,
@@ -51,7 +52,7 @@ anthropic_client = anthropic.Anthropic()
 client = OpenAI()
-def anthropic_streamlit_streamer(stream):
     """
     Process the Anthropic streaming response and yield content from the deltas.
@@ -67,6 +68,18 @@ def anthropic_streamlit_streamer(stream):
                 if text_delta:
                     yield text_delta
             # Handle message completion events (optional if needed)
             elif event.type == "message_stop":
                 break  # End of message, stop streaming
@@ -83,22 +96,34 @@ def get_ui_friendly_name(llm):
 def google_streamlit_streamer(stream):
     for chunk in stream:
         yield chunk.text
-def together_streamlit_streamer(stream):
     for chunk in stream:
         yield chunk.choices[0].delta.content
 def llm_streamlit_streamer(stream, llm):
     if llm.startswith("anthropic"):
-        return anthropic_streamlit_streamer(stream)
     elif llm.startswith("vertex"):
         return google_streamlit_streamer(stream)
     elif llm.startswith("together"):
-        return together_streamlit_streamer(stream)
 # Helper functions for LLM council and aggregator selection
@@ -152,9 +177,13 @@ def get_llm_response_stream(model_identifier, prompt):
     if provider == "openai":
         return get_openai_response(model_name, prompt)
     elif provider == "anthropic":
-        return anthropic_streamlit_streamer(get_anthropic_response(model_name, prompt))
     elif provider == "together":
-        return together_streamlit_streamer(get_together_response(model_name, prompt))
     elif provider == "vertex":
         return google_streamlit_streamer(get_google_response(model_name, prompt))
     else:
@@ -174,7 +203,7 @@ def create_dataframe_for_direct_assessment_judging_response(
         for criteria_score in judging_model.criteria_scores:
             data.append(
                 {
-                    "llm_judge_model": model_name,
                     "criteria": criteria_score.criterion,
                     "score": criteria_score.score,
                     "explanation": criteria_score.explanation,
@@ -283,58 +312,62 @@ def get_parse_judging_response_for_direct_assessment_prompt(
     )
-DEBUG_MODE = True
 def parse_judging_responses(
     prompt: str, judging_responses: dict[str, str]
 ) -> DirectAssessmentJudgingResponse:
-    if DEBUG_MODE:
-        return DirectAssessmentJudgingResponse(
-            judging_models=[
-                DirectAssessmentCriteriaScores(
-                    model="together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
-                    criteria_scores=[
-                        DirectAssessmentCriterionScore(
-                            criterion="helpfulness", score=3, explanation="explanation1"
-                        ),
-                        DirectAssessmentCriterionScore(
-                            criterion="conciseness", score=4, explanation="explanation2"
-                        ),
-                        DirectAssessmentCriterionScore(
-                            criterion="relevance", score=5, explanation="explanation3"
-                        ),
-                    ],
-                ),
-                DirectAssessmentCriteriaScores(
-                    model="together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
-                    criteria_scores=[
-                        DirectAssessmentCriterionScore(
-                            criterion="helpfulness", score=1, explanation="explanation1"
-                        ),
-                        DirectAssessmentCriterionScore(
-                            criterion="conciseness", score=2, explanation="explanation2"
-                        ),
-                        DirectAssessmentCriterionScore(
-                            criterion="relevance", score=3, explanation="explanation3"
-                        ),
-                    ],
-                ),
-            ]
-        )
     else:
-        completion = client.beta.chat.completions.parse(
-            model="gpt-4o-mini",
-            messages=[
-                {
-                    "role": "system",
-                    "content": "Parse the judging responses into structured data.",
-                },
-                {"role": "user", "content": prompt},
-            ],
-            response_format=DirectAssessmentJudgingResponse,
-        )
-        return completion.choices[0].message.parsed
 def plot_criteria_scores(df):
@@ -401,11 +434,11 @@ def plot_overall_scores(overall_scores_df):
     ax = sns.barplot(
         x="ui_friendly_name",
         y="mean_score",
-        hue="ui_friendly_name",  # Add this line
         data=summary,
         palette="prism",
         capsize=0.1,
-        legend=False,  # Add this line
     )
     # Add error bars manually
@@ -420,15 +453,20 @@ def plot_overall_scores(overall_scores_df):
         zorder=10,  # Ensure error bars are on top
     )
-    # Add text annotations
-    for i, row in summary.iterrows():
         ax.text(
-            i,
-            row["mean_score"],
-            f"{row['mean_score']:.2f}",
             ha="center",
             va="bottom",
-            fontweight="bold",
             color="black",
             bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=0.5),
         )
@@ -446,23 +484,24 @@ def plot_overall_scores(overall_scores_df):
 def plot_per_judge_overall_scores(df):
     # Find the overall score by finding the overall score for each judge, and then averaging
     # over all judges.
-    grouped = df.groupby(["llm_judge_model"]).agg({"score": ["mean"]}).reset_index()
-    grouped.columns = ["llm_judge_model", "overall_score"]
     # Create the horizontal bar plot
     plt.figure(figsize=(10, 6))
     ax = sns.barplot(
         data=grouped,
-        y="llm_judge_model",
-        x="overall_score",
-        hue="llm_judge_model",
-        orient="h",
     )
     # Customize the plot
-    plt.title("Overall Scores by LLM Judge Model")
     plt.xlabel("Overall Score")
-    plt.ylabel("LLM Judge Model")
     # Adjust layout and display the plot
     plt.tight_layout()
@@ -510,41 +549,63 @@ def main():
     cols = st.columns([2, 1, 2])
     if not st.session_state.authenticated:
         with cols[1]:
-            password = st.text_input("Password", type="password")
-            if st.button("Login", use_container_width=True):
-                if password == PASSWORD:
-                    st.session_state.authenticated = True
-                else:
-                    st.error("Invalid credentials")
     if st.session_state.authenticated:
-        # cols[1].success("Logged in successfully!")
-        st.markdown("#### LLM Council Member Selection")
-        # Council and aggregator selection
-        selected_models = llm_council_selector()
-        # st.write("Selected Models:", selected_models)
-        selected_aggregator = aggregator_selector()
         # Initialize session state for collecting responses.
         if "responses" not in st.session_state:
-            st.session_state.responses = {}
-        # if "aggregator_response" not in st.session_state:
-        # st.session_state.aggregator_response = {}
-        # Prompt input
-        st.markdown("#### Enter your prompt")
-        _, center_column, _ = st.columns([3, 5, 3])
-        with center_column:
-            user_prompt = st.text_area(
-                "Enter your prompt", value="Say 'Hello World'", key="user_prompt"
-            )
-        if center_column.button("Submit", use_container_width=True):
             st.markdown("#### Responses")
             response_columns = st.columns(3)
             selected_models_to_streamlit_column_map = {
@@ -552,7 +613,7 @@ def main():
             }
             # Fetching and streaming responses from each selected model
-            for selected_model in selected_models:
                 with selected_models_to_streamlit_column_map[selected_model]:
                     st.write(get_ui_friendly_name(selected_model))
                     with st.chat_message(
@@ -571,11 +632,8 @@ def main():
                 user_prompt=user_prompt, llms=selected_models
             )
-            with st.expander("Aggregator Prompt"):
-                st.code(aggregator_prompt)
             # Fetching and streaming response from the aggregator
-            st.write(f"Mixture-of-Agents ({get_ui_friendly_name(selected_aggregator)})")
             with st.chat_message(
                 selected_aggregator,
                 avatar="img/council_icon.png",
@@ -589,272 +647,329 @@ def main():
                         message_placeholder.write_stream(aggregator_stream)
                     )
-        # st.write("Responses (in session state):")
-        # st.write(st.session_state["responses"])
         # Judging.
-        st.markdown("#### Judging Configuration")
-        # Choose the type of assessment
-        assessment_type = st.radio(
-            "Select the type of assessment",
-            options=["Direct Assessment", "Pairwise Comparison"],
-        )
-        _, center_column, _ = st.columns([3, 5, 3])
-        # Depending on the assessment type, render different forms
-        if assessment_type == "Direct Assessment":
-            # Initialize session state for direct assessment.
-            if "direct_assessment_overall_score" not in st.session_state:
-                st.session_state["direct_assessment_overall_score"] = {}
-            if "direct_assessment_judging_df" not in st.session_state:
-                st.session_state["direct_assessment_judging_df"] = {}
-                for response_model in selected_models:
                     st.session_state["direct_assessment_judging_df"][
-                        response_model
                     ] = {}
-                # aggregator model
-                st.session_state["direct_assessment_judging_df"][
-                    "agg__" + selected_aggregator
-                ] = {}
-            if "direct_assessment_judging_responses" not in st.session_state:
-                st.session_state["direct_assessment_judging_responses"] = {}
-                for response_model in selected_models:
                     st.session_state["direct_assessment_judging_responses"][
-                        response_model
                     ] = {}
-                # aggregator model
-                st.session_state["direct_assessment_judging_responses"][
-                    "agg__" + selected_aggregator
-                ] = {}
-            if "direct_assessment_overall_scores" not in st.session_state:
-                st.session_state["direct_assessment_overall_scores"] = {}
-                for response_model in selected_models:
                     st.session_state["direct_assessment_overall_scores"][
-                        response_model
                     ] = {}
-                st.session_state["direct_assessment_overall_scores"][
-                    "agg__" + selected_aggregator
-                ] = {}
-            if "judging_status" not in st.session_state:
-                st.session_state["judging_status"] = "incomplete"
-            # Direct assessment prompt.
-            with center_column.expander("Direct Assessment Prompt"):
-                direct_assessment_prompt = st.text_area(
-                    "Prompt for the Direct Assessment",
-                    value=get_default_direct_assessment_prompt(user_prompt=user_prompt),
-                    height=500,
-                    key="direct_assessment_prompt",
-                )
-            # TODO: Add option to edit criteria list with a basic text field.
-            criteria_list = DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST
-            # Create DirectAssessment object when form is submitted
-            if center_column.button(
-                "Submit Direct Assessment", use_container_width=True
-            ):
-                # Submit direct asssessment.
-                responses_for_judging = st.session_state["responses"]
-                # st.write("Responses for judging (in session state):")
-                # st.write(responses_for_judging)
-                response_judging_columns = st.columns(3)
-                responses_for_judging_to_streamlit_column_map = {
-                    model: response_judging_columns[i % 3]
-                    for i, model in enumerate(responses_for_judging.keys())
-                }
-                # Get judging responses.
-                for response_model, response in responses_for_judging.items():
-                    st_column = responses_for_judging_to_streamlit_column_map[
-                        response_model
-                    ]
-                    with st_column:
-                        if "agg__" in response_model:
-                            judging_model_header = "Mixture-of-Agents Response"
-                        else:
-                            judging_model_header = get_ui_friendly_name(response_model)
-                        st.write(f"Judging for {judging_model_header}")
-                        # st.write("Response being judged: ")
-                        # st.write(response)
-                        judging_prompt = get_direct_assessment_prompt(
-                            direct_assessment_prompt=direct_assessment_prompt,
-                            user_prompt=user_prompt,
-                            response=response,
-                            criteria_list=criteria_list,
-                            options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
-                        )
-                        with st.expander("Final Judging Prompt"):
-                            st.code(judging_prompt)
-                        for judging_model in selected_models:
-                            with st.expander(
-                                get_ui_friendly_name(judging_model), expanded=False
-                            ):
-                                with st.chat_message(
-                                    judging_model,
-                                    avatar=PROVIDER_TO_AVATAR_MAP[judging_model],
                                 ):
-                                    message_placeholder = st.empty()
-                                    judging_stream = get_llm_response_stream(
-                                        judging_model, judging_prompt
-                                    )
-                                    # if judging_stream:
-                                    st.session_state[
-                                        "direct_assessment_judging_responses"
-                                    ][response_model][
-                                        judging_model
-                                    ] = message_placeholder.write_stream(
-                                        judging_stream
-                                    )
-                        # When all of the judging is finished for the given response, get the actual
-                        # values, parsed (use gpt-4o-mini for now) with json mode.
-                        # TODO.
-                        judging_responses = st.session_state[
-                            "direct_assessment_judging_responses"
-                        ][response_model]
-                        # st.write("Judging responses (in session state):")
-                        # st.write(judging_responses)
-                        if not judging_responses:
-                            st.error(f"No judging responses for {response_model}")
-                            quit()
-                        parse_judging_response_prompt = (
-                            get_parse_judging_response_for_direct_assessment_prompt(
-                                judging_responses,
-                                criteria_list,
-                                SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
                             )
-                        )
-                        with st.expander("Parse Judging Response Prompt"):
-                            st.code(parse_judging_response_prompt)
-                        # Issue the prompt to openai mini with structured outputs
-                        parsed_judging_responses = parse_judging_responses(
-                            parse_judging_response_prompt, judging_responses
-                        )
-                        st.session_state["direct_assessment_judging_df"][
-                            response_model
-                        ] = create_dataframe_for_direct_assessment_judging_response(
-                            parsed_judging_responses
-                        )
-                        st.write(
-                            st.session_state["direct_assessment_judging_df"][
-                                response_model
-                            ]
-                        )
-                        plot_criteria_scores(
                             st.session_state["direct_assessment_judging_df"][
                                 response_model
-                            ]
-                        )
-                        # Find the overall score by finding the overall score for each judge, and then averaging
-                        # over all judges.
-                        plot_per_judge_overall_scores(
-                            st.session_state["direct_assessment_judging_df"][
-                                response_model
-                            ]
-                        )
-                        grouped = (
-                            st.session_state["direct_assessment_judging_df"][
-                                response_model
-                            ]
-                            .groupby(["llm_judge_model"])
-                            .agg({"score": ["mean"]})
-                            .reset_index()
-                        )
-                        grouped.columns = ["llm_judge_model", "overall_score"]
-                        # st.write(
-                        #     "Extracting overall scores from this grouped dataframe:"
-                        # )
-                        # st.write(grouped)
-                        # Save the overall scores to the session state.
-                        for record in grouped.to_dict(orient="records"):
-                            st.session_state["direct_assessment_overall_scores"][
-                                response_model
-                            ][record["llm_judge_model"]] = record["overall_score"]
-                        overall_score = grouped["overall_score"].mean()
-                        controversy = grouped["overall_score"].std()
-                        st.write(f"Overall Score: {overall_score:.2f}")
-                        st.write(f"Controversy: {controversy:.2f}")
-                st.session_state["judging_status"] = "complete"
-            # Judging is complete.
-            st.write("#### Results")
-            # The session state now contains the overall scores for each response from each judge.
-            if st.session_state["judging_status"] == "complete":
-                overall_scores_df_raw = pd.DataFrame(
-                    st.session_state["direct_assessment_overall_scores"]
-                ).reset_index()
-                overall_scores_df = pd.melt(
-                    overall_scores_df_raw,
-                    id_vars=["index"],
-                    var_name="response_model",
-                    value_name="score",
-                ).rename(columns={"index": "judging_model"})
-                # Print the overall winner.
-                overall_winner = overall_scores_df.loc[
-                    overall_scores_df["score"].idxmax()
-                ]
-                st.write(
-                    f"**Overall Winner:** {get_ui_friendly_name(overall_winner['response_model'])}"
-                )
-                # Find how much the standard deviation overlaps with other models.
-                # Calculate separability.
-                # TODO.
-                st.write(f"**Confidence:** {overall_winner['score']:.2f}")
-                left_column, right_column = st.columns([1, 1])
-                with left_column:
-                    plot_overall_scores(overall_scores_df)
-                with right_column:
-                    st.dataframe(overall_scores_df)
-        elif assessment_type == "Pairwise Comparison":
-            pass
-            # pairwise_comparison_prompt = st.text_area(
-            #     "Prompt for the Pairwise Comparison"
-            # )
-            # granularity = st.selectbox("Granularity", ["coarse", "fine", "super fine"])
-            # ties_allowed = st.checkbox("Are ties allowed?")
-            # position_swapping = st.checkbox("Enable position swapping?")
-            # reference_model = st.text_input("Reference Model")
-            # # Create PairwiseComparison object when form is submitted
-            # if st.button("Submit Pairwise Comparison"):
-            #     pairwise_comparison_config = PairwiseComparison(
-            #         type="pairwise_comparison",
-            #         granularity=granularity,
-            #         ties_allowed=ties_allowed,
-            #         position_swapping=position_swapping,
-            #         reference_model=reference_model,
-            #         prompt=prompt,
-            #     )
-            #     st.success(f"Pairwise Comparison Created: {pairwise_comparison_config}")
-            #     # Submit pairwise comparison.
-            #     responses_for_judging = st.session_state["responses"]
     else:
         with cols[1]:

 from together import Together
 import google.generativeai as genai
 import time
+from collections import defaultdict
 from typing import List, Optional, Literal, Union, Dict
 from constants import (
     LLM_COUNCIL_MEMBERS,
 client = OpenAI()
+def anthropic_streamlit_streamer(stream, llm):
     """
     Process the Anthropic streaming response and yield content from the deltas.
                 if text_delta:
                     yield text_delta
+            # Count input token usage.
+            if event.type == "message_start":
+                input_token_usage = event["usage"]["input_tokens"]
+                output_token_usage = event["usage"]["output_tokens"]
+                st.session_state["input_token_usage"][llm] += input_token_usage
+                st.session_state["output_token_usage"][llm] += output_token_usage
+            # Count output token usage.
+            if event.type == "message_delta":
+                output_token_usage = event["usage"]["output_tokens"]
+                st.session_state["output_token_usage"][llm] += output_token_usage
             # Handle message completion events (optional if needed)
             elif event.type == "message_stop":
                 break  # End of message, stop streaming
 def google_streamlit_streamer(stream):
+    # TODO: Count token usage.
     for chunk in stream:
         yield chunk.text
+def together_streamlit_streamer(stream, llm):
+    # https://docs.together.ai/docs/chat-overview#streaming-responses
     for chunk in stream:
+        if chunk.usage:
+            st.session_state["input_token_usage"][llm] += chunk.usage.prompt_tokens
+        if chunk.usage:
+            st.session_state["output_token_usage"][llm] += chunk.usage.completion_tokens
         yield chunk.choices[0].delta.content
 def llm_streamlit_streamer(stream, llm):
     if llm.startswith("anthropic"):
+        print(f"Using Anthropic streamer for llm: {llm}")
+        return anthropic_streamlit_streamer(stream, llm)
     elif llm.startswith("vertex"):
+        print(f"Using Vertex streamer for llm: {llm}")
         return google_streamlit_streamer(stream)
     elif llm.startswith("together"):
+        print(f"Using Together streamer for llm: {llm}")
+        return together_streamlit_streamer(stream, llm)
+    else:
+        print(f"Using OpenAI streamer for llm: {llm}")
+        return openai_streamlit_streamer(stream, llm)
 # Helper functions for LLM council and aggregator selection
     if provider == "openai":
         return get_openai_response(model_name, prompt)
     elif provider == "anthropic":
+        return anthropic_streamlit_streamer(
+            get_anthropic_response(model_name, prompt), model_identifier
+        )
     elif provider == "together":
+        return together_streamlit_streamer(
+            get_together_response(model_name, prompt), model_identifier
+        )
     elif provider == "vertex":
         return google_streamlit_streamer(get_google_response(model_name, prompt))
     else:
         for criteria_score in judging_model.criteria_scores:
             data.append(
                 {
+                    "judging_model": model_name,
                     "criteria": criteria_score.criterion,
                     "score": criteria_score.score,
                     "explanation": criteria_score.explanation,
     )
 def parse_judging_responses(
     prompt: str, judging_responses: dict[str, str]
 ) -> DirectAssessmentJudgingResponse:
+    # if os.getenv("DEBUG_MODE") == "True":
+    #     return DirectAssessmentJudgingResponse(
+    #         judging_models=[
+    #             DirectAssessmentCriteriaScores(
+    #                 model="together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+    #                 criteria_scores=[
+    #                     DirectAssessmentCriterionScore(
+    #                         criterion="helpfulness", score=3, explanation="explanation1"
+    #                     ),
+    #                     DirectAssessmentCriterionScore(
+    #                         criterion="conciseness", score=4, explanation="explanation2"
+    #                     ),
+    #                     DirectAssessmentCriterionScore(
+    #                         criterion="relevance", score=5, explanation="explanation3"
+    #                     ),
+    #                 ],
+    #             ),
+    #             DirectAssessmentCriteriaScores(
+    #                 model="together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
+    #                 criteria_scores=[
+    #                     DirectAssessmentCriterionScore(
+    #                         criterion="helpfulness", score=1, explanation="explanation1"
+    #                     ),
+    #                     DirectAssessmentCriterionScore(
+    #                         criterion="conciseness", score=2, explanation="explanation2"
+    #                     ),
+    #                     DirectAssessmentCriterionScore(
+    #                         criterion="relevance", score=3, explanation="explanation3"
+    #                     ),
+    #                 ],
+    #             ),
+    #         ]
+    #     )
+    # else:
+    completion = client.beta.chat.completions.parse(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "system",
+                "content": "Parse the judging responses into structured data.",
+            },
+            {"role": "user", "content": prompt},
+        ],
+        response_format=DirectAssessmentJudgingResponse,
+    )
+    return completion.choices[0].message.parsed
+def get_llm_avatar(model_identifier):
+    if "agg__" in model_identifier:
+        return "img/council_icon.png"
     else:
+        return PROVIDER_TO_AVATAR_MAP[model_identifier]
 def plot_criteria_scores(df):
     ax = sns.barplot(
         x="ui_friendly_name",
         y="mean_score",
+        hue="ui_friendly_name",
         data=summary,
         palette="prism",
         capsize=0.1,
+        legend=False,
     )
     # Add error bars manually
         zorder=10,  # Ensure error bars are on top
     )
+    # Add text annotations using the actual positions of the bars
+    for patch, row in zip(ax.patches, summary.itertuples()):
+        # Get the center of each bar (x position)
+        x = patch.get_x() + patch.get_width() / 2
+        y = patch.get_height()
+        # Add the text annotation
         ax.text(
+            x,
+            y,
+            f"{row.mean_score:.2f}",
             ha="center",
             va="bottom",
+            # fontweight="bold",
             color="black",
             bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=0.5),
         )
 def plot_per_judge_overall_scores(df):
     # Find the overall score by finding the overall score for each judge, and then averaging
     # over all judges.
+    grouped = df.groupby(["judging_model"]).agg({"score": ["mean"]}).reset_index()
+    grouped.columns = ["judging_model", "overall_score"]
     # Create the horizontal bar plot
     plt.figure(figsize=(10, 6))
     ax = sns.barplot(
         data=grouped,
+        x="judging_model",
+        y="overall_score",
+        hue="judging_model",
+        orient="v",
+        palette="rainbow",
     )
     # Customize the plot
+    plt.title("Overall Score from each LLM Judge")
     plt.xlabel("Overall Score")
+    plt.ylabel("LLM Judge")
     # Adjust layout and display the plot
     plt.tight_layout()
     cols = st.columns([2, 1, 2])
     if not st.session_state.authenticated:
         with cols[1]:
+            with st.form("login_form"):
+                password = st.text_input("Password", type="password")
+                submit_button = st.form_submit_button("Login", use_container_width=True)
+                if submit_button:
+                    if password == PASSWORD:
+                        st.session_state.authenticated = True
+                        st.success("Logged in successfully!")
+                        st.rerun()
+                    else:
+                        st.error("Invalid credentials")
     if st.session_state.authenticated:
+        if "responses_collected" not in st.session_state:
+            st.session_state["responses_collected"] = False
         # Initialize session state for collecting responses.
         if "responses" not in st.session_state:
+            st.session_state.responses = defaultdict(str)
+        # Initialize session state for token usage.
+        if "input_token_usage" not in st.session_state:
+            st.session_state["input_token_usage"] = defaultdict(int)
+        if "output_token_usage" not in st.session_state:
+            st.session_state["output_token_usage"] = defaultdict(int)
+        if "selected_models" not in st.session_state:
+            st.session_state["selected_models"] = []
+        if "selected_aggregator" not in st.session_state:
+            st.session_state["selected_aggregator"] = None
+        with st.form(key="prompt_form"):
+            st.markdown("#### LLM Council Member Selection")
+            # Council and aggregator selection
+            selected_models = llm_council_selector()
+            selected_aggregator = aggregator_selector()
+            # Prompt input and submission form
+            st.markdown("#### Enter your prompt")
+            _, center_column, _ = st.columns([3, 5, 3])
+            with center_column:
+                user_prompt = st.text_area(
+                    "Enter your prompt",
+                    value="Say 'Hello World'",
+                    key="user_prompt",
+                    label_visibility="hidden",
+                )
+                submit_button = st.form_submit_button(
+                    "Submit", use_container_width=True
+                )
+        if submit_button:
             st.markdown("#### Responses")
+            # Udpate state.
+            st.session_state.selected_models = selected_models
+            st.session_state.selected_aggregator = selected_aggregator
+            # Render the chats.
             response_columns = st.columns(3)
             selected_models_to_streamlit_column_map = {
             }
             # Fetching and streaming responses from each selected model
+            for selected_model in st.session_state.selected_models:
                 with selected_models_to_streamlit_column_map[selected_model]:
                     st.write(get_ui_friendly_name(selected_model))
                     with st.chat_message(
                 user_prompt=user_prompt, llms=selected_models
             )
             # Fetching and streaming response from the aggregator
+            st.write(f"{get_ui_friendly_name(selected_aggregator)}")
             with st.chat_message(
                 selected_aggregator,
                 avatar="img/council_icon.png",
                         message_placeholder.write_stream(aggregator_stream)
                     )
+            st.session_state.responses_collected = True
+        # Render chats generally?
+        if st.session_state.responses and not submit_button:
+            st.markdown("#### Responses")
+            response_columns = st.columns(3)
+            selected_models_to_streamlit_column_map = {
+                model: response_columns[i]
+                for i, model in enumerate(st.session_state.selected_models)
+            }
+            for response_model, response in st.session_state.responses.items():
+                st_column = selected_models_to_streamlit_column_map.get(
+                    response_model, response_columns[0]
+                )
+                with st_column.chat_message(
+                    response_model,
+                    avatar=get_llm_avatar(response_model),
+                ):
+                    st.write(get_ui_friendly_name(response_model))
+                    st.write(response)
         # Judging.
+        if st.session_state.responses_collected:
+            st.markdown("#### Judging Configuration")
+            # Choose the type of assessment
+            assessment_type = st.radio(
+                "Select the type of assessment",
+                options=["Direct Assessment", "Pairwise Comparison"],
+            )
+            _, center_column, _ = st.columns([3, 5, 3])
+            # Depending on the assessment type, render different forms
+            if assessment_type == "Direct Assessment":
+                # Initialize session state for direct assessment.
+                if "direct_assessment_overall_score" not in st.session_state:
+                    st.session_state["direct_assessment_overall_score"] = {}
+                if "direct_assessment_judging_df" not in st.session_state:
+                    st.session_state["direct_assessment_judging_df"] = {}
+                    for response_model in selected_models:
+                        st.session_state["direct_assessment_judging_df"][
+                            response_model
+                        ] = {}
+                    # aggregator model
                     st.session_state["direct_assessment_judging_df"][
+                        "agg__" + selected_aggregator
                     ] = {}
+                if "direct_assessment_judging_responses" not in st.session_state:
+                    st.session_state["direct_assessment_judging_responses"] = {}
+                    for response_model in selected_models:
+                        st.session_state["direct_assessment_judging_responses"][
+                            response_model
+                        ] = {}
+                    # aggregator model
                     st.session_state["direct_assessment_judging_responses"][
+                        "agg__" + selected_aggregator
                     ] = {}
+                if "direct_assessment_overall_scores" not in st.session_state:
+                    st.session_state["direct_assessment_overall_scores"] = {}
+                    for response_model in selected_models:
+                        st.session_state["direct_assessment_overall_scores"][
+                            response_model
+                        ] = {}
                     st.session_state["direct_assessment_overall_scores"][
+                        "agg__" + selected_aggregator
                     ] = {}
+                if "judging_status" not in st.session_state:
+                    st.session_state["judging_status"] = "incomplete"
+                # Direct assessment prompt.
+                with center_column.expander("Direct Assessment Prompt"):
+                    direct_assessment_prompt = st.text_area(
+                        "Prompt for the Direct Assessment",
+                        value=get_default_direct_assessment_prompt(
+                            user_prompt=user_prompt
+                        ),
+                        height=500,
+                        key="direct_assessment_prompt",
+                    )
+                # TODO: Add option to edit criteria list with a basic text field.
+                criteria_list = DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST
+                # Create DirectAssessment object when form is submitted
+                if center_column.button(
+                    "Submit Direct Assessment", use_container_width=True
+                ):
+                    # Render the chats.
+                    response_columns = st.columns(3)
+                    selected_models_to_streamlit_column_map = {
+                        model: response_columns[i]
+                        for i, model in enumerate(selected_models)
+                    }
+                    for response_model, response in st.session_state[
+                        "responses"
+                    ].items():
+                        st_column = selected_models_to_streamlit_column_map.get(
+                            response_model, response_columns[0]
+                        )
+                        with st_column:
+                            with st.chat_message(
+                                get_ui_friendly_name(response_model),
+                                avatar=get_llm_avatar(response_model),
+                            ):
+                                st.write(get_ui_friendly_name(response_model))
+                                st.write(response)
+                    # Submit direct asssessment.
+                    responses_for_judging = st.session_state["responses"]
+                    response_judging_columns = st.columns(3)
+                    responses_for_judging_to_streamlit_column_map = {
+                        model: response_judging_columns[i % 3]
+                        for i, model in enumerate(responses_for_judging.keys())
+                    }
+                    # Get judging responses.
+                    for response_model, response in responses_for_judging.items():
+                        st_column = responses_for_judging_to_streamlit_column_map[
+                            response_model
+                        ]
+                        with st_column:
+                            st.write(
+                                f"Judging for {get_ui_friendly_name(response_model)}"
+                            )
+                            judging_prompt = get_direct_assessment_prompt(
+                                direct_assessment_prompt=direct_assessment_prompt,
+                                user_prompt=user_prompt,
+                                response=response,
+                                criteria_list=criteria_list,
+                                options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
+                            )
+                            with st.expander("Final Judging Prompt"):
+                                st.code(judging_prompt)
+                            for judging_model in selected_models:
+                                with st.expander(
+                                    get_ui_friendly_name(judging_model), expanded=False
                                 ):
+                                    with st.chat_message(
+                                        judging_model,
+                                        avatar=PROVIDER_TO_AVATAR_MAP[judging_model],
+                                    ):
+                                        message_placeholder = st.empty()
+                                        judging_stream = get_llm_response_stream(
+                                            judging_model, judging_prompt
+                                        )
+                                        st.session_state[
+                                            "direct_assessment_judging_responses"
+                                        ][response_model][
+                                            judging_model
+                                        ] = message_placeholder.write_stream(
+                                            judging_stream
+                                        )
+                            # When all of the judging is finished for the given response, get the actual
+                            # values, parsed.
+                            # TODO.
+                            judging_responses = st.session_state[
+                                "direct_assessment_judging_responses"
+                            ][response_model]
+                            if not judging_responses:
+                                st.error(f"No judging responses for {response_model}")
+                                quit()
+                            parse_judging_response_prompt = (
+                                get_parse_judging_response_for_direct_assessment_prompt(
+                                    judging_responses,
+                                    criteria_list,
+                                    SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
+                                )
+                            )
+                            # Issue the prompt to openai mini with structured outputs
+                            parsed_judging_responses = parse_judging_responses(
+                                parse_judging_response_prompt, judging_responses
                             )
                             st.session_state["direct_assessment_judging_df"][
                                 response_model
+                            ] = create_dataframe_for_direct_assessment_judging_response(
+                                parsed_judging_responses
+                            )
+                            plot_criteria_scores(
+                                st.session_state["direct_assessment_judging_df"][
+                                    response_model
+                                ]
+                            )
+                            # Find the overall score by finding the overall score for each judge, and then averaging
+                            # over all judges.
+                            plot_per_judge_overall_scores(
+                                st.session_state["direct_assessment_judging_df"][
+                                    response_model
+                                ]
+                            )
+                            grouped = (
+                                st.session_state["direct_assessment_judging_df"][
+                                    response_model
+                                ]
+                                .groupby(["judging_model"])
+                                .agg({"score": ["mean"]})
+                                .reset_index()
+                            )
+                            grouped.columns = ["judging_model", "overall_score"]
+                            # Save the overall scores to the session state.
+                            for record in grouped.to_dict(orient="records"):
+                                st.session_state["direct_assessment_overall_scores"][
+                                    response_model
+                                ][record["judging_model"]] = record["overall_score"]
+                            overall_score = grouped["overall_score"].mean()
+                            controversy = grouped["overall_score"].std()
+                            st.write(f"Overall Score: {overall_score:.2f}")
+                            st.write(f"Controversy: {controversy:.2f}")
+                    st.session_state["judging_status"] = "complete"
+                # Judging is complete.
+                # The session state now contains the overall scores for each response from each judge.
+                if st.session_state["judging_status"] == "complete":
+                    st.write("#### Results")
+                    overall_scores_df_raw = pd.DataFrame(
+                        st.session_state["direct_assessment_overall_scores"]
+                    ).reset_index()
+                    overall_scores_df = pd.melt(
+                        overall_scores_df_raw,
+                        id_vars=["index"],
+                        var_name="response_model",
+                        value_name="score",
+                    ).rename(columns={"index": "judging_model"})
+                    # Print the overall winner.
+                    overall_winner = overall_scores_df.loc[
+                        overall_scores_df["score"].idxmax()
+                    ]
+                    st.write(
+                        f"**Overall Winner:** {get_ui_friendly_name(overall_winner['response_model'])}"
+                    )
+                    # Find how much the standard deviation overlaps with other models.
+                    # Calculate separability.
+                    # TODO.
+                    st.write(f"**Confidence:** {overall_winner['score']:.2f}")
+                    left_column, right_column = st.columns([1, 1])
+                    with left_column:
+                        plot_overall_scores(overall_scores_df)
+                    with right_column:
+                        # All overall scores.
+                        overall_scores_df = overall_scores_df[
+                            ["response_model", "judging_model", "score"]
+                        ]
+                        overall_scores_df["response_model"] = overall_scores_df[
+                            "response_model"
+                        ].apply(get_ui_friendly_name)
+                        overall_scores_df["judging_model"] = overall_scores_df[
+                            "judging_model"
+                        ].apply(get_ui_friendly_name)
+                        with st.expander("Overall scores from all judges"):
+                            st.dataframe(overall_scores_df)
+                    # All criteria scores.
+                    with right_column:
+                        all_scores_df = pd.DataFrame()
+                        for response_model, score_df in st.session_state[
+                            "direct_assessment_judging_df"
+                        ].items():
+                            score_df["response_model"] = response_model
+                            all_scores_df = pd.concat([all_scores_df, score_df])
+                        all_scores_df = all_scores_df.reset_index()
+                        all_scores_df = all_scores_df.drop(columns="index")
+                        # Reorder the columns
+                        all_scores_df = all_scores_df[
+                            [
+                                "response_model",
+                                "judging_model",
+                                "criteria",
+                                "score",
+                                "explanation",
+                            ]
+                        ]
+                        all_scores_df["response_model"] = all_scores_df[
+                            "response_model"
+                        ].apply(get_ui_friendly_name)
+                        all_scores_df["judging_model"] = all_scores_df[
+                            "judging_model"
+                        ].apply(get_ui_friendly_name)
+                        with st.expander(
+                            "Criteria-specific scores and explanations from all judges"
+                        ):
+                            st.dataframe(all_scores_df)
+            elif assessment_type == "Pairwise Comparison":
+                pass
+        # Token usage.
+        with st.expander("Token Usage"):
+            st.write("Input tokens used.")
+            st.write(st.session_state.input_token_usage)
+            st.write(
+                f"Input Tokens Total: {sum(st.session_state.input_token_usage.values())}"
+            )
+            st.write("Output tokens used.")
+            st.write(st.session_state.output_token_usage)
+            st.write(
+                f"Output Tokens Total: {sum(st.session_state.output_token_usage.values())}"
+            )
     else:
         with cols[1]:

constants.py CHANGED Viewed

@@ -1,18 +1,42 @@
-LLM_COUNCIL_MEMBERS = {
-    "Smalls": [
-        # "openai://gpt-4o-mini",
-        "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
-        "together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
-        # "vertex://gemini-1.5-flash-001",
-        # "anthropic://claude-3-haiku-20240307",
-    ],
-    "Flagships": [
-        "openai://gpt-4o",
-        "together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
-        "vertex://gemini-1.5-pro-001",
-        "anthropic://claude-3-5-sonnet",
-    ],
-}
 PROVIDER_TO_AVATAR_MAP = {
     "openai://gpt-4o-mini": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIwLjk5ZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjU2IDI2MCI+PHBhdGggZD0iTTIzOS4xODQgMTA2LjIwM2E2NC43MiA2NC43MiAwIDAgMC01LjU3Ni01My4xMDNDMjE5LjQ1MiAyOC40NTkgMTkxIDE1Ljc4NCAxNjMuMjEzIDIxLjc0QTY1LjU4NiA2NS41ODYgMCAwIDAgNTIuMDk2IDQ1LjIyYTY0LjcyIDY0LjcyIDAgMCAwLTQzLjIzIDMxLjM2Yy0xNC4zMSAyNC42MDItMTEuMDYxIDU1LjYzNCA4LjAzMyA3Ni43NGE2NC42NyA2NC42NyAwIDAgMCA1LjUyNSA1My4xMDJjMTQuMTc0IDI0LjY1IDQyLjY0NCAzNy4zMjQgNzAuNDQ2IDMxLjM2YTY0LjcyIDY0LjcyIDAgMCAwIDQ4Ljc1NCAyMS43NDRjMjguNDgxLjAyNSA1My43MTQtMTguMzYxIDYyLjQxNC00NS40ODFhNjQuNzcgNjQuNzcgMCAwIDAgNDMuMjI5LTMxLjM2YzE0LjEzNy0yNC41NTggMTAuODc1LTU1LjQyMy04LjA4My03Ni40ODNtLTk3LjU2IDEzNi4zMzhhNDguNCA0OC40IDAgMCAxLTMxLjEwNS0xMS4yNTVsMS41MzUtLjg3bDUxLjY3LTI5LjgyNWE4LjYgOC42IDAgMCAwIDQuMjQ3LTcuMzY3di03Mi44NWwyMS44NDUgMTIuNjM2Yy4yMTguMTExLjM3LjMyLjQwOS41NjN2NjAuMzY3Yy0uMDU2IDI2LjgxOC0yMS43ODMgNDguNTQ1LTQ4LjYwMSA0OC42MDFNMzcuMTU4IDE5Ny45M2E0OC4zNSA0OC4zNSAwIDAgMS01Ljc4MS0zMi41ODlsMS41MzQuOTIxbDUxLjcyMiAyOS44MjZhOC4zNCA4LjM0IDAgMCAwIDguNDQxIDBsNjMuMTgxLTM2LjQyNXYyNS4yMjFhLjg3Ljg3IDAgMCAxLS4zNTguNjY1bC01Mi4zMzUgMzAuMTg0Yy0yMy4yNTcgMTMuMzk4LTUyLjk3IDUuNDMxLTY2LjQwNC0xNy44MDNNMjMuNTQ5IDg1LjM4YTQ4LjUgNDguNSAwIDAgMSAyNS41OC0yMS4zMzN2NjEuMzlhOC4yOSA4LjI5IDAgMCAwIDQuMTk1IDcuMzE2bDYyLjg3NCAzNi4yNzJsLTIxLjg0NSAxMi42MzZhLjgyLjgyIDAgMCAxLS43NjcgMEw0MS4zNTMgMTUxLjUzYy0yMy4yMTEtMTMuNDU0LTMxLjE3MS00My4xNDQtMTcuODA0LTY2LjQwNXptMTc5LjQ2NiA0MS42OTVsLTYzLjA4LTM2LjYzTDE2MS43MyA3Ny44NmEuODIuODIgMCAwIDEgLjc2OCAwbDUyLjIzMyAzMC4xODRhNDguNiA0OC42IDAgMCAxLTcuMzE2IDg3LjYzNXYtNjEuMzkxYTguNTQgOC41NCAwIDAgMC00LjQtNy4yMTNtMjEuNzQyLTMyLjY5bC0xLjUzNS0uOTIybC01MS42MTktMzAuMDgxYTguMzkgOC4zOSAwIDAgMC04LjQ5MiAwTDk5Ljk4IDk5LjgwOFY3NC41ODdhLjcyLjcyIDAgMCAxIC4zMDctLjY2NWw1Mi4yMzMtMzAuMTMzYTQ4LjY1MiA0OC42NTIgMCAwIDEgNzIuMjM2IDUwLjM5MXpNODguMDYxIDEzOS4wOTdsLTIxLjg0NS0xMi41ODVhLjg3Ljg3IDAgMCAxLS40MS0uNjE0VjY1LjY4NWE0OC42NTIgNDguNjUyIDAgMCAxIDc5Ljc1Ny0zNy4zNDZsLTEuNTM1Ljg3bC01MS42NyAyOS44MjVhOC42IDguNiAwIDAgMC00LjI0NiA3LjM2N3ptMTEuODY4LTI1LjU4TDEyOC4wNjcgOTcuM2wyOC4xODggMTYuMjE4djMyLjQzNGwtMjguMDg2IDE2LjIxOGwtMjguMTg4LTE2LjIxOHoiLz48L3N2Zz4=",
@@ -34,9 +58,17 @@ LLM_TO_UI_NAME_MAP = {
     "anthropic://claude-3-haiku-20240307": "Claude 3 Haiku",
 }
-# AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
-AGGREGATORS = ["together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]
 # Fix the aggregator step.
 # Add a judging step.

+import os
+import dotenv
+dotenv.load_dotenv()
+if os.getenv("DEBUG_MODE") == "True":
+    LLM_COUNCIL_MEMBERS = {
+        "Smalls": [
+            "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            "together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
+            # "anthropic://claude-3-haiku-20240307",
+        ],
+        "Flagships": [
+            "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            "together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
+            "anthropic://claude-3-haiku-20240307",
+        ],
+    }
+else:
+    LLM_COUNCIL_MEMBERS = {
+        "Smalls": [
+            "openai://gpt-4o-mini",
+            "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+            "together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
+            "vertex://gemini-1.5-flash-001",
+            "anthropic://claude-3-haiku-20240307",
+        ],
+        "Flagships": [
+            "openai://gpt-4o",
+            "together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+            "vertex://gemini-1.5-pro-002",
+            "anthropic://claude-3-5-sonnet",
+        ],
+        "OpenAI": [
+            "openai://gpt-4o",
+            "openai://gpt-4o-mini",
+        ],
+    }
 PROVIDER_TO_AVATAR_MAP = {
     "openai://gpt-4o-mini": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIwLjk5ZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjU2IDI2MCI+PHBhdGggZD0iTTIzOS4xODQgMTA2LjIwM2E2NC43MiA2NC43MiAwIDAgMC01LjU3Ni01My4xMDNDMjE5LjQ1MiAyOC40NTkgMTkxIDE1Ljc4NCAxNjMuMjEzIDIxLjc0QTY1LjU4NiA2NS41ODYgMCAwIDAgNTIuMDk2IDQ1LjIyYTY0LjcyIDY0LjcyIDAgMCAwLTQzLjIzIDMxLjM2Yy0xNC4zMSAyNC42MDItMTEuMDYxIDU1LjYzNCA4LjAzMyA3Ni43NGE2NC42NyA2NC42NyAwIDAgMCA1LjUyNSA1My4xMDJjMTQuMTc0IDI0LjY1IDQyLjY0NCAzNy4zMjQgNzAuNDQ2IDMxLjM2YTY0LjcyIDY0LjcyIDAgMCAwIDQ4Ljc1NCAyMS43NDRjMjguNDgxLjAyNSA1My43MTQtMTguMzYxIDYyLjQxNC00NS40ODFhNjQuNzcgNjQuNzcgMCAwIDAgNDMuMjI5LTMxLjM2YzE0LjEzNy0yNC41NTggMTAuODc1LTU1LjQyMy04LjA4My03Ni40ODNtLTk3LjU2IDEzNi4zMzhhNDguNCA0OC40IDAgMCAxLTMxLjEwNS0xMS4yNTVsMS41MzUtLjg3bDUxLjY3LTI5LjgyNWE4LjYgOC42IDAgMCAwIDQuMjQ3LTcuMzY3di03Mi44NWwyMS44NDUgMTIuNjM2Yy4yMTguMTExLjM3LjMyLjQwOS41NjN2NjAuMzY3Yy0uMDU2IDI2LjgxOC0yMS43ODMgNDguNTQ1LTQ4LjYwMSA0OC42MDFNMzcuMTU4IDE5Ny45M2E0OC4zNSA0OC4zNSAwIDAgMS01Ljc4MS0zMi41ODlsMS41MzQuOTIxbDUxLjcyMiAyOS44MjZhOC4zNCA4LjM0IDAgMCAwIDguNDQxIDBsNjMuMTgxLTM2LjQyNXYyNS4yMjFhLjg3Ljg3IDAgMCAxLS4zNTguNjY1bC01Mi4zMzUgMzAuMTg0Yy0yMy4yNTcgMTMuMzk4LTUyLjk3IDUuNDMxLTY2LjQwNC0xNy44MDNNMjMuNTQ5IDg1LjM4YTQ4LjUgNDguNSAwIDAgMSAyNS41OC0yMS4zMzN2NjEuMzlhOC4yOSA4LjI5IDAgMCAwIDQuMTk1IDcuMzE2bDYyLjg3NCAzNi4yNzJsLTIxLjg0NSAxMi42MzZhLjgyLjgyIDAgMCAxLS43NjcgMEw0MS4zNTMgMTUxLjUzYy0yMy4yMTEtMTMuNDU0LTMxLjE3MS00My4xNDQtMTcuODA0LTY2LjQwNXptMTc5LjQ2NiA0MS42OTVsLTYzLjA4LTM2LjYzTDE2MS43MyA3Ny44NmEuODIuODIgMCAwIDEgLjc2OCAwbDUyLjIzMyAzMC4xODRhNDguNiA0OC42IDAgMCAxLTcuMzE2IDg3LjYzNXYtNjEuMzkxYTguNTQgOC41NCAwIDAgMC00LjQtNy4yMTNtMjEuNzQyLTMyLjY5bC0xLjUzNS0uOTIybC01MS42MTktMzAuMDgxYTguMzkgOC4zOSAwIDAgMC04LjQ5MiAwTDk5Ljk4IDk5LjgwOFY3NC41ODdhLjcyLjcyIDAgMCAxIC4zMDctLjY2NWw1Mi4yMzMtMzAuMTMzYTQ4LjY1MiA0OC42NTIgMCAwIDEgNzIuMjM2IDUwLjM5MXpNODguMDYxIDEzOS4wOTdsLTIxLjg0NS0xMi41ODVhLjg3Ljg3IDAgMCAxLS40MS0uNjE0VjY1LjY4NWE0OC42NTIgNDguNjUyIDAgMCAxIDc5Ljc1Ny0zNy4zNDZsLTEuNTM1Ljg3bC01MS42NyAyOS44MjVhOC42IDguNiAwIDAgMC00LjI0NiA3LjM2N3ptMTEuODY4LTI1LjU4TDEyOC4wNjcgOTcuM2wyOC4xODggMTYuMjE4djMyLjQzNGwtMjguMDg2IDE2LjIxOGwtMjguMTg4LTE2LjIxOHoiLz48L3N2Zz4=",
     "anthropic://claude-3-haiku-20240307": "Claude 3 Haiku",
 }
+if os.getenv("DEBUG_MODE") == "True":
+    AGGREGATORS = ["together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]
+else:
+    AGGREGATORS = [
+        "anthropic://claude-3-haiku-20240307",
+        "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+        "together://meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+        "together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
+        "openai://gpt-4o",
+        "openai://gpt-4o-mini",
+    ]
 # Fix the aggregator step.
 # Add a judging step.

img/qwen.webp ADDED Viewed