Spaces:
Running
Running
| import gradio as gr | |
| import src.app_util as app_util | |
| from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets | |
| links_markdown = """ | |
| [📄 Paper](https://arxiv.org/abs/2502.04313) | | |
| [☯ Homepage](https://model-similarity.github.io/) | | |
| [🐱 Code](https://github.com/model-similarity/lm-similarity) | | |
| [🐍 pip install lm-sim](https://pypi.org/project/lm-sim/) | | |
| [🤗 Data](https://huggingface.co/datasets/bethgelab/lm-similarity) | |
| """ | |
| model_init = ["HuggingFaceTB/SmolLM2-1.7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct", "microsoft/phi-4", "Qwen/Qwen2.5-14B-Instruct-1M", "meta-llama/Llama-3.3-70B-Instruct"] | |
| dataset_init = "mmlu_pro" | |
| metric_init = "CAPA" | |
| # Create Gradio interface | |
| with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo: | |
| gr.Markdown("# Model Similarity Comparison Tool") | |
| gr.Markdown(links_markdown) | |
| gr.Markdown('This is an interactive demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)." You can compare the functional similarity of hundreds of Language Models on the Open LLM Leaderboard v2 benchmark datasets.') | |
| with gr.Row(): | |
| dataset_dropdown = gr.Dropdown( | |
| choices=get_leaderboard_datasets(model_init), | |
| label="Select Dataset", | |
| value=dataset_init, | |
| filterable=True, | |
| interactive=True, | |
| allow_custom_value=False, | |
| info="Open LLM Leaderboard v2 benchmark datasets" | |
| ) | |
| metric_dropdown = gr.Dropdown( | |
| choices=["CAPA", "CAPA (det.)", "Error Consistency"], | |
| label="Select Metric", | |
| value=metric_init, | |
| info="Select a similarity metric to compute" | |
| ) | |
| model_dropdown = gr.Dropdown( | |
| choices=get_leaderboard_models_cached(), | |
| label="Select Models", | |
| value=model_init, | |
| multiselect=True, | |
| filterable=True, | |
| allow_custom_value=False, | |
| info="Search and select multiple models" | |
| ) | |
| model_dropdown.change( | |
| fn=app_util.update_datasets_based_on_models, | |
| inputs=[model_dropdown, dataset_dropdown], | |
| outputs=dataset_dropdown | |
| ) | |
| generate_btn = gr.Button("Generate Heatmap", variant="primary") | |
| heatmap = gr.Image(value=app_util.create_heatmap(model_init, dataset_init, metric_init), label="Similarity Heatmap", elem_classes="image_container", visible=True) | |
| generate_btn.click( | |
| fn=app_util.validate_inputs, | |
| inputs=[model_dropdown, dataset_dropdown], | |
| queue=False | |
| ).then( | |
| fn=app_util.create_heatmap, | |
| inputs=[model_dropdown, dataset_dropdown, metric_dropdown], | |
| outputs=heatmap | |
| ) | |
| gr.Markdown("\* Self-similarity is only 1.0 for CAPA if the model predicts a single option with 100% confidence for each question. If the model is uncertain, the self-similarity will be lower.") | |
| clear_btn = gr.Button("Clear Selection") | |
| clear_btn.click( | |
| lambda: [[], None, None], | |
| outputs=[model_dropdown, dataset_dropdown, heatmap] | |
| ) | |
| gr.Markdown("## Information") | |
| metric_info_markdown = r""" | |
| We propose Chance Adjusted Probabilistic Agreement (CAPA, or κ_p), a novel metric for model similarity which adjusts for chance agreement due to accuracy. | |
| Using CAPA, we find: | |
| 1. LLM-as-a-judge scores are biased towards more similar models controlling for the model's capability. | |
| 2. Gain from training strong models on annotations of weak supervisors (weak-to-strong generalization) is higher when the two models are more different. | |
| 3. Concerningly, model errors are getting more correlated as capabilities increase. | |
| """ | |
| gr.Markdown(metric_info_markdown) | |
| with gr.Row(): | |
| gr.Image(value="data/table_capa.png", label="Comparison of different similarity metrics for multiple-choice questions", elem_classes="image_container", interactive=False) | |
| gr.Markdown(""" | |
| - **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n | |
| - Some datasets are not multiple-choice - for these, the metrics are not applicable. \n | |
| - **Models**: Open LLM Leaderboard models \n | |
| - Every model evaluation is gated on Hugging Face and access has to be requested. \n | |
| - We requested access for the most popular models, but some may be missing. \n | |
| - Notably, loading data is not possible for some meta-llama and gemma models. | |
| - **Metrics**: CAPA (probabilistic), CAPA (deterministic), Error Consistency""") | |
| if __name__ == "__main__": | |
| demo.launch(ssr_mode=False) |