import json import os import re from typing import Dict, List, Optional import gradio as gr import pandas as pd # Verified badge HTML for OpenAI models VERIFIED_BADGE = ( '✓' ) # Unified data structure - each model defined once with both scores # Format: List of dictionaries with model info and both Full Set and Text-Only scores LEADERBOARD_DATA = [ { "Model": f"Zoom Federated AI {VERIFIED_BADGE}", "Organization": "Zoom", "Open Source": "No", "Publish Date": "2025-12-29", "Full Set Score": 53.0, "Full Set Reference": "https://www.zoom.com/en/blog/zoom-ai-redefining-agentic-federated-intelligence/", "Text-Only Score": 55.2, "Text-Only Reference": "https://tinyurl.com/sileixu-hle-linkedin-53", }, { "Model": f"GPT-5.2 Pro {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 50.0, "Full Set Reference": "https://openai.com/index/introducing-gpt-5-2/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"GPT-5.2 Thinking {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 45.5, "Full Set Reference": "https://openai.com/index/introducing-gpt-5-2/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"GPT-5 Pro {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 42.0, "Full Set Reference": "https://openai.com/index/introducing-gpt-5/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"GPT-5 {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 35.2, "Full Set Reference": "https://openai.com/index/introducing-gpt-5/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"ChatGPT Agent {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-07-17", "Full Set Score": 41.6, "Full Set Reference": "https://openai.com/index/introducing-chatgpt-agent/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "OpenAI Deep Research", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-02-02", "Full Set Score": 26.6, "Full Set Reference": "https://openai.com/index/introducing-deep-research/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "Gemini Deep Research", "Organization": "Google", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 46.4, "Full Set Reference": "https://blog.google/technology/developers/deep-research-agent-gemini-api/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "Gemini 3 Pro", "Organization": "Google", "Open Source": "No", "Publish Date": "2025-11-18", "Full Set Score": 45.8, "Full Set Reference": "https://blog.google/products/gemini/gemini-3/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "Gemini 3 Flash", "Organization": "Google", "Open Source": "No", "Publish Date": "2025-12-17", "Full Set Score": 43.5, "Full Set Reference": "https://blog.google/products/gemini/gemini-3-flash/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"Kimi K2 Thinking {VERIFIED_BADGE}", "Organization": "Moonshot AI", "Open Source": "Yes", "Publish Date": "2025-11-06", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 44.9, "Text-Only Reference": "https://huggingface.co/moonshotai/Kimi-K2-Thinking", }, { "Model": f"Kimi K2 Thinking (Heavy) {VERIFIED_BADGE}", "Organization": "Moonshot AI", "Open Source": "Yes", "Publish Date": "2025-11-06", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 51.0, "Text-Only Reference": "https://huggingface.co/moonshotai/Kimi-K2-Thinking", }, { "Model": "Grok 4", "Organization": "xAI", "Open Source": "No", "Publish Date": "2025-07-09", "Full Set Score": 38.6, "Full Set Reference": "https://x.ai/news/grok-4", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "Grok 4 (Heavy)", "Organization": "xAI", "Open Source": "No", "Publish Date": "2025-07-09", "Full Set Score": 44.4, "Full Set Reference": "https://x.ai/news/grok-4", "Text-Only Score": 50.7, "Text-Only Reference": "https://x.ai/news/grok-4", }, { "Model": "GLM 4.7", "Organization": "Z.ai", "Open Source": "Yes", "Publish Date": "2025-12-22", "Full Set Score": 42.8, "Full Set Reference": "https://z.ai/blog/glm-4.7", "Text-Only Score": None, "Text-Only Reference": None, }, { "Model": f"Claude Opus 4.5 {VERIFIED_BADGE}", "Organization": "Anthropic", "Open Source": "No", "Publish Date": "2025-12-22", "Full Set Score": 43.2, "Full Set Reference": "https://assets.anthropic.com/m/64823ba7485345a7/Claude-Opus-4-5-System-Card.pdf", "Text-Only Score": None, "Text-Only Reference": None, }, { "Model": f"Claude Sonnet 4.5 {VERIFIED_BADGE}", "Organization": "Anthropic", "Open Source": "No", "Publish Date": "2025-09-29", "Full Set Score": 28.4, "Full Set Reference": "https://assets.anthropic.com/m/64823ba7485345a7/Claude-Opus-4-5-System-Card.pdf", "Text-Only Score": None, "Text-Only Reference": None, }, { "Model": "DeepWriter", "Organization": "Deepwriter AI", "Open Source": "No", "Publish Date": "2025-11-26", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 50.9, "Text-Only Reference": "https://deepwriter.com/blog/small-team-beats-worlds-top-ai-labs-at-hle/", }, { "Model": "Seed1.8", "Organization": "ByteDance", "Open Source": "No", "Publish Date": "2025-12-18", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 41.7, "Text-Only Reference": "https://lf3-static.bytednsdoc.com/obj/eden-cn/lapzild-tss/ljhwZthlaukjlkulzlp/research/Seed-1.8-Modelcard.pdf", }, { "Model": f"MiroThinker-v1.5-235B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2026-01-04", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 39.2, "Text-Only Reference": "https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B", }, { "Model": f"MiroThinker-v1.5-30B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2026-01-04", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 31.0, "Text-Only Reference": "https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B", }, { "Model": f"MiroThinker-v1.0-72B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2025-11-14", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 37.7, "Text-Only Reference": "https://arxiv.org/pdf/2511.11793", }, { "Model": f"MiroThinker-v1.0-30B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2025-11-14", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 33.4, "Text-Only Reference": "https://arxiv.org/pdf/2511.11793", }, { "Model": f"MiroThinker-v1.0-8B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2025-11-14", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 21.5, "Text-Only Reference": "https://arxiv.org/pdf/2511.11793", }, { "Model": "Tongyi-DeepResearch-30B-A3B", "Organization": "Alibaba", "Open Source": "Yes", "Publish Date": "2025-11-04", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 32.9, "Text-Only Reference": "https://arxiv.org/pdf/2510.24701", }, { "Model": "Tongyi-DeepResearch-30B-A3B (Heavy)", "Organization": "Alibaba", "Open Source": "Yes", "Publish Date": "2025-11-04", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 38.3, "Text-Only Reference": "https://arxiv.org/pdf/2510.24701", }, { "Model": "Perplexity Deep Research", "Organization": "Perplexity", "Open Source": "No", "Publish Date": "2025-02-14", "Full Set Score": 21.1, "Full Set Reference": "https://www.perplexity.ai/hub/blog/introducing-perplexity-deep-research", "Text-Only Score": None, "Text-Only Reference": None, }, { "Model": "MiniMax-M2", "Organization": "MiniMax AI", "Open Source": "Yes", "Publish Date": "2025-10-27", "Full Set Score": None, "Full Set Reference": None, "Text-Only Score": 31.8, "Text-Only Reference": "https://huggingface.co/MiniMaxAI/MiniMax-M2", }, { "Model": "ToolOrchestra", "Organization": "NVIDIA", "Open Source": "Yes", "Publish Date": "2025-11-26", "Full Set Score": None, "Full Set Reference": None, "Text-Only Score": 37.1, "Text-Only Reference": "https://arxiv.org/pdf/2511.21689", }, ] # Legacy support - kept for backward compatibility with file loading FULL_SET_DATA = [] TEXT_ONLY_DATA = [] def load_data_from_file(filepath: str, data_type: str = "csv") -> Optional[List[Dict]]: """Load data from CSV or JSON file""" if not os.path.exists(filepath): return None try: if data_type == "csv": df = pd.read_csv(filepath) return df.to_dict("records") elif data_type == "json": with open(filepath, "r") as f: return json.load(f) except Exception as e: print(f"Error loading data from {filepath}: {e}") return None return None def get_data(): """Get data from files if they exist, otherwise use default unified data""" # Try to load unified data from file first unified_data = load_data_from_file("data/leaderboard.json", "json") if unified_data is None: unified_data = load_data_from_file("data/leaderboard.csv", "csv") # If unified data exists, split it into full_set and text_only format if unified_data: full_set = [] text_only = [] for entry in unified_data: # Extract common fields model_info = { "Model": entry.get("Model", ""), "Organization": entry.get("Organization", ""), "Open Source": entry.get("Open Source", ""), "Publish Date": entry.get("Publish Date", ""), } # Add full set data if available if entry.get("Full Set Score") is not None: full_set_entry = model_info.copy() full_set_entry["Score"] = entry.get("Full Set Score") if entry.get("Full Set Reference"): full_set_entry["Reference"] = entry.get("Full Set Reference") full_set.append(full_set_entry) # Add text-only data if available if entry.get("Text-Only Score") is not None: text_only_entry = model_info.copy() text_only_entry["Score"] = entry.get("Text-Only Score") if entry.get("Text-Only Reference"): text_only_entry["Reference"] = entry.get("Text-Only Reference") text_only.append(text_only_entry) return full_set, text_only # Fallback: Try to load separate files (backward compatibility) full_set = load_data_from_file("data/full_set.csv", "csv") if full_set is None: full_set = load_data_from_file("data/full_set.json", "json") if full_set is None: # Convert unified LEADERBOARD_DATA to separate formats full_set = [] text_only = [] for entry in LEADERBOARD_DATA: model_info = { "Model": entry.get("Model", ""), "Organization": entry.get("Organization", ""), "Open Source": entry.get("Open Source", ""), "Publish Date": entry.get("Publish Date", ""), } if entry.get("Full Set Score") is not None: full_set_entry = model_info.copy() full_set_entry["Score"] = entry.get("Full Set Score") if entry.get("Full Set Reference"): full_set_entry["Reference"] = entry.get("Full Set Reference") full_set.append(full_set_entry) if entry.get("Text-Only Score") is not None: text_only_entry = model_info.copy() text_only_entry["Score"] = entry.get("Text-Only Score") if entry.get("Text-Only Reference"): text_only_entry["Reference"] = entry.get("Text-Only Reference") text_only.append(text_only_entry) return full_set, text_only text_only = load_data_from_file("data/text_only.csv", "csv") if text_only is None: text_only = load_data_from_file("data/text_only.json", "json") if text_only is None: text_only = TEXT_ONLY_DATA return full_set, text_only def create_leaderboard_table(data: List[Dict], title: str) -> pd.DataFrame: """Convert data list to pandas DataFrame for display""" df = pd.DataFrame(data) # Remove Rank and Tools columns if they exist columns_to_remove = ["Rank", "Tools"] for col in columns_to_remove: if col in df.columns: df = df.drop(columns=[col]) # Sort by Score descending if Score column exists if "Score" in df.columns: df = df.sort_values("Score", ascending=False).reset_index(drop=True) return df def dataframe_to_html(df: pd.DataFrame) -> str: """Convert DataFrame to HTML table with proper styling""" html = "
| {col} | ' html += "
|---|
| {cell_value} | ' html += "
" + " ".join(paragraph_lines) + "
") paragraph_lines = [] in_paragraph = False header_text = process_inline_markdown(stripped[5:]) html_lines.append(f"" + " ".join(paragraph_lines) + "
") paragraph_lines = [] in_paragraph = False header_text = process_inline_markdown(stripped[4:]) html_lines.append(f"" + " ".join(paragraph_lines) + "
") paragraph_lines = [] in_paragraph = False if not in_list: html_lines.append("" + " ".join(paragraph_lines) + "
") paragraph_lines = [] in_paragraph = False html_lines.append("" + " ".join(paragraph_lines) + "
") return "\n".join(html_lines) def create_motivation_section(): """Create the motivation section content as HTML""" motivation = f"""### About Humanity's Last Exam (HLE) [Humanity's Last Exam (HLE)](https://agi.safe.ai/) is a rigorous, multi-modal AI benchmark created by the Center for AI Safety in collaboration with Scale AI, designed to push large language models beyond saturated tests by evaluating reasoning and expert-level knowledge across thousands of challenging questions spanning mathematics, natural sciences, and the humanities. ### Why another leaderboard? While leaderboards existed for HLE, they fall short in several ways, leading to widespread confusion about true state-of-the-art results. In fact, if you ask ChatGPT, Gemini, and Claude "What's the SOTA for Humanity's Last Exam" today, they will all get the answer wrong. - **Tool Exclusion**: The [official leaderboard by Scale AI](https://scale.com/leaderboard/humanitys_last_exam) and many other leaderboards focus on models without tool use. - **Lack of separation**: Scores for the full HLE benchmark and the text-only subset are often mixed together, leading to unfair comparisons - score on full set is generally lower than the text only subset. - **Data Contamination**: Since copies of HLE, blogs and papers discussing HLE have been indexed by search engines, scores might be artificially inflated for agents without filtering. In this leaderboard, we add a{VERIFIED_BADGE} badge to indicate that some form of filtering is mentioned for the agent or its previous versions. """ return markdown_to_html(motivation) def build_leaderboard(): """Build the Gradio interface""" # Get data (from files or defaults) full_set_data, text_only_data = get_data() # Create separate tables for Full Set and Text-Only full_set_df = format_leaderboard_table(full_set_data, "Full Set Score") text_only_df = format_leaderboard_table(text_only_data, "Text-Only Score") custom_css = """ .gradio-container { max-width: 1200px; width: 100%; margin: 0 auto; padding: 0 10px; overflow: visible !important; } .gradio-row { justify-content: center; overflow: visible !important; } .gradio-column { text-align: left; max-width: 100%; overflow: visible !important; } h1, h2, h3 { text-align: center !important; } .markdown-text h1, .markdown-text h2, .markdown-text h3 { text-align: center !important; } hr { border-color: #95d5b2 !important; border-width: 2px !important; } .markdown-text, .markdown-text p, .markdown-text ul, .markdown-text ol, .markdown-text li { text-align: left !important; } /* Style verified badge in markdown - restore badge styling for all spans if inline styles are stripped */ .markdown-text span { display: inline-block !important; width: 12px !important; height: 12px !important; background-color: #00bfa5 !important; border-radius: 50% !important; color: white !important; text-align: center !important; line-height: 12px !important; font-size: 8px !important; font-weight: bold !important; margin-left: 4px !important; vertical-align: middle !important; } table { border: 2px solid #95d5b2 !important; border-collapse: collapse; margin: 0 auto; } table th { background-color: #95d5b2 !important; color: #1a1a1a !important; text-align: center; } table td { text-align: center; } table tr:hover { background-color: rgba(149, 213, 178, 0.1) !important; } table a { color: inherit !important; text-decoration: none !important; } table a:hover { color: #00bfa5 !important; } /* Make Agent / Model column bold */ table td:first-child, table th:first-child { font-weight: bold !important; } /* Make scores green */ table td strong { color: #00bfa5 !important; } table td a strong { color: #00bfa5 !important; } /* Custom tooltip styling */ .tooltip { position: relative; display: inline-block; } .tooltip .tooltiptext { visibility: hidden; opacity: 0; background-color: rgba(60, 60, 60, 0.95); color: #ffffff; text-align: center; border-radius: 0; padding: 14px 18px; position: absolute; z-index: 9999; bottom: 125%; left: 50%; transform: translateX(-50%); font-size: 13px; box-shadow: 0 6px 16px rgba(0, 0, 0, 0.3); transition: opacity 0.05s ease-in, visibility 0.05s; pointer-events: none; margin-bottom: 15px; word-wrap: break-word; white-space: nowrap; line-height: 1.5; border: none; } .tooltip .tooltiptext::after { content: ""; position: absolute; top: 100%; left: 50%; margin-left: -6px; border-width: 6px; border-style: solid; border-color: rgba(60, 60, 60, 0.95) transparent transparent transparent; } .tooltip:hover .tooltiptext { visibility: visible; opacity: 1; transition: opacity 0.05s ease-in, visibility 0s; } /* Mobile responsive styles */ @media screen and (max-width: 768px) { .gradio-container { max-width: 100%; padding: 0 5px; } table { font-size: 12px; } table td, table th { padding: 6px 4px !important; font-size: 11px; } h1 { font-size: 24px !important; } h2 { font-size: 20px !important; } h3 { font-size: 18px !important; } .markdown-text { font-size: 14px !important; } .tooltip .tooltiptext { font-size: 11px; padding: 10px 12px; white-space: normal; } } @media screen and (max-width: 480px) { .gradio-container { padding: 0 3px; } table { font-size: 10px; } table td, table th { padding: 4px 2px !important; font-size: 10px; } h1 { font-size: 20px !important; } h2 { font-size: 18px !important; } h3 { font-size: 16px !important; } .markdown-text { font-size: 13px !important; } } """ with gr.Blocks(title="HLE Leaderboard for Agents with Tools", theme="JohnSmith9982/small_and_pretty", css=custom_css) as demo: gr.Markdown( """ # 🏆 Humanity's Last Exam Leaderboard for Agents with Tools """ ) gr.Markdown("---") # Motivation Section with gr.Row(): with gr.Column(): motivation_text = create_motivation_section() gr.HTML(motivation_text) gr.Markdown("---") gr.Markdown("