import json import os import re from typing import Dict, List, Optional import gradio as gr import pandas as pd # Verified badge HTML for OpenAI models VERIFIED_BADGE = ( '' ) # Unified data structure - each model defined once with both scores # Format: List of dictionaries with model info and both Full Set and Text-Only scores LEADERBOARD_DATA = [ { "Model": f"Zoom Federated AI {VERIFIED_BADGE}", "Organization": "Zoom", "Open Source": "No", "Publish Date": "2025-12-29", "Full Set Score": 53.0, "Full Set Reference": "https://www.zoom.com/en/blog/zoom-ai-redefining-agentic-federated-intelligence/", "Text-Only Score": 55.2, "Text-Only Reference": "https://tinyurl.com/sileixu-hle-linkedin-53", }, { "Model": f"GPT-5.2 Pro {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 50.0, "Full Set Reference": "https://openai.com/index/introducing-gpt-5-2/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"GPT-5.2 Thinking {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 45.5, "Full Set Reference": "https://openai.com/index/introducing-gpt-5-2/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"GPT-5 Pro {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 42.0, "Full Set Reference": "https://openai.com/index/introducing-gpt-5/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"GPT-5 {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 35.2, "Full Set Reference": "https://openai.com/index/introducing-gpt-5/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"ChatGPT Agent {VERIFIED_BADGE}", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-07-17", "Full Set Score": 41.6, "Full Set Reference": "https://openai.com/index/introducing-chatgpt-agent/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "OpenAI Deep Research", "Organization": "OpenAI", "Open Source": "No", "Publish Date": "2025-02-02", "Full Set Score": 26.6, "Full Set Reference": "https://openai.com/index/introducing-deep-research/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "Gemini Deep Research", "Organization": "Google", "Open Source": "No", "Publish Date": "2025-12-11", "Full Set Score": 46.4, "Full Set Reference": "https://blog.google/technology/developers/deep-research-agent-gemini-api/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "Gemini 3 Pro", "Organization": "Google", "Open Source": "No", "Publish Date": "2025-11-18", "Full Set Score": 45.8, "Full Set Reference": "https://blog.google/products/gemini/gemini-3/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "Gemini 3 Flash", "Organization": "Google", "Open Source": "No", "Publish Date": "2025-12-17", "Full Set Score": 43.5, "Full Set Reference": "https://blog.google/products/gemini/gemini-3-flash/", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": f"Kimi K2 Thinking {VERIFIED_BADGE}", "Organization": "Moonshot AI", "Open Source": "Yes", "Publish Date": "2025-11-06", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 44.9, "Text-Only Reference": "https://huggingface.co/moonshotai/Kimi-K2-Thinking", }, { "Model": f"Kimi K2 Thinking (Heavy) {VERIFIED_BADGE}", "Organization": "Moonshot AI", "Open Source": "Yes", "Publish Date": "2025-11-06", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 51.0, "Text-Only Reference": "https://huggingface.co/moonshotai/Kimi-K2-Thinking", }, { "Model": "Grok 4", "Organization": "xAI", "Open Source": "No", "Publish Date": "2025-07-09", "Full Set Score": 38.6, "Full Set Reference": "https://x.ai/news/grok-4", "Text-Only Score": "-", "Text-Only Reference": None, }, { "Model": "Grok 4 (Heavy)", "Organization": "xAI", "Open Source": "No", "Publish Date": "2025-07-09", "Full Set Score": 44.4, "Full Set Reference": "https://x.ai/news/grok-4", "Text-Only Score": 50.7, "Text-Only Reference": "https://x.ai/news/grok-4", }, { "Model": "GLM 4.7", "Organization": "Z.ai", "Open Source": "Yes", "Publish Date": "2025-12-22", "Full Set Score": 42.8, "Full Set Reference": "https://z.ai/blog/glm-4.7", "Text-Only Score": None, "Text-Only Reference": None, }, { "Model": f"Claude Opus 4.5 {VERIFIED_BADGE}", "Organization": "Anthropic", "Open Source": "No", "Publish Date": "2025-12-22", "Full Set Score": 43.2, "Full Set Reference": "https://assets.anthropic.com/m/64823ba7485345a7/Claude-Opus-4-5-System-Card.pdf", "Text-Only Score": None, "Text-Only Reference": None, }, { "Model": f"Claude Sonnet 4.5 {VERIFIED_BADGE}", "Organization": "Anthropic", "Open Source": "No", "Publish Date": "2025-09-29", "Full Set Score": 28.4, "Full Set Reference": "https://assets.anthropic.com/m/64823ba7485345a7/Claude-Opus-4-5-System-Card.pdf", "Text-Only Score": None, "Text-Only Reference": None, }, { "Model": "DeepWriter", "Organization": "Deepwriter AI", "Open Source": "No", "Publish Date": "2025-11-26", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 50.9, "Text-Only Reference": "https://deepwriter.com/blog/small-team-beats-worlds-top-ai-labs-at-hle/", }, { "Model": "Seed1.8", "Organization": "ByteDance", "Open Source": "No", "Publish Date": "2025-12-18", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 41.7, "Text-Only Reference": "https://lf3-static.bytednsdoc.com/obj/eden-cn/lapzild-tss/ljhwZthlaukjlkulzlp/research/Seed-1.8-Modelcard.pdf", }, { "Model": f"MiroThinker-v1.5-235B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2026-01-04", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 39.2, "Text-Only Reference": "https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B", }, { "Model": f"MiroThinker-v1.5-30B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2026-01-04", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 31.0, "Text-Only Reference": "https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B", }, { "Model": f"MiroThinker-v1.0-72B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2025-11-14", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 37.7, "Text-Only Reference": "https://arxiv.org/pdf/2511.11793", }, { "Model": f"MiroThinker-v1.0-30B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2025-11-14", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 33.4, "Text-Only Reference": "https://arxiv.org/pdf/2511.11793", }, { "Model": f"MiroThinker-v1.0-8B {VERIFIED_BADGE}", "Organization": "MiroMind AI", "Open Source": "Yes", "Publish Date": "2025-11-14", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 21.5, "Text-Only Reference": "https://arxiv.org/pdf/2511.11793", }, { "Model": "Tongyi-DeepResearch-30B-A3B", "Organization": "Alibaba", "Open Source": "Yes", "Publish Date": "2025-11-04", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 32.9, "Text-Only Reference": "https://arxiv.org/pdf/2510.24701", }, { "Model": "Tongyi-DeepResearch-30B-A3B (Heavy)", "Organization": "Alibaba", "Open Source": "Yes", "Publish Date": "2025-11-04", "Full Set Score": "-", "Full Set Reference": None, "Text-Only Score": 38.3, "Text-Only Reference": "https://arxiv.org/pdf/2510.24701", }, { "Model": "Perplexity Deep Research", "Organization": "Perplexity", "Open Source": "No", "Publish Date": "2025-02-14", "Full Set Score": 21.1, "Full Set Reference": "https://www.perplexity.ai/hub/blog/introducing-perplexity-deep-research", "Text-Only Score": None, "Text-Only Reference": None, }, { "Model": "MiniMax-M2", "Organization": "MiniMax AI", "Open Source": "Yes", "Publish Date": "2025-10-27", "Full Set Score": None, "Full Set Reference": None, "Text-Only Score": 31.8, "Text-Only Reference": "https://huggingface.co/MiniMaxAI/MiniMax-M2", }, { "Model": "ToolOrchestra", "Organization": "NVIDIA", "Open Source": "Yes", "Publish Date": "2025-11-26", "Full Set Score": None, "Full Set Reference": None, "Text-Only Score": 37.1, "Text-Only Reference": "https://arxiv.org/pdf/2511.21689", }, ] # Legacy support - kept for backward compatibility with file loading FULL_SET_DATA = [] TEXT_ONLY_DATA = [] def load_data_from_file(filepath: str, data_type: str = "csv") -> Optional[List[Dict]]: """Load data from CSV or JSON file""" if not os.path.exists(filepath): return None try: if data_type == "csv": df = pd.read_csv(filepath) return df.to_dict("records") elif data_type == "json": with open(filepath, "r") as f: return json.load(f) except Exception as e: print(f"Error loading data from {filepath}: {e}") return None return None def get_data(): """Get data from files if they exist, otherwise use default unified data""" # Try to load unified data from file first unified_data = load_data_from_file("data/leaderboard.json", "json") if unified_data is None: unified_data = load_data_from_file("data/leaderboard.csv", "csv") # If unified data exists, split it into full_set and text_only format if unified_data: full_set = [] text_only = [] for entry in unified_data: # Extract common fields model_info = { "Model": entry.get("Model", ""), "Organization": entry.get("Organization", ""), "Open Source": entry.get("Open Source", ""), "Publish Date": entry.get("Publish Date", ""), } # Add full set data if available if entry.get("Full Set Score") is not None: full_set_entry = model_info.copy() full_set_entry["Score"] = entry.get("Full Set Score") if entry.get("Full Set Reference"): full_set_entry["Reference"] = entry.get("Full Set Reference") full_set.append(full_set_entry) # Add text-only data if available if entry.get("Text-Only Score") is not None: text_only_entry = model_info.copy() text_only_entry["Score"] = entry.get("Text-Only Score") if entry.get("Text-Only Reference"): text_only_entry["Reference"] = entry.get("Text-Only Reference") text_only.append(text_only_entry) return full_set, text_only # Fallback: Try to load separate files (backward compatibility) full_set = load_data_from_file("data/full_set.csv", "csv") if full_set is None: full_set = load_data_from_file("data/full_set.json", "json") if full_set is None: # Convert unified LEADERBOARD_DATA to separate formats full_set = [] text_only = [] for entry in LEADERBOARD_DATA: model_info = { "Model": entry.get("Model", ""), "Organization": entry.get("Organization", ""), "Open Source": entry.get("Open Source", ""), "Publish Date": entry.get("Publish Date", ""), } if entry.get("Full Set Score") is not None: full_set_entry = model_info.copy() full_set_entry["Score"] = entry.get("Full Set Score") if entry.get("Full Set Reference"): full_set_entry["Reference"] = entry.get("Full Set Reference") full_set.append(full_set_entry) if entry.get("Text-Only Score") is not None: text_only_entry = model_info.copy() text_only_entry["Score"] = entry.get("Text-Only Score") if entry.get("Text-Only Reference"): text_only_entry["Reference"] = entry.get("Text-Only Reference") text_only.append(text_only_entry) return full_set, text_only text_only = load_data_from_file("data/text_only.csv", "csv") if text_only is None: text_only = load_data_from_file("data/text_only.json", "json") if text_only is None: text_only = TEXT_ONLY_DATA return full_set, text_only def create_leaderboard_table(data: List[Dict], title: str) -> pd.DataFrame: """Convert data list to pandas DataFrame for display""" df = pd.DataFrame(data) # Remove Rank and Tools columns if they exist columns_to_remove = ["Rank", "Tools"] for col in columns_to_remove: if col in df.columns: df = df.drop(columns=[col]) # Sort by Score descending if Score column exists if "Score" in df.columns: df = df.sort_values("Score", ascending=False).reset_index(drop=True) return df def dataframe_to_html(df: pd.DataFrame) -> str: """Convert DataFrame to HTML table with proper styling""" html = "
" html += '' # Header html += "" for col in df.columns: html += f'' html += "" # Body html += "" for idx, row in df.iterrows(): html += "" for col in df.columns: cell_value = str(row[col]) if pd.notna(row[col]) else "" # Check if cell contains HTML link if "{cell_value}' else: html += f'' html += "" html += "
{col}
{cell_value}
" html += "
" return html def format_leaderboard_table(data: List[Dict], score_column_name: str) -> pd.DataFrame: """Format a single leaderboard table with proper score formatting and links""" if not data: return pd.DataFrame() # Filter out models without scores (None, empty string, or "-") filtered_data = [] for row in data: score = row.get("Score", "") # Skip if score is None, empty, or "-" if score is None or str(score).strip() == "" or str(score).strip() == "-": continue filtered_data.append(row) if not filtered_data: return pd.DataFrame() # Create DataFrame from filtered data df = pd.DataFrame(filtered_data) # Format scores with links and tooltips formatted_data = [] for _, row in df.iterrows(): formatted_row = { "Agent / Model": row.get("Model", ""), "Organization": row.get("Organization", ""), "Open Source": row.get("Open Source", ""), "Publish Date": row.get("Publish Date", ""), } score = row.get("Score", "") reference = row.get("Reference", "") # Format score with reference link if available if score and reference: # Format as HTML link with custom tooltip formatted_row[score_column_name] = ( f'' f'{score}{reference}' ) elif score: # Format score (numeric values) try: score_val = float(score) formatted_row[score_column_name] = f"{score_val:.2f}" except (ValueError, TypeError): formatted_row[score_column_name] = f"{score}" else: formatted_row[score_column_name] = "" formatted_data.append(formatted_row) result_df = pd.DataFrame(formatted_data) # Sort by score descending def extract_score_for_sorting(score_str): """Extract numeric score from string""" if not score_str or str(score_str).strip() == "" or str(score_str).strip() == "-": return None # Extract from HTML if present match = re.search(r"([\d.]+)", str(score_str)) if match: try: return float(match.group(1)) except (ValueError, TypeError): pass # Try direct conversion try: return float(score_str) except (ValueError, TypeError): return None if score_column_name in result_df.columns: result_df["_sort_score"] = result_df[score_column_name].apply(extract_score_for_sorting) result_df = result_df.sort_values("_sort_score", ascending=False, na_position="last") result_df = result_df.drop(columns=["_sort_score"]) # Reorder columns column_order = ["Agent / Model", "Organization", "Open Source", "Publish Date", score_column_name] existing_columns = [col for col in column_order if col in result_df.columns] other_columns = [col for col in result_df.columns if col not in column_order] result_df = result_df[existing_columns + other_columns] return result_df.reset_index(drop=True) def merge_leaderboard_tables(full_set_data: List[Dict], text_only_data: List[Dict]) -> pd.DataFrame: """Merge full set and text-only data into a single table""" # Create dictionaries keyed by Model name for easier merging full_set_dict = {} for row in full_set_data: model = row.get("Model", "") if model: full_set_dict[model] = row text_only_dict = {} for row in text_only_data: model = row.get("Model", "") if model: text_only_dict[model] = row # Get all unique models all_models = set(full_set_dict.keys()) | set(text_only_dict.keys()) # Build merged data merged_data = [] def get_sort_key(model): """Get sort key for initial model ordering""" full_score = full_set_dict.get(model, {}).get("Score", "") text_score = text_only_dict.get(model, {}).get("Score", "") # Try to get full set score if full_score and str(full_score).strip() != "-": try: return -float(full_score) except (ValueError, TypeError): pass # If no valid full score, use text-only score minus 3% if text_score and str(text_score).strip() != "-": try: return -float(text_score) + 3 except (ValueError, TypeError): pass # Default to 0 if no valid scores return 0 for model in sorted(all_models, key=get_sort_key): row = {"Model": model} # Add full set score with reference link if model in full_set_dict: full_score = full_set_dict[model].get("Score", "") full_ref = full_set_dict[model].get("Reference", "") if full_score and full_ref: # Format as HTML link with custom tooltip row["Full Set Score"] = ( f'' f'{full_score}{full_ref}' ) elif full_score: row["Full Set Score"] = f"{full_score}" else: row["Full Set Score"] = "" else: row["Full Set Score"] = "" # Add text-only score with reference link if model in text_only_dict: text_score = text_only_dict[model].get("Score", "") text_ref = text_only_dict[model].get("Reference", "") if text_score and text_ref: # Format as HTML link with custom tooltip row["Text-Only Score"] = ( f'' f'{text_score}{text_ref}' ) elif text_score: row["Text-Only Score"] = f"{text_score}" else: row["Text-Only Score"] = "" else: row["Text-Only Score"] = "" # Add Organization (from data if available, otherwise use placeholder) if model in full_set_dict and "Organization" in full_set_dict[model]: row["Organization"] = full_set_dict[model]["Organization"] elif model in text_only_dict and "Organization" in text_only_dict[model]: row["Organization"] = text_only_dict[model]["Organization"] else: row["Organization"] = "" # Will be filled from data if available # Add Open Source (from data if available, otherwise use placeholder) if model in full_set_dict and "Open Source" in full_set_dict[model]: row["Open Source"] = full_set_dict[model]["Open Source"] elif model in full_set_dict and "OpenSource" in full_set_dict[model]: row["Open Source"] = full_set_dict[model]["OpenSource"] elif model in text_only_dict and "Open Source" in text_only_dict[model]: row["Open Source"] = text_only_dict[model]["Open Source"] elif model in text_only_dict and "OpenSource" in text_only_dict[model]: row["Open Source"] = text_only_dict[model]["OpenSource"] else: row["Open Source"] = "" # Will be filled from data if available # Add Publish Date (from data if available) if model in full_set_dict and "Publish Date" in full_set_dict[model]: row["Publish Date"] = full_set_dict[model]["Publish Date"] elif model in text_only_dict and "Publish Date" in text_only_dict[model]: row["Publish Date"] = text_only_dict[model]["Publish Date"] else: row["Publish Date"] = "" merged_data.append(row) # Create DataFrame merged_df = pd.DataFrame(merged_data) # Extract numeric scores for sorting (before formatting as links) # We need to sort before converting to links, so let's extract the numeric values def extract_score_for_sorting(score_str): """Extract numeric score from string (handles both plain numbers and HTML links)""" if not score_str or str(score_str).strip() == "" or str(score_str).strip() == "-": return None # If it's an HTML link like 85.2, extract 85.2 match = re.search(r"]*>([\d.]+)", str(score_str)) if match: return float(match.group(1)) # Otherwise try to convert directly try: return float(score_str) except Exception: return None # Sort by Full Set Score descending (or Text-Only Score minus 3% if Full Set Score is None or "-") if "Full Set Score" in merged_df.columns and "Text-Only Score" in merged_df.columns: merged_df["_full_score"] = merged_df["Full Set Score"].apply(extract_score_for_sorting) merged_df["_text_score"] = merged_df["Text-Only Score"].apply(extract_score_for_sorting) # Calculate sort score: use Full Set Score if available, otherwise Text-Only Score * 0.97 def calculate_sort_score(row): full_score = row["_full_score"] text_score = row["_text_score"] if full_score is not None: return full_score elif text_score is not None: return text_score * 0.97 # Minus 3% else: return 0 merged_df["_sort_score"] = merged_df.apply(calculate_sort_score, axis=1) merged_df = merged_df.sort_values("_sort_score", ascending=False, na_position="last") merged_df = merged_df.drop(columns=["_sort_score", "_full_score", "_text_score"]) elif "Full Set Score" in merged_df.columns: merged_df["_sort_score"] = merged_df["Full Set Score"].apply(extract_score_for_sorting) merged_df = merged_df.sort_values("_sort_score", ascending=False, na_position="last") merged_df = merged_df.drop(columns=["_sort_score"]) # Format scores that aren't already links def format_score_with_ref(score_str): """Format score, ensuring it has 2 decimal places if it's a link""" if not score_str: return "" # If already an HTML link with tooltip and bold, format the number inside tooltip_pattern = ( r']*>([\d.]+)' r'([^<]+)' ) match = re.match(tooltip_pattern, str(score_str)) if match: url = match.group(1) score_val = float(match.group(2)) tooltip_text = match.group(3) return ( f'' f'{score_val:.2f}{tooltip_text}' ) # If already an HTML link with tooltip (without bold), format the number inside tooltip_pattern = r']*>([\d.]+)' r'([^<]+)' match = re.match(tooltip_pattern, str(score_str)) if match: url = match.group(1) score_val = float(match.group(2)) tooltip_text = match.group(3) return ( f'' f'{score_val:.2f}{tooltip_text}' ) # If it's a simple link without tooltip wrapper, add tooltip wrapper match = re.match(r']*>([\d.]+)', str(score_str)) if match: url = match.group(1) score_val = float(match.group(2)) return ( f'' f'{score_val:.2f}{url}' ) # If it's already bold but not a link match = re.match(r"([\d.]+)", str(score_str)) if match: score_val = float(match.group(1)) return f"{score_val:.2f}" # Otherwise format as plain number with bold try: score_val = float(score_str) return f"{score_val:.2f}" except Exception: # If it's already a string with special characters (like "-"), keep it as is but make bold return f"{score_str}" merged_df["Full Set Score"] = merged_df["Full Set Score"].apply(format_score_with_ref) if "Text-Only Score" in merged_df.columns: def format_score_with_ref(score_str): """Format score, ensuring it has 2 decimal places if it's a link""" if not score_str: return "" # If already an HTML link with tooltip, format the number inside tooltip_pattern = r']*>([\d.]+)' r'([^<]+)' match = re.match(tooltip_pattern, str(score_str)) if match: url = match.group(1) score_val = float(match.group(2)) tooltip_text = match.group(3) return ( f'' f'{score_val:.2f}{tooltip_text}' ) # If it's a simple link without tooltip wrapper, add tooltip wrapper match = re.match(r']*>([\d.]+)', str(score_str)) if match: url = match.group(1) score_val = float(match.group(2)) return ( f'' f'{score_val:.2f}{url}' ) # Otherwise format as plain number try: score_val = float(score_str) return f"{score_val:.2f}" except Exception: return str(score_str) merged_df["Text-Only Score"] = merged_df["Text-Only Score"].apply(format_score_with_ref) # Ensure Organization, Open Source, and Publish Date columns exist if "Organization" not in merged_df.columns: merged_df["Organization"] = "" if "Open Source" not in merged_df.columns: merged_df["Open Source"] = "" if "Publish Date" not in merged_df.columns: merged_df["Publish Date"] = "" # Rename Model column to "Agent / Model" if "Model" in merged_df.columns: merged_df = merged_df.rename(columns={"Model": "Agent / Model"}) # Reorder columns: Agent / Model, Organization, Open Source, Publish Date, Full Set Score, Text-Only Score column_order = ["Agent / Model", "Organization", "Open Source", "Publish Date", "Full Set Score", "Text-Only Score"] existing_columns = [col for col in column_order if col in merged_df.columns] other_columns = [col for col in merged_df.columns if col not in column_order] merged_df = merged_df[existing_columns + other_columns] return merged_df.reset_index(drop=True) def markdown_to_html(text: str) -> str: """Convert markdown text to HTML while preserving HTML badges""" import re lines = text.split("\n") html_lines = [] in_list = False in_paragraph = False paragraph_lines = [] def process_inline_markdown(line: str) -> str: """Process inline markdown (bold, links) in a line""" # Convert markdown links [text](url) to HTML links line = re.sub(r"\[([^\]]+)\]\(([^\)]+)\)", r'\1', line) # Convert markdown bold **text** to HTML bold line = re.sub(r"\*\*([^\*]+)\*\*", r"\1", line) return line for line in lines: stripped = line.strip() # Handle headers if stripped.startswith("#### "): if in_list: html_lines.append("") in_list = False if in_paragraph: html_lines.append("

" + " ".join(paragraph_lines) + "

") paragraph_lines = [] in_paragraph = False header_text = process_inline_markdown(stripped[5:]) html_lines.append(f"

{header_text}

") elif stripped.startswith("### "): if in_list: html_lines.append("") in_list = False if in_paragraph: html_lines.append("

" + " ".join(paragraph_lines) + "

") paragraph_lines = [] in_paragraph = False header_text = process_inline_markdown(stripped[4:]) html_lines.append(f"

{header_text}

") # Handle list items elif stripped.startswith("- "): if in_paragraph: html_lines.append("

" + " ".join(paragraph_lines) + "

") paragraph_lines = [] in_paragraph = False if not in_list: html_lines.append("") in_list = False if in_paragraph: html_lines.append("

" + " ".join(paragraph_lines) + "

") paragraph_lines = [] in_paragraph = False html_lines.append("
") # Handle regular paragraphs else: if in_list: html_lines.append("") in_list = False processed_line = process_inline_markdown(stripped) paragraph_lines.append(processed_line) in_paragraph = True # Close any open structures if in_list: html_lines.append("") if in_paragraph: html_lines.append("

" + " ".join(paragraph_lines) + "

") return "\n".join(html_lines) def create_motivation_section(): """Create the motivation section content as HTML""" motivation = f"""### About Humanity's Last Exam (HLE) [Humanity's Last Exam (HLE)](https://agi.safe.ai/) is a rigorous, multi-modal AI benchmark created by the Center for AI Safety in collaboration with Scale AI, designed to push large language models beyond saturated tests by evaluating reasoning and expert-level knowledge across thousands of challenging questions spanning mathematics, natural sciences, and the humanities. ### Why another leaderboard? While leaderboards existed for HLE, they fall short in several ways, leading to widespread confusion about true state-of-the-art results. In fact, if you ask ChatGPT, Gemini, and Claude "What's the SOTA for Humanity's Last Exam" today, they will all get the answer wrong. - **Tool Exclusion**: The [official leaderboard by Scale AI](https://scale.com/leaderboard/humanitys_last_exam) and many other leaderboards focus on models without tool use. - **Lack of separation**: Scores for the full HLE benchmark and the text-only subset are often mixed together, leading to unfair comparisons - score on full set is generally lower than the text only subset. - **Data Contamination**: Since copies of HLE, blogs and papers discussing HLE have been indexed by search engines, scores might be artificially inflated for agents without filtering. In this leaderboard, we add a{VERIFIED_BADGE} badge to indicate that some form of filtering is mentioned for the agent or its previous versions. """ return markdown_to_html(motivation) def build_leaderboard(): """Build the Gradio interface""" # Get data (from files or defaults) full_set_data, text_only_data = get_data() # Create separate tables for Full Set and Text-Only full_set_df = format_leaderboard_table(full_set_data, "Full Set Score") text_only_df = format_leaderboard_table(text_only_data, "Text-Only Score") custom_css = """ .gradio-container { max-width: 1200px; width: 100%; margin: 0 auto; padding: 0 10px; overflow: visible !important; } .gradio-row { justify-content: center; overflow: visible !important; } .gradio-column { text-align: left; max-width: 100%; overflow: visible !important; } h1, h2, h3 { text-align: center !important; } .markdown-text h1, .markdown-text h2, .markdown-text h3 { text-align: center !important; } hr { border-color: #95d5b2 !important; border-width: 2px !important; } .markdown-text, .markdown-text p, .markdown-text ul, .markdown-text ol, .markdown-text li { text-align: left !important; } /* Style verified badge in markdown - restore badge styling for all spans if inline styles are stripped */ .markdown-text span { display: inline-block !important; width: 12px !important; height: 12px !important; background-color: #00bfa5 !important; border-radius: 50% !important; color: white !important; text-align: center !important; line-height: 12px !important; font-size: 8px !important; font-weight: bold !important; margin-left: 4px !important; vertical-align: middle !important; } table { border: 2px solid #95d5b2 !important; border-collapse: collapse; margin: 0 auto; } table th { background-color: #95d5b2 !important; color: #1a1a1a !important; text-align: center; } table td { text-align: center; } table tr:hover { background-color: rgba(149, 213, 178, 0.1) !important; } table a { color: inherit !important; text-decoration: none !important; } table a:hover { color: #00bfa5 !important; } /* Make Agent / Model column bold */ table td:first-child, table th:first-child { font-weight: bold !important; } /* Make scores green */ table td strong { color: #00bfa5 !important; } table td a strong { color: #00bfa5 !important; } /* Custom tooltip styling */ .tooltip { position: relative; display: inline-block; } .tooltip .tooltiptext { visibility: hidden; opacity: 0; background-color: rgba(60, 60, 60, 0.95); color: #ffffff; text-align: center; border-radius: 0; padding: 14px 18px; position: absolute; z-index: 9999; bottom: 125%; left: 50%; transform: translateX(-50%); font-size: 13px; box-shadow: 0 6px 16px rgba(0, 0, 0, 0.3); transition: opacity 0.05s ease-in, visibility 0.05s; pointer-events: none; margin-bottom: 15px; word-wrap: break-word; white-space: nowrap; line-height: 1.5; border: none; } .tooltip .tooltiptext::after { content: ""; position: absolute; top: 100%; left: 50%; margin-left: -6px; border-width: 6px; border-style: solid; border-color: rgba(60, 60, 60, 0.95) transparent transparent transparent; } .tooltip:hover .tooltiptext { visibility: visible; opacity: 1; transition: opacity 0.05s ease-in, visibility 0s; } /* Mobile responsive styles */ @media screen and (max-width: 768px) { .gradio-container { max-width: 100%; padding: 0 5px; } table { font-size: 12px; } table td, table th { padding: 6px 4px !important; font-size: 11px; } h1 { font-size: 24px !important; } h2 { font-size: 20px !important; } h3 { font-size: 18px !important; } .markdown-text { font-size: 14px !important; } .tooltip .tooltiptext { font-size: 11px; padding: 10px 12px; white-space: normal; } } @media screen and (max-width: 480px) { .gradio-container { padding: 0 3px; } table { font-size: 10px; } table td, table th { padding: 4px 2px !important; font-size: 10px; } h1 { font-size: 20px !important; } h2 { font-size: 18px !important; } h3 { font-size: 16px !important; } .markdown-text { font-size: 13px !important; } } """ with gr.Blocks(title="HLE Leaderboard for Agents with Tools", theme="JohnSmith9982/small_and_pretty", css=custom_css) as demo: gr.Markdown( """ # 🏆 Humanity's Last Exam Leaderboard for Agents with Tools """ ) gr.Markdown("---") # Motivation Section with gr.Row(): with gr.Column(): motivation_text = create_motivation_section() gr.HTML(motivation_text) gr.Markdown("---") gr.Markdown("
") # Full Set Leaderboard gr.Markdown("## 📊 Full Set Leaderboard") gr.Markdown( """ Results on the complete set of tasks (2500 examples). """ ) # Convert DataFrame to HTML with clickable links full_set_html = dataframe_to_html(full_set_df) # Display as HTML to support clickable links gr.HTML(value=full_set_html) # Text-Only Leaderboard gr.Markdown("
") gr.Markdown("## 📊 Text-Only Leaderboard") gr.Markdown( """ Results on the text-only subset of tasks (2158 examples). """ ) # Convert DataFrame to HTML with clickable links text_only_html = dataframe_to_html(text_only_df) # Display as HTML to support clickable links gr.HTML(value=text_only_html) # Add JavaScript for tooltip positioning (injected after page load) gr.HTML( value="""""", visible=False, ) # Footer gr.Markdown("
") notes_text = f"""#### Notes - All models and agents listed have tool use capabilities. - All numbers are based on the official report of each agent. - Missing{VERIFIED_BADGE} badge does not necessarily mean the agent applied no filtering, it just means that no such mention is found. - Having{VERIFIED_BADGE} badge does not necessarily mean the agent applied perfect filtering. E.g., only blocking huggingface URLs won't be sufficient. - We exclude scores reported on non-official subsets as the results are not comparable. - Please contact us at hle@zoom.us if you find any errors or want to add a model or agent to the leaderboard. - Last updated: Jan 8, 2026. """ gr.HTML(markdown_to_html(notes_text)) return demo if __name__ == "__main__": demo = build_leaderboard() demo.launch(share=False, server_name="0.0.0.0", server_port=7860)