import pandas as pd import gradio as gr import csv import json import os import requests import io import shutil import pprint as pp from huggingface_hub import Repository from datasets import DATASETS HF_TOKEN = os.environ.get("HF_TOKEN") BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"] TASKS_V1 = ["V1-Overall", "I-CLS", "I-QA", "I-RET", "I-VG"] COLUMN_NAMES = BASE_COLS + TASKS_V1 DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \ ['number'] * len(TASKS_V1) LEADERBOARD_INTRODUCTION = """ # 📊 **MMEB LEADERBOARD (VLM2Vec)** ## Introduction We introduce a novel benchmark, **MMEB-V1 (Massive Multimodal Embedding Benchmark)**, which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training and evaluating embedding models across various combinations of text and image modalities. All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text, or a combination of both. MMEB-V1 is divided into 20 in-distribution datasets, which can be used for training, and 16 out-of-distribution datasets, reserved for evaluation. Building upon on **MMEB-V1**, **MMEB-V2** expands the evaluation scope to include five new tasks: four video-based tasks — Video Retrieval, Moment Retrieval, Video Classification, and Video Question Answering — and one task focused on visual documents, Visual Document Retrieval. This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings. **IMPORTANT NOTES Regarding the old MMEB-V1 leaderboard:** MMEB-V1 is now part of the Image section of MMEB-V2, and the results on its leaderboard have been merged into the MMEB-V2 Image leaderboard. We won't be accepting new submissions to the old V1 leaderboard anymore, and will remove it from this web page soon. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. **On the V2 Image leaderboard, models from the old V1 leaderboard are missing detailed scores of each dataset. We hope the authors of the models on V1 leaderboard could rerun your models using our updated V2 pipeline, and provide us the scores sheet with the new format, so that we can make them consistent with the other models' formats.** | [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec) | [**📖MMEB-V2/VLM2Vec-V2 Paper**](https://arxiv.org/abs/2507.04590) | [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160) | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) | [**Discord**](https://discord.gg/njyKubdtry) | """ TABLE_INTRODUCTION = """***Important Notes:*** This is the old MMEB-V1 leaderboard, which is now deprecated and going to be removed from this web page soon. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image section. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n""" LEADERBOARD_INFO = f""" ## Dataset Overview This is the dictionary of all datasets used in our code. Please make sure all datasets' scores are included in your submission. \n ```python {pp.pformat(DATASETS)} ``` """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""@article{jiang2024vlm2vec, title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks}, author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu}, journal={arXiv preprint arXiv:2410.05160}, year={2024} }""" SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction ## Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n ## ⚠️ Please note that you need to submit the JSON file with the following format: ```json { "metadata": { "model_name": "", "url": "" or null, "model_size": or null, "data_source": "Self-Reported", ... ... }, "metrics": { "image": { "ImageNet-1K": { "hit@1": 0.5, "ndcg@1": 0.5, ... ... }, "N24News": { ... ... }, ... ... }, "visdoc": { "ViDoRe": { "hit@1": 0.5, "ndcg@1": 0.5, ... ... }, ... ... }, "video": { "DiDeMo": { "hit@1": 0.5, "ndcg@1": 0.5, ... ... }, "MSR-VTT": { ... ... }, ... ... } } } ``` To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then inform us on [our discord server](https://discord.gg/njyKubdtry), or send us an email at m7su@uwaterloo.ca, including your model's information. \n We will review your submission and update the leaderboard accordingly. \n We highly recommend joining our [discord server](https://discord.gg/njyKubdtry), which provides a convenient way to stay informed with latest updates, or share any feedback you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community! """ def create_hyperlinked_names(df): def convert_url(url, model_name): return f'{model_name}' if url else model_name def add_link_to_model_name(row): row['Models'] = convert_url(row['URL'], row['Models']) return row df = df.copy() df = df.apply(add_link_to_model_name, axis=1) return df # def fetch_data(file: str) -> pd.DataFrame: # # fetch the leaderboard data from remote # if file is None: # raise ValueError("URL Not Provided") # url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}" # print(f"Fetching data from {url}") # response = requests.get(url) # if response.status_code != 200: # raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}") # return pd.read_json(io.StringIO(response.text), orient='records', lines=True) def get_df(file="results.jsonl"): df = pd.read_json(file, orient='records', lines=True) df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size) for task in TASKS_V1: if df[task].isnull().any(): df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score) df = df.sort_values(by=['V1-Overall'], ascending=False) df = create_hyperlinked_names(df) df['Rank'] = range(1, len(df) + 1) return df def refresh_data(): df = get_df() return df[COLUMN_NAMES] def search_and_filter_models(df, query, min_size, max_size): filtered_df = df.copy() if query: filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)] size_mask = filtered_df['Model Size(B)'].apply(lambda x: (min_size <= 1000.0 <= max_size) if x == 'unknown' else (min_size <= x <= max_size)) filtered_df = filtered_df[size_mask] return filtered_df[COLUMN_NAMES] def search_models(df, query): if query: return df[df['Models'].str.contains(query, case=False, na=False)] return df def get_size_range(df): sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x) if (sizes == 0.0).all(): return 0.0, 1000.0 return float(sizes.min()), float(sizes.max()) def process_model_size(size): if pd.isna(size) or size == 'unk': return 'unknown' try: val = float(size) return round(val, 3) except (ValueError, TypeError): return 'unknown' def filter_columns_by_tasks(df, selected_tasks=None): if selected_tasks is None or len(selected_tasks) == 0: return df[COLUMN_NAMES] base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall'] selected_columns = base_columns + selected_tasks available_columns = [col for col in selected_columns if col in df.columns] return df[available_columns]