|
import gradio as gr |
|
import json |
|
import pandas as pd |
|
from typing import Dict, List, Any |
|
|
|
|
|
|
|
BENCHMARK_DATA_FORMAT_EXAMPLE = [ |
|
{ |
|
"name": "jinaai/jina-embeddings-v3", |
|
"url": "https://huggingface.co/jinaai/jina-embeddings-v3", |
|
"context_length": "8192", |
|
"num_parameters": "572M", |
|
"emb_dim": 1024, |
|
"retrieval": { |
|
"KazQADRetrieval": 0.63206, |
|
"average_score": 0.63206 |
|
}, |
|
"classification": { |
|
"KazSandraPolarityClassification": 0.75332, |
|
"KazSandraScoreClassification": 0.519385, |
|
"average_score": 0.6363525 |
|
}, |
|
"bitext_mining": { |
|
"KazParcBitextMining_kaz-to-eng": 0.919131, |
|
"KazParcBitextMining_eng-to-kaz": 0.912916, |
|
"KazParcBitextMining_kaz-to-rus": 0.929359, |
|
"KazParcBitextMining_rus-to-kaz": 0.921656, |
|
"average_score": 0.9207655 |
|
} |
|
} |
|
] |
|
|
|
|
|
class KazTEBLeaderboard: |
|
def __init__(self, data: List[Dict[str, Any]]): |
|
self.data = data |
|
self.tasks = self._extract_tasks() |
|
|
|
def _extract_tasks(self) -> Dict[str, List[str]]: |
|
tasks = {} |
|
if self.data: |
|
sample_model = self.data[0] |
|
for task_name in ['retrieval', 'classification', 'bitext_mining']: |
|
if task_name in sample_model: |
|
datasets = [k for k in sample_model[task_name].keys() if k != 'average_score'] |
|
tasks[task_name] = datasets |
|
return tasks |
|
|
|
def _format_score(self, score: float) -> str: |
|
return f"{score:.4f}" |
|
|
|
def _create_model_link(self, name: str, url: str) -> str: |
|
return f'<a href="{url}" target="_blank" style="color: #1976d2; text-decoration: none;">{name}</a>' |
|
|
|
def get_task_dataframe(self, task_name: str) -> pd.DataFrame: |
|
rows = [] |
|
|
|
for model in self.data: |
|
if task_name not in model: |
|
continue |
|
|
|
row = { |
|
'Model': self._create_model_link(model['name'], model['url']), |
|
'Average': self._format_score(model[task_name]['average_score']), |
|
'Context Length': model['context_length'], |
|
'Parameters': model.get('num_parameters', 'N/A'), |
|
'Embedding Dimmension': model.get('emb_dim', 'N/A') |
|
} |
|
|
|
|
|
for dataset in self.tasks[task_name]: |
|
if dataset in model[task_name]: |
|
row[dataset] = self._format_score(model[task_name][dataset]) |
|
|
|
rows.append(row) |
|
|
|
df = pd.DataFrame(rows) |
|
df['_sort_key'] = df['Average'].astype(float) |
|
df = df.sort_values('_sort_key', ascending=False).drop('_sort_key', axis=1) |
|
df.insert(0, 'Rank', range(1, len(df) + 1)) |
|
|
|
return df |
|
|
|
def create_interface(self): |
|
|
|
|
|
js_func = """ |
|
function refresh() { |
|
const url = new URL(window.location); |
|
|
|
if (url.searchParams.get('__theme') !== 'light') { |
|
url.searchParams.set('__theme', 'light'); |
|
window.location.href = url.href; |
|
} |
|
} |
|
""" |
|
|
|
with gr.Blocks(js=js_func) as demo: |
|
|
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; margin-bottom: 20px;"> |
|
<h1 style="font-size: 36px; margin-bottom: 10px;">KazTEB Leaderboard π</h1> |
|
<p style="font-size: 22px; color: #666;">Kazakh language extension for the <a href="https://github.com/embeddings-benchmark/mteb" target="_blank" style="color: #1976d2; text-decoration: none;">Massive Text Embedding Benchmark</a></p> |
|
</div> |
|
""" |
|
) |
|
|
|
|
|
gr.HTML( |
|
""" |
|
<div style="margin-bottom: 30px; padding: 20px; background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #1976d2;"> |
|
<p style="font-size: 16px; line-height: 1.6; margin: 0; color: #333;"> |
|
This is a new and ongoing project dedicated to a comprehensive evaluation of existing text embedding models on datasets designed for Kazakh language tasks. <a href="https://github.com/Batyr1203/kazteb">Link</a> to the project code. <br><br>Currently, the leaderboard supports only 3 tasks: <b>retrieval</b>, <b>classification</b>, and <b>bitext mining</b>, based on existing human-annotated datasets. The aim of this project is to extend the list to 8 tasks proposed in MTEB and cover multiple domains within each task. The test datasets are planned to be acquired from real data sources, without using synthetic samples. |
|
</p> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Tabs() as main_tabs: |
|
with gr.Tab("π Task Results"): |
|
|
|
with gr.Tabs() as task_tabs: |
|
with gr.Tab("Retrieval"): |
|
retrieval_df = self.get_task_dataframe('retrieval') |
|
gr.DataFrame( |
|
value=retrieval_df, |
|
headers=list(retrieval_df.columns), |
|
datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(retrieval_df.columns) - 5), |
|
col_count=(len(retrieval_df.columns), "fixed"), |
|
interactive=False, |
|
column_widths=[50, 400] + [200] * (len(retrieval_df.columns)-2) |
|
) |
|
|
|
with gr.Tab("Classification"): |
|
classification_df = self.get_task_dataframe('classification') |
|
gr.DataFrame( |
|
value=classification_df, |
|
headers=list(classification_df.columns), |
|
datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(classification_df.columns) - 5), |
|
col_count=(len(classification_df.columns), "fixed"), |
|
interactive=False, |
|
column_widths=[50, 400] + [200] * (len(classification_df.columns)-2) |
|
) |
|
|
|
with gr.Tab("Bitext Mining"): |
|
bitext_df = self.get_task_dataframe('bitext_mining') |
|
gr.DataFrame( |
|
value=bitext_df, |
|
headers=list(bitext_df.columns), |
|
datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(bitext_df.columns) - 5), |
|
col_count=(len(bitext_df.columns), "fixed"), |
|
interactive=False, |
|
column_widths=[50, 400] + [200] * (len(bitext_df.columns)-2) |
|
) |
|
|
|
with gr.Tab("π Metrics"): |
|
gr.Markdown("## Evaluation Metrics Overview") |
|
gr.Markdown("Although the evaluation generates multiple metric values for each task, we retain only a single metric for reference.") |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
gr.Markdown( |
|
"""### π Retrieval |
|
|
|
**Metric:** nDCG@10 (Normalized Discounted Cumulative Gain) |
|
- Measures ranking quality of retrieved documents |
|
- Considers both relevance and position |
|
- **Range:** 0.0 - 1.0 (higher is better) |
|
|
|
**Dataset:** [KazQADRetrieval](https://huggingface.co/datasets/issai/kazqad) |
|
- Question-answer retrieval for Kazakh language |
|
- Human-annotated question-document pairs""", |
|
elem_classes=["retrieval-card"] |
|
) |
|
|
|
with gr.Column(): |
|
gr.Markdown( |
|
"""### π Classification |
|
|
|
**Metric:** Accuracy |
|
- Percentage of correctly classified instances |
|
- Standard classification metric |
|
- **Range:** 0.0 - 1.0 (higher is better) |
|
|
|
**Datasets:** |
|
- [KazSandraPolarityClassification](https://huggingface.co/datasets/issai/kazsandra): Sentiment polarity |
|
- [KazSandraScoreClassification](https://huggingface.co/datasets/issai/kazsandra): Sentiment scoring""", |
|
elem_classes=["classification-card"] |
|
) |
|
|
|
with gr.Column(): |
|
gr.Markdown( |
|
"""### π Bitext Mining |
|
|
|
**Metric:** F1-Score |
|
- Harmonic mean of precision and recall |
|
- Balances correctness and completeness |
|
- **Range:** 0.0 - 1.0 (higher is better) |
|
|
|
**Dataset:** [KazParcBitextMining](https://huggingface.co/datasets/issai/kazparc) |
|
- Parallel sentence mining (kk β en, kk β ru) |
|
- Bidirectional evaluation""", |
|
elem_classes=["bitext-card"] |
|
) |
|
|
|
gr.Markdown("---") |
|
gr.Markdown("### π Scoring & Ranking") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("**Task Averaging:** Equal weight per dataset within each task") |
|
with gr.Column(): |
|
gr.Markdown("**Model Ranking:** Based on individual task performance") |
|
with gr.Column(): |
|
|
|
pass |
|
|
|
gr.Markdown("---") |
|
gr.HTML( |
|
""" |
|
<div style="margin-top: 30px; padding: 20px; background-color: #f0f8ff; border-radius: 8px; border-left: 4px solid #4a90e2;"> |
|
<h3 style="margin-top: 0; color: #2c3e50; display: flex; align-items: center;"> |
|
π TODO: |
|
</h3> |
|
<ul style="color: #333; line-height: 1.6; margin-bottom: 0;"> |
|
<li><strong>Dynamic Data Loading:</strong> Switching to API-based result fetching for real-time updates without manual JSON uploads.</li> |
|
</ul> |
|
</div> |
|
""" |
|
) |
|
|
|
|
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; margin-top: 20px; padding: 15px; color: #666; font-size: 14px;"> |
|
π§ Contact: <a href="mailto:arysbatyr@gmail.com" style="color: #1976d2; text-decoration: none;">arysbatyr@gmail.com</a> |
|
</div> |
|
""" |
|
) |
|
|
|
return demo |
|
|
|
|
|
def load_benchmark_data(filepath: str = None) -> List[Dict[str, Any]]: |
|
if filepath: |
|
with open(filepath, 'r') as f: |
|
return json.load(f) |
|
return BENCHMARK_DATA_FORMAT_EXAMPLE |
|
|
|
|
|
if __name__ == "__main__": |
|
data = load_benchmark_data("./results.json") |
|
|
|
leaderboard = KazTEBLeaderboard(data) |
|
|
|
demo = leaderboard.create_interface() |
|
demo.launch() |
|
|
|
|