File size: 11,133 Bytes
098125d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9192c4
098125d
 
 
 
 
 
 
 
 
c9192c4
098125d
 
c9192c4
098125d
 
c9192c4
098125d
 
c9192c4
098125d
 
 
c9192c4
098125d
 
 
 
 
 
 
c9192c4
098125d
 
 
 
c9192c4
098125d
c9192c4
098125d
 
 
 
c9192c4
098125d
c9192c4
098125d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9192c4
098125d
 
 
 
 
 
 
c9192c4
098125d
c9192c4
098125d
 
 
 
 
 
 
 
c9192c4
098125d
 
c9192c4
098125d
 
 
 
 
 
 
 
c9192c4
 
098125d
c9192c4
098125d
 
 
 
 
 
 
c9192c4
 
098125d
c9192c4
098125d
 
 
 
 
 
 
c9192c4
 
098125d
c9192c4
098125d
 
 
c9192c4
098125d
 
 
 
 
c9192c4
098125d
 
 
 
 
 
 
 
 
 
c9192c4
098125d
 
 
c9192c4
098125d
 
 
 
 
 
9a814cd
 
098125d
 
c9192c4
098125d
 
 
c9192c4
098125d
 
 
 
 
 
9a814cd
098125d
 
 
c9192c4
098125d
 
c9192c4
098125d
 
 
 
 
 
 
 
c9192c4
098125d
c9192c4
098125d
 
 
 
 
 
 
 
 
 
 
c9192c4
098125d
c9192c4
098125d
 
 
 
 
 
c9192c4
098125d
 
 
 
 
 
 
 
 
 
 
 
c9192c4
098125d
c9192c4
098125d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import gradio as gr
import json
import pandas as pd
from typing import Dict, List, Any


# Sample data
BENCHMARK_DATA_FORMAT_EXAMPLE = [
  {
    "name": "jinaai/jina-embeddings-v3",
    "url": "https://huggingface.co/jinaai/jina-embeddings-v3",
    "context_length": "8192",
    "num_parameters": "572M",
    "emb_dim": 1024,
    "retrieval": {
      "KazQADRetrieval": 0.63206,
      "average_score": 0.63206
    },
    "classification": {
      "KazSandraPolarityClassification": 0.75332,
      "KazSandraScoreClassification": 0.519385,
      "average_score": 0.6363525
    },
    "bitext_mining": {
      "KazParcBitextMining_kaz-to-eng": 0.919131,
      "KazParcBitextMining_eng-to-kaz": 0.912916,
      "KazParcBitextMining_kaz-to-rus": 0.929359,
      "KazParcBitextMining_rus-to-kaz": 0.921656,
      "average_score": 0.9207655
    }
  }
]


class KazTEBLeaderboard:
    def __init__(self, data: List[Dict[str, Any]]):
        self.data = data
        self.tasks = self._extract_tasks()

    def _extract_tasks(self) -> Dict[str, List[str]]:
        tasks = {}
        if self.data:
            sample_model = self.data[0]
            for task_name in ['retrieval', 'classification', 'bitext_mining']:
                if task_name in sample_model:
                    datasets = [k for k in sample_model[task_name].keys() if k != 'average_score']
                    tasks[task_name] = datasets
        return tasks

    def _format_score(self, score: float) -> str:
        return f"{score:.4f}"

    def _create_model_link(self, name: str, url: str) -> str:
        return f'<a href="{url}" target="_blank" style="color: #1976d2; text-decoration: none;">{name}</a>'

    def get_task_dataframe(self, task_name: str) -> pd.DataFrame:
        rows = []

        for model in self.data:
            if task_name not in model:
                continue

            row = {
                'Model': self._create_model_link(model['name'], model['url']),
                'Average': self._format_score(model[task_name]['average_score']),
                'Context Length': model['context_length'],
                'Parameters': model.get('num_parameters', 'N/A'),
                'Embedding Dimmension': model.get('emb_dim', 'N/A')
            }

            # Addition of dataset-specific scores
            for dataset in self.tasks[task_name]:
                if dataset in model[task_name]:
                    row[dataset] = self._format_score(model[task_name][dataset])

            rows.append(row)

        df = pd.DataFrame(rows)
        df['_sort_key'] = df['Average'].astype(float)
        df = df.sort_values('_sort_key', ascending=False).drop('_sort_key', axis=1)
        df.insert(0, 'Rank', range(1, len(df) + 1))

        return df

    def create_interface(self):

        # we will force the light theme for now :)
        js_func = """
        function refresh() {
            const url = new URL(window.location);

            if (url.searchParams.get('__theme') !== 'light') {
                url.searchParams.set('__theme', 'light');
                window.location.href = url.href;
            }
        }
        """

        with gr.Blocks(js=js_func) as demo:
            # Header
            gr.HTML(
                """
                <div style="text-align: center; margin-bottom: 20px;">
                    <h1 style="font-size: 36px; margin-bottom: 10px;">KazTEB Leaderboard πŸ†</h1>
                    <p style="font-size: 22px; color: #666;">Kazakh language extension for the <a href="https://github.com/embeddings-benchmark/mteb" target="_blank" style="color: #1976d2; text-decoration: none;">Massive Text Embedding Benchmark</a></p>
                </div>
                """
            )

            # Subheader -- Project description
            gr.HTML(
                """
                <div style="margin-bottom: 30px; padding: 20px; background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #1976d2;">
                    <p style="font-size: 16px; line-height: 1.6; margin: 0; color: #333;">
                        This is a new and ongoing project dedicated to a comprehensive evaluation of existing text embedding models on datasets designed for Kazakh language tasks. <a href="https://github.com/Batyr1203/kazteb">Link</a> to the project code. <br><br>Currently, the leaderboard supports only 3 tasks: <b>retrieval</b>, <b>classification</b>, and <b>bitext mining</b>, based on existing human-annotated datasets. The aim of this project is to extend the list to 8 tasks proposed in MTEB and cover multiple domains within each task. The test datasets are planned to be acquired from real data sources, without using synthetic samples.
                    </p>
                </div>
                """
            )

            with gr.Tabs() as main_tabs:
                with gr.Tab("πŸ“Š Task Results"):

                    with gr.Tabs() as task_tabs:
                        with gr.Tab("Retrieval"):
                            retrieval_df = self.get_task_dataframe('retrieval')
                            gr.DataFrame(
                                value=retrieval_df,
                                headers=list(retrieval_df.columns),
                                datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(retrieval_df.columns) - 5),
                                col_count=(len(retrieval_df.columns), "fixed"),
                                interactive=False,
                                column_widths=[50, 400] + [200] * (len(retrieval_df.columns)-2)
                            )

                        with gr.Tab("Classification"):
                            classification_df = self.get_task_dataframe('classification')
                            gr.DataFrame(
                                value=classification_df,
                                headers=list(classification_df.columns),
                                datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(classification_df.columns) - 5),
                                col_count=(len(classification_df.columns), "fixed"),
                                interactive=False,
                                column_widths=[50, 400] + [200] * (len(classification_df.columns)-2)
                            )

                        with gr.Tab("Bitext Mining"):
                            bitext_df = self.get_task_dataframe('bitext_mining')
                            gr.DataFrame(
                                value=bitext_df,
                                headers=list(bitext_df.columns),
                                datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(bitext_df.columns) - 5),
                                col_count=(len(bitext_df.columns), "fixed"),
                                interactive=False,
                                column_widths=[50, 400] + [200] * (len(bitext_df.columns)-2)
                            )

                with gr.Tab("πŸ“ˆ Metrics"):
                    gr.Markdown("## Evaluation Metrics Overview")
                    gr.Markdown("Although the evaluation generates multiple metric values for each task, we retain only a single metric for reference.")

                    with gr.Row():

                        with gr.Column():
                            gr.Markdown(
                                """### πŸ” Retrieval

**Metric:** nDCG@10 (Normalized Discounted Cumulative Gain)
- Measures ranking quality of retrieved documents
- Considers both relevance and position
- **Range:** 0.0 - 1.0 (higher is better)

**Dataset:** [KazQADRetrieval](https://huggingface.co/datasets/issai/kazqad)
- Question-answer retrieval for Kazakh language
- Human-annotated question-document pairs""",
                                elem_classes=["retrieval-card"]
                            )

                        with gr.Column():
                            gr.Markdown(
                                """### πŸ“ Classification

**Metric:** Accuracy
- Percentage of correctly classified instances
- Standard classification metric
- **Range:** 0.0 - 1.0 (higher is better)

**Datasets:**
- [KazSandraPolarityClassification](https://huggingface.co/datasets/issai/kazsandra): Sentiment polarity
- [KazSandraScoreClassification](https://huggingface.co/datasets/issai/kazsandra): Sentiment scoring""",
                                elem_classes=["classification-card"]
                            )

                        with gr.Column():
                            gr.Markdown(
                                """### πŸ”— Bitext Mining

**Metric:** F1-Score
- Harmonic mean of precision and recall
- Balances correctness and completeness
- **Range:** 0.0 - 1.0 (higher is better)

**Dataset:** [KazParcBitextMining](https://huggingface.co/datasets/issai/kazparc)
- Parallel sentence mining (kk ↔ en, kk ↔ ru)
- Bidirectional evaluation""",
                                elem_classes=["bitext-card"]
                            )

                    gr.Markdown("---")
                    gr.Markdown("### πŸ“Š Scoring & Ranking")

                    with gr.Row():
                        with gr.Column():
                            gr.Markdown("**Task Averaging:** Equal weight per dataset within each task")
                        with gr.Column():
                            gr.Markdown("**Model Ranking:** Based on individual task performance")
                        with gr.Column():
                            #gr.Markdown("**Future Plans:** Overall cross-task scoring implementation")
                            pass

            gr.Markdown("---")
            gr.HTML(
                """
                <div style="margin-top: 30px; padding: 20px; background-color: #f0f8ff; border-radius: 8px; border-left: 4px solid #4a90e2;">
                    <h3 style="margin-top: 0; color: #2c3e50; display: flex; align-items: center;">
                        πŸ“‹ TODO:
                    </h3>
                    <ul style="color: #333; line-height: 1.6; margin-bottom: 0;">
                        <li><strong>Dynamic Data Loading:</strong> Switching to API-based result fetching for real-time updates without manual JSON uploads.</li>
                    </ul>
                </div>
                """
            )

            # Contact information
            gr.HTML(
                """
                <div style="text-align: center; margin-top: 20px; padding: 15px; color: #666; font-size: 14px;">
                    πŸ“§ Contact: <a href="mailto:arysbatyr@gmail.com" style="color: #1976d2; text-decoration: none;">arysbatyr@gmail.com</a>
                </div>
                """
            )

        return demo


def load_benchmark_data(filepath: str = None) -> List[Dict[str, Any]]:
    if filepath:
        with open(filepath, 'r') as f:
            return json.load(f)
    return BENCHMARK_DATA_FORMAT_EXAMPLE


if __name__ == "__main__":
    data = load_benchmark_data("./results.json")

    leaderboard = KazTEBLeaderboard(data)

    demo = leaderboard.create_interface()
    demo.launch()