batyrme commited on
Commit
098125d
·
verified ·
1 Parent(s): 0bb17cb

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +260 -0
  2. results.json +186 -0
app.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import pandas as pd
4
+ from typing import Dict, List, Any
5
+
6
+
7
+ # Sample data
8
+ BENCHMARK_DATA_FORMAT_EXAMPLE = [
9
+ {
10
+ "name": "jinaai/jina-embeddings-v3",
11
+ "url": "https://huggingface.co/jinaai/jina-embeddings-v3",
12
+ "context_length": "8192",
13
+ "num_parameters": "572M",
14
+ "emb_dim": 1024,
15
+ "retrieval": {
16
+ "KazQADRetrieval": 0.63206,
17
+ "average_score": 0.63206
18
+ },
19
+ "classification": {
20
+ "KazSandraPolarityClassification": 0.75332,
21
+ "KazSandraScoreClassification": 0.519385,
22
+ "average_score": 0.6363525
23
+ },
24
+ "bitext_mining": {
25
+ "KazParcBitextMining_kaz-to-eng": 0.919131,
26
+ "KazParcBitextMining_eng-to-kaz": 0.912916,
27
+ "KazParcBitextMining_kaz-to-rus": 0.929359,
28
+ "KazParcBitextMining_rus-to-kaz": 0.921656,
29
+ "average_score": 0.9207655
30
+ }
31
+ }
32
+ ]
33
+
34
+
35
+ class KazTEBLeaderboard:
36
+ def __init__(self, data: List[Dict[str, Any]]):
37
+ self.data = data
38
+ self.tasks = self._extract_tasks()
39
+
40
+ def _extract_tasks(self) -> Dict[str, List[str]]:
41
+ tasks = {}
42
+ if self.data:
43
+ sample_model = self.data[0]
44
+ for task_name in ['retrieval', 'classification', 'bitext_mining']:
45
+ if task_name in sample_model:
46
+ datasets = [k for k in sample_model[task_name].keys() if k != 'average_score']
47
+ tasks[task_name] = datasets
48
+ return tasks
49
+
50
+ def _format_score(self, score: float) -> str:
51
+ return f"{score:.4f}"
52
+
53
+ def _create_model_link(self, name: str, url: str) -> str:
54
+ return f'<a href="{url}" target="_blank" style="color: #1976d2; text-decoration: none;">{name}</a>'
55
+
56
+ def get_task_dataframe(self, task_name: str) -> pd.DataFrame:
57
+ rows = []
58
+
59
+ for model in self.data:
60
+ if task_name not in model:
61
+ continue
62
+
63
+ row = {
64
+ 'Model': self._create_model_link(model['name'], model['url']),
65
+ 'Average': self._format_score(model[task_name]['average_score']),
66
+ 'Context Length': model['context_length'],
67
+ 'Parameters': model.get('num_parameters', 'N/A'),
68
+ 'Embedding Dimmension': model.get('emb_dim', 'N/A')
69
+ }
70
+
71
+ # Addition of dataset-specific scores
72
+ for dataset in self.tasks[task_name]:
73
+ if dataset in model[task_name]:
74
+ row[dataset] = self._format_score(model[task_name][dataset])
75
+
76
+ rows.append(row)
77
+
78
+ df = pd.DataFrame(rows)
79
+ df['_sort_key'] = df['Average'].astype(float)
80
+ df = df.sort_values('_sort_key', ascending=False).drop('_sort_key', axis=1)
81
+ df.insert(0, 'Rank', range(1, len(df) + 1))
82
+
83
+ return df
84
+
85
+ def create_interface(self):
86
+
87
+ # we will force the light theme for now :)
88
+ js_func = """
89
+ function refresh() {
90
+ const url = new URL(window.location);
91
+
92
+ if (url.searchParams.get('__theme') !== 'light') {
93
+ url.searchParams.set('__theme', 'light');
94
+ window.location.href = url.href;
95
+ }
96
+ }
97
+ """
98
+
99
+ with gr.Blocks(js=js_func) as demo:
100
+ # Header
101
+ gr.Markdown(
102
+ """
103
+ <div style="text-align: center; margin-bottom: 20px;">
104
+ <h1 style="font-size: 36px; margin-bottom: 10px;">KazTEB Leaderboard 🏆</h1>
105
+ <p style="font-size: 22px; color: #666;">Kazakh language extension for the <a href="https://github.com/embeddings-benchmark/mteb" target="_blank" style="color: #1976d2; text-decoration: none;">Massive Text Embedding Benchmark</a></p>
106
+ </div>
107
+ """
108
+ )
109
+
110
+ # Subheader -- Project description
111
+ gr.Markdown(
112
+ """
113
+ <div style="margin-bottom: 30px; padding: 20px; background-color: #f8f9fa; border-radius: 8px; border-left: 4px solid #1976d2;">
114
+ <p style="font-size: 16px; line-height: 1.6; margin: 0; color: #333;">
115
+ This is a new and ongoing project dedicated to a comprehensive evaluation of existing text embedding models on datasets designed for Kazakh language tasks. <a href="https://github.com/Batyr1203/kazteb">Link</a> to the project code. <br><br>Currently, the leaderboard supports only 3 tasks: <b>retrieval</b>, <b>classification</b>, and <b>bitext mining</b>, based on existing human-annotated datasets. The aim of this project is to extend the list to 8 tasks proposed in MTEB and cover multiple domains within each task. The test datasets are planned to be acquired from real data sources, without using synthetic samples.
116
+ </p>
117
+ </div>
118
+ """
119
+ )
120
+
121
+ with gr.Tabs() as main_tabs:
122
+ with gr.Tab("📊 Task Results"):
123
+
124
+ with gr.Tabs() as task_tabs:
125
+ with gr.Tab("Retrieval"):
126
+ retrieval_df = self.get_task_dataframe('retrieval')
127
+ gr.DataFrame(
128
+ value=retrieval_df,
129
+ headers=list(retrieval_df.columns),
130
+ datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(retrieval_df.columns) - 5),
131
+ col_count=(len(retrieval_df.columns), "fixed"),
132
+ interactive=False
133
+ )
134
+
135
+ with gr.Tab("Classification"):
136
+ classification_df = self.get_task_dataframe('classification')
137
+ gr.DataFrame(
138
+ value=classification_df,
139
+ headers=list(classification_df.columns),
140
+ datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(classification_df.columns) - 5),
141
+ col_count=(len(classification_df.columns), "fixed"),
142
+ interactive=False
143
+ )
144
+
145
+ with gr.Tab("Bitext Mining"):
146
+ bitext_df = self.get_task_dataframe('bitext_mining')
147
+ gr.DataFrame(
148
+ value=bitext_df,
149
+ headers=list(bitext_df.columns),
150
+ datatype=["number", "html", "str", "str", "str"] + ["str"] * (len(bitext_df.columns) - 5),
151
+ col_count=(len(bitext_df.columns), "fixed"),
152
+ interactive=False
153
+ )
154
+
155
+ with gr.Tab("📈 Metrics"):
156
+ gr.Markdown("## Evaluation Metrics Overview")
157
+ gr.Markdown("Although the evaluation generates multiple metric values for each task, we retain only a single metric for reference.")
158
+
159
+ with gr.Row():
160
+
161
+ with gr.Column():
162
+ gr.Markdown(
163
+ """### 🔍 Retrieval
164
+
165
+ **Metric:** nDCG@10 (Normalized Discounted Cumulative Gain)
166
+ - Measures ranking quality of retrieved documents
167
+ - Considers both relevance and position
168
+ - **Range:** 0.0 - 1.0 (higher is better)
169
+
170
+ **Dataset:** [KazQADRetrieval](https://huggingface.co/datasets/issai/kazqad)
171
+ - Question-answer retrieval for Kazakh language
172
+ - Human-annotated question-document pairs""",
173
+ elem_classes=["retrieval-card"]
174
+ )
175
+
176
+ with gr.Column():
177
+ gr.Markdown(
178
+ """### 📝 Classification
179
+
180
+ **Metric:** Accuracy
181
+ - Percentage of correctly classified instances
182
+ - Standard classification metric
183
+ - **Range:** 0.0 - 1.0 (higher is better)
184
+
185
+ **Datasets:**
186
+ - **[KazSandraPolarityClassification](https://huggingface.co/datasets/issai/kazsandra):** Sentiment polarity
187
+ - **[KazSandraScoreClassification](https://huggingface.co/datasets/issai/kazsandra):** Sentiment scoring""",
188
+ elem_classes=["classification-card"]
189
+ )
190
+
191
+ with gr.Column():
192
+ gr.Markdown(
193
+ """### 🔗 Bitext Mining
194
+
195
+ **Metric:** F1-Score
196
+ - Harmonic mean of precision and recall
197
+ - Balances correctness and completeness
198
+ - **Range:** 0.0 - 1.0 (higher is better)
199
+
200
+ **Dataset:** [KazParcBitextMining](https://huggingface.co/datasets/issai/kazparc)
201
+ - Parallel sentence mining (Kazakh ↔ English)
202
+ - Bidirectional evaluation""",
203
+ elem_classes=["bitext-card"]
204
+ )
205
+
206
+ gr.Markdown("---")
207
+ gr.Markdown("### 📊 Scoring & Ranking")
208
+
209
+ with gr.Row():
210
+ with gr.Column():
211
+ gr.Markdown("**Task Averaging:** Equal weight per dataset within each task")
212
+ with gr.Column():
213
+ gr.Markdown("**Model Ranking:** Based on individual task performance")
214
+ with gr.Column():
215
+ #gr.Markdown("**Future Plans:** Overall cross-task scoring implementation")
216
+ pass
217
+
218
+ # Todo section at the bottom
219
+ gr.Markdown("---")
220
+ gr.Markdown(
221
+ """
222
+ <div style="margin-top: 30px; padding: 20px; background-color: #f0f8ff; border-radius: 8px; border-left: 4px solid #4a90e2;">
223
+ <h3 style="margin-top: 0; color: #2c3e50; display: flex; align-items: center;">
224
+ 📋 TODO:
225
+ </h3>
226
+ <ul style="color: #333; line-height: 1.6; margin-bottom: 0;">
227
+ <li><strong>API-based Model Evaluation:</strong> Adding results of closed-source models such as Google's Gemini embeddings.</li>
228
+ <li><strong>Dynamic Data Loading:</strong> Switching to API-based result fetching for real-time updates without manual JSON uploads.</li>
229
+ </ul>
230
+ </div>
231
+ """
232
+ )
233
+
234
+ # Contact information
235
+ gr.Markdown(
236
+ """
237
+ <div style="text-align: center; margin-top: 20px; padding: 15px; color: #666; font-size: 14px;">
238
+ 📧 Contact: <a href="mailto:arysbatyr@gmail.com" style="color: #1976d2; text-decoration: none;">arysbatyr@gmail.com</a>
239
+ </div>
240
+ """
241
+ )
242
+
243
+ return demo
244
+
245
+
246
+ def load_benchmark_data(filepath: str = None) -> List[Dict[str, Any]]:
247
+ if filepath:
248
+ with open(filepath, 'r') as f:
249
+ return json.load(f)
250
+ return BENCHMARK_DATA_FORMAT_EXAMPLE
251
+
252
+
253
+ if __name__ == "__main__":
254
+ data = load_benchmark_data("./results.json")
255
+
256
+ leaderboard = KazTEBLeaderboard(data)
257
+
258
+ demo = leaderboard.create_interface()
259
+ demo.launch()
260
+
results.json ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "jinaai/jina-embeddings-v3",
4
+ "url": "https://huggingface.co/jinaai/jina-embeddings-v3",
5
+ "context_length": "8192",
6
+ "num_parameters": "572M",
7
+ "emb_dim": 1024,
8
+ "retrieval": {
9
+ "KazQADRetrieval": 0.63206,
10
+ "average_score": 0.63206
11
+ },
12
+ "classification": {
13
+ "KazSandraPolarityClassification": 0.75332,
14
+ "KazSandraScoreClassification": 0.519385,
15
+ "average_score": 0.6363525
16
+ },
17
+ "bitext_mining": {
18
+ "KazParcBitextMining_kaz-to-eng": 0.919131,
19
+ "KazParcBitextMining_eng-to-kaz": 0.912916,
20
+ "KazParcBitextMining_kaz-to-rus": 0.929359,
21
+ "KazParcBitextMining_rus-to-kaz": 0.921656,
22
+ "average_score": 0.9207655
23
+ }
24
+ },
25
+ {
26
+ "name": "Qwen/Qwen3-Embedding-0.6B",
27
+ "url": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B",
28
+ "context_length": "32K",
29
+ "num_parameters": "595M",
30
+ "emb_dim": 1024,
31
+ "retrieval": {
32
+ "KazQADRetrieval": 0.50446,
33
+ "average_score": 0.50446
34
+ },
35
+ "classification": {
36
+ "KazSandraScoreClassification": 0.370898,
37
+ "KazSandraPolarityClassification": 0.66377,
38
+ "average_score": 0.517334
39
+ },
40
+ "bitext_mining": {
41
+ "KazParcBitextMining_kaz-to-eng": 0.731777,
42
+ "KazParcBitextMining_eng-to-kaz": 0.742017,
43
+ "KazParcBitextMining_kaz-to-rus": 0.760971,
44
+ "KazParcBitextMining_rus-to-kaz": 0.766429,
45
+ "average_score": 0.7502985
46
+ }
47
+ },
48
+ {
49
+ "name": "Qwen/Qwen3-Embedding-4B",
50
+ "url": "https://huggingface.co/Qwen/Qwen3-Embedding-4B",
51
+ "context_length": "32K",
52
+ "num_parameters": "4B",
53
+ "emb_dim": 2560,
54
+ "retrieval": {
55
+ "KazQADRetrieval": 0.6153,
56
+ "average_score": 0.6153
57
+ },
58
+ "classification": {
59
+ "KazSandraScoreClassification": 0.394189,
60
+ "KazSandraPolarityClassification": 0.687012,
61
+ "average_score": 0.5406005
62
+ },
63
+ "bitext_mining": {
64
+ "KazParcBitextMining_kaz-to-eng": 0.943184,
65
+ "KazParcBitextMining_eng-to-kaz": 0.939993,
66
+ "KazParcBitextMining_kaz-to-rus": 0.945092,
67
+ "KazParcBitextMining_rus-to-kaz": 0.947474,
68
+ "average_score": 0.9439357500000001
69
+ }
70
+ },
71
+ {
72
+ "name": "Qwen/Qwen3-Embedding-8B",
73
+ "url": "https://huggingface.co/Qwen/Qwen3-Embedding-8B",
74
+ "context_length": "32K",
75
+ "num_parameters": "7B",
76
+ "emb_dim": 4096,
77
+ "retrieval": {
78
+ "KazQADRetrieval": 0.64347,
79
+ "average_score": 0.64347
80
+ },
81
+ "classification": {
82
+ "KazSandraScoreClassification": 0.471484,
83
+ "KazSandraPolarityClassification": 0.735547,
84
+ "average_score": 0.6035155
85
+ },
86
+ "bitext_mining": {
87
+ "KazParcBitextMining_kaz-to-eng": 0.958446,
88
+ "KazParcBitextMining_eng-to-kaz": 0.956327,
89
+ "KazParcBitextMining_kaz-to-rus": 0.957558,
90
+ "KazParcBitextMining_rus-to-kaz": 0.960846,
91
+ "average_score": 0.95829425
92
+ }
93
+ },
94
+ {
95
+ "name": "intfloat/multilingual-e5-small",
96
+ "url": "https://huggingface.co/intfloat/multilingual-e5-small",
97
+ "context_length": "512",
98
+ "num_parameters": "118M",
99
+ "emb_dim": 384,
100
+ "retrieval": {
101
+ "KazQADRetrieval": 0.53556,
102
+ "average_score": 0.53556
103
+ },
104
+ "classification": {
105
+ "KazSandraScoreClassification": 0.479639,
106
+ "KazSandraPolarityClassification": 0.74165,
107
+ "average_score": 0.6106445
108
+ },
109
+ "bitext_mining": {
110
+ "KazParcBitextMining_kaz-to-eng": 0.868082,
111
+ "KazParcBitextMining_eng-to-kaz": 0.873415,
112
+ "KazParcBitextMining_kaz-to-rus": 0.88751,
113
+ "KazParcBitextMining_rus-to-kaz": 0.904797,
114
+ "average_score": 0.883451
115
+ }
116
+ },
117
+ {
118
+ "name": "intfloat/multilingual-e5-large-instruct",
119
+ "url": "https://huggingface.co/intfloat/multilingual-e5-large-instruct",
120
+ "context_length": "512",
121
+ "num_parameters": "560M",
122
+ "emb_dim": 1024,
123
+ "retrieval": {
124
+ "KazQADRetrieval": 0.64164,
125
+ "average_score": 0.64164
126
+ },
127
+ "classification": {
128
+ "KazSandraPolarityClassification": 0.778467,
129
+ "KazSandraScoreClassification": 0.562012,
130
+ "average_score": 0.6702395
131
+ },
132
+ "bitext_mining": {
133
+ "KazParcBitextMining_kaz-to-eng": 0.961832,
134
+ "KazParcBitextMining_eng-to-kaz": 0.958423,
135
+ "KazParcBitextMining_kaz-to-rus": 0.958846,
136
+ "KazParcBitextMining_rus-to-kaz": 0.953091,
137
+ "average_score": 0.958048
138
+ }
139
+ },
140
+ {
141
+ "name": "intfloat/multilingual-e5-large",
142
+ "url": "https://huggingface.co/intfloat/multilingual-e5-large",
143
+ "context_length": "512",
144
+ "num_parameters": "560M",
145
+ "emb_dim": 1024,
146
+ "retrieval": {
147
+ "KazQADRetrieval": 0.61387,
148
+ "average_score": 0.61387
149
+ },
150
+ "classification": {
151
+ "KazSandraScoreClassification": 0.506543,
152
+ "KazSandraPolarityClassification": 0.75332,
153
+ "average_score": 0.6299315
154
+ },
155
+ "bitext_mining": {
156
+ "KazParcBitextMining_kaz-to-eng": 0.938867,
157
+ "KazParcBitextMining_eng-to-kaz": 0.941032,
158
+ "KazParcBitextMining_kaz-to-rus": 0.942812,
159
+ "KazParcBitextMining_rus-to-kaz": 0.945944,
160
+ "average_score": 0.94216375
161
+ }
162
+ },
163
+ {
164
+ "name": "intfloat/multilingual-e5-base",
165
+ "url": "https://huggingface.co/intfloat/multilingual-e5-base",
166
+ "context_length": "512",
167
+ "num_parameters": "278M",
168
+ "emb_dim": 768,
169
+ "retrieval": {
170
+ "KazQADRetrieval": 0.56312,
171
+ "average_score": 0.56312
172
+ },
173
+ "classification": {
174
+ "KazSandraPolarityClassification": 0.747656,
175
+ "KazSandraScoreClassification": 0.482275,
176
+ "average_score": 0.6149655
177
+ },
178
+ "bitext_mining": {
179
+ "KazParcBitextMining_kaz-to-eng": 0.902851,
180
+ "KazParcBitextMining_eng-to-kaz": 0.910523,
181
+ "KazParcBitextMining_kaz-to-rus": 0.918989,
182
+ "KazParcBitextMining_rus-to-kaz": 0.924031,
183
+ "average_score": 0.9140984999999999
184
+ }
185
+ }
186
+ ]