akera commited on
Commit
d5b83bc
Β·
verified Β·
1 Parent(s): a729bca

Update config.py

Browse files
Files changed (1) hide show
  1. config.py +177 -74
config.py CHANGED
@@ -7,96 +7,199 @@ LEADERBOARD_DATASET = "Sunbird/salt-translation-leaderboard"
7
  TEST_SET_DATASET = "Sunbird/salt-translation-test-set"
8
  SALT_DATASET = "sunbird/salt"
9
 
10
- # Language settings - ALL UG40 LANGUAGES (Updated from SALT constants)
11
- ALL_UG40_LANGUAGES = [
12
- 'ach', 'eng', 'lgg', 'lug', 'nyn', 'rny', 'teo', 'swa'
13
- ]
14
 
15
  LANGUAGE_NAMES = {
16
- 'ach': 'Acholi',
17
- 'eng': 'English',
18
- 'lgg': 'Lugbara',
19
- 'lug': 'Luganda',
20
- 'nyn': 'Runyankole',
21
- 'rny': 'Runyoro',
22
- 'teo': 'Ateso',
23
- 'swa': 'Swahili'
24
  }
25
 
26
- # Google Translate supported subset (for comparison)
27
- GOOGLE_SUPPORTED_LANGUAGES = ['lug', 'ach', 'swa', 'eng']
28
 
29
  # Google Translate language mapping
30
- GOOGLE_LANG_MAP = {
31
- 'lug': 'lg',
32
- 'ach': 'ach',
33
- 'swa': 'sw',
34
- 'eng': 'en'
35
- }
36
-
37
- # Evaluation settings
38
- MAX_TEST_SAMPLES = 500 # Per language pair
39
- MIN_SAMPLES_PER_PAIR = 10 # Minimum samples to be valid
40
 
41
- # UI settings
42
- TITLE = "πŸ† SALT Translation Leaderboard"
43
- DESCRIPTION = """
44
- Evaluation of translation models on Ugandan languages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- Upload your model's predictions on our standardized test set to see how it performs across all UG40 language pairs.
47
- Compare against Google Translate baseline and other submitted models.
48
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- # File format specifications
51
- PREDICTION_FORMAT = {
52
- 'required_columns': ['sample_id', 'prediction'],
53
- 'optional_columns': ['model_name', 'confidence'],
54
- 'file_types': ['.csv', '.tsv', '.json']
 
 
 
 
 
 
 
 
 
 
55
  }
56
 
57
- # Metrics configuration - Updated to match reference implementation
58
  METRICS_CONFIG = {
59
- 'primary_metrics': ['bleu', 'chrf', 'quality_score'],
60
- 'secondary_metrics': ['rouge1', 'rouge2', 'rougeL', 'cer', 'wer', 'len_ratio'],
61
- 'display_precision': 4,
62
- 'quality_score_components': [
63
- 'bleu', # normalized to 0-1
64
- 'chrf', # already 0-1
65
- 'cer', # inverted (1-cer)
66
- 'wer', # inverted (1-wer)
67
- 'rouge1', # 0-1
68
- 'rougeL' # 0-1
 
 
 
 
69
  ],
70
- 'error_metrics': ['cer', 'wer'], # Lower is better
71
- 'score_metrics': ['bleu', 'chrf', 'quality_score', 'rouge1', 'rouge2', 'rougeL'] # Higher is better
72
  }
73
 
74
- # Display settings for leaderboard
75
- DISPLAY_CONFIG = {
76
- 'max_models_radar': 8,
77
- 'max_models_ranking': 15,
78
- 'max_language_pairs_detail': 20,
79
- 'decimal_places': {
80
- 'quality_score': 4,
81
- 'bleu': 2,
82
- 'chrf': 4,
83
- 'rouge1': 4,
84
- 'rouge2': 4,
85
- 'rougeL': 4,
86
- 'cer': 4,
87
- 'wer': 4,
88
- 'len_ratio': 3,
89
- 'coverage_rate': 1 # percentage
90
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  }
92
 
93
- # Chart colors and styling
94
  CHART_CONFIG = {
95
- 'google_comparable_color': '#1f77b4',
96
- 'ug40_only_color': '#ff7f0e',
97
- 'primary_colorscale': 'Viridis',
98
- 'secondary_colorscale': 'Plasma',
99
- 'bar_height_per_model': 30,
100
- 'min_chart_height': 400,
101
- 'max_chart_height': 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  }
 
7
  TEST_SET_DATASET = "Sunbird/salt-translation-test-set"
8
  SALT_DATASET = "sunbird/salt"
9
 
10
+ # Language settings - ALL UG40 LANGUAGES
11
+ ALL_UG40_LANGUAGES = ["ach", "eng", "lgg", "lug", "nyn", "rny", "teo", "swa"]
 
 
12
 
13
  LANGUAGE_NAMES = {
14
+ "ach": "Acholi",
15
+ "eng": "English",
16
+ "lgg": "Lugbara",
17
+ "lug": "Luganda",
18
+ "nyn": "Runyankole",
19
+ "rny": "Runyoro",
20
+ "teo": "Ateso",
21
+ "swa": "Swahili",
22
  }
23
 
24
+ # Google Translate supported subset (for fair comparison)
25
+ GOOGLE_SUPPORTED_LANGUAGES = ["lug", "ach", "swa", "eng"]
26
 
27
  # Google Translate language mapping
28
+ GOOGLE_LANG_MAP = {"lug": "lg", "ach": "ach", "swa": "sw", "eng": "en"}
 
 
 
 
 
 
 
 
 
29
 
30
+ # SCIENTIFIC EVALUATION TRACKS
31
+ EVALUATION_TRACKS = {
32
+ "google_comparable": {
33
+ "name": "Google-Comparable Track",
34
+ "description": "Models evaluated only on language pairs supported by Google Translate",
35
+ "languages": GOOGLE_SUPPORTED_LANGUAGES,
36
+ "min_samples_per_pair": 50,
37
+ "statistical_power": 0.8,
38
+ "significance_level": 0.05,
39
+ },
40
+ "ug40_complete": {
41
+ "name": "UG40-Complete Track",
42
+ "description": "Models evaluated on all UG40 language pairs",
43
+ "languages": ALL_UG40_LANGUAGES,
44
+ "min_samples_per_pair": 30,
45
+ "statistical_power": 0.8,
46
+ "significance_level": 0.05,
47
+ },
48
+ "language_pair_matrix": {
49
+ "name": "Language-Pair Matrix",
50
+ "description": "Individual language pair analysis with statistical significance",
51
+ "languages": ALL_UG40_LANGUAGES,
52
+ "min_samples_per_pair": 20,
53
+ "statistical_power": 0.7,
54
+ "significance_level": 0.05,
55
+ },
56
+ }
57
 
58
+ # MODEL CATEGORIES
59
+ MODEL_CATEGORIES = {
60
+ "commercial": {
61
+ "name": "Commercial Systems",
62
+ "description": "Production translation systems",
63
+ "examples": ["google_translate", "azure_translator"],
64
+ "color": "#1f77b4",
65
+ },
66
+ "research": {
67
+ "name": "Research Models",
68
+ "description": "Academic and research institution models",
69
+ "examples": ["nllb", "m2m100"],
70
+ "color": "#ff7f0e",
71
+ },
72
+ "baseline": {
73
+ "name": "Baseline Models",
74
+ "description": "Simple baseline and reference models",
75
+ "examples": ["word_lookup", "frequency_baseline"],
76
+ "color": "#2ca02c",
77
+ },
78
+ "community": {
79
+ "name": "Community Submissions",
80
+ "description": "User-submitted models and fine-tuned variants",
81
+ "examples": ["user_submission"],
82
+ "color": "#d62728",
83
+ },
84
+ }
85
 
86
+ # STATISTICAL SETTINGS
87
+ STATISTICAL_CONFIG = {
88
+ "confidence_level": 0.95,
89
+ "bootstrap_samples": 1000,
90
+ "min_samples_for_ci": 20,
91
+ "effect_size_thresholds": {
92
+ "small": 0.2,
93
+ "medium": 0.5,
94
+ "large": 0.8,
95
+ },
96
+ "multiple_testing_correction": "bonferroni",
97
+ "outlier_detection": {
98
+ "method": "iqr",
99
+ "factor": 1.5,
100
+ },
101
  }
102
 
103
+ # METRICS CONFIGURATION - Enhanced for statistical analysis
104
  METRICS_CONFIG = {
105
+ "primary_metrics": ["bleu", "chrf", "quality_score"],
106
+ "secondary_metrics": ["rouge1", "rouge2", "rougeL", "cer", "wer", "len_ratio"],
107
+ "display_precision": 4,
108
+ "quality_score_components": ["bleu", "chrf", "cer", "wer", "rouge1", "rougeL"],
109
+ "error_metrics": ["cer", "wer"], # Lower is better
110
+ "score_metrics": ["bleu", "chrf", "quality_score", "rouge1", "rouge2", "rougeL"],
111
+ "statistical_metrics": [
112
+ "mean",
113
+ "std",
114
+ "median",
115
+ "ci_lower",
116
+ "ci_upper",
117
+ "p_value",
118
+ "effect_size",
119
  ],
 
 
120
  }
121
 
122
+ # VALIDATION REQUIREMENTS
123
+ VALIDATION_CONFIG = {
124
+ "min_samples_per_track": {
125
+ "google_comparable": 200,
126
+ "ug40_complete": 400,
127
+ "language_pair_matrix": 50,
128
+ },
129
+ "max_missing_rate": 0.05, # 5% missing predictions allowed
130
+ "quality_thresholds": {
131
+ "min_valid_predictions": 0.95,
132
+ "max_duplicate_rate": 0.1,
133
+ "min_avg_length": 3,
134
+ "max_avg_length": 500,
135
+ },
136
+ }
137
+
138
+ # UI CONFIGURATION
139
+ UI_CONFIG = {
140
+ "title": "πŸ† SALT Translation Leaderboard - Scientific Edition",
141
+ "description": """
142
+ Rigorous evaluation of translation models on Ugandan languages with statistical significance testing.
143
+ Three evaluation tracks ensure fair comparison across different model capabilities and language support.
144
+ """,
145
+ "tracks": {
146
+ "google_comparable": {
147
+ "tab_name": "πŸ€– Google-Comparable Track",
148
+ "icon": "πŸ€–",
149
+ "color": "#1f77b4",
150
+ },
151
+ "ug40_complete": {
152
+ "tab_name": "🌍 UG40-Complete Track",
153
+ "icon": "🌍",
154
+ "color": "#ff7f0e",
155
+ },
156
+ "language_pair_matrix": {
157
+ "tab_name": "πŸ“Š Language-Pair Matrix",
158
+ "icon": "πŸ“Š",
159
+ "color": "#2ca02c",
160
+ },
161
+ },
162
  }
163
 
164
+ # CHART CONFIGURATION - Research-grade styling
165
  CHART_CONFIG = {
166
+ "statistical_colorscale": "RdYlBu_r",
167
+ "category_colors": {cat: info["color"] for cat, info in MODEL_CATEGORIES.items()},
168
+ "heatmap_config": {
169
+ "colorscale": "Viridis",
170
+ "show_values": True,
171
+ "font_size": 10,
172
+ },
173
+ "confidence_interval_config": {
174
+ "alpha": 0.3,
175
+ "line_width": 2,
176
+ "marker_size": 8,
177
+ },
178
+ "statistical_plot_config": {
179
+ "height": 600,
180
+ "width": 800,
181
+ "margin": {"l": 100, "r": 50, "t": 50, "b": 100},
182
+ },
183
+ }
184
+
185
+ # FILE FORMAT SPECIFICATIONS
186
+ PREDICTION_FORMAT = {
187
+ "required_columns": ["sample_id", "prediction"],
188
+ "optional_columns": ["model_name", "confidence", "category"],
189
+ "file_types": [".csv", ".tsv", ".json"],
190
+ "category_detection": {
191
+ "google": ["google", "translate"],
192
+ "nllb": ["nllb", "meta"],
193
+ "m2m": ["m2m", "facebook"],
194
+ "baseline": ["baseline", "simple", "lookup"],
195
+ },
196
+ }
197
+
198
+ # EVALUATION SETTINGS
199
+ MAX_TEST_SAMPLES = 500 # Per language pair
200
+ MIN_SAMPLES_PER_PAIR = 10 # Minimum for basic statistics
201
+ SAMPLE_SIZE_RECOMMENDATIONS = {
202
+ "basic_comparison": 50,
203
+ "statistical_significance": 100,
204
+ "publication_quality": 200,
205
  }