Files changed (1) hide show
  1. chandar-lab_NeoBERT.json +115 -0
chandar-lab_NeoBERT.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bomFormat": "CycloneDX",
3
+ "specVersion": "1.6",
4
+ "serialNumber": "urn:uuid:1c879a20-16da-4bf5-901d-c0989128f99f",
5
+ "version": 1,
6
+ "metadata": {
7
+ "timestamp": "2025-06-05T09:41:58.948971+00:00",
8
+ "component": {
9
+ "type": "machine-learning-model",
10
+ "bom-ref": "chandar-lab/NeoBERT-d613029c-4864-500f-a464-bcbfff6cda00",
11
+ "name": "chandar-lab/NeoBERT",
12
+ "externalReferences": [
13
+ {
14
+ "url": "https://huggingface.co/chandar-lab/NeoBERT",
15
+ "type": "documentation"
16
+ }
17
+ ],
18
+ "modelCard": {
19
+ "modelParameters": {
20
+ "task": "feature-extraction",
21
+ "architectureFamily": "neobert",
22
+ "modelArchitecture": "NeoBERTLMHead",
23
+ "datasets": [
24
+ {
25
+ "ref": "tiiuae/falcon-refinedweb-0f0f969c-55e1-583f-88a6-b6a6fc0a250c"
26
+ }
27
+ ]
28
+ },
29
+ "properties": [
30
+ {
31
+ "name": "library_name",
32
+ "value": "transformers"
33
+ }
34
+ ]
35
+ },
36
+ "authors": [
37
+ {
38
+ "name": "chandar-lab"
39
+ }
40
+ ],
41
+ "licenses": [
42
+ {
43
+ "license": {
44
+ "id": "MIT",
45
+ "url": "https://spdx.org/licenses/MIT.html"
46
+ }
47
+ }
48
+ ],
49
+ "tags": [
50
+ "transformers",
51
+ "safetensors",
52
+ "neobert",
53
+ "fill-mask",
54
+ "feature-extraction",
55
+ "custom_code",
56
+ "en",
57
+ "dataset:tiiuae/falcon-refinedweb",
58
+ "arxiv:2502.19587",
59
+ "license:mit",
60
+ "autotrain_compatible",
61
+ "region:us"
62
+ ]
63
+ }
64
+ },
65
+ "components": [
66
+ {
67
+ "type": "data",
68
+ "bom-ref": "tiiuae/falcon-refinedweb-0f0f969c-55e1-583f-88a6-b6a6fc0a250c",
69
+ "name": "tiiuae/falcon-refinedweb",
70
+ "data": [
71
+ {
72
+ "type": "dataset",
73
+ "bom-ref": "tiiuae/falcon-refinedweb-0f0f969c-55e1-583f-88a6-b6a6fc0a250c",
74
+ "name": "tiiuae/falcon-refinedweb",
75
+ "contents": {
76
+ "url": "https://huggingface.co/datasets/tiiuae/falcon-refinedweb",
77
+ "properties": [
78
+ {
79
+ "name": "task_categories",
80
+ "value": "text-generation"
81
+ },
82
+ {
83
+ "name": "language",
84
+ "value": "en"
85
+ },
86
+ {
87
+ "name": "size_categories",
88
+ "value": "100B<n<1T"
89
+ },
90
+ {
91
+ "name": "pretty_name",
92
+ "value": "Falcon RefinedWeb"
93
+ },
94
+ {
95
+ "name": "license",
96
+ "value": "odc-by"
97
+ }
98
+ ]
99
+ },
100
+ "governance": {
101
+ "owners": [
102
+ {
103
+ "organization": {
104
+ "name": "tiiuae",
105
+ "url": "https://huggingface.co/tiiuae"
106
+ }
107
+ }
108
+ ]
109
+ },
110
+ "description": "\n\t\n\t\t\n\t\t\ud83d\udcc0 Falcon RefinedWeb\n\t\n\nFalcon RefinedWeb is a massive English web dataset built by TII and released under an ODC-By 1.0 license.\nSee the \ud83d\udcd3 paper on arXiv for more details. \nRefinedWeb is built through stringent filtering and large-scale deduplication of CommonCrawl; we found models trained on RefinedWeb to achieve performance in-line or better than models trained on curated datasets, while only relying on web data. \nRefinedWeb is also \"multimodal-friendly\": it contains links and alt\u2026 See the full description on the dataset page: https://huggingface.co/datasets/tiiuae/falcon-refinedweb."
111
+ }
112
+ ]
113
+ }
114
+ ]
115
+ }