Update README.md
Browse files
README.md
CHANGED
@@ -3,631 +3,161 @@ tags:
|
|
3 |
- sentence-transformers
|
4 |
- sentence-similarity
|
5 |
- feature-extraction
|
6 |
-
-
|
7 |
-
-
|
8 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
base_model: Shuu12121/CodeModernBERT-Crow
|
10 |
-
widget:
|
11 |
-
- source_sentence: 'getAttachment
|
12 |
-
|
13 |
-
Return an attachment from a SharePoint list item
|
14 |
-
|
15 |
-
|
16 |
-
@param $list_name Name of list
|
17 |
-
|
18 |
-
@param $list_item_id ID of record item is attached to
|
19 |
-
|
20 |
-
@return Array of attachment urls'
|
21 |
-
sentences:
|
22 |
-
- "private function getOrderDirection(): string\n {\n $result = !empty($this->searchData['sort_by_order'])\
|
23 |
-
\ ?\n $this->searchData['sort_by_order'] :\n Options::DEFAULT_SORT_BY_ORDER;\n\
|
24 |
-
\n return $result;\n }"
|
25 |
-
- "function(property, value) {\n var values = [];\n for (var i in\
|
26 |
-
\ this._nodes) {\n if (this._nodes.hasOwnProperty(i)) {\n \
|
27 |
-
\ var n = this._nodes[i];\n if ((property in n && n[property]\
|
28 |
-
\ == value) || (n.data && value == n.data[property])) {\n values.push(n);\n\
|
29 |
-
\ }\n }\n }\n\n return (values.length)\
|
30 |
-
\ ? values : null;\n }"
|
31 |
-
- "public function getAttachments ($list_name, $list_item_id) {\n\t\t// Wrap in\
|
32 |
-
\ CAML\n\t\t$CAML = '\n\t\t<GetAttachmentCollection xmlns=\"http://schemas.microsoft.com/sharepoint/soap/\"\
|
33 |
-
>\n\t\t\t<listName>' . $list_name . '</listName>\n\t\t\t<listItemID>' . $list_item_id\
|
34 |
-
\ . '</listItemID>\n\t\t</GetAttachmentCollection>';\n\n\t\t$xmlvar = new \\SoapVar($CAML,\
|
35 |
-
\ XSD_ANYXML);\n\n\t\t// Attempt to run operation\n\t\ttry {\n\t\t\t$rawXml =\
|
36 |
-
\ $this->soapClient->GetAttachmentCollection($xmlvar)->GetAttachmentCollectionResult->any;\n\
|
37 |
-
\t\t} catch (\\SoapFault $fault) {\n\t\t\t$this->onError($fault);\n\t\t}\n\n\t\
|
38 |
-
\t// Load XML in to DOM document and grab all list items.\n\t\t$nodes = $this->getArrayFromElementsByTagName($rawXml,\
|
39 |
-
\ 'Attachment');\n\n\t\t$attachments = array();\n\n\t\t// Format data in to array\
|
40 |
-
\ or object\n\t\tforeach ($nodes as $counter => $node) {\n\t\t\t$attachments[]\
|
41 |
-
\ = $node->textContent;\n\t\t}\n\n\t\t// Return Array of attachment URLs\n\t\t\
|
42 |
-
return $attachments;\n\t}"
|
43 |
-
- source_sentence: 'Load the services into the module if they have
|
44 |
-
|
45 |
-
not loaded already
|
46 |
-
|
47 |
-
@param \OtherCode\FController\Components\Services $services
|
48 |
-
|
49 |
-
@param \OtherCode\FController\Components\Registry $storage
|
50 |
-
|
51 |
-
@param \OtherCode\FController\Components\Messages $messages'
|
52 |
-
sentences:
|
53 |
-
- "public function connect(\\OtherCode\\FController\\Components\\Services $services,\
|
54 |
-
\ \\OtherCode\\FController\\Components\\Registry $storage, \\OtherCode\\FController\\\
|
55 |
-
Components\\Messages $messages)\n {\n if (!isset($this->services)) {\n\
|
56 |
-
\ $this->services = $services;\n }\n\n if (!isset($this->storage))\
|
57 |
-
\ {\n $this->storage = $storage;\n }\n\n if (!isset($this->messages))\
|
58 |
-
\ {\n $this->messages = $messages;\n }\n }"
|
59 |
-
- "function(element) {\n var that = this;\n this.element = element;\n this.signalTap\
|
60 |
-
\ = new Listeners();\n this.signalLong = new Listeners();\n this.signalTouchstart\
|
61 |
-
\ = new Listeners();\n this.signalTouchend = new Listeners();\n this.signalMove\
|
62 |
-
\ = windowMove;\n this._touch = 0; // 0 = Nothing, 1 = Mouse, 2 = Touch.\n\
|
63 |
-
\ this._onMouseDown = function(evt) {\n if (that._touch) return;\n \
|
64 |
-
\ currentTouchListener = that;\n that._touch = 1;\n evt.preventDefault();\n\
|
65 |
-
\ evt.stopPropagation();\n var rect = that.element.getBoundingClientRect();\n\
|
66 |
-
\ var arg = {\n x: evt.pageX - rect.left,\n y: evt.pageY\
|
67 |
-
\ - rect.top,\n rect: rect\n };\n console.info(\"[tfw.touch-event]\
|
68 |
-
\ arg=...\", arg);\n that.signalTouchstart.fire(arg);\n };\n this._onTouchstart\
|
69 |
-
\ = function(evt) {\n if (that._touch) return;\n currentTouchListener\
|
70 |
-
\ = that;\n that._touch = 2;\n evt.preventDefault();\n evt.stopPropagation();\n\
|
71 |
-
\ var rect = that.element.getBoundingClientRect();\n var arg = {\n\
|
72 |
-
\ x: evt.pageX - rect.left,\n y: evt.pageY - rect.top,\n\
|
73 |
-
\ rect: rect\n };\n console.info(\"[tfw.touch-event]\
|
74 |
-
\ arg=...\", arg);\n that.signalTouchstart.fire(arg);\n };\n element.addEventListener(\"\
|
75 |
-
mousedown\", this._onMouseDown, false);\n}"
|
76 |
-
- "func (cfg *TransportConfig) WithBasePath(basePath string) *TransportConfig {\n\
|
77 |
-
\tcfg.BasePath = basePath\n\treturn cfg\n}"
|
78 |
-
- source_sentence: '// DirectoryValidator ensures the input is a valid and **existing**
|
79 |
-
directory
|
80 |
-
|
81 |
-
// Returns modified extended path'
|
82 |
-
sentences:
|
83 |
-
- "func DirectoryValidator(input string) (interface{}, error) {\n\tif input == \"\
|
84 |
-
\" {\n\t\treturn \"\", nil\n\t}\n\n\tpath, err := PathValidator(input)\n\tif err\
|
85 |
-
\ != nil {\n\t\treturn nil, err\n\t}\n\tpathStr := path.(string)\n\n\tinfo, err\
|
86 |
-
\ := os.Stat(pathStr)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\n\tif !info.IsDir()\
|
87 |
-
\ {\n\t\treturn nil, fmt.Errorf(\"%s is not a directory\", pathStr)\n\t}\n\n\t\
|
88 |
-
return pathStr, nil\n}"
|
89 |
-
- "public void addGivenVendor(String source, String name, String value, boolean\
|
90 |
-
\ regex, Confidence confidence) {\n givenVendor.add(new EvidenceMatcher(source,\
|
91 |
-
\ name, value, regex, confidence));\n }"
|
92 |
-
- "func (a *EnableArgs) SetMaxScriptsCacheSize(maxScriptsCacheSize float64) *EnableArgs\
|
93 |
-
\ {\n\ta.MaxScriptsCacheSize = &maxScriptsCacheSize\n\treturn a\n}"
|
94 |
-
- source_sentence: https://url.spec.whatwg.org/#url-miscellaneous
|
95 |
-
sentences:
|
96 |
-
- "public boolean isDescendent(TarEntry desc) {\r\n return desc.header.name.toString().startsWith(this.header.name.toString());\r\
|
97 |
-
\n }"
|
98 |
-
- "public String displayErrors(String pathPrefix) {\n\n if (pathPrefix ==\
|
99 |
-
\ null) {\n pathPrefix = \"\";\n }\n StringBuffer html\
|
100 |
-
\ = new StringBuffer(512);\n html.append(\"<table border='0' cellpadding='5'\
|
101 |
-
\ cellspacing='0' style='width: 100%; height: 100%;'>\");\n html.append(\"\
|
102 |
-
\\t<tr>\");\n html.append(\"\\t\\t<td style='vertical-align: middle; height:\
|
103 |
-
\ 100%;'>\");\n html.append(getHtmlPart(\"C_BLOCK_START\", \"Error\"));\n\
|
104 |
-
\ html.append(\"\\t\\t\\t<table border='0' cellpadding='0' cellspacing='0'\
|
105 |
-
\ style='width: 100%;'>\");\n html.append(\"\\t\\t\\t\\t<tr>\");\n \
|
106 |
-
\ html.append(\"\\t\\t\\t\\t\\t<td><img src='\").append(pathPrefix).append(\"\
|
107 |
-
resources/error.png' border='0'></td>\");\n html.append(\"\\t\\t\\t\\t\\\
|
108 |
-
t<td> </td>\");\n html.append(\"\\t\\t\\t\\t\\t<td style='width:\
|
109 |
-
\ 100%;'>\");\n\n Iterator<String> iter = getErrors().iterator();\n \
|
110 |
-
\ while (iter.hasNext()) {\n String msg = iter.next();\n \
|
111 |
-
\ html.append(\"\\t\\t\\t\\t\\t\\t\");\n html.append(msg);\n \
|
112 |
-
\ html.append(\"<br/>\");\n }\n\n html.append(\"\\t\\\
|
113 |
-
t\\t\\t\\t</td>\");\n html.append(\"\\t\\t\\t\\t</tr>\");\n html.append(\"\
|
114 |
-
\\t\\t\\t</table>\");\n html.append(getHtmlPart(\"C_BLOCK_END\"));\n \
|
115 |
-
\ html.append(\"\\t\\t</td>\");\n html.append(\"\\t</tr>\");\n \
|
116 |
-
\ html.append(\"</table>\");\n return html.toString();\n }"
|
117 |
-
- "function getDefaultPortFromScheme( scheme ) {\n var port = null;\n if ( scheme\
|
118 |
-
\ === 'ftp') {\n port = 21;\n }\n else if ( scheme === 'gopher' ) {\n \
|
119 |
-
\ port = 70;\n }\n else if ( scheme === 'http' || scheme === 'ws' ) {\n port\
|
120 |
-
\ = 80;\n }\n else if ( scheme === 'https' || scheme === 'wss' ) {\n port\
|
121 |
-
\ = 443;\n }\n return port;\n}"
|
122 |
-
- source_sentence: syntactic sugar
|
123 |
-
sentences:
|
124 |
-
- "@Override\n public void sawOpcode(int seen) {\n int groupId = -1;\n\
|
125 |
-
\n try {\n stack.precomputation(this);\n\n if (seen\
|
126 |
-
\ == Const.INVOKEINTERFACE) {\n String className = getClassConstantOperand();\n\
|
127 |
-
\ String methodName = getNameConstantOperand();\n \
|
128 |
-
\ String signature = getSigConstantOperand();\n QMethod methodInfo\
|
129 |
-
\ = new QMethod(methodName, signature);\n\n if (isCollection(className))\
|
130 |
-
\ {\n if (collectionMethods.contains(methodInfo) || ITERATOR.equals(methodInfo))\
|
131 |
-
\ {\n if (stack.getStackDepth() > 0) {\n \
|
132 |
-
\ OpcodeStack.Item itm = stack.getStackItem(0);\n \
|
133 |
-
\ groupId = findCollectionGroup(itm, true);\n \
|
134 |
-
\ }\n } else if (REMOVE.equals(methodInfo)) {\n \
|
135 |
-
\ if (stack.getStackDepth() > 1) {\n \
|
136 |
-
\ OpcodeStack.Item itm = stack.getStackItem(1);\n \
|
137 |
-
\ int id = findCollectionGroup(itm, true);\n if ((id\
|
138 |
-
\ >= 0) && collectionGroups.get(id).isStandardCollection()) {\n \
|
139 |
-
\ Integer it = groupToIterator.get(Integer.valueOf(id));\n \
|
140 |
-
\ Loop loop = loops.get(it);\n \
|
141 |
-
\ if (loop != null) {\n int pc\
|
142 |
-
\ = getPC();\n if (loop.hasPC(pc)) {\n \
|
143 |
-
\ boolean needPop = !Values.SIG_VOID.equals(SignatureUtils.getReturnSignature(signature));\n\
|
144 |
-
\n if (!breakFollows(loop, needPop) &&\
|
145 |
-
\ !returnFollows(needPop)) {\n bugReporter.reportBug(new\
|
146 |
-
\ BugInstance(this, BugType.DWI_DELETING_WHILE_ITERATING.name(), NORMAL_PRIORITY)\n\
|
147 |
-
\ .addClass(this).addMethod(this).addSourceLine(this));\n\
|
148 |
-
\ }\n \
|
149 |
-
\ }\n }\n }\n \
|
150 |
-
\ }\n } else {\n \
|
151 |
-
\ Integer numArgs = modifyingMethods.get(methodInfo);\n \
|
152 |
-
\ if ((numArgs != null) && (stack.getStackDepth() > numArgs.intValue())) {\n\
|
153 |
-
\ OpcodeStack.Item itm = stack.getStackItem(numArgs.intValue());\n\
|
154 |
-
\ int id = findCollectionGroup(itm, true);\n \
|
155 |
-
\ if (id >= 0) {\n Integer\
|
156 |
-
\ it = groupToIterator.get(Integer.valueOf(id));\n \
|
157 |
-
\ if (it != null) {\n Loop loop = loops.get(it);\n\
|
158 |
-
\ if (loop != null) {\n \
|
159 |
-
\ int pc = getPC();\n \
|
160 |
-
\ if (loop.hasPC(pc)) {\n boolean\
|
161 |
-
\ needPop = !Values.SIG_VOID.equals(SignatureUtils.getReturnSignature(signature));\n\
|
162 |
-
\ boolean breakFollows = breakFollows(loop,\
|
163 |
-
\ needPop);\n boolean returnFollows\
|
164 |
-
\ = !breakFollows && returnFollows(needPop);\n\n \
|
165 |
-
\ if (!breakFollows && !returnFollows) {\n \
|
166 |
-
\ bugReporter.reportBug(new BugInstance(this, BugType.DWI_MODIFYING_WHILE_ITERATING.name(),\
|
167 |
-
\ NORMAL_PRIORITY)\n .addClass(this).addMethod(this).addSourceLine(this));\n\
|
168 |
-
\ }\n \
|
169 |
-
\ }\n }\n \
|
170 |
-
\ }\n }\n }\n \
|
171 |
-
\ }\n } else if (\"java/util/Iterator\".equals(className)\
|
172 |
-
\ && HASNEXT.equals(methodInfo) && (stack.getStackDepth() > 0)) {\n \
|
173 |
-
\ OpcodeStack.Item itm = stack.getStackItem(0);\n \
|
174 |
-
\ Integer id = (Integer) itm.getUserValue();\n if (id != null)\
|
175 |
-
\ {\n groupId = id.intValue();\n }\n\
|
176 |
-
\ }\n } else if ((seen == Const.PUTFIELD) || (seen ==\
|
177 |
-
\ Const.PUTSTATIC)) {\n if (stack.getStackDepth() > 1) {\n \
|
178 |
-
\ OpcodeStack.Item itm = stack.getStackItem(0);\n\n \
|
179 |
-
\ Integer id = (Integer) itm.getUserValue();\n if (id\
|
180 |
-
\ == null) {\n FieldAnnotation fa = FieldAnnotation\n \
|
181 |
-
\ .fromFieldDescriptor(new FieldDescriptor(getClassConstantOperand(),\
|
182 |
-
\ getNameConstantOperand(), getSigConstantOperand(), false));\n \
|
183 |
-
\ itm = new OpcodeStack.Item(itm.getSignature(), fa, stack.getStackItem(1).getRegisterNumber());\n\
|
184 |
-
\ removeFromCollectionGroup(itm);\n \
|
185 |
-
\ groupId = findCollectionGroup(itm, true);\n }\n \
|
186 |
-
\ }\n } else if (OpcodeUtils.isAStore(seen)) {\n \
|
187 |
-
\ if (stack.getStackDepth() > 0) {\n OpcodeStack.Item\
|
188 |
-
\ itm = stack.getStackItem(0);\n Integer id = (Integer) itm.getUserValue();\n\
|
189 |
-
\ if (id != null) {\n int reg = RegisterUtils.getAStoreReg(this,\
|
190 |
-
\ seen);\n\n try {\n JavaClass\
|
191 |
-
\ cls = itm.getJavaClass();\n if ((cls != null) &&\
|
192 |
-
\ cls.implementationOf(iteratorClass)) {\n Integer\
|
193 |
-
\ regIt = Integer.valueOf(reg);\n Iterator<Integer>\
|
194 |
-
\ curIt = groupToIterator.values().iterator();\n \
|
195 |
-
\ while (curIt.hasNext()) {\n if (curIt.next().equals(regIt))\
|
196 |
-
\ {\n curIt.remove();\n \
|
197 |
-
\ }\n }\n \
|
198 |
-
\ groupToIterator.put(id, regIt);\n }\n\
|
199 |
-
\n GroupPair pair = collectionGroups.get(id.intValue());\n\
|
200 |
-
\ if (pair != null) {\n \
|
201 |
-
\ pair.addMember(Integer.valueOf(reg));\n }\n \
|
202 |
-
\ } catch (ClassNotFoundException cnfe) {\n \
|
203 |
-
\ bugReporter.reportMissingClass(cnfe);\n \
|
204 |
-
\ }\n } else {\n String cls = itm.getSignature();\n\
|
205 |
-
\ if ((cls != null) && cls.startsWith(Values.SIG_QUALIFIED_CLASS_PREFIX))\
|
206 |
-
\ {\n cls = SignatureUtils.trimSignature(cls);\n \
|
207 |
-
\ if (isCollection(cls) || \"java/util/Iterator\".equals(cls))\
|
208 |
-
\ {\n int reg = RegisterUtils.getAStoreReg(this,\
|
209 |
-
\ seen);\n removeFromCollectionGroup(new OpcodeStack.Item(itm,\
|
210 |
-
\ reg));\n Iterator<Integer> it = groupToIterator.values().iterator();\n\
|
211 |
-
\ while (it.hasNext()) {\n \
|
212 |
-
\ if (it.next().intValue() == reg) {\n \
|
213 |
-
\ it.remove();\n break;\n\
|
214 |
-
\ }\n }\n \
|
215 |
-
\ }\n }\n }\n\
|
216 |
-
\ }\n } else if (OpcodeUtils.isALoad(seen)) {\n \
|
217 |
-
\ int reg = RegisterUtils.getALoadReg(this, seen);\n \
|
218 |
-
\ OpcodeStack.Item itm = new OpcodeStack.Item(new OpcodeStack.Item(), reg);\n\
|
219 |
-
\ groupId = findCollectionGroup(itm, false);\n } else\
|
220 |
-
\ if ((seen == Const.IFEQ) && (stack.getStackDepth() > 0)) {\n \
|
221 |
-
\ OpcodeStack.Item itm = stack.getStackItem(0);\n Integer id =\
|
222 |
-
\ (Integer) itm.getUserValue();\n if (id != null) {\n \
|
223 |
-
\ int target = getBranchTarget();\n int gotoAddr\
|
224 |
-
\ = target - 3;\n int ins = getCode().getCode()[gotoAddr];\n\
|
225 |
-
\ if (ins < 0) {\n ins = 256 + ins;\n\
|
226 |
-
\ }\n if ((ins == Const.GOTO) || (ins ==\
|
227 |
-
\ Const.GOTO_W)) {\n Integer reg = groupToIterator.get(id);\n\
|
228 |
-
\ if (reg != null) {\n loops.put(reg,\
|
229 |
-
\ new Loop(getPC(), gotoAddr));\n }\n \
|
230 |
-
\ }\n }\n }\n } finally {\n TernaryPatcher.pre(stack,\
|
231 |
-
\ seen);\n stack.sawOpcode(this, seen);\n TernaryPatcher.post(stack,\
|
232 |
-
\ seen);\n if ((groupId >= 0) && (stack.getStackDepth() > 0)) {\n \
|
233 |
-
\ OpcodeStack.Item itm = stack.getStackItem(0);\n \
|
234 |
-
\ itm.setUserValue(Integer.valueOf(groupId));\n }\n\n processEndOfScopes(Integer.valueOf(getPC()));\n\
|
235 |
-
\ }\n }"
|
236 |
-
- "function Concept(concept) {\n if (!(concept.prefLabel && concept.id && concept.type\
|
237 |
-
\ === 'Concept')) {\n throw new Error('Invalid concept: \"' + concept.id +\
|
238 |
-
\ '\"');\n }\n this.id = concept.id;\n this.prefLabel = concept.prefLabel;\n\
|
239 |
-
\ this.altLabel = concept.altLabel;\n this.hiddenLabel = concept.hiddenLabel;\n\
|
240 |
-
\ this.definition = concept.definition;\n this._topConceptOf = concept.topConceptOf;\n\
|
241 |
-
\ this._partOfScheme = false;\n this._originalConcept = concept;\n this._broaderConcepts\
|
242 |
-
\ = [];\n this._narrowerConcepts = [];\n this._relatedConcepts = [];\n}"
|
243 |
-
- "public MessageDestinationComponent addDestination() { //3\r\n MessageDestinationComponent\
|
244 |
-
\ t = new MessageDestinationComponent();\r\n if (this.destination == null)\r\
|
245 |
-
\n this.destination = new ArrayList<MessageDestinationComponent>();\r\n\
|
246 |
-
\ this.destination.add(t);\r\n return t;\r\n }"
|
247 |
pipeline_tag: sentence-similarity
|
248 |
library_name: sentence-transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
---
|
250 |
|
251 |
-
#
|
252 |
|
253 |
-
|
254 |
|
255 |
-
|
256 |
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
- **Maximum Sequence Length:** 1024 tokens
|
261 |
-
- **Output Dimensionality:** 768 dimensions
|
262 |
-
- **Similarity Function:** Cosine Similarity
|
263 |
-
<!-- - **Training Dataset:** Unknown -->
|
264 |
-
<!-- - **Language:** Unknown -->
|
265 |
-
<!-- - **License:** Unknown -->
|
266 |
|
267 |
-
|
268 |
|
269 |
-
-
|
270 |
-
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
271 |
-
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
272 |
|
273 |
-
|
274 |
|
275 |
-
|
276 |
-
SentenceTransformer(
|
277 |
-
(0): Transformer({'max_seq_length': 1024, 'do_lower_case': False}) with Transformer model: ModernBertModel
|
278 |
-
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
279 |
-
)
|
280 |
-
```
|
281 |
|
282 |
-
|
|
|
|
|
|
|
283 |
|
284 |
-
|
285 |
|
286 |
-
|
287 |
|
288 |
-
|
289 |
-
pip install -U sentence-transformers
|
290 |
-
```
|
291 |
|
292 |
-
Then you can load this model and run inference.
|
293 |
```python
|
294 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
|
|
295 |
|
296 |
-
#
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
'syntactic sugar',
|
301 |
-
'public MessageDestinationComponent addDestination() { //3\r\n MessageDestinationComponent t = new MessageDestinationComponent();\r\n if (this.destination == null)\r\n this.destination = new ArrayList<MessageDestinationComponent>();\r\n this.destination.add(t);\r\n return t;\r\n }',
|
302 |
-
'function Concept(concept) {\n if (!(concept.prefLabel && concept.id && concept.type === \'Concept\')) {\n throw new Error(\'Invalid concept: "\' + concept.id + \'"\');\n }\n this.id = concept.id;\n this.prefLabel = concept.prefLabel;\n this.altLabel = concept.altLabel;\n this.hiddenLabel = concept.hiddenLabel;\n this.definition = concept.definition;\n this._topConceptOf = concept.topConceptOf;\n this._partOfScheme = false;\n this._originalConcept = concept;\n this._broaderConcepts = [];\n this._narrowerConcepts = [];\n this._relatedConcepts = [];\n}',
|
303 |
]
|
304 |
-
embeddings = model.encode(sentences)
|
305 |
-
print(embeddings.shape)
|
306 |
-
# [3, 768]
|
307 |
-
|
308 |
-
# Get the similarity scores for the embeddings
|
309 |
-
similarities = model.similarity(embeddings, embeddings)
|
310 |
-
print(similarities.shape)
|
311 |
-
# [3, 3]
|
312 |
-
```
|
313 |
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
You can finetune this model on your own dataset.
|
326 |
-
|
327 |
-
<details><summary>Click to expand</summary>
|
328 |
-
|
329 |
-
</details>
|
330 |
-
-->
|
331 |
-
|
332 |
-
<!--
|
333 |
-
### Out-of-Scope Use
|
334 |
-
|
335 |
-
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
336 |
-
-->
|
337 |
-
|
338 |
-
<!--
|
339 |
-
## Bias, Risks and Limitations
|
340 |
-
|
341 |
-
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
342 |
-
-->
|
343 |
-
|
344 |
-
<!--
|
345 |
-
### Recommendations
|
346 |
-
|
347 |
-
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
348 |
-
-->
|
349 |
-
|
350 |
-
## Training Details
|
351 |
-
|
352 |
-
### Training Dataset
|
353 |
-
|
354 |
-
#### Unnamed Dataset
|
355 |
-
|
356 |
-
* Size: 1,761,493 training samples
|
357 |
-
* Columns: <code>sentence_0</code>, <code>sentence_1</code>, and <code>label</code>
|
358 |
-
* Approximate statistics based on the first 1000 samples:
|
359 |
-
| | sentence_0 | sentence_1 | label |
|
360 |
-
|:--------|:-----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------|:--------------------------------------------------------------|
|
361 |
-
| type | string | string | float |
|
362 |
-
| details | <ul><li>min: 3 tokens</li><li>mean: 48.06 tokens</li><li>max: 914 tokens</li></ul> | <ul><li>min: 28 tokens</li><li>mean: 183.37 tokens</li><li>max: 1024 tokens</li></ul> | <ul><li>min: 1.0</li><li>mean: 1.0</li><li>max: 1.0</li></ul> |
|
363 |
-
* Samples:
|
364 |
-
| sentence_0 | sentence_1 | label |
|
365 |
-
|:-----------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------|
|
366 |
-
| <code>// NewMockUnit creates a new mock instance</code> | <code>func NewMockUnit(ctrl *gomock.Controller) *MockUnit {<br> mock := &MockUnit{ctrl: ctrl}<br> mock.recorder = &MockUnitMockRecorder{mock}<br> return mock<br>}</code> | <code>1.0</code> |
|
367 |
-
| <code>// SetNextPageToken sets the NextPageToken field's value.</code> | <code>func (s *ListBudgetsForResourceOutput) SetNextPageToken(v string) *ListBudgetsForResourceOutput {<br> s.NextPageToken = &v<br> return s<br>}</code> | <code>1.0</code> |
|
368 |
-
| <code>// addHandler adds a handler for a puType/event.</code> | <code>func (r *registerer) addHandler(puType common.PUType, event common.Event, handler common.EventHandler) {<br> r.handlers[puType][event] = handler<br>}</code> | <code>1.0</code> |
|
369 |
-
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
370 |
-
```json
|
371 |
-
{
|
372 |
-
"scale": 20.0,
|
373 |
-
"similarity_fct": "cos_sim"
|
374 |
-
}
|
375 |
-
```
|
376 |
-
|
377 |
-
### Training Hyperparameters
|
378 |
-
#### Non-Default Hyperparameters
|
379 |
-
|
380 |
-
- `per_device_train_batch_size`: 256
|
381 |
-
- `per_device_eval_batch_size`: 256
|
382 |
-
- `num_train_epochs`: 5
|
383 |
-
- `fp16`: True
|
384 |
-
- `multi_dataset_batch_sampler`: round_robin
|
385 |
-
|
386 |
-
#### All Hyperparameters
|
387 |
-
<details><summary>Click to expand</summary>
|
388 |
-
|
389 |
-
- `overwrite_output_dir`: False
|
390 |
-
- `do_predict`: False
|
391 |
-
- `eval_strategy`: no
|
392 |
-
- `prediction_loss_only`: True
|
393 |
-
- `per_device_train_batch_size`: 256
|
394 |
-
- `per_device_eval_batch_size`: 256
|
395 |
-
- `per_gpu_train_batch_size`: None
|
396 |
-
- `per_gpu_eval_batch_size`: None
|
397 |
-
- `gradient_accumulation_steps`: 1
|
398 |
-
- `eval_accumulation_steps`: None
|
399 |
-
- `torch_empty_cache_steps`: None
|
400 |
-
- `learning_rate`: 5e-05
|
401 |
-
- `weight_decay`: 0.0
|
402 |
-
- `adam_beta1`: 0.9
|
403 |
-
- `adam_beta2`: 0.999
|
404 |
-
- `adam_epsilon`: 1e-08
|
405 |
-
- `max_grad_norm`: 1
|
406 |
-
- `num_train_epochs`: 5
|
407 |
-
- `max_steps`: -1
|
408 |
-
- `lr_scheduler_type`: linear
|
409 |
-
- `lr_scheduler_kwargs`: {}
|
410 |
-
- `warmup_ratio`: 0.0
|
411 |
-
- `warmup_steps`: 0
|
412 |
-
- `log_level`: passive
|
413 |
-
- `log_level_replica`: warning
|
414 |
-
- `log_on_each_node`: True
|
415 |
-
- `logging_nan_inf_filter`: True
|
416 |
-
- `save_safetensors`: True
|
417 |
-
- `save_on_each_node`: False
|
418 |
-
- `save_only_model`: False
|
419 |
-
- `restore_callback_states_from_checkpoint`: False
|
420 |
-
- `no_cuda`: False
|
421 |
-
- `use_cpu`: False
|
422 |
-
- `use_mps_device`: False
|
423 |
-
- `seed`: 42
|
424 |
-
- `data_seed`: None
|
425 |
-
- `jit_mode_eval`: False
|
426 |
-
- `use_ipex`: False
|
427 |
-
- `bf16`: False
|
428 |
-
- `fp16`: True
|
429 |
-
- `fp16_opt_level`: O1
|
430 |
-
- `half_precision_backend`: auto
|
431 |
-
- `bf16_full_eval`: False
|
432 |
-
- `fp16_full_eval`: False
|
433 |
-
- `tf32`: None
|
434 |
-
- `local_rank`: 0
|
435 |
-
- `ddp_backend`: None
|
436 |
-
- `tpu_num_cores`: None
|
437 |
-
- `tpu_metrics_debug`: False
|
438 |
-
- `debug`: []
|
439 |
-
- `dataloader_drop_last`: False
|
440 |
-
- `dataloader_num_workers`: 0
|
441 |
-
- `dataloader_prefetch_factor`: None
|
442 |
-
- `past_index`: -1
|
443 |
-
- `disable_tqdm`: False
|
444 |
-
- `remove_unused_columns`: True
|
445 |
-
- `label_names`: None
|
446 |
-
- `load_best_model_at_end`: False
|
447 |
-
- `ignore_data_skip`: False
|
448 |
-
- `fsdp`: []
|
449 |
-
- `fsdp_min_num_params`: 0
|
450 |
-
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
451 |
-
- `tp_size`: 0
|
452 |
-
- `fsdp_transformer_layer_cls_to_wrap`: None
|
453 |
-
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
454 |
-
- `deepspeed`: None
|
455 |
-
- `label_smoothing_factor`: 0.0
|
456 |
-
- `optim`: adamw_torch
|
457 |
-
- `optim_args`: None
|
458 |
-
- `adafactor`: False
|
459 |
-
- `group_by_length`: False
|
460 |
-
- `length_column_name`: length
|
461 |
-
- `ddp_find_unused_parameters`: None
|
462 |
-
- `ddp_bucket_cap_mb`: None
|
463 |
-
- `ddp_broadcast_buffers`: False
|
464 |
-
- `dataloader_pin_memory`: True
|
465 |
-
- `dataloader_persistent_workers`: False
|
466 |
-
- `skip_memory_metrics`: True
|
467 |
-
- `use_legacy_prediction_loop`: False
|
468 |
-
- `push_to_hub`: False
|
469 |
-
- `resume_from_checkpoint`: None
|
470 |
-
- `hub_model_id`: None
|
471 |
-
- `hub_strategy`: every_save
|
472 |
-
- `hub_private_repo`: None
|
473 |
-
- `hub_always_push`: False
|
474 |
-
- `gradient_checkpointing`: False
|
475 |
-
- `gradient_checkpointing_kwargs`: None
|
476 |
-
- `include_inputs_for_metrics`: False
|
477 |
-
- `include_for_metrics`: []
|
478 |
-
- `eval_do_concat_batches`: True
|
479 |
-
- `fp16_backend`: auto
|
480 |
-
- `push_to_hub_model_id`: None
|
481 |
-
- `push_to_hub_organization`: None
|
482 |
-
- `mp_parameters`:
|
483 |
-
- `auto_find_batch_size`: False
|
484 |
-
- `full_determinism`: False
|
485 |
-
- `torchdynamo`: None
|
486 |
-
- `ray_scope`: last
|
487 |
-
- `ddp_timeout`: 1800
|
488 |
-
- `torch_compile`: False
|
489 |
-
- `torch_compile_backend`: None
|
490 |
-
- `torch_compile_mode`: None
|
491 |
-
- `include_tokens_per_second`: False
|
492 |
-
- `include_num_input_tokens_seen`: False
|
493 |
-
- `neftune_noise_alpha`: None
|
494 |
-
- `optim_target_modules`: None
|
495 |
-
- `batch_eval_metrics`: False
|
496 |
-
- `eval_on_start`: False
|
497 |
-
- `use_liger_kernel`: False
|
498 |
-
- `eval_use_gather_object`: False
|
499 |
-
- `average_tokens_across_devices`: False
|
500 |
-
- `prompts`: None
|
501 |
-
- `batch_sampler`: batch_sampler
|
502 |
-
- `multi_dataset_batch_sampler`: round_robin
|
503 |
-
|
504 |
-
</details>
|
505 |
-
|
506 |
-
### Training Logs
|
507 |
-
| Epoch | Step | Training Loss |
|
508 |
-
|:------:|:-----:|:-------------:|
|
509 |
-
| 0.0727 | 500 | 0.7681 |
|
510 |
-
| 0.1453 | 1000 | 0.1157 |
|
511 |
-
| 0.2180 | 1500 | 0.1068 |
|
512 |
-
| 0.2907 | 2000 | 0.0979 |
|
513 |
-
| 0.3633 | 2500 | 0.0969 |
|
514 |
-
| 0.4360 | 3000 | 0.0945 |
|
515 |
-
| 0.5086 | 3500 | 0.0918 |
|
516 |
-
| 0.5813 | 4000 | 0.0905 |
|
517 |
-
| 0.6540 | 4500 | 0.088 |
|
518 |
-
| 0.7266 | 5000 | 0.0854 |
|
519 |
-
| 0.7993 | 5500 | 0.0855 |
|
520 |
-
| 0.8720 | 6000 | 0.0873 |
|
521 |
-
| 0.9446 | 6500 | 0.082 |
|
522 |
-
| 1.0173 | 7000 | 0.0728 |
|
523 |
-
| 1.0900 | 7500 | 0.0427 |
|
524 |
-
| 1.1626 | 8000 | 0.0435 |
|
525 |
-
| 1.2353 | 8500 | 0.0441 |
|
526 |
-
| 1.3079 | 9000 | 0.045 |
|
527 |
-
| 1.3806 | 9500 | 0.0444 |
|
528 |
-
| 1.4533 | 10000 | 0.0433 |
|
529 |
-
| 1.5259 | 10500 | 0.0447 |
|
530 |
-
| 1.5986 | 11000 | 0.0443 |
|
531 |
-
| 1.6713 | 11500 | 0.0439 |
|
532 |
-
| 1.7439 | 12000 | 0.0449 |
|
533 |
-
| 1.8166 | 12500 | 0.0441 |
|
534 |
-
| 1.8893 | 13000 | 0.0443 |
|
535 |
-
| 1.9619 | 13500 | 0.0461 |
|
536 |
-
| 2.0346 | 14000 | 0.0335 |
|
537 |
-
| 2.1073 | 14500 | 0.0192 |
|
538 |
-
| 2.1799 | 15000 | 0.0199 |
|
539 |
-
| 2.2526 | 15500 | 0.0197 |
|
540 |
-
| 2.3252 | 16000 | 0.0199 |
|
541 |
-
| 2.3979 | 16500 | 0.02 |
|
542 |
-
| 2.4706 | 17000 | 0.0206 |
|
543 |
-
| 2.5432 | 17500 | 0.0204 |
|
544 |
-
| 2.6159 | 18000 | 0.0202 |
|
545 |
-
| 2.6886 | 18500 | 0.0206 |
|
546 |
-
| 2.7612 | 19000 | 0.0209 |
|
547 |
-
| 2.8339 | 19500 | 0.0211 |
|
548 |
-
| 2.9066 | 20000 | 0.0207 |
|
549 |
-
| 2.9792 | 20500 | 0.0202 |
|
550 |
-
| 3.0519 | 21000 | 0.014 |
|
551 |
-
| 3.1245 | 21500 | 0.0112 |
|
552 |
-
| 3.1972 | 22000 | 0.0111 |
|
553 |
-
| 3.2699 | 22500 | 0.0113 |
|
554 |
-
| 3.3425 | 23000 | 0.0117 |
|
555 |
-
| 3.4152 | 23500 | 0.0116 |
|
556 |
-
| 3.4879 | 24000 | 0.0118 |
|
557 |
-
| 3.5605 | 24500 | 0.0114 |
|
558 |
-
| 3.6332 | 25000 | 0.0114 |
|
559 |
-
| 3.7059 | 25500 | 0.011 |
|
560 |
-
| 3.7785 | 26000 | 0.0109 |
|
561 |
-
| 3.8512 | 26500 | 0.0113 |
|
562 |
-
| 3.9238 | 27000 | 0.0113 |
|
563 |
-
| 3.9965 | 27500 | 0.0111 |
|
564 |
-
| 4.0692 | 28000 | 0.0085 |
|
565 |
-
| 4.1418 | 28500 | 0.0081 |
|
566 |
-
| 4.2145 | 29000 | 0.0082 |
|
567 |
-
| 4.2872 | 29500 | 0.0083 |
|
568 |
-
| 4.3598 | 30000 | 0.0085 |
|
569 |
-
| 4.4325 | 30500 | 0.0086 |
|
570 |
-
| 4.5052 | 31000 | 0.0082 |
|
571 |
-
| 4.5778 | 31500 | 0.0083 |
|
572 |
-
| 4.6505 | 32000 | 0.0084 |
|
573 |
-
| 4.7232 | 32500 | 0.0085 |
|
574 |
-
| 4.7958 | 33000 | 0.0083 |
|
575 |
-
| 4.8685 | 33500 | 0.0082 |
|
576 |
-
| 4.9411 | 34000 | 0.0081 |
|
577 |
-
|
578 |
-
|
579 |
-
### Framework Versions
|
580 |
-
- Python: 3.11.12
|
581 |
-
- Sentence Transformers: 3.4.1
|
582 |
-
- Transformers: 4.51.3
|
583 |
-
- PyTorch: 2.6.0+cu124
|
584 |
-
- Accelerate: 1.5.2
|
585 |
-
- Datasets: 3.5.0
|
586 |
-
- Tokenizers: 0.21.1
|
587 |
-
|
588 |
-
## Citation
|
589 |
-
|
590 |
-
### BibTeX
|
591 |
-
|
592 |
-
#### Sentence Transformers
|
593 |
-
```bibtex
|
594 |
-
@inproceedings{reimers-2019-sentence-bert,
|
595 |
-
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
596 |
-
author = "Reimers, Nils and Gurevych, Iryna",
|
597 |
-
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
598 |
-
month = "11",
|
599 |
-
year = "2019",
|
600 |
-
publisher = "Association for Computational Linguistics",
|
601 |
-
url = "https://arxiv.org/abs/1908.10084",
|
602 |
-
}
|
603 |
-
```
|
604 |
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
year={2017},
|
611 |
-
eprint={1705.00652},
|
612 |
-
archivePrefix={arXiv},
|
613 |
-
primaryClass={cs.CL}
|
614 |
-
}
|
615 |
```
|
616 |
|
617 |
-
|
618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
619 |
|
620 |
-
*
|
621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
622 |
|
623 |
-
<!--
|
624 |
-
## Model Card Authors
|
625 |
|
626 |
-
|
627 |
-
-->
|
628 |
|
629 |
-
|
630 |
-
|
631 |
|
632 |
-
|
633 |
-
-->
|
|
|
3 |
- sentence-transformers
|
4 |
- sentence-similarity
|
5 |
- feature-extraction
|
6 |
+
- code-search
|
7 |
+
- modernbert
|
8 |
+
- code
|
9 |
+
- python
|
10 |
+
- java
|
11 |
+
- javascript
|
12 |
+
- php
|
13 |
+
- ruby
|
14 |
+
- rust
|
15 |
+
- go
|
16 |
base_model: Shuu12121/CodeModernBERT-Crow
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
pipeline_tag: sentence-similarity
|
18 |
library_name: sentence-transformers
|
19 |
+
license: apache-2.0
|
20 |
+
datasets:
|
21 |
+
- code-search-net/code_search_net
|
22 |
+
- Shuu12121/python-codesearch-filtered
|
23 |
+
- Shuu12121/java-codesearch-filtered
|
24 |
+
- Shuu12121/javascript-codesearch-filtered
|
25 |
+
- Shuu12121/rust-codesearch-filtered
|
26 |
+
- Shuu12121/ruby-codesearch-filtered
|
27 |
+
language:
|
28 |
+
- en
|
29 |
+
widget:
|
30 |
+
# Example queries for the widget
|
31 |
+
- src_texts:
|
32 |
+
- "def greet(name): print(f'Hello, {name}!')"
|
33 |
+
- "function calculateSum(a, b) { return a + b; }"
|
34 |
+
- "public static void main(String[] args) { System.out.println(\"Hello World\"); }"
|
35 |
+
example_title: "Code Snippets"
|
36 |
+
- src_texts:
|
37 |
+
- "A function that prints a greeting."
|
38 |
+
- "Function to compute the sum of two numbers."
|
39 |
+
- "The main entry point of a Java program."
|
40 |
+
example_title: "Descriptions"
|
41 |
---
|
42 |
|
43 |
+
# Shuu12121/CodeSearch-ModernBERT-Crow-Plus🐦⬛
|
44 |
|
45 |
+
このモデルは、`Shuu12121/CodeModernBERT-Crow` をベースにした Sentence Transformer モデルであり、特に多言語コード検索タスクにおいて高い性能を発揮するようにファインチューニングされています。
|
46 |
|
47 |
+
This is a Sentence Transformer model based on `Shuu12121/CodeModernBERT-Crow`, fine-tuned for high performance on multilingual code search tasks.
|
48 |
|
49 |
+
開発者 (Developer): [Shuu12121](https://huggingface.co/Shuu12121)
|
50 |
+
ベースモデル (Base Model): [Shuu12121/CodeModernBERT-Crow](https://huggingface.co/Shuu12121/CodeModernBERT-Crow)
|
51 |
+
License: Apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
## 概要 / Overview
|
54 |
|
55 |
+
`CodeSearch-ModernBERT-Crow-Plus` は、自然言語のクエリと複数のプログラミング言語(Python, Java, JavaScript, PHP, Ruby, Go, Rust)のコードスニペット(主に関数レベル)間の意味的な類似性を捉えるために設計された Sentence Transformer モデルです。ベースモデルである `CodeModernBERT-Crow` の強力なコード理解能力を継承し、コード検索や類似性判定タスクに最適化されています。
|
|
|
|
|
56 |
|
57 |
+
`CodeSearch-ModernBERT-Crow-Plus` is a Sentence Transformer model designed to capture the semantic similarity between natural language queries and code snippets (primarily at the function level) across multiple programming languages (Python, Java, JavaScript, PHP, Ruby, Go, Rust). It inherits the strong code understanding capabilities of its base model, `CodeModernBERT-Crow`, and is optimized for code search and similarity tasks.
|
58 |
|
59 |
+
## モデル詳細 / Model Details
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
+
* **ベースモデル / Base Model:** [Shuu12121/CodeModernBERT-Crow](https://huggingface.co/Shuu12121/CodeModernBERT-Crow)
|
62 |
+
* **アーキテクチャ / Architecture:** ModernBERT (hidden\_size: 768, layers: 12, heads: 12)
|
63 |
+
* **最大入力長 / Max Sequence Length:** 1024 トークン
|
64 |
+
* **ファインチューニング / Fine-tuning:** このモデルは、コードとその対応するドキュメント(例:CodeSearchNet データセット)を用いた類似性学習タスクでファインチューニングされていると考えられます。Sentence Transformer ライブラリで使用するために、Pooling 層が追加されています。
|
65 |
|
66 |
+
## 使用方法 / How to Use
|
67 |
|
68 |
+
`sentence-transformers` ライブラリを使って簡単に利用できます。
|
69 |
|
70 |
+
You can easily use this model with the `sentence-transformers` library.
|
|
|
|
|
71 |
|
|
|
72 |
```python
|
73 |
from sentence_transformers import SentenceTransformer
|
74 |
+
import torch
|
75 |
+
|
76 |
+
# モデルのロード / Load the model
|
77 |
+
model = SentenceTransformer("Shuu12121/CodeSearch-ModernBERT-Crow-Plus")
|
78 |
|
79 |
+
# エンコードしたいテキスト(コードまたは自然言語) / Texts to encode (code or natural language)
|
80 |
+
code_snippets = [
|
81 |
+
"def factorial(n): if n == 0: return 1 else: return n * factorial(n-1)",
|
82 |
+
"function binarySearch(arr, target) { let left = 0, right = arr.length - 1; while (left <= right) { const mid = Math.floor((left + right) / 2); if (arr[mid] === target) return mid; if (arr[mid] < target) left = mid + 1; else right = mid - 1; } return -1; }"
|
|
|
|
|
|
|
83 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
natural_language_queries = [
|
86 |
+
"calculate the factorial of a number recursively",
|
87 |
+
"find an element in a sorted array using binary search"
|
88 |
+
]
|
89 |
+
|
90 |
+
# エンベディングの取得 / Get embeddings
|
91 |
+
code_embeddings = model.encode(code_snippets)
|
92 |
+
query_embeddings = model.encode(natural_language_queries)
|
93 |
+
|
94 |
+
print("Code Embeddings Shape:", code_embeddings.shape)
|
95 |
+
print("Query Embeddings Shape:", query_embeddings.shape)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
+
# 類似度の計算(例:コサイン類似度) / Calculate similarity (e.g., cosine similarity)
|
98 |
+
# Requires a similarity function, e.g., from sentence_transformers.util or sklearn.metrics.pairwise
|
99 |
+
# from sentence_transformers.util import cos_sim
|
100 |
+
# similarities = cos_sim(query_embeddings, code_embeddings)
|
101 |
+
# print(similarities)
|
|
|
|
|
|
|
|
|
|
|
102 |
```
|
103 |
|
104 |
+
## 評価 / Evaluation
|
105 |
+
|
106 |
+
このモデルは [MTEB (Massive Text Embedding Benchmark)](https://huggingface.co/spaces/mteb/leaderboard) で評価されています。
|
107 |
+
|
108 |
+
This model has been evaluated on the MTEB (Massive Text Embedding Benchmark).
|
109 |
+
|
110 |
+
**タスク: CodeSearchNet Retrieval**
|
111 |
+
|
112 |
+
* **MTEB 標準評価 (main_score: nDCG@10): 0.8890**
|
113 |
+
* `ndcg_at_1`: 0.8060
|
114 |
+
* `ndcg_at_3`: 0.8748
|
115 |
+
* `ndcg_at_5`: 0.8837
|
116 |
+
* `ndcg_at_10`: 0.8890
|
117 |
+
* `ndcg_at_100`: 0.8940
|
118 |
+
* `map_at_10`: 0.8659
|
119 |
+
* `recall_at_10`: 0.9585
|
120 |
+
* `mrr_at_10`: 0.8659
|
121 |
+
|
122 |
+
* **COIR (Contextual Object Identity Recognition) 設定での評価 (main_score: nDCG@10): 0.7985**
|
123 |
+
* `ndcg_at_1`: 0.7124
|
124 |
+
* `ndcg_at_3`: 0.7752
|
125 |
+
* `ndcg_at_5`: 0.7874
|
126 |
+
* `ndcg_at_10`: 0.7985
|
127 |
+
* `ndcg_at_100`: 0.8126
|
128 |
+
* `map_at_10`: 0.7717
|
129 |
+
* `recall_at_10`: 0.8818
|
130 |
+
* `mrr_at_10`: 0.7717
|
131 |
+
|
132 |
+
*注: 評価設定の違いにより、同じ CodeSearchNet Retrieval タスクでもスコアが異なります。*
|
133 |
+
*Note: Scores differ for the same CodeSearchNet Retrieval task due to different evaluation settings.*
|
134 |
+
|
135 |
+
参考として、ベースモデル `Shuu12121/CodeModernBERT-Crow` の CodeSearchNet Test Split における MRR@100 スコアは以下の通りです(固定評価スクリプト使用)。
|
136 |
+
|
137 |
+
For reference, the MRR@100 scores for the base model `Shuu12121/CodeModernBERT-Crow` on the CodeSearchNet Test Split (using a fixed evaluation script) are:
|
138 |
+
|
139 |
+
| 言語 / Language | Python | Java | JavaScript | PHP | Ruby | Go |
|
140 |
+
| :-------------- | :----- | :----- | :--------- | :----- | :----- | :----- |
|
141 |
+
| MRR@100 | 0.9372 | 0.8642 | 0.8118 | 0.8388 | 0.8392 | 0.8522 |
|
142 |
+
|
143 |
+
|
144 |
+
## 意図された用途と制限 / Intended Use & Limitations
|
145 |
|
146 |
+
* **意図された用途 / Intended Use:**
|
147 |
+
* 多言語コード検索 (Natural Language to Code, Code to Code)
|
148 |
+
* コードの類似性判定
|
149 |
+
* コード分類やクラスタリングのための特徴抽出
|
150 |
+
* コード推薦システム
|
151 |
+
* **対象言語 / Target Languages:** Python, Java, JavaScript, PHP, Ruby, Go, Rust
|
152 |
+
* **制限 / Limitations:**
|
153 |
+
* 主に関数レベルのコードスニペットに最適化されています。非常に長いコードファイル全体や、構文的に不完全なコードに対する性能は低下する可能性があります。
|
154 |
+
* 特定のドメインやライブラリに特化したタスクでは、追加のファインチューニングが有効な場合があります。
|
155 |
+
* 生成タスクには適していません(これはエンコーダモデルです)。
|
156 |
|
|
|
|
|
157 |
|
158 |
+
## 連絡先 / Contact
|
|
|
159 |
|
160 |
+
質問や提案については、開発者 Shuu12121 までご連絡ください。
|
161 |
+
For questions or suggestions, please contact the developer Shuu12121.
|
162 |
|
163 |
+
📧 shun0212114@outlook.jp
|
|