Rename handler.py to tokenizer.py
Browse files- handler.py +0 -12
- tokenizer.py +14 -0
handler.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
class EndpointHandler:
|
2 |
-
def __init__(self):
|
3 |
-
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
4 |
-
self.tokenizer = AutoTokenizer.from_pretrained(".")
|
5 |
-
self.model = AutoModelForSequenceClassification.from_pretrained(".")
|
6 |
-
|
7 |
-
def __call__(self, inputs: dict) -> dict:
|
8 |
-
texts = inputs.get("inputs")
|
9 |
-
tokens = self.tokenizer(texts, return_tensors="pt", padding=True)
|
10 |
-
outputs = self.model(**tokens)
|
11 |
-
# convert outputs to JSON-serializable form
|
12 |
-
return {"logits": outputs.logits.tolist()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import tiktoken
|
4 |
+
from torch import nn
|
5 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
6 |
+
|
7 |
+
def text_to_token_ids(text, tokenizer):
|
8 |
+
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
|
9 |
+
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
|
10 |
+
return encoded_tensor
|
11 |
+
|
12 |
+
def token_ids_to_text(token_ids, tokenizer):
|
13 |
+
flat = token_ids.squeeze(0)
|
14 |
+
return tokenizer.decode(flat.tolist())
|