|
import tiktoken |
|
from transformers import AutoTokenizer |
|
|
|
|
|
def analyze_tokens_detailed(text, model): |
|
""" |
|
For a given text and model, returns a list of dicts with details for each token: |
|
- token string |
|
- token id |
|
- decoded value |
|
- token length |
|
- NSL value (token length / max token length in sequence) |
|
- subword fertility (number of tokens per word) |
|
Also returns the decoded output for the entire sequence. |
|
""" |
|
|
|
if 'gpt' in model: |
|
tokenizer = tiktoken.encoding_for_model(model) |
|
token_ids = tokenizer.encode(text) |
|
tokens = [tokenizer.decode([tid]) for tid in token_ids] |
|
else: |
|
tokenizer = AutoTokenizer.from_pretrained(model) |
|
token_ids = tokenizer.encode(text, add_special_tokens=False) |
|
tokens = [tokenizer.decode([tid]) for tid in token_ids] |
|
|
|
|
|
if 'gpt' in model: |
|
decoded_output = tokenizer.decode(token_ids) |
|
else: |
|
decoded_output = tokenizer.decode(token_ids) |
|
|
|
|
|
token_lengths = [len(t) for t in tokens] |
|
max_token_length = max(token_lengths) if token_lengths else 1 |
|
nsl_values = [l / max_token_length for l in token_lengths] |
|
|
|
|
|
|
|
words = text.split() |
|
word_token_counts = [] |
|
if len(words) > 0: |
|
|
|
import re |
|
text_pointer = 0 |
|
word_idx = 0 |
|
token_word_map = [] |
|
for token in tokens: |
|
|
|
while word_idx < len(words) and not text[text_pointer:].startswith(words[word_idx]): |
|
text_pointer += 1 |
|
if word_idx < len(words): |
|
token_word_map.append(word_idx) |
|
text_pointer += len(token) |
|
if text_pointer >= len(text) or (word_idx + 1 < len(words) and text[text_pointer:].startswith(words[word_idx + 1])): |
|
word_idx += 1 |
|
else: |
|
token_word_map.append(-1) |
|
|
|
from collections import Counter |
|
fertility_counter = Counter(token_word_map) |
|
subword_fertility = [fertility_counter[i] for i in range(len(words))] |
|
|
|
token_fertility = [fertility_counter[idx] if idx >= 0 else 0 for idx in token_word_map] |
|
else: |
|
token_fertility = [1 for _ in tokens] |
|
|
|
|
|
table = [] |
|
for i, (token, tid, decoded, length, nsl, fert) in enumerate(zip(tokens, token_ids, tokens, token_lengths, nsl_values, token_fertility)): |
|
table.append({ |
|
'token': token, |
|
'token_id': tid, |
|
'decoded': decoded, |
|
'token_length': length, |
|
'nsl': nsl, |
|
'subword_fertility': fert |
|
}) |
|
return { |
|
'model': model, |
|
'decoded_output': decoded_output, |
|
'tokens': table |
|
} |
|
|
|
|