salmankhanpm's picture
Upload 2 files
a92795e verified
import tiktoken
from transformers import AutoTokenizer
# ... existing code ...
def analyze_tokens_detailed(text, model):
"""
For a given text and model, returns a list of dicts with details for each token:
- token string
- token id
- decoded value
- token length
- NSL value (token length / max token length in sequence)
- subword fertility (number of tokens per word)
Also returns the decoded output for the entire sequence.
"""
# Tokenize
if 'gpt' in model:
tokenizer = tiktoken.encoding_for_model(model)
token_ids = tokenizer.encode(text)
tokens = [tokenizer.decode([tid]) for tid in token_ids]
else:
tokenizer = AutoTokenizer.from_pretrained(model)
token_ids = tokenizer.encode(text, add_special_tokens=False)
tokens = [tokenizer.decode([tid]) for tid in token_ids]
# Decoded output for the entire sequence
if 'gpt' in model:
decoded_output = tokenizer.decode(token_ids)
else:
decoded_output = tokenizer.decode(token_ids)
# Token lengths
token_lengths = [len(t) for t in tokens]
max_token_length = max(token_lengths) if token_lengths else 1
nsl_values = [l / max_token_length for l in token_lengths]
# Subword fertility: number of tokens per word
# Map each token to its originating word (approximate)
words = text.split()
word_token_counts = []
if len(words) > 0:
# Use a simple greedy approach: assign tokens to words in order
import re
text_pointer = 0
word_idx = 0
token_word_map = []
for token in tokens:
# Find the next word that matches the start of the token
while word_idx < len(words) and not text[text_pointer:].startswith(words[word_idx]):
text_pointer += 1
if word_idx < len(words):
token_word_map.append(word_idx)
text_pointer += len(token)
if text_pointer >= len(text) or (word_idx + 1 < len(words) and text[text_pointer:].startswith(words[word_idx + 1])):
word_idx += 1
else:
token_word_map.append(-1)
# Count tokens per word
from collections import Counter
fertility_counter = Counter(token_word_map)
subword_fertility = [fertility_counter[i] for i in range(len(words))]
# Assign fertility to each token
token_fertility = [fertility_counter[idx] if idx >= 0 else 0 for idx in token_word_map]
else:
token_fertility = [1 for _ in tokens]
# Build table
table = []
for i, (token, tid, decoded, length, nsl, fert) in enumerate(zip(tokens, token_ids, tokens, token_lengths, nsl_values, token_fertility)):
table.append({
'token': token,
'token_id': tid,
'decoded': decoded,
'token_length': length,
'nsl': nsl,
'subword_fertility': fert
})
return {
'model': model,
'decoded_output': decoded_output,
'tokens': table
}
# ... existing code ...