File size: 3,111 Bytes
145530a
c2e2189
 
a92795e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145530a
a92795e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import tiktoken
from transformers import AutoTokenizer

# ... existing code ...
def analyze_tokens_detailed(text, model):
    """
    For a given text and model, returns a list of dicts with details for each token:
    - token string
    - token id
    - decoded value
    - token length
    - NSL value (token length / max token length in sequence)
    - subword fertility (number of tokens per word)
    Also returns the decoded output for the entire sequence.
    """
    # Tokenize
    if 'gpt' in model:
        tokenizer = tiktoken.encoding_for_model(model)
        token_ids = tokenizer.encode(text)
        tokens = [tokenizer.decode([tid]) for tid in token_ids]
    else:
        tokenizer = AutoTokenizer.from_pretrained(model)
        token_ids = tokenizer.encode(text, add_special_tokens=False)
        tokens = [tokenizer.decode([tid]) for tid in token_ids]

    # Decoded output for the entire sequence
    if 'gpt' in model:
        decoded_output = tokenizer.decode(token_ids)
    else:
        decoded_output = tokenizer.decode(token_ids)

    # Token lengths
    token_lengths = [len(t) for t in tokens]
    max_token_length = max(token_lengths) if token_lengths else 1
    nsl_values = [l / max_token_length for l in token_lengths]

    # Subword fertility: number of tokens per word
    # Map each token to its originating word (approximate)
    words = text.split()
    word_token_counts = []
    if len(words) > 0:
        # Use a simple greedy approach: assign tokens to words in order
        import re
        text_pointer = 0
        word_idx = 0
        token_word_map = []
        for token in tokens:
            # Find the next word that matches the start of the token
            while word_idx < len(words) and not text[text_pointer:].startswith(words[word_idx]):
                text_pointer += 1
            if word_idx < len(words):
                token_word_map.append(word_idx)
                text_pointer += len(token)
                if text_pointer >= len(text) or (word_idx + 1 < len(words) and text[text_pointer:].startswith(words[word_idx + 1])):
                    word_idx += 1
            else:
                token_word_map.append(-1)
        # Count tokens per word
        from collections import Counter
        fertility_counter = Counter(token_word_map)
        subword_fertility = [fertility_counter[i] for i in range(len(words))]
        # Assign fertility to each token
        token_fertility = [fertility_counter[idx] if idx >= 0 else 0 for idx in token_word_map]
    else:
        token_fertility = [1 for _ in tokens]

    # Build table
    table = []
    for i, (token, tid, decoded, length, nsl, fert) in enumerate(zip(tokens, token_ids, tokens, token_lengths, nsl_values, token_fertility)):
        table.append({
            'token': token,
            'token_id': tid,
            'decoded': decoded,
            'token_length': length,
            'nsl': nsl,
            'subword_fertility': fert
        })
    return {
        'model': model,
        'decoded_output': decoded_output,
        'tokens': table
    }
# ... existing code ...