home / assets /js /search.js
Narsil's picture
Narsil HF Staff
Static space.
4c7b631
raw
history blame
120 kB
var documents = [{
"id": 0,
"url": "http://localhost:4000/404.html",
"title": "",
"body": " 404 Page not found :( The requested page could not be found. "
}, {
"id": 1,
"url": "http://localhost:4000/about/",
"title": "About Me",
"body": "I’m an entrepreneur technical guy. Built french math/physics online training platform Kwyk. Interested in Machine Learning that actually works. Regularly trying to learn and try new languages/frameworks. "
}, {
"id": 2,
"url": "http://localhost:4000/categories/",
"title": "Tags",
"body": "Contents: {% if site. categories. size > 0 %} {% for category in site. categories %} {% capture category_name %}{{ category | first }}{% endcapture %} {{ category_name }}{% endfor %}{% endif %} {% for category in site. categories %} {% capture category_name %}{{ category | first }}{% endcapture %} <h3 id = {{ category_name }} ><i class= fas fa-tags category-tags-icon ></i></i> {{ category_name }}</h3><a name= {{ category_name | slugize }} ></a>{% for post in site. categories[category_name] %}{%- assign date_format = site. minima. date_format | default: %b %-d, %Y -%}<article class= archive-item > <p class= post-meta post-meta-title ><a class= page-meta href= {{ site. baseurl }}{{ post. url }} >{{post. title}}</a> • {{ post. date | date: date_format }}</p></article>{% endfor %} {% endfor %}"
}, {
"id": 3,
"url": "http://localhost:4000/images/copied_from_nb/",
"title": "",
"body": "WarningDo not manually save images into this folder. This is used by GitHub Actions to automatically copy images. Any images you save into this folder could be deleted at build time. "
}, {
"id": 4,
"url": "http://localhost:4000/ml/nlp/2020/07/22/creating-a-translate-app.html",
"title": "Creating a dutch translation app",
"body": "2020/07/22 - TL;DR Recently moved to the Netherlands, in order to avoid Googling translate everything, I did the next best thing to learning the language: I created a clone of translate. google. com Find a correct training loop: My first instinct was to check Hugging Face as this repo contains solid implementations that I know are easy to change. However, in that particular instance, the example for translation does not start from scratch, and I wanted to check what multilingual translation could do here, as I’m using English, Dutch & French on translate. google. com (For food sometimes french is much better than english for me). My second guess was Fairseq from facebook. In their example there is an actual example for multilingual German, French, English. Close enough for my needs. First things first, start to follow the example by the book. Most implementations out there are broken and won’t work out of the box. This time, it turned out particularly smooth. Clone the repo then follow the instructions # First install sacrebleu and sentencepiecepip install sacrebleu sentencepiece# Then download and preprocess the datacd examples/translation/bash prepare-iwslt17-multilingual. shcd . . /. . # Binarize the de-en datasetTEXT=examples/translation/iwslt17. de_fr. en. bpe16kfairseq-preprocess --source-lang de --target-lang en \ --trainpref $TEXT/train. bpe. de-en \ --validpref $TEXT/valid0. bpe. de-en,$TEXT/valid1. bpe. de-en,$TEXT/valid2. bpe. de-en,$TEXT/valid3. bpe. de-en,$TEXT/valid4. bpe. de-en,$TEXT/valid5. bpe. de-en \ --destdir data-bin/iwslt17. de_fr. en. bpe16k \ --workers 10# Binarize the fr-en dataset# NOTE: it's important to reuse the en dictionary from the previous stepfairseq-preprocess --source-lang fr --target-lang en \ --trainpref $TEXT/train. bpe. fr-en \ --validpref $TEXT/valid0. bpe. fr-en,$TEXT/valid1. bpe. fr-en,$TEXT/valid2. bpe. fr-en,$TEXT/valid3. bpe. fr-en,$TEXT/valid4. bpe. fr-en,$TEXT/valid5. bpe. fr-en \ --tgtdict data-bin/iwslt17. de_fr. en. bpe16k/dict. en. txt \ --destdir data-bin/iwslt17. de_fr. en. bpe16k \ --workers 10# Train a multilingual transformer model# NOTE: the command below assumes 1 GPU, but accumulates gradients from# 8 fwd/bwd passes to simulate training on 8 GPUsmkdir -p checkpoints/multilingual_transformerCUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17. de_fr. en. bpe16k/ \ --max-epoch 50 \ --ddp-backend=no_c10d \ --task multilingual_translation --lang-pairs de-en,fr-en \ --arch multilingual_transformer_iwslt_de_en \ --share-decoders --share-decoder-input-output-embed \ --optimizer adam --adam-betas '(0. 9, 0. 98)' \ --lr 0. 0005 --lr-scheduler inverse_sqrt --min-lr '1e-09' \ --warmup-updates 4000 --warmup-init-lr '1e-07' \ --label-smoothing 0. 1 --criterion label_smoothed_cross_entropy \ --dropout 0. 3 --weight-decay 0. 0001 \ --save-dir checkpoints/multilingual_transformer \ --max-tokens 4000 \ --update-freq 8# Generate and score the test set with sacrebleuSRC=desacrebleu --test-set iwslt17 --language-pair ${SRC}-en --echo src \ | python scripts/spm_encode. py --model examples/translation/iwslt17. de_fr. en. bpe16k/sentencepiece. bpe. model \ > iwslt17. test. ${SRC}-en. ${SRC}. bpecat iwslt17. test. ${SRC}-en. ${SRC}. bpe \ | fairseq-interactive data-bin/iwslt17. de_fr. en. bpe16k/ \ --task multilingual_translation --lang-pairs de-en,fr-en \ --source-lang ${SRC} --target-lang en \ --path checkpoints/multilingual_transformer/checkpoint_best. pt \ --buffer-size 2000 --batch-size 128 \ --beam 5 --remove-bpe=sentencepiece \ > iwslt17. test. ${SRC}-en. en. sysThe data: While it’s training, let’s look at where I can get Dutch data. The IWSLT 2017 did not seem to have Dutch data at first glance or here. I also tried just mimicking the adress from facebook prepare-iwslt17-multilingual. sh (The address https://wit3. fbk. eu/archive/2017-01-trnted/texts/de/en/de-en. tgz so I simply tried if https://wit3. fbk. eu/archive/2017-01-trnted/texts/nl/en/nl-en. tgz). Turns out there aren’t. Europarl seemed like a good bet but looking at the data, the langage seems pretty formatted and not very dialogue like. That might explain why it does not seem to be used that often. Looking back at IWSLT 2017 finally found the Dutch data and the training data. Is it me, or are competitions websites really hard to read ? The actual training loop: Ok so let’s reuse the training loop from the german file, so we just need to copy the dutch files in the same layout as the german ones, edit all the scripts and command lines to edit everything. I had to multiply the test files, someone Facebook has tst2011, tst2012 tst2013, tst2014, tst2015 for the german data, which does not seem to exist on the competition website… So here instead of trying to figure out where the information was, I simply copy-pasted the tst2010 file into dummy versions for tst2011…tst2015 (oh yeah simply omitting them will make bash scripts fail because file alignement is a requirement !, and I don’t want to spend more than 5mn editing a bash script). Now with our edited bash script: cd examples/translation/bash prepare-iwslt17-multilingual_nl. shcd . . /. . Preprocess dutch data: TEXT=examples/translation/iwslt17. nl. en. bpe16kfairseq-preprocess --source-lang nl --target-lang en \ --trainpref $TEXT/train. bpe. nl-en \ --validpref $TEXT/valid0. bpe. nl-en,$TEXT/valid1. bpe. nl-en,$TEXT/valid2. bpe. nl-en,$TEXT/valid3. bpe. nl-en,$TEXT/valid4. bpe. nl-en,$TEXT/valid5. bpe. nl-en \ --destdir data-bin/iwslt17. nl_fr. en. bpe16k \ --workers 10Now let’s preprocess french data # NOTE: it's important to reuse the en dictionary from the previous stepfairseq-preprocess --source-lang fr --target-lang en \ --trainpref $TEXT/train. bpe. fr-en \ --validpref $TEXT/valid0. bpe. fr-en,$TEXT/valid1. bpe. fr-en,$TEXT/valid2. bpe. fr-en,$TEXT/valid3. bpe. fr-en,$TEXT/valid4. bpe. fr-en,$TEXT/valid5. bpe. fr-en \ --tgtdict data-bin/iwslt17. nl_fr. en. bpe16k/dict. en. txt \ --destdir data-bin/iwslt17. nl_fr. en. bpe16k \ --workers 10Overall, pretty simple task, just a bit bothering to hit all the various walls. Now that we preformatted the dutch data, we can run the training loop on our own data ! mkdir -p checkpoints/multilingual_transformer_nlCUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17. nl_fr. en. bpe16k/ \ --max-epoch 50 \ --ddp-backend=no_c10d \ --task multilingual_translation --lang-pairs nl-en,fr-en \ # Don't change the arch !\ --arch multilingual_transformer_iwslt_de_en \ --share-decoders --share-decoder-input-output-embed \ --optimizer adam --adam-betas '(0. 9, 0. 98)' \ --lr 0. 0005 --lr-scheduler inverse_sqrt --min-lr '1e-09' \ --warmup-updates 4000 --warmup-init-lr '1e-07' \ --label-smoothing 0. 1 --criterion label_smoothed_cross_entropy \ --dropout 0. 3 --weight-decay 0. 0001 \ # Change the checkpoint \ --save-dir checkpoints/multilingual_transformer_nl \ --max-tokens 4000 \ --update-freq 8Checking the final result: So now we have a model checkpoints/multilingual_transformer_nl/checkpoint_best. pt, let’s run it ! # Generate and score the test set with sacrebleuSRC=nlsacrebleu --test-set iwslt17 --language-pair ${SRC}-en --echo src \ | python scripts/spm_encode. py --model examples/translation/iwslt17. nl_fr. en. bpe16k/sentencepiece. bpe. model \ > iwslt17. test. ${SRC}-en. ${SRC}. bpecat iwslt17. test. ${SRC}-en. ${SRC}. bpe \ | fairseq-interactive data-bin/iwslt17. nl_fr. en. bpe16k/ \ --task multilingual_translation --lang-pairs de-en,fr-en \ --source-lang ${SRC} --target-lang en \ --path checkpoints/multilingual_transformer_nl/checkpoint_best. pt \ --buffer-size 2000 --batch-size 128 \ --beam 5 --remove-bpe=sentencepiece \ > iwslt17. test. ${SRC}-en. en. sysBut woops…sacreBLEU: No such language pair nl-en sacreBLEU: Available language pairs for test set iwslt17 : en-fr, fr-en, en-de, de-en, en-zh, zh-en, en-ar, ar-en, en-ja, ja-en, en-ko, ko-en So it looks like we’re going to need to pipe some of our own data into this pipeline, we can just use the validation set we used to train cat examples/translation/iwslt17. nl_fr. en. bpe16k/valid0. bpe. nl-en. nl |python scripts/spm_encode. py --model examples/translation/iwslt17. nl_fr. en. bpe16k/sentencepiece. bpe. model \ > iwslt17. test. ${SRC}-en. ${SRC}. bpeThere we go we have encoded with our multilingual BPE tokenizer our valid dataset. We can now run our translating command cat iwslt17. test. ${SRC}-en. ${SRC}. bpe | fairseq-interactive data-bin/iwslt17. nl_fr. en. bpe16k/ --task multilingual_translation --lang-pairs nl-en,fr-en --source-lang ${SRC} --target-lang en --path checkpoints/multilingual_transformer_nl/checkpoint_best. pt --buffer-size 2000 --batch-size 128 --beam 5 --remove-bpe=sentencepieceHere are some outputs (not cherry picked): S-999 Iedereen heeft een vissenkom nodig. H-999 -1. 0272072553634644 Everybody needs a fishing ticket. D-999 -1. 0272072553634644 Everybody needs a fishing ticket. P-999 -1. 5687 -0. 2169 -0. 2363 -2. 0637 -2. 6527 -0. 2981 -0. 1540S-998 Het leidt tot meer verlamming en minder tevredenheid. H-998 -0. 32848915457725525 It leads to more paralysis and less satisfaction. D-998 -0. 32848915457725525 It leads to more paralysis and less satisfaction. P-998 -0. 9783 -0. 3836 -0. 1854 -0. 8328 -0. 1779 -0. 0163 -0. 3334 -0. 3619 -0. 2152 -0. 0450 -0. 2831 -0. 1289S-987 Ze maken ons leven minder waard. H-987 -0. 5473383665084839 They make our lives worth less. D-987 -0. 5473383665084839 They make our lives worth less. Seems good enough for now. Productizing: Flask server: Ok, in order to productionize, initially I wanted to move away from fairseq, but a lot of logic is actually tied to fairseq-interative (beam search, loading all the args, ensembling the model, source language selection and so on). It’s definitely possible to move out of it, but it felt like a few days job, so much more than I was willing to invest in this particular approach. So the idea is to have a flask server sitting in front of the model, call the appropriate encoding with spm_encode, pass it to fairseq interactive, and output the D-XXX line back to the caller. We’re going to containerize it and deploy to Kubernetes (it just happens I have a kubernetes cluster running, so less problems with deploying on it). I considered using ONNX-js (or TFlite) to deploy directly on the browser which saves a lot of headaches on deployment and keeping the service running in the long run (Like I did for the glasses project). Here the main problem is the size of the model (600Mo). I could go back and try to optimize but that’s a pretty big model, it’s going to be hard to make it come to a comfortable level for browser-only mode (Again just too much work for what I have in mind here). So let’s get started from the Flask’s hello world from flask import Flaskapp = Flask(__name__)@app. route('/')def hello_world(): return 'Hello, World!'Let’s edit it a bit to include our translate function. from flask import Flaskapp = Flask(__name__)def translate(text): # TODO later return This is a translation ! @app. route('/', methods=[ POST ])def hello(): text = request. form[ input ] print(f IN {text} ) output = translate(text) print(f OUT {output} ) result = json. dumps({ en : output}) return resultWe can run our example and check it’s running with curl $ curl -d input= Ik heft een appel. http://localhost:5000/`{ en : This is a translation ! }Implementing the translate function. : Ok this is where we are super tied to fairseq-interactive code, I had to dig into the source code, copy most of it, and mainly split Model loading code from Model running code. For that I used a lot of globals as the original code does not separate these two concerns (tidying this will be a later goal if it every comes to that). The final implementation is quite verbose but available here. One good point about this implementation is that we load the model early, so that it’s available right away when the server comes up (but it does take some time to come up). A negative point, is that because it’s loaded eagerly it’s going to make forking a nightmare and basically preventing us from using wsgi efficiently which is the recommended way of deploying Flask. It’s fine for now, it’s a personnal project after all, to get more stable deployment I would try to remove python from the equation of the web part if possible, it’s really slow and hard to work with on webservers because of the forking/threading nightmare in Python. So know our backend can really translate ! $ curl -d input= Ik heft een appel. http://localhost:5000/`{ en : I have an apple. }Before moving that to the cloud, let’s build a nice interface in front of it React front: Ok so we’re going to use React with Typescript. React because we’re going JS anyway to get the translation without clicking a button with a form like html. It’s also more convenient to use Material-UI which I find helps make a website nice from scratch (and I’m tired of Bootstrap). Typescript because it’s just saner than VanillaJS (it won’t make much of a difference here. yarn create react-app app --template typescriptcd appyarn add @material-ui/coreLet’s edit our App. tsx to use Material-UI and get the initial layout looking like translate. google. com. import React from react ;import { makeStyles } from @material-ui/core/styles ;import TextField from @material-ui/core/TextField ;import Card from @material-ui/core/Card ;import Grid from @material-ui/core/Grid ;const useStyles = makeStyles(theme => ({ app: { display: flex , justifyContent: center , alignItems: center , height: 100vh }}));function App() { const classes = useStyles(); return ( <div className={classes. app}> <Card> <form> <Grid container> <Grid item xs={12} md={6}> <TextField id= standard-basic label= Dutch multiline autoFocus /> </Grid> <Grid item xs={12} md={6}> <TextField id= standard-basic label= English multiline /> </Grid> </Grid> </form> </Card> </div> );}export default App;Here is the result : Now let’s look at the logic (simplified): type Timeout = ReturnType<typeof setTimeout>;const [text, setText] = useState( );const [time, setTime] = useState<Timeout | null>(null);const url = http://localhost:5000 ;const translate = (text: string) => { if (text === ) { setText( ); return; } const form = new FormData(); form. append( input , text); fetch(url, { method: POST , body: form }). then(response => { response. json(). then(json => { console. log(json); setText(json[ en ]); }); });};Then call it on the onChange attribute of our Dutch field. onChange={event => { // We use a timeout handler to prevent very fast keystrokes // from spamming our server. if (time !== null) { clearTimeout(time); } const text = event. target. value; const timeout = setTimeout(() => { translate(text); }, 500); setTime(timeout);}}There we have it: Let’s dockerize !: As I mentionned loading the whole model in the flask app is going to hinder a lot the wsgi process forking. I did try it, try to come up with easy fixes, but ultimately found that keeping the development server was just easier. Ok so we’re going to need a python docker image, install pytorch, fairseq, and flask to our image (actually we need flask_cors too to make sure we can call from any website as it’s an API. ) As it turns out, fairseq 0. 9 had a bug in the training loop and I was using master from a few month ago, and I needed to work with that specific version since there had been breaking changes since in master. That gives us the following requirements. txt torchflaskflask_cors-e git://github. com/pytorch/fairseq. git@7a6519f84fed06947bbf161c7b66c9099bc4ce53#egg=fairseqsentencepieceNow our Docker file, is going to get the python dependencies, copy all the local files (including model and tokenizer file) and run the flask server. That gives us : FROM python:3. 7-slimRUN pip install -U pipRUN apt-get update && apt-get install -y git build-essential # Required for building fairseq from source. COPY server/requirements. txt /app/requirements. txtRUN pip install -r /app/requirements. txtCOPY . /appWORKDIR /appCMD [ python , translate. py ]Let’s build and check that it works: docker build -t translate:latest . docker run -p 5000:5000 translate:latest# Now check with curl that we can still hit the docker and get a correct answercurl -d input= Ik heft een appel. http://localhost:5000/`# { en : This is a translation ! }Kubernetes cluster: Okay the following part will be pretty specific to my setup. I use a kubernetes cluster on GCP with ingress. I’m going to skip updating the SSL certificate. Let’s start with pushing the image to GCP: docker tag translate:latest gcr. io/myproject-XXXXXX/translate:1. 0docker push gcr. io/myproject-XXXXXX/translate:1. 0kubectl apply -f deployment. yamlkubectl apply -f service. yamlkubectl apply -f ingress. yamlHere are the (edited for brevity&security) service files I used: #deployment. yamlapiVersion: apps/v1kind: Deploymentmetadata: name: translate-deploymentspec: replicas: 1 selector: matchLabels: app: translate template: metadata: labels: app: translate spec: containers: - name: translate image: gcr. io/myproject-XXXXX/translate:1. 0 ports: - containerPort: 5000# service. yamlapiVersion: v1kind: Servicemetadata: name: translate-servicespec: type: NodePort selector: app: translate ports: - protocol: TCP port: 80 targetPort: 5000#ingress. yamlapiVersion: networking. k8s. io/v1beta1kind: Ingressmetadata: name: ingress-front annotations: kubernetes. io/ingress. global-static-ip-name: address-cluster networking. gke. io/managed-certificates: ottomate-certificate-newspec: rules: - host: translate. ottomate. app http: paths: - path: /* backend: serviceName: translate-service servicePort: 80Hopefully within a few minutes you have your pod running and you can hit your live own server with the API. You just need to update your react App to point the the correct URL and boom your done, your very own translate server app. What could/should be done next. : For the model: Add more data to the original training set, some words are missing, translation can become funky on some real world sentences I give the machine (Dutch companies tend to send very verbose emails) Add some data augmentation in the pool as the current translation is very brittle to errors. Using Sentence piece algorihm with sampling instead of BPE could be used, some typo generator, word inversions to name a few. Training some error detection algorithm on top or using ready made ones could help (translate. google. com has some spellfixing magic applied before it seems. ) Making it smaller to make it portable to tflite, mobile phone for offline mode and so on (it’s a pretty big workload to make it work though)For the backend: Battle testing the backend should be the first thing to do to check failure modes and fix naive DOS attacks. Something like TorchServe seems like what we want for the model part. Never used it so far, but it seems to solve some problems encountered here and would make iterations faster on various models (also swapping out models). On the other spectrum I could go for tighter control. Removing the fairseq-interative clutter would be my first move. If I can go pytorch barebones, then using Rust, with Hugging Face’s tokenizers library would probably make inference faster and deployment easier. It would of course make iteration much slower so I would do that only when the model is very stable. It could make mobile offline possible (with a very large app data but doable. )For the frontend: Working a bit more on the mobile part of the design which is a bit broken at the moment. Maybe add buttons to switch languages easily, switch language sides (although I mostly use Dutch->English and Dutch->French) Add a react-native app so that I can translate from my phone. (Without offline mode)"
}, {
"id": 5,
"url": "http://localhost:4000/energy/2020/03/19/solar-energy.html",
"title": "Super simple estimation of available solar energy",
"body": "2020/03/19 - Solar energy Stefan boltzmann's law : $ \text{Surface energy} = \sigma T^4$ For the sun, $T = \text{5,778 }K$ $\sigma = 5. 67 \times 10 ^{-8} W. m^{-2}. K^{-4}$ from sympy. physics. units import K, W, m, gigasigma = 5. 67 * 10**(-8) * W *m**(-2) * K**(-4)T = 5778 * Ksurface_energy = sigma * T**4print(surface_energy) 63196526. 5460292*watt/meter**2 Total emitted solar energy : $ Radiation = \text{Surface of the sun} \times \text{Surface energy} $ $ Radiation = 4 \pi r^2 \times \text{Surface energy} $ from sympy import *r_sun = 696_340 * 1000 *msurface_of_sun = 4 * pi * r_sun ** 2 radiation = surface_of_sun * surface_energyprint(radiation) 1. 22573302243694e+26*pi*watt Energy received at earth average distance : $ \text{Radiation received} = \frac{\text{Total sun radiation}}{ \text{sphere at earth's distance}}$ $ \text{Radiation received} = \frac{Radiation}{ 4 \pi D_{earth-sun}^2} $ R_earth = 6_371 * 1000 * mD_earth_sun = 148. 88 * 10**6 * 1000 * mearth_perp_surface = pi * R_earth **2sphere = 4 * pi * D_earth_sun **2radiation_received = radiation / sphereprint(radiation_received) 1382. 49374484614*watt/meter**2 Energy received by the earth surface (before atmosphere) : $ \text{Energy received} = \text{radiation received} \times \frac{ \text{visible surface}}{ \text{earth's surface}} $ power_received = radiation_received * pi * R_earth **2surface_power_received = power_received / (4 * pi * R_earth **2)print(surface_power_received)print(power_received. n()) 345. 623436211536*watt/meter**21. 76290235470883e+17*watt RADIATION RECEIVED BY SYSTEM EARTH = $345 W. m^{-2}$ MAXIMUM POWER WITH EARTH DYSON SPHERE : $176 PW$ Human consumption 13 511 MTep Source International Energy agency from sympy. physics. units import J, s, Wfrom sympy. physics. units. util import convert_tomillion = 10 **6kilo = 10**3giga = 10 ** 9toe = 41. 868 * giga * Jktoe = kilo * toeMtoe = million * toehour = 60 * 60 * syear = 24 * h * 365. 25base = sum([3852538,2949909,670298,335519,204190,1286064,4329220])Humanity_total_annual_consumption = base * ktoehumanity_power_consumption = Humanity_total_annual_consumption / yearprint(convert_to(humanity_power_consumption. n(), [W]). n()) 18080149776408. 9*watt print(convert_to(humanity_power_consumption / power_received * 100, [J, s]). n()) 0. 0102558997258785 We are currently consuming 0. 01% of the maximum capacity of the earth covered by a Dyson sphere of solar panels. A bit more realistic approach : After the atmosphere only $168 W. m^{-2}$ hit the surface. It's quite complicated to infer it depends on the wavelength of the incoming light, clouds, composition of the atmosphere and so on, so we just take the value from here. Then we only have 29% of the earth surface that is landmass (where we can reasonably put solar panels in large quantity) Of that 31% is covered in forest which are already some natural solar panels we don't want to remove (for other obvious reasons) sourceAnd 38. 4% is covered of agricultural land source. Then solar panels are not 100% efficient. They are roughly only 20% efficient with current technology at a reasonable cost. earth_power_received = 168 * W * m **(-2)available_surface = 4 * pi * R_earth **2 * 0. 29 * (1 -. 31 - . 384)max_power = earth_power_received * available_surface * 0. 2print(max_power. n())print(convert_to(humanity_power_consumption / max_power *100, [J, s]). n()) 1. 52084087357243e+15*watt1. 18882587196246 Conclusion In the end we are currently consuming 1. 2% of the realistic available solar power energy. That's would require posing solar panels everywhere on the planet that is not a forest or agricultural land. And we don't account yet for Energy return on energy invested (EROEI) which is likely to increase that percentage. NB: This is a very superficial attempt to evaluate these numbers, however the result should be correct within an order of magnitude. "
}, {
"id": 6,
"url": "http://localhost:4000/ml/2020/03/10/no-gd-training.html",
"title": "Can we train neural networks without gradient descent ?",
"body": "2020/03/10 - What's the problem ? : ML models usually are not really capable of predicting how well the data youfeed them is close to what was in the dataset. It really matters in production models as they might make really stupid mistakes just because they are offthe training set. Let's train a simple mnist model (straight out from pytorch tutorial https://github. com/pytorch/examples/tree/master/mnist) #collapsefrom __future__ import print_functionimport argparseimport torchimport torch. nn as nnimport torch. nn. functional as Fimport torch. optim as optimfrom torchvision import datasets, transformsfrom torch. optim. lr_scheduler import StepLRimport osclass Net(nn. Module): def __init__(self): super(Net, self). __init__() self. conv1 = nn. Conv2d(1, 32, 3, 1) self. conv2 = nn. Conv2d(32, 64, 3, 1) self. dropout1 = nn. Dropout2d(0. 25) self. dropout2 = nn. Dropout2d(0. 5) self. fc1 = nn. Linear(9216, 128) self. fc2 = nn. Linear(128, 10) def forward(self, x): x = self. conv1(x) x = F. relu(x) x = self. conv2(x) x = F. max_pool2d(x, 2) x = self. dropout1(x) x = torch. flatten(x, 1) x = self. fc1(x) x = F. relu(x) x = self. dropout2(x) x = self. fc2(x) output = F. log_softmax(x, dim=1) return outputdef train(args, model, device, train_loader, optimizer, epoch): model. train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data. to(device), target. to(device) optimizer. zero_grad() output = model(data) loss = F. nll_loss(output, target) loss. backward() optimizer. step() if batch_idx % args. log_interval == 0: print('Train Epoch: {} [{}/{} ({:. 0f}%)]\tLoss: {:. 6f}'. format( epoch, batch_idx * len(data), len(train_loader. dataset), 100. * batch_idx / len(train_loader), loss. item()))def test(args, model, device, test_loader): model. eval() test_loss = 0 correct = 0 with torch. no_grad(): for data, target in test_loader: data, target = data. to(device), target. to(device) output = model(data) test_loss += F. nll_loss(output, target, reduction='sum'). item() # sum up batch loss pred = output. argmax(dim=1, keepdim=True) # get the index of the max log-probability correct += pred. eq(target. view_as(pred)). sum(). item() test_loss /= len(test_loader. dataset) print('\nTest set: Average loss: {:. 4f}, Accuracy: {}/{} ({:. 0f}%)\n'. format( test_loss, correct, len(test_loader. dataset), 100. * correct / len(test_loader. dataset)))def mnist(): filename ="mnist_cnn. pt" if os. path. exists(filename): return # Training settings parser = argparse. ArgumentParser(description='PyTorch MNIST Example') parser. add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser. add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser. add_argument('--epochs', type=int, default=4, metavar='N', help='number of epochs to train (default: 14)') parser. add_argument('--lr', type=float, default=1. 0, metavar='LR', help='learning rate (default: 1. 0)') parser. add_argument('--gamma', type=float, default=0. 7, metavar='M', help='Learning rate step gamma (default: 0. 7)') parser. add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser. add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser. add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser. add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser. parse_args() use_cuda = not args. no_cuda and torch. cuda. is_available() torch. manual_seed(args. seed) device = torch. device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch. utils. data. DataLoader( datasets. MNIST('. . /data', train=True, download=True, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=args. batch_size, shuffle=True, **kwargs) test_loader = torch. utils. data. DataLoader( datasets. MNIST(os. path. expanduser('. . /data'), train=False, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=args. test_batch_size, shuffle=True, **kwargs) model = Net(). to(device) optimizer = optim. Adadelta(model. parameters(), lr=args. lr) scheduler = StepLR(optimizer, step_size=1, gamma=args. gamma) for epoch in range(1, args. epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader) scheduler. step() if args. save_model: torch. save(model. state_dict(), filename) # mnist() Other out of distribution detector have been proposed. Here is a sample of methods: Genetic algorithmsDFOSimulated annealingExperiments : def train_ticket(args, model, device, train_loader, optimizer, epoch): model. train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data. to(device), target. to(device) optimizer. zero_grad() output = model(data) loss = F. nll_loss(output, target) loss. backward() optimizer. step() if batch_idx % args. log_interval == 0: print('Train Epoch: {} [{}/{} ({:. 0f}%)]\tLoss: {:. 6f}'. format( epoch, batch_idx * len(data), len(train_loader. dataset), 100. * batch_idx / len(train_loader), loss. item()))def test_ticket(args, model, device, test_loader): model. eval() test_loss = 0 correct = 0 with torch. no_grad(): for data, target in test_loader: data, target = data. to(device), target. to(device) output = model(data) test_loss += F. nll_loss(output, target, reduction='sum'). item() # sum up batch loss pred = output. argmax(dim=1, keepdim=True) # get the index of the max log-probability correct += pred. eq(target. view_as(pred)). sum(). item() test_loss /= len(test_loader. dataset) print('\nTest set: Average loss: {:. 4f}, Accuracy: {}/{} ({:. 0f}%)\n'. format( test_loss, correct, len(test_loader. dataset), 100. * correct / len(test_loader. dataset)))def ticket_finder(): filename ="ticket_finder. pt" if os. path. exists(filename): return # Training settings parser = argparse. ArgumentParser(description='PyTorch MNIST Example') parser. add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser. add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser. add_argument('--epochs', type=int, default=4, metavar='N', help='number of epochs to train (default: 14)') parser. add_argument('--lr', type=float, default=1. 0, metavar='LR', help='learning rate (default: 1. 0)') parser. add_argument('--gamma', type=float, default=0. 7, metavar='M', help='Learning rate step gamma (default: 0. 7)') parser. add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser. add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser. add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser. add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser. parse_args() use_cuda = not args. no_cuda and torch. cuda. is_available() torch. manual_seed(args. seed) device = torch. device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch. utils. data. DataLoader( datasets. MNIST('. . /data', train=True, download=True, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=args. batch_size, shuffle=True, **kwargs) test_loader = torch. utils. data. DataLoader( datasets. MNIST(os. path. expanduser('. . /data'), train=False, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=args. test_batch_size, shuffle=True, **kwargs) model = Net(). to(device) optimizer = TicketFinder(model. parameters()) for epoch in range(1, args. epochs + 1): train_ticket(args, model, device, train_loader, optimizer, epoch) test_ticket(args, model, device, test_loader) if args. save_model: torch. save(model. state_dict(), filename) "
}, {
"id": 7,
"url": "http://localhost:4000/ml/docker/2020/03/04/running-gpu-enabled-docker.html",
"title": "Running a docker with GPU enabled (for pytorch and tensorflow)",
"body": "2020/03/04 - Sometimes if you want to contain dependencies you might want to use dockerto containerize your projects. You can also use it for GPUIn order to run docker images with GPU enabled, you are going to need: Install dockersudo apt-get install \ apt-transport-https \ ca-certificates \ curl \ gnupg-agent \ software-properties-commoncurl -fsSL https://download. docker. com/linux/ubuntu/gpg | sudo apt-key add -sudo add-apt-repository \ deb [arch=amd64] https://download. docker. com/linux/ubuntu \ $(lsb_release -cs) \ stable sudo apt-get updatesudo apt-get install docker-ce docker-ce-cli containerd. iosource Install nvidia-container-toolkit# Add the package repositoriesdistribution=$(. /etc/os-release;echo $ID$VERSION_ID)curl -s -L https://nvidia. github. io/nvidia-docker/gpgkey | sudo apt-key add -curl -s -L https://nvidia. github. io/nvidia-docker/$distribution/nvidia-docker. list | sudo tee /etc/apt/sources. list. d/nvidia-docker. listsudo apt-get update && sudo apt-get install -y nvidia-container-toolkitsudo systemctl restart dockersource Launch the docker for PyTorchIn order to use cuda you need a nvidia enabled image, that will make everything simpler. You could of course link your own cuda library via volume mounting but it’s cumbersome (and I didn’t check that it works) Create an account on https://ngc. nvidia. com/ Go to the create an API key page https://ngc. nvidia. com/setup/api-key Generate the key and copy itdocker login nvcr. ioUsername: $oauthtokenPassword: <Your Key>docker run --gpus all -it --rm nvcr. io/nvidia/pytorch:20. 02-py3 bashpython -c import torch; print(torch. cuda. is_available()) # TrueIf you fail to login the docker run command will fail with unauthenticated error. Caveat: This is the only option for now, docker-compose CANNOT run the –gpu option. To check updates for docker compose, look at this issue Bonus: Nvidia put up a lot of containers with various libraries enabled check it out in their catalog Enjoy !: "
}, {
"id": 8,
"url": "http://localhost:4000/ml/nlp/kldivergence/2020/02/26/self-kl-models.html",
"title": "Self KL-divergence for detecting out of distribution data and unsupervised text classification",
"body": "2020/02/26 - TL;DR. By training two models in the same dataset order with same architecture, same loss, but different initialization, I was able to obtain a consistent out-of-distribution detector by measuring the kl-divergence between model outputs. This out-of-distribution measure used on text could lead to unsupervised text classification. What's the problem ? : ML models usually are not really capable of predicting how well the data youfeed them is close to what was in the dataset. It really matters in production models as they might make really stupid mistakes just because they are offthe training set. Let's train a simple mnist model (straight out from pytorch tutorial https://github. com/pytorch/examples/tree/master/mnist) #collapsefrom __future__ import print_functionimport argparseimport torchimport torch. nn as nnimport torch. nn. functional as Fimport torch. optim as optimfrom torchvision import datasets, transformsfrom torch. optim. lr_scheduler import StepLRimport osclass Net(nn. Module): def __init__(self): super(Net, self). __init__() self. conv1 = nn. Conv2d(1, 32, 3, 1) self. conv2 = nn. Conv2d(32, 64, 3, 1) self. dropout1 = nn. Dropout2d(0. 25) self. dropout2 = nn. Dropout2d(0. 5) self. fc1 = nn. Linear(9216, 128) self. fc2 = nn. Linear(128, 10) def forward(self, x): x = self. conv1(x) x = F. relu(x) x = self. conv2(x) x = F. max_pool2d(x, 2) x = self. dropout1(x) x = torch. flatten(x, 1) x = self. fc1(x) x = F. relu(x) x = self. dropout2(x) x = self. fc2(x) output = F. log_softmax(x, dim=1) return outputdef train(args, model, device, train_loader, optimizer, epoch): model. train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data. to(device), target. to(device) optimizer. zero_grad() output = model(data) loss = F. nll_loss(output, target) loss. backward() optimizer. step() if batch_idx % args. log_interval == 0: print('Train Epoch: {} [{}/{} ({:. 0f}%)]\tLoss: {:. 6f}'. format( epoch, batch_idx * len(data), len(train_loader. dataset), 100. * batch_idx / len(train_loader), loss. item()))def test(args, model, device, test_loader): model. eval() test_loss = 0 correct = 0 with torch. no_grad(): for data, target in test_loader: data, target = data. to(device), target. to(device) output = model(data) test_loss += F. nll_loss(output, target, reduction='sum'). item() # sum up batch loss pred = output. argmax(dim=1, keepdim=True) # get the index of the max log-probability correct += pred. eq(target. view_as(pred)). sum(). item() test_loss /= len(test_loader. dataset) print('\nTest set: Average loss: {:. 4f}, Accuracy: {}/{} ({:. 0f}%)\n'. format( test_loss, correct, len(test_loader. dataset), 100. * correct / len(test_loader. dataset)))def mnist(): filename = "mnist_cnn. pt" # Training settings if os. path. exists(filename): return parser = argparse. ArgumentParser(description='PyTorch MNIST Example') parser. add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser. add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser. add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser. add_argument('--lr', type=float, default=1. 0, metavar='LR', help='learning rate (default: 1. 0)') parser. add_argument('--gamma', type=float, default=0. 7, metavar='M', help='Learning rate step gamma (default: 0. 7)') parser. add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser. add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser. add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser. add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser. parse_args() use_cuda = not args. no_cuda and torch. cuda. is_available() torch. manual_seed(args. seed) device = torch. device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch. utils. data. DataLoader( datasets. MNIST('. . /data', train=True, download=True, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=args. batch_size, shuffle=True, **kwargs) test_loader = torch. utils. data. DataLoader( datasets. MNIST(os. path. expanduser('. . /data'), train=False, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=args. test_batch_size, shuffle=True, **kwargs) model = Net(). to(device) optimizer = optim. Adadelta(model. parameters(), lr=args. lr) scheduler = StepLR(optimizer, step_size=1, gamma=args. gamma) for epoch in range(1, args. epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader) scheduler. step() if args. save_model: torch. save(model. state_dict(), filename) # mnist() #collapsefrom torch. distributions import Categoricalfrom torch. nn. parameter import Parameterfrom torchvision import transformsdef attack_simple(model, verbose=False): dummy_input = Parameter(torch. rand(1, 1, 28, 28, requires_grad=True)) lr = 1 optimizer = optim. Adadelta([dummy_input], lr=lr) for i in range(100): output = model(dummy_input) entropy = Categorical(logits = output). entropy() # print(f'Entropy {entropy. item():. 2f}') optimizer. zero_grad() entropy. backward() optimizer. step() MAX = output[0]. exp(). max(dim=-1) pil_img = transforms. Resize((240, 240))(transforms. ToPILImage()(dummy_input[0])) return (MAX. values. item() > 0. 8, MAX, pil_img)def check_attack(): mnist_model = Net() mnist_model. load_state_dict(torch. load('mnist_cnn. pt')) success, MAX, pil_img = attack_simple(mnist_model) print(f"MNIST Model says : This is a {MAX. indices. item()} with probability {MAX. values. item() * 100:. 2f}%") display(pil_img) success_rate = sum(attack_simple(mnist_model)[0] for i in range(100)) / 100. print(f"Success rate {success_rate * 100: . 2f}") # check_attack() Then generate an random image for which the model is highly confident yet it's completely absurd. This new image is out of distribution yet the model does not know it. We want to avoid doing such mistakes in production. Other approaches : Other out of distribution detector have been proposed. Here is a sample of methods: Likelihood Ratios for Out-of-Distribution Detection: Propose to learn 2 distinct models, one raw , one with perturbation instilled into the dataset, and look at the log likelihood ratio of the two models, claim is that the difference between the two will reflect how far input is from the semantic part of the manifold of X. $p(x) = p(x_{background})p(x_{semantic})$, the perturbation needs to lie only on $x_{semantic}$. Out-of-distribution Detection in Classifiers via Generation: Propose to use autoencoder (or GANs) to generate a low dimensional representation of the manifold of the dataset X, then perturb X on that representation. Those perturbated examples are trained to become a new class of the output of the classifier. Enhancing the reliability of Out-of-Distribution Image Detection in Neural Networks (Odin): This one uses temperature scaling regarding softmax to generate perturbated input, then look at the probability of the softmax if it passes a threshold. IMO, this paper is interesting as it supposes smoothness properties on In distribution data, and less smooth for out-of-distribution. It does require some examples of out-of-distribution for fitting 3 hyperparameters (temperature, threshold and magnitude of perturbation)Your classifier is secretly an energy based model and you should treat it like one: This one adds a new term in the loss to estimate p(x) basically. Multiple ood detectors are proposed, the most efficient being the second derivative of p(x), claiming again that density of p(x) will change more widly in ood space, leading to a good ood detector. WAIC, but Why? Generative Ensembles for Robust Anomaly Detection: This paper proposes to use an ensemble of models and look at WAIC criterion to detect OOD. It makes many comparison to VAE and GANs Learning Confidence for Out-of-Distribution Detection in Neural Networks : The core idea in this paper is to change the learning loss, to learn confidence as prior task to classification task, a model is allowed to see real label only when it claims it can solve the problem, outputting via another head directly a confidence score. Caveat is that the model might choose to give up and always claim confidence, and another trick is proposed to emphasize the in-distribution vs out-of-distribution by preprocessing inputs to move them towards region of higher confidence. In-distribution tends to move closer to 1 than out-of-distribution. So the direct confidence estimator seems to be smoother out-of-distribution than in-distribution, where peaks are more likely to be found. Papers with code: More links on that hopefully Our approach : Tl;dr : Make two similar models, with two different random initialization, then train them at the same time. > The ood detector will simply be the a threshold classifier on the KL-divergence between the two outputs. The core argument for this approach is that the neural network captures the dataset manifold (which means it will produce regular outputs for in dataset items). For the range of possible values it has random values for a random initialization. If that is true, then we train the model, we shift it's output only on the dataset manifold, and not anywhere else. If that assumption is correct, then the 2 models have very low probability of concurring in their output outside of the manifold if they have been initialized differently. It's quite close to WAIC, but the two models need to be trained at the same time. The argument is that is should align gradients during the training phase, leading to more correlation for in-dataset prediction for the models. The argument for this supposes that the lottery ticket hypothesis is true, and adds that lottery ticket is unique (or at least that the class of lottery tickets is very thin, and they all highly correlate to each other). If this is true, then the gradients within the network that correspond to this lottery ticket winner in both networks should be the same (or highly correlated). In order to fix the threshold, we found that simply setting it to be 10x the average kl-divergence obtained on the train dataset worked pretty well. As kl divergence is measured in bits, 10x is a quite large margin. More work could be done to study more closely the behaviour of this self kl-divergence. Experiments : Experiment 1 : MNIST attack like failure presented before. class MultiNet(nn. Module): def __init__(self, *models): super(). __init__() self. models = nn. ModuleList(models) def forward(self, x): return [model(x) for model in self. models] def train_multi(args, model, device, train_loader, optimizer, epoch): model. train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data. to(device), target. to(device) optimizer. zero_grad() outputs = model(data) loss = sum(F. nll_loss(output, target) for output in outputs) loss. backward() optimizer. step() if batch_idx % args. log_interval == 0: print('Train Epoch: {} [{}/{} ({:. 0f}%)]\tLoss: {:. 6f}'. format( epoch, batch_idx * len(data), len(train_loader. dataset), 100. * batch_idx / len(train_loader), loss. item()))def test_multi(args, model, device, test_loader): model. eval() test_loss = 0 correct = 0 with torch. no_grad(): for data, target in test_loader: data, target = data. to(device), target. to(device) outputs = model(data) test_loss += sum(F. nll_loss(output, target, reduction='sum'). item() for output in outputs) pred = outputs[0]. argmax(dim=1, keepdim=True) # get the index of the max log-probability correct += pred. eq(target. view_as(pred)). sum(). item() test_loss /= len(test_loader. dataset) print('\nTest set: Average loss: {:. 4f}, Accuracy: {}/{} ({:. 0f}%)\n'. format( test_loss, correct, len(test_loader. dataset), 100. * correct / len(test_loader. dataset)))def mnist_multi(): # Training settings filename = "mnist_multi_cnn. pt" if os. path. exists(filename): return parser = argparse. ArgumentParser(description='PyTorch MNIST Example') parser. add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser. add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser. add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser. add_argument('--lr', type=float, default=1. 0, metavar='LR', help='learning rate (default: 1. 0)') parser. add_argument('--gamma', type=float, default=0. 7, metavar='M', help='Learning rate step gamma (default: 0. 7)') parser. add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser. add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser. add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser. add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser. parse_args() use_cuda = not args. no_cuda and torch. cuda. is_available() torch. manual_seed(args. seed) device = torch. device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch. utils. data. DataLoader( datasets. MNIST('. . /data', train=True, download=True, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=args. batch_size, shuffle=True, **kwargs) test_loader = torch. utils. data. DataLoader( datasets. MNIST(os. path. expanduser('. . /data'), train=False, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=args. test_batch_size, shuffle=True, **kwargs) model1 = Net() model2 = Net() model = MultiNet(model1, model2). to(device) optimizer = optim. Adadelta(model. parameters(), lr=args. lr) scheduler = StepLR(optimizer, step_size=1, gamma=args. gamma) for epoch in range(1, args. epochs + 1): train_multi(args, model, device, train_loader, optimizer, epoch) test_multi(args, model, device, test_loader) scheduler. step() if args. save_model: torch. save(model. state_dict(), filename) # mnist_multi() from torchvision import datasetsdef kl(model, device, test_loader): model. eval() test_loss = 0 with torch. no_grad(): for data, target in test_loader: data, target = data. to(device), target. to(device) outputs = model(data) loss = 0 n = 0 for i in range(len(outputs) - 1): for j in range(i + 1, len(outputs)): n += 1 loss += 1/2 * (F. kl_div(outputs[i], outputs[j]. exp(), reduction='sum'). item() + F. kl_div(outputs[j], outputs[i]. exp(), reduction='sum'). item()) loss /= n test_loss += loss test_loss /= len(test_loader. dataset) print('\nTest set: Average loss: {:. 4f}, len {} \n'. format( test_loss, len(test_loader. dataset))) return test_lossdef get_reference_kl(): multi_model = MultiNet(Net(), Net()) multi_model. load_state_dict(torch. load('mnist_multi_cnn. pt')) test_loader = torch. utils. data. DataLoader( datasets. MNIST(os. path. expanduser('. . /data'), train=False, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=1000, shuffle=True) return kl(multi_model, device='cpu', test_loader=test_loader)# ref_kl_loss = get_reference_kl() Now we have 2 models capable of detecting digits, we have instantly 3 checks for checking if the output of our model is valid. The 2 models need to be concording (they need to outputs the same digit as an output), they need to have similar kl-divergence, we actually have a reference for the test set, so we know what kind of divergence we should look for, anything 10x more is definitely ood (we could look at the test set distribution for more fine grain estimation). Because kl divergence is asymetric we have 2 values (it's harder for spiked distribution to have another distribution be close in the kl sense, so taking the max of kl-divergence should be used for out-of-distribution. #collapsefrom torch. distributions import Categoricalfrom torch. nn. parameter import Parameterfrom torchvision import transformsdef attack(loss_fn, verbose=True, n=100, lr=1): multi_model = MultiNet(Net(), Net()) multi_model. load_state_dict(torch. load('mnist_multi_cnn. pt')) dummy_input = Parameter(torch. rand(1, 1, 28, 28, requires_grad=True)) optimizer = optim. Adadelta([dummy_input], lr=lr) for i in range(n): outputs = multi_model(dummy_input) loss = loss_fn(outputs) # print(f'Entropy {entropy. item():. 2f}') optimizer. zero_grad() loss. backward() optimizer. step() MAX1 = outputs[0][0]. exp(). max(dim=-1) MAX2 = outputs[1][0]. exp(). max(dim=-1) kl_loss = F. kl_div(outputs[0], outputs[1]. exp(), reduction='batchmean') kl_loss2 = F. kl_div(outputs[1], outputs[0]. exp(), reduction='batchmean') if (kl_loss / ref_kl_loss) > 10 or kl_loss2 / ref_kl_loss > 10 or MAX1. indices. item() != MAX2. indices. item(): success = False else: success = MAX1. values. item() > 0. 8 and MAX2. values. item() > 0. 8 if verbose: print(f"MNIST Model says : This is a {MAX1. indices. item()} with probability {MAX1. values. item() * 100:. 2f}%") print(f"MNIST Model 2 says : This is a {MAX2. indices. item()} with probability {MAX2. values. item() * 100:. 2f}%") print(f"KL-divergence is {kl_loss / ref_kl_loss} {kl_loss2 / ref_kl_loss}") if success: print("ATTACK SUCCEEDED") else: print("ATTACK FAILED") pil_img = transforms. Resize((240, 240))(transforms. ToPILImage()(dummy_input[0])) display(pil_img) return success Now if we simply attack the first model like we did earlier, we can see that we can trick it as easily as before. BUT the second model, does not get attacked which is to be expected. def loss(outputs): entropy = Categorical(logits = outputs[0]). entropy() loss = entropy return loss_ = attack(loss) MNIST Model says : This is a 3 with probability 99. 32%MNIST Model 2 says : This is a 3 with probability 33. 50%KL-divergence is 587. 7392578125 152. 96902465820312ATTACK FAILED Even if we try a smarter and attack both models at the same time, we can't succeed at a consistent rate. Be warned, it will succeed sometimes, just not consistently. def loss(outputs): entropy1 = Categorical(logits = outputs[0]). entropy() entropy2 = Categorical(logits = outputs[1]). entropy() kl_loss1 = F. kl_div(outputs[0], outputs[1]. exp(), reduction='batchmean') kl_loss2 = F. kl_div(outputs[1], outputs[0]. exp(), reduction='batchmean') distance = F. mse_loss(outputs[0], outputs[1]) loss = entropy1 + entropy2 + kl_loss1 + kl_loss2 + distance return loss _ = attack(loss) MNIST Model says : This is a 1 with probability 11. 50%MNIST Model 2 says : This is a 7 with probability 11. 48%KL-divergence is 0. 474844753742218 0. 47643253207206726ATTACK FAILED Be warned, it will succeed sometimes, just not consistently. For comparison, the first attack succeeds with close to 100% (we couldn't make it fail). Actually because we have 10 classes, and if we supposed out-of-distribution probability distribution is uniformly random, it should be something close to 10%, when our initial random image finds a place where the 2 models intersect on the same digit. def loss(outputs): entropy1 = Categorical(logits = outputs[0]). entropy() entropy2 = Categorical(logits = outputs[1]). entropy() kl_loss1 = F. kl_div(outputs[0], outputs[1]. exp(), reduction='batchmean') kl_loss2 = F. kl_div(outputs[1], outputs[0]. exp(), reduction='batchmean') distance = F. mse_loss(outputs[0], outputs[1]) loss = entropy1 + entropy2 + kl_loss1 + kl_loss2 + distance return loss def attack_rate(): attacks = [] for i in range(100): success = attack(loss, verbose=False, n=200, lr=0. 1) if success: print("F", end='') else: print(". ", end='') attacks. append(success) print('') print(f"Attack success rate {sum(attacks)/len(attacks) * 100:. 2f}%")# attack_rate() The actual attack range seems to stagnate at around 0% (30% if we remove the confidence rate > 80%) with various learning rates and attack steps. There probably are better strategies to attack, this, but the main point is that it became harder. Experiment 2 : Now let's test this on common ood detection for classic datasets. We will add ood detection for the train dataset, just to check that we don't exclude too much of the original dataset. Datasets used will be MNIST, FashionMNIST #collapsefrom torchvision. datasets import MNIST, Omniglot, FashionMNISTfrom torchvision import transformsimport osdef dataset_multi(dataset_cls, filename, model, transform): # Training settings parser = argparse. ArgumentParser(description='PyTorch MNIST Example') parser. add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser. add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser. add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser. add_argument('--lr', type=float, default=1e-2, metavar='LR', help='learning rate (default: 1. 0)') parser. add_argument('--gamma', type=float, default=0. 7, metavar='M', help='Learning rate step gamma (default: 0. 7)') parser. add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser. add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser. add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser. add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser. parse_args() use_cuda = not args. no_cuda and torch. cuda. is_available() torch. manual_seed(args. seed) device = torch. device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch. utils. data. DataLoader( dataset_cls('. . /data', train=True, download=True, transform=transform), batch_size=args. batch_size, shuffle=True, **kwargs) test_loader = torch. utils. data. DataLoader( dataset_cls('. . /data', train=False, download=True, transform=transform), batch_size = args. test_batch_size) optimizer = optim. Adam(model. parameters(), lr=args. lr) scheduler = optim. lr_scheduler. CyclicLR( optimizer, base_lr=0, max_lr=args. lr, cycle_momentum=False, step_size_up=200 ) for epoch in range(1, args. epochs + 1): train_multi(args, model, device, train_loader, optimizer, epoch) test_multi(args, model, device, test_loader) scheduler. step() if args. save_model: torch. save(model. state_dict(), filename)def run_datasets(create_model, suffix): datasets = [MNIST, FashionMNIST] device = torch. device("cuda" if torch. cuda. is_available() else "cpu") for dataset_cls in datasets: filename = f'{dataset_cls. __name__}{suffix}. pt' if os. path. exists(filename): continue model = create_model(). to(device) transform = transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ]) dataset_multi(dataset_cls, filename, model, transform) def create_model(): model1 = Net() model2 = Net() model = MultiNet(model1, model2) return model# run_datasets(create_model, suffix='') from sklearn. metrics import roc_auc_score def test_datasets(model_arch, suffix): datasets = [MNIST, FashionMNIST] batch_size = 500 device = 'cuda' if torch. cuda. is_available() else 'cpu' for dataset_cls in datasets: filename = f'{dataset_cls. __name__}{suffix}. pt' model_arch. load_state_dict(torch. load(filename)) model = model_arch test_loader = torch. utils. data. DataLoader( dataset_cls('. . /data', train=False, download=True, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=batch_size) ref_kl_loss = kl(model, device, test_loader) print("Ref loss", ref_kl_loss) all_labels = [] all_scores = [] for dataset_cls2 in datasets: test_loader2 = torch. utils. data. DataLoader( dataset_cls2('. . /data', train=False, download=True, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])),batch_size=batch_size, shuffle=True) OOD = 0 for data, target in test_loader2: outputs = model(data. to(device)) kl_loss = torch. max(F. kl_div(outputs[0], outputs[1]. exp(), reduction='none'), F. kl_div(outputs[1], outputs[0]. exp(), reduction='none')) kl_loss = kl_loss. sum(dim=-1) similar = outputs[0]. argmax(dim=-1) == outputs[1]. argmax(dim=-1) normed = kl_loss / ref_kl_loss kl_anomaly = normed > 10 non_concordant = similar == False out_of_distrib = sum(kl_anomaly | non_concordant) N = normed. shape[0] boolean = dataset_cls2 != dataset_cls all_labels. extend([boolean] * N) all_scores. extend(normed. tolist()) OOD += out_of_distrib print(f"Trained on {dataset_cls. __name__} we detected on {dataset_cls2. __name__} {OOD}/{len(test_loader2. dataset)} ({float(OOD)/len(test_loader2. dataset) * 100:. 2f}%) out of distribution") auc = roc_auc_score(all_labels, all_scores) print(f"AUC for {dataset_cls. __name__} : {auc}")def exp_2(): device = torch. device('cuda' if torch. cuda. is_available() else 'cpu') model = MultiNet(Net(), Net()). to(device) test_datasets(model, suffix='') # exp_2() So we can see that we achieve, with no tuning whatsoever a decent out of distribution detector. We seem to achieve much better AUROC on MNIST, probably because the in-distribution learning seems to be much better (99% test accuracy vs 92% for fastionMNIST). So to False positives for fashionMNIST probably come from this hard to learn in-distribution. Some fine tuning needs to be done to get better results. We also have to keep in mind, that the models to learn this are quite small (2M parameters but only 2 convolution layers) so the lottery hypothesis validity for such a network might be questionned. Experiment 2 bis : Same experiment but with fine tuned, larger networks on the same datasets from torchvision. models. resnet import ResNet, BasicBlockclass MnistResNet(ResNet): def __init__(self): super(MnistResNet, self). __init__(BasicBlock, [2, 2, 2, 2], num_classes=10) self. conv1 = torch. nn. Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) def forward(self, x): return F. log_softmax(super(MnistResNet, self). forward(x), dim=-1)def run_datasets_res(): datasets = [MNIST, FashionMNIST] device = torch. device("cuda" if torch. cuda. is_available() else "cpu") for dataset_cls in datasets: filename = f'{dataset_cls. __name__}_resnet. pt' if os. path. exists(filename): continue multi_res = MultiNet(MnistResNet(), MnistResNet()). to(device) transform = transforms. Compose([ transforms. Resize((224, 224)),transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,))]) dataset_multi(dataset_cls, filename, multi_res, transform) def test_datasets_bis(model_arch): datasets = [MNIST, FashionMNIST] batch_size = 10 device = 'cuda' if torch. cuda. is_available() else 'cpu' for dataset_cls in datasets: filename = f'{dataset_cls. __name__}_resnet. pt' model_arch. load_state_dict(torch. load(filename)) model = model_arch test_loader = torch. utils. data. DataLoader( dataset_cls('. . /data', train=False, download=True, transform=transforms. Compose([ transforms. Resize((224, 224)), transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=batch_size) ref_kl_loss = kl(model, device, test_loader) print("Ref loss", ref_kl_loss) all_labels = [] all_scores = [] for dataset_cls2 in datasets: test_loader2 = torch. utils. data. DataLoader( dataset_cls2('. . /data', train=False, download=True, transform=transforms. Compose([ transforms. Resize((224, 224)), transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])),batch_size=batch_size, shuffle=True) OOD = 0 for data, target in test_loader2: outputs = model(data. to(device)) kl_loss = torch. max(F. kl_div(outputs[0], outputs[1]. exp(), reduction='none'), F. kl_div(outputs[1], outputs[0]. exp(), reduction='none')) kl_loss = kl_loss. sum(dim=-1) similar = outputs[0]. argmax(dim=-1) == outputs[1]. argmax(dim=-1) normed = kl_loss / ref_kl_loss kl_anomaly = normed > 10 non_concordant = similar == False out_of_distrib = sum(kl_anomaly | non_concordant) N = normed. shape[0] boolean = dataset_cls2 != dataset_cls all_labels. extend([boolean] * N) all_scores. extend(normed. tolist()) OOD += out_of_distrib print(f"Trained on {dataset_cls. __name__} we detected on {dataset_cls2. __name__} {OOD}/{len(test_loader2. dataset)} ({float(OOD)/len(test_loader2. dataset) * 100:. 2f}%) out of distribution") auc = roc_auc_score(all_labels, all_scores) print(f"AUC for {dataset_cls. __name__} : {auc}") def exp_2_bis(): device = torch. device('cuda' if torch. cuda. is_available() else 'cpu') multi_res = MultiNet(MnistResNet(), MnistResNet()). to(device) test_datasets_bis(multi_res)# run_datasets_res()# exp_2_bis() Experiment 3 : Check that two identical networks (same initalization) actually don't work. It's just a sanity check. We should obtain always kl_div = 0 no matter where we are in the input space. def create_same_model(): model1 = Net() model = MultiNet(model1, model1) return modeldef exp_3(): device = torch. device('cuda' if torch. cuda. is_available() else 'cpu') run_datasets(create_same_model, suffix='_exp3') test_datasets(create_same_model(). to(device), suffix='_exp3') # exp_3() Experiment 4 : Run this method with 2, 3, 4, and so on models. We should get exponential improved accuracy, if the random behavious for out-of-distribution for models is correct. def create_n_model(n): models = [Net() for i in range(n)] model = MultiNet(*models) return modeldef test_datasets_4(model_arch, suffix): datasets = [MNIST, FashionMNIST] batch_size = 100 device = 'cuda' if torch. cuda. is_available() else 'cpu' for dataset_cls in datasets: filename = f'{dataset_cls. __name__}{suffix}. pt' model_arch. load_state_dict(torch. load(filename)) model = model_arch test_loader = torch. utils. data. DataLoader( dataset_cls('. . /data', train=False, download=True, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])), batch_size=batch_size) ref_kl_loss = kl(model, device, test_loader) print("Ref loss", ref_kl_loss) all_labels = [] all_scores = [] for dataset_cls2 in datasets: test_loader2 = torch. utils. data. DataLoader( dataset_cls2('. . /data', train=False, download=True, transform=transforms. Compose([ transforms. ToTensor(), transforms. Normalize((0. 1307,), (0. 3081,)) ])),batch_size=batch_size, shuffle=True) OOD = 0 for data, target in test_loader2: outputs = model(data. to(device)) kl_losses = [] for i in range(len(outputs) - 1): for j in range(i + 1, len(outputs)): kl_losses. append(F. kl_div(outputs[i], outputs[j]. exp(), reduction='none')) kl_losses. append(F. kl_div(outputs[j], outputs[i]. exp(), reduction='none')) kl_loss = torch. stack(kl_losses, dim=0). max(dim=0). values kl_loss = kl_loss. sum(dim=-1) similar = outputs[0]. argmax(dim=-1) == outputs[1]. argmax(dim=-1) normed = kl_loss / ref_kl_loss kl_anomaly = normed > 10 non_concordant = similar == False out_of_distrib = sum(kl_anomaly | non_concordant) N = normed. shape[0] boolean = dataset_cls2 != dataset_cls all_labels. extend([boolean] * N) all_scores. extend(normed. tolist()) OOD += out_of_distrib print(f"Trained on {dataset_cls. __name__} we detected on {dataset_cls2. __name__} {OOD}/{len(test_loader2. dataset)} ({float(OOD)/len(test_loader2. dataset) * 100:. 2f}%) out of distribution") auc = roc_auc_score(all_labels, all_scores) print(f"AUC for {dataset_cls. __name__} : {auc}")def exp_4(): device = torch. device('cuda' if torch. cuda. is_available() else 'cpu') for n in [2, 4, 8]: print('=' * 20) print(f"N = {n}") run_datasets(lambda: create_n_model(n), suffix=f'_exp4_{n}') test_datasets_4(create_n_model(n). to(device), suffix=f'_exp4_{n}') # exp_4() Seems not to be working too great, we ARE improving AUC. Not by a strong margin, it is probably just that we are having a better approximator of our metric by ensembling. Experiment 5 : Test on a larger output space, like CIFAR-100 and SVHN, to check that part of the limits are actually due to small number of output classesfor MNIST/FashionMNISTOther idea is to test on Transformers. Early experiment seems to show that we can use that idea to detect different language within text with just the kl_div used as a distance. Found French book within english books dataset, AND english paragraphs within this french book. Needs some work to clean this experiment Show that small network trained on a single english book enables to detect different languagesor different patterns of writing (old english, irish, french, or event dictionnaries)The detection is super fined grained capable of detecting english within a French book. For brevity, we won't include training code. We just trained a simple transformer (6 layers deep) on a english text and checked our metric on some other texts. Experiment 6 : Need to test with various training schemes, regularization schemes (dropout, batchnorm, l2 penalization) and so on. We should find that the smoother in-distribution our models behave the more this method should work. Hopefully test accuracy should be a good smoothness proxy. Limits : The pros for this method are that: It's super simple to implement, and only costs a constant factor in training time. You could also extend this to 3, 4 side models, and it should improve robustness exponentially if the random factors are correct. If we keep this number small, it will still be constant cost factor. It does not require a perturbation model for input data, which in itself is subject to fine-tuning. The cons is that: It does not work so well on low dimensional output spaces. It seems other methods have better results than this one. It only works for models that output probability distributions (hard to extend to object detection, generation and other tasks)Future Work : There is a lot more experiments necessary to verify that the hypothesis in favor of that approach hold. Try to find ways to implement that in other tasks. How to improve out-of-distribution detection. "
}, {
"id": 9,
"url": "http://localhost:4000/ml/nlp/2019/08/06/model-based-bpe-encodings-3.html",
"title": "Model based encodings (3)",
"body": "2019/08/06 - In the first segmentwe looked into how we could make a BPEbased encoding, not only based on frequency in the dataset, but directly on themodel probability measure of the next token. In that article I mention thatdynamic BPE are costly because they stop being a one time operation but have tobe done for every batch because the vocabulary might have changed. In thisarticle I try to completely remove the “static” BPE approach and replace itcompletely with ML blocks. TL;DR In this article we present an idea to replace classical BPE algorithm with a pure ML version of it. What is the goal ?: So the goal is to replace BPE algorithm. So it’s go from something like “T|h|e| |c|a|t| |a|t|e| |t|h|e| |a|p|p|l|e|. ” To something that has less elements : “The |ca|t |at|e |the| |app|le|. ” In one sentence, BPE fuses bytes to form tokens based on frequency in the fulldataset. For a more detailed example, look that the previousarticle. In this example, you can see there is always a split after a space. That’s alimitation of BPE so actually our target might look different, maybe more like “The cat |at|e |the app|le|. ” Here we can notice that “The cat” is a full token and contain 2 actual words. So the goal is to fuse some starting bytes into N tokens (let’s say ~10k) thathopefully capture regularities in our dataset and are at least correlated tofrequency in the original dataset like BPE was. Another property we need to have from BPE is that it can encode an arbitrarystring of text. It does not matter if it’s not the same language or even if itmakes sense, you CAN encode it, that is a very desirable property. It avoidsthe out-of-vocabulary problem. Approach: Tokenization: So let’s imagine we have a trained transformer likeGPT-2. But trained on bytesdirectly NOT on tokens like the original transformer. Now we can use the ideathat when a model is highly confident, it probably means that what it’s aboutto predict is “in the same token”. Let’s take an example. Try to predict thefollowing Character (as in a single letter) in the next 2 sentences Sentence 1: “Who are yo…” Sentence 2 : “I like …” In the first sentence, normally you would vote with very high confidence for“u”, whereas in the second sentence, you lack a lot of context to be exactlysure on what’s coming next. So “you” would be a token, whereas “like …” can’tbe a single token, it has to be at least 2, “like “ and “…”. Here is a small gif of actual probabilities of the language model on a small sentence You can see the in the left of the graph the probabilities drop, those are thetokens that try to get predicted but are missing context (because we have veryfew characters before them. For the right side, you can see the drops in probabilityare pretty consistent and correspond to word boundaries most often. Handling unknown tokens: Now we know how we are going to “fuse” characters, but we are not done yet. BPEtokens are a discrete SET of identified values from 0 to N (~10k in thisexperiment). Also BPE can encode an arbitrary new string by using it’s fusiontable. So we can’t just run our algorithm on some specific dataset, count allthe tokens created and declare that these are the N tokens for eternity. Let’simagine I feed my algorithm a new sentence, in a different language, French forinstance. “J’adore l’Italie. ” We can run our “tokenizer” on this, and receive something like this “J|’|ado|re |l’|Ita|lie. ” Now “ado” might not be in our original list, so what do we do with it ? Do wedeclare the token wrong and split it ? That would be odd. A key insight, is to remember that the first step of the discrete “token” onceit enters the model (all of them do that, it’s really not specific totransformer or GPT-2) it gets embedded, meaning we go from a number between 1and N, to a vector in d dimension space (d is between 100 and 1000 generally). For instance token 3 gets mapped to [0. 3, -0. 15, 1. 4, …] while token 4 gets mappedto [-2. 4, -0. 014, 0. 45, …] So the idea it to generate directly a token embedding (a vector in d-dimension), not necessarily adiscrete value (a number between 0 and vocabulary size). In order to do that we need that all tokens should now be represented in thesame way by a d dimension space vector. One way to achieve that is to use anautoencoder. or with code The core idea is that when we encounter a new unseen token like “ado” it will still havea representation through the VAE, and will probably be close to a known token like “add”. This can help the network overcome odd tokenization or spelling errors. ## The name is VAE but I didn't use the internal KL loss in the end as it prevented/slowed down the learning. class VAE(nn. Module): def __init__(self): super(VAE, self). __init__() self. M = config. CONTEXT_SIZE * config. EMBEDDING_DIM layer = nn. Linear m = 400 self. fc1 = layer(self. M, m) self. fc21 = layer(m, config. EMBEDDING_DIM) self. fc22 = layer(m, config. EMBEDDING_DIM) self. fc3 = layer(config. EMBEDDING_DIM, m) self. fc4 = layer(m, self. M) def encode(self, x): # x is [Batch, Context size, Embedding dim] x = x. view(-1, self. M) h1 = F. relu(self. fc1(x)) return self. fc21(h1), self. fc22(h1) def reparameterize(self, mu, logvar): std = torch. exp(0. 5 * logvar) eps = torch. randn_like(std) return mu + eps * std def decode(self, z): h3 = F. relu(self. fc3(z)) return torch. tanh( self. fc4(h3). view(-1, config. CONTEXT_SIZE, config. EMBEDDING_DIM) ) def forward(self, x): mu, logvar = self. encode(x) z = self. reparameterize(mu, logvar) return mu, logvar, z, self. decode(z)Final network: Results: Here is a summary of the values of the tokenization we got.   Raw BPE Model based Vocabulary size 256 10000 26262 #Tokens 387k 90k 92k Avg token length 1 3. 3 6. 65 Here is a excerpt of the kind of tokenization we created |He w|as on|e of|the |most |n|oticea|ble member|s of the| Reform| Club|, |th|ough| he| s|eemed|always |to |avoid |att|racting at|tention|; an en|ig|mat|i|cal |p|erson|age|,||ab|out whom l|ittle| was |known|, |e|xc|ept that |he| w|as |a |poli|shed m|an|o|f |th|e |wo|rld|. |Pe|ople sa|id| that h|e |re|sembl|ed| |Byron|--at least|t|hat |his hea|d w|as |Byronic|; |but| he was |a |b|earde|d, tranquil| Byron|,who| |might live| on a |thousand year|s |w|ithout g|r|owing o|ld|. ||Certainly| an| English|man|, it |was |m|ore |doubt|ful w|h|ether |Phileas Fogg|w|as |a |London|er|. Full text This version has been done with epsilon=0. 0015. As you can see, “Phileas Fogg” is already a token in this situation, which is a multi-word token notachievable by regular BPE. You can also see, a lot of words contain only single bytes tokens whichis why this method compresses LESS than regular BPE at the same vocabulary size. Another note is that classical words like “was” is already a token (in the last sentence) but it’s not alwaysthe case, this token is context dependent now ! VAE: After the VAE step, the reconstruction is not perfect yet perfectly legible. |He w|as on|e of|the |most |n|oticea|ihe member|s of the| reform| Club|, |th|ough| he| s|eemed|always |to |asoid |att|nacting at|tention|, an en|ig|mat|i|cal |p|erson|age|,||ab|it whom l|ittle| was | nown|, |e|xc| pt that |he| w|as |a |poli|shed m|an|o|f |th|e |wo|rld|. |Pe|ople sa|id| that h|e |re|sembl|ed| |pyron| cat least|t|hat |has hea|d w|as |blronic|; |but| he was |a |b|earde|in tranquil| pyron|who| |eight live| on a |dar and year|s |w|ithout g|r|owing o|ld|. ||rertainly| an| English|man|, it |was |m|ore |doubt|ful w|h|ether |Phileas Fogg|w|as |a |London|er|. Full text Most of the errors tend to lie in the first characters of long tokens. That’s because, I’m forced to paddthe input of the VAE and to mask that padding. In practice that means that the first characters of long tokens get updatedless that the others so necessarily they contain more errors. More information. Upper level: In order to complete the experiment, we need to check that the original language modeldone directly at BPE level can be done with this new model-based BPE encoding. It’s pretty slow to train that upper level because we need to flow thegradients all the way through the VAE decoder, and the lower layer decodingstep, in order to get the character level loss (softmax + nll_loss) to properly train something. That’s a limit of the current approach. If we randomly split the text into train&validation, we can learn almost perfectly (97% top-1 character level accuracy)the language model on top of that Model based BPE. However this can be considered overfitting because even though a specific inputwas never seen in the valid set, a very close one was. If instead we try to compare with a fixed split, where the last part of the bookis considered the valid set, then we get much lower result. We could achieve 25% exact character matching, and ~77%top-10 character matching on the valid set, which is the end of the book !The same results happen with BPE, even worse ! we can’t get past 13% top-1 and 25% top-10on the regular BPE. That’s understandable because the dataset is very small andthe last part of the book is different so it’s very hard to infer it from just thebeginning and no other text. Another note, is that model based BPE are not tokenizing deterministicly, thereis some variance to it, depending on the context of a particular word. This actually seems to be a good property (See this) andmight explain away the better performance of model based BPE over regular BPE. Keep in mind it’s 25% of the characters that are correct. If we looked at a discrete view of tokens we probably would have a much higher prediction rate (it’s left for future work for now). Here is a picture from the tensorboard values, P_1 is probability that thecharacter predicted is the correct one, P_10 is that it is in the top-10values. The overfitting starts happening around the ~1M steps mark. Notes: In the experiment we learned model by model, freezing the lower modelbefore training something on top. It’s because the batching of differentlayers occur differently. Learning the whole thing end-to-end is probably goingto need some thought. The batching is easy for the lower level, every batchneeds a tensor of shape CONTEXT_SIZE (=64) of [0-255] ints. For the VAE, weneed to have a variable length (depending on the length token) times EMBEDDING_DIM(=128). The upper level needs only tensors of size CONTEXT_SIZE *EMBEDDING_DIM yet if we want to try and end-to-end training, we have noidea how many bytes we need to generate 1 correct tensor in the upper layer. We know it’s no more than CONTEXT_SIZE² but that would be prohibitive to usethat value. The loss NEEDS to always be the byte-level nll loss. At first I thought asimple MSE loss in the embedding space could be enough to learn the propermodels. It seems to not be the case. I could only achieve meaningful results byalways referring to the original strings and calculating the NLL Loss. Whenusing this loss, the MSE actually increases. This leads me to think thatencoding/decoding + softmax are highly anisotropic operators. Looking at thesingular values of the embedding matrix, we can see that the highest one is7. 35, the lowest one 0. 12, so there are 2 orders of magnitude between the 2. This anisotropy means that the MSE loss which considers all dimensions of theembeddding equal is actually couting way too much some irrelevant dimensions. It would be much faster and simpler if we could train directly on MSE (it wouldenable us to train without running all the decoding steps to generate theloss). So we need to add some spectral loss on the embedding on the lowerlanguage model to test that hypothesis. The tokens have variable lengths. In order to fix this, we have to padd allsequences during learning. Because we padd, we have to mask the paddingduring training for both VAE and upper LM. Keeping track of this is prettynifty and it means gradients on rarely used places will rarely get updated. Sowe will almost surely miss some letters in our tokens. Either at the front orthe end of the token depending on how we padd the tokens. Future work: Actually testing discretizing the tokens to compare with the regular BPE. In that direction,also comparing with a randomized tokenizer as used in SentencePieceto make sure the results are actually comparable and are indeed linked to tokenization variance. The masking problem really seems to be a current limit of the model. Finding a workaround would be really valuable. The fact that the NLL loss is required slows down upper layers. It would be awesome if we could smooth outthe encoding/decoding matrix so that L2 directly for VAE and the upper layer works. It probably goes against regularlanguage model embedding so not sure it’s doable. Making the epsilon based tokenization directly after the embedding layer. This would help stack those levels hopefully learninghigher and higer representations of text leading the sentence embedding and so on. On the same idea, another direction would be to do actual discrete tokenization to allow for the models to stack. "
}, {
"id": 10,
"url": "http://localhost:4000/ml/nlp/2019/06/06/model-based-bpe-encodings-2.html",
"title": "Model based encodings (2)",
"body": "2019/06/06 - In the first segmentwe looked into how we could make a BPEbased encoding, not only based on frequency in the dataset, but directly on themodel probability measure of the next token. In that article I mention thatdynamic BPE are costly because they stop being a one time operation but have tobe done for every batch because the vocabulary might have changed. In thisarticle I try to completely remove the “static” BPE approach and replace itcompletely with ML blocks. TL;DR In this article we present an idea to replace classical BPE algorithm with a pure ML version of it. What is the goal ?: So the goal is to replace BPE algorithm. So it’s go from something like “T|h|e| |c|a|t| |a|t|e| |t|h|e| |a|p|p|l|e|. ” To something that has less elements : “The |ca|t |at|e |the| |app|le|. ” In one sentence, BPE fuses bytes to form tokens based on frequency in the fulldataset. For a more detailed example, look that the previousarticle. In this example, you can see there is always a split after a space. That’s alimitation of BPE so actually our target might look different, maybe more like “The cat |at|e |the app|le|. ” Here we can notice that “The cat” is a full token and contain 2 actual words. So the goal is to fuse some starting bytes into N tokens (let’s say ~10k) thathopefully capture regularities in our dataset and are at least correlated tofrequency in the original dataset like BPE was. Another property we need to have from BPE is that it can encode an arbitrarystring of text. It does not matter if it’s not the same language or even if itmakes sense, you CAN encode it, that is a very desirable property. It avoidsthe out-of-vocabulary problem. Approach: Tokenization: So let’s imagine we have a trained transformer likeGPT-2. But trained on bytesdirectly NOT on tokens like the original transformer. Now we can use the ideathat when a model is highly confident, it probably means that what it’s aboutto predict is “in the same token”. Let’s take an example. Try to predict thefollowing Character (as in a single letter) in the next 2 sentences Sentence 1: “Who are yo…” Sentence 2 : “I like …” In the first sentence, normally you would vote with very high confidence for“u”, whereas in the second sentence, you lack a lot of context to be exactlysure on what’s coming next. So “you” would be a token, whereas “like …” can’tbe a single token, it has to be at least 2, “like “ and “…”. Here is a small gif of actual probabilities of the language model on a small sentence You can see the in the left of the graph the probabilities drop, those are thetokens that try to get predicted but are missing context (because we have veryfew characters before them. For the right side, you can see the drops in probabilityare pretty consistent and correspond to word boundaries most often. Handling unknown tokens: Now we know how we are going to “fuse” characters, but we are not done yet. BPEtokens are a discrete SET of identified values from 0 to N (~10k in thisexperiment). Also BPE can encode an arbitrary new string by using it’s fusiontable. So we can’t just run our algorithm on some specific dataset, count allthe tokens created and declare that these are the N tokens for eternity. Let’simagine I feed my algorithm a new sentence, in a different language, French forinstance. “J’adore l’Italie. ” We can run our “tokenizer” on this, and receive something like this “J|’|ado|re |l’|Ita|lie. ” Now “ado” might not be in our original list, so what do we do with it ? Do wedeclare the token wrong and split it ? That would be odd. A key insight, is to remember that the first step of the discrete “token” onceit enters the model (all of them do that, it’s really not specific totransformer or GPT-2) it gets embedded, meaning we go from a number between 1and N, to a vector in d dimension space (d is between 100 and 1000 generally). For instance token 3 gets mapped to [0. 3, -0. 15, 1. 4, …] while token 4 gets mappedto [-2. 4, -0. 014, 0. 45, …] So the idea it to generate directly a token embedding (a vector in d-dimension), not necessarily adiscrete value (a number between 0 and vocabulary size). In order to do that we need that all tokens should now be represented in thesame way by a d dimension space vector. One way to achieve that is to use anautoencoder. or with code The core idea is that when we encounter a new unseen token like “ado” it will still havea representation through the VAE, and will probably be close to a known token like “add”. This can help the network overcome odd tokenization or spelling errors. ## The name is VAE but I didn't use the internal KL loss in the end as it prevented/slowed down the learning. class VAE(nn. Module): def __init__(self): super(VAE, self). __init__() self. M = config. CONTEXT_SIZE * config. EMBEDDING_DIM layer = nn. Linear m = 400 self. fc1 = layer(self. M, m) self. fc21 = layer(m, config. EMBEDDING_DIM) self. fc22 = layer(m, config. EMBEDDING_DIM) self. fc3 = layer(config. EMBEDDING_DIM, m) self. fc4 = layer(m, self. M) def encode(self, x): # x is [Batch, Context size, Embedding dim] x = x. view(-1, self. M) h1 = F. relu(self. fc1(x)) return self. fc21(h1), self. fc22(h1) def reparameterize(self, mu, logvar): std = torch. exp(0. 5 * logvar) eps = torch. randn_like(std) return mu + eps * std def decode(self, z): h3 = F. relu(self. fc3(z)) return torch. tanh( self. fc4(h3). view(-1, config. CONTEXT_SIZE, config. EMBEDDING_DIM) ) def forward(self, x): mu, logvar = self. encode(x) z = self. reparameterize(mu, logvar) return mu, logvar, z, self. decode(z)Final network: Results: Here is a summary of the values of the tokenization we got.   Raw BPE Model based Vocabulary size 256 10000 26262 #Tokens 387k 90k 92k Avg token length 1 3. 3 6. 65 Here is a excerpt of the kind of tokenization we created |He w|as on|e of|the |most |n|oticea|ble member|s of the| Reform| Club|, |th|ough| he| s|eemed|always |to |avoid |att|racting at|tention|; an en|ig|mat|i|cal |p|erson|age|,||ab|out whom l|ittle| was |known|, |e|xc|ept that |he| w|as |a |poli|shed m|an|o|f |th|e |wo|rld|. |Pe|ople sa|id| that h|e |re|sembl|ed| |Byron|--at least|t|hat |his hea|d w|as |Byronic|; |but| he was |a |b|earde|d, tranquil| Byron|,who| |might live| on a |thousand year|s |w|ithout g|r|owing o|ld|. ||Certainly| an| English|man|, it |was |m|ore |doubt|ful w|h|ether |Phileas Fogg|w|as |a |London|er|. Full text This version has been done with epsilon=0. 0015. As you can see, “Phileas Fogg” is already a token in this situation, which is a multi-word token notachievable by regular BPE. You can also see, a lot of words contain only single bytes tokens whichis why this method compresses LESS than regular BPE at the same vocabulary size. Another note is that classical words like “was” is already a token (in the last sentence) but it’s not alwaysthe case, this token is context dependent now ! VAE: After the VAE step, the reconstruction is not perfect yet perfectly legible. |He w|as on|e of|the |most |n|oticea|ihe member|s of the| reform| Club|, |th|ough| he| s|eemed|always |to |asoid |att|nacting at|tention|, an en|ig|mat|i|cal |p|erson|age|,||ab|it whom l|ittle| was | nown|, |e|xc| pt that |he| w|as |a |poli|shed m|an|o|f |th|e |wo|rld|. |Pe|ople sa|id| that h|e |re|sembl|ed| |pyron| cat least|t|hat |has hea|d w|as |blronic|; |but| he was |a |b|earde|in tranquil| pyron|who| |eight live| on a |dar and year|s |w|ithout g|r|owing o|ld|. ||rertainly| an| English|man|, it |was |m|ore |doubt|ful w|h|ether |Phileas Fogg|w|as |a |London|er|. Full text Most of the errors tend to lie in the first characters of long tokens. That’s because, I’m forced to paddthe input of the VAE and to mask that padding. In practice that means that the first characters of long tokens get updatedless that the others so necessarily they contain more errors. More information. Upper level: In order to complete the experiment, we need to check that the original language modeldone directly at BPE level can be done with this new model-based BPE encoding. It’s pretty slow to train that upper level because we need to flow thegradients all the way through the VAE decoder, and the lower layer decodingstep, in order to get the character level loss (softmax + nll_loss) to properly train something. That’s a limit of the current approach. If we randomly split the text into train&validation, we can learn almost perfectly (97% top-1 character level accuracy)the language model on top of that Model based BPE. However this can be considered overfitting because even though a specific inputwas never seen in the valid set, a very close one was. If instead we try to compare with a fixed split, where the last part of the bookis considered the valid set, then we get much lower result. We could achieve 25% exact character matching, and ~77%top-10 character matching on the valid set, which is the end of the book !The same results happen with BPE, even worse ! we can’t get past 13% top-1 and 25% top-10on the regular BPE. That’s understandable because the dataset is very small andthe last part of the book is different so it’s very hard to infer it from just thebeginning and no other text. Another note, is that model based BPE are not tokenizing deterministicly, thereis some variance to it, depending on the context of a particular word. This actually seems to be a good property (See this) andmight explain away the better performance of model based BPE over regular BPE. Keep in mind it’s 25% of the characters that are correct. If we looked at a discrete view of tokens we probably would have a much higher prediction rate (it’s left for future work for now). Here is a picture from the tensorboard values, P_1 is probability that thecharacter predicted is the correct one, P_10 is that it is in the top-10values. The overfitting starts happening around the ~1M steps mark. Notes: In the experiment we learned model by model, freezing the lower modelbefore training something on top. It’s because the batching of differentlayers occur differently. Learning the whole thing end-to-end is probably goingto need some thought. The batching is easy for the lower level, every batchneeds a tensor of shape CONTEXT_SIZE (=64) of [0-255] ints. For the VAE, weneed to have a variable length (depending on the length token) times EMBEDDING_DIM(=128). The upper level needs only tensors of size CONTEXT_SIZE *EMBEDDING_DIM yet if we want to try and end-to-end training, we have noidea how many bytes we need to generate 1 correct tensor in the upper layer. We know it’s no more than CONTEXT_SIZE² but that would be prohibitive to usethat value. The loss NEEDS to always be the byte-level nll loss. At first I thought asimple MSE loss in the embedding space could be enough to learn the propermodels. It seems to not be the case. I could only achieve meaningful results byalways referring to the original strings and calculating the NLL Loss. Whenusing this loss, the MSE actually increases. This leads me to think thatencoding/decoding + softmax are highly anisotropic operators. Looking at thesingular values of the embedding matrix, we can see that the highest one is7. 35, the lowest one 0. 12, so there are 2 orders of magnitude between the 2. This anisotropy means that the MSE loss which considers all dimensions of theembeddding equal is actually couting way too much some irrelevant dimensions. It would be much faster and simpler if we could train directly on MSE (it wouldenable us to train without running all the decoding steps to generate theloss). So we need to add some spectral loss on the embedding on the lowerlanguage model to test that hypothesis. The tokens have variable lengths. In order to fix this, we have to padd allsequences during learning. Because we padd, we have to mask the paddingduring training for both VAE and upper LM. Keeping track of this is prettynifty and it means gradients on rarely used places will rarely get updated. Sowe will almost surely miss some letters in our tokens. Either at the front orthe end of the token depending on how we padd the tokens. Future work: Actually testing discretizing the tokens to compare with the regular BPE. In that direction,also comparing with a randomized tokenizer as used in SentencePieceto make sure the results are actually comparable and are indeed linked to tokenization variance. The masking problem really seems to be a current limit of the model. Finding a workaround would be really valuable. The fact that the NLL loss is required slows down upper layers. It would be awesome if we could smooth outthe encoding/decoding matrix so that L2 directly for VAE and the upper layer works. It probably goes against regularlanguage model embedding so not sure it’s doable. Making the epsilon based tokenization directly after the embedding layer. This would help stack those levels hopefully learninghigher and higer representations of text leading the sentence embedding and so on. On the same idea, another direction would be to do actual discrete tokenization to allow for the models to stack. "
}, {
"id": 11,
"url": "http://localhost:4000/ml/nlp/2019/05/16/model-based-bpe-encodings.html",
"title": "Model based encodings",
"body": "2019/05/16 - Byte-pair encodings (BPE) are now very commonly used in NLP. In GPT-2, Byte-pair encodings are used to preformat the raw texts before feeding the model. But this is a relatively costly step for your preprocessing and has some limitations. For instance, you have to split your data on spaces if you want your byte pair algorithm to compute in reasonable time. TL;DR In this article we present an idea to generate Byte pair encodings, not based on frequency in the dataset, but on the quality of the prediction of our model. This enables us to predict multi word tokens like “New York” and address languages that don’t use spaces to split words. What are Byte Pair Encodings ?: Byte-pair encodings are a way to compress information from pairs of bytes that will form tokens. Let’s take an example : “I love carrots and I love apples. ” This sentence read by a computer is only a sequence of bytes (bytes are simply a number between 0 and 255). That means to a computer our sentence looks like “I love carrots and I love apples. ” -> [73, 32, 108, 111, 118, 101, 32, 99, 97, 114, 114, 111, 116, 115, 32, 97, 110, 100, 32, 73, 32, 108, 111, 118, 101, 32, 97, 112, 112, 108, 101, 115, 46] From that example, you may remark that some bytes are occurring multiple times together like [108, 111] that occurs twice (it’s “lo” from “love”). So let’s build a new token for this frequent pair. Numbers from 0 to 255 are already taken so we’ll take the next available number which is 256, and we are going to store that information in a table [108, 111] -> 256 Now if we use that new token to encode our original bytes, whenever we encounter [108, 111], we’ll replace that by 256, so the original byte string becomes : [73, 32, 108, 256, 101, 32, 99, 97, 114, 114, 111, 116, 115, 32, 97, 110, 100, 32, 73, 32, 256, 118, 101, 32, 97, 112, 112, 108, 101, 115, 46] We went from 33 numbers to 31 numbers. We can rinse and repeat to compress the number of numbers even further. Originally, BPE was proposed as a compression algorithm. It’s not the best compression tool, so we won’t look at that side of the algorithm. Now you get what we are looking at when we train a model on BPEs, just a list of numbers. Typically a BPE vocabulary contains ~10k tokens (GPT-2 has 50k), that means it can capture very frequent words like “the” entirely, and parts of words that contain many variations like “ment” (mentally, environment …). What’s great about it it that you can now have words share semantic parts of them for their representation in your model so (environ-ment, environ-ment-al, environ-ment-ally will all share “environ” which will contain most of the semantic meaning, the rest will contain grammar information hopefully). The real advantage of BPE over classical Word Embeddings is that it does not fall into the out-of-vocabulary error (when a word was not seen). At worse you can always fall back to single bytes. What’s the problem with BPE ?: BPE algorithm is pretty bad in terms of complexity to calculate (roughly O(n²), you can look at a very good implementation https://github. com/glample/fastBPE). BPE is also pretty bad when you want to encode some new text. A greedy algorithm will be O(n) but not the best encoding possible, the best encoding possible is actually O(n²) in the general case. To be honest, most implementations split on spaces as mentioned earlier which speeds up the algorithm quite a bit. Once we have encoded a full word like “the” there is no way to add tokens to it, so it’s not necessary to look at it anymore for potential byte pairs, so we can assume the encoding&table creation go from O(n²) to something much closer to O(n). In addition, at encoding time, once we know the encoding for “the” we can cache that information leading to further speed ups. But using spaces as a special character has drawbacks, namely: We can’t address as well languages that don’t use a space to separate words like Chinese (arguably German). We can’t encode frequently occurring multi words like “New York” or “European Union” or “black holes” The second problem is especially bad when you consider examples where semantic is very different from the composing words like “Chicago Bulls” have nothing to do with bulls. ε-BPE or model based BPE encoding: The core idea is that instead of using frequency in the dataset to create the byte pairs, we can use the probability transition of the model to create the BPE. Let’s use some kind of transformer, GPT-2 for instance. The core idea of that model, is to predict the next token (in the BPE sense) given a fixed context size. But we can use the output probability of the model in order to create new tokens, not because they are frequent but because they are easy to predict. For instance in a book that contains a character “Sir Francis” that appears rarely, but there is only one character named “Sir …”, the algorithm might learn quite easily that “Sir “ is followed by “Francis” with great confidence, even if the occurence of the words is pretty low compared to common words like “the”, “like” and “I”. So the core algorithm, will train a simple transformer on a dataset on regular bytes (at least at the start). Then, as the algorithm learns, some predictions will be above 1-ε. We can keep track of those and keep track of the last token we received, to check if we were correct. Let’s keep a hit map to see how successful our algorithm is. For instance, I predicted “Fo” will be followed by “gg” (Phileas Fogg is a character in Around the world in 80 days) with probability > 1-ε. I was correct in 14 cases, and got it wrong in 1 case (let’s say it was classical “Fo” “g “). We were correct 14/15 times that’s 93% accuracy. If we look at the fluctuation interval associated with that, we get [92. 74-93. 25%] range. If 92. 74 > 1–ε we can conclude that our transition prediction is really very good, it’s not a fluke of the model. More generally, if we want 95% confidence when we upgrade this transition, we need to respect the following inequality : k / n - 1/sqrt(n) > 1-ε, where k is the number of successful predictions, n is the total number of predictions and ε the probability margin explained earlier. This model is slightly different from byte pair encoding, but now we don’t suffer from the 2 problems mentioned above, we can get pretty long tokens if the dataset allows for it, and we can use Chinese or German as the space character does not play any special role. Results: Implementation can be found here. On the first run, we ran on a book Around the world in 80 days by Jules Verne. It’s a very small dataset but the idea is to check that we can actually overcome BPE’s limitations. Here are a few telling tokens that were created while running on the dataset : Promotion # Token created 338 “Mr. Fogg” 357 “Phileas Fogg” 360 “Passepartout” 635 “ir Franc” (Sir Francis) 781 “It was” 900 ’” asked’ (contains a quote character) What is interesting, it that: We managed to create multi word tokens like “Phileas Fogg” Multi word tokens are a minority in terms of tokens created by the algorithm. Out of 421 tokens that contain a space character only 27 are multi word tokens like “New York”. The remaining 394 tokens contain an ending space, meaning our algorithm is learning word boundaries. It is reassuring because traditional BPE are usually hardcoding that information. Multi word tokens are name of characters in the book, which are occurring frequently, they are an entity by themselves (Fogg even has 2 tokens associated to him) 2 Multi word tokens are not specific to the book, “it was” is a pretty common 2 word token in English in descriptions, “(…) asked” is a very common continuation when we start a quote and end a sentence with a question mark. We can guess that “(…) said” would be a token further down the line, but it’s harder as there are probably a wider variety of verbs that can fit (said, replied, answered and so on…) Here is a more complete comparison of standard BPE with ε-BPE, with the first 100 tokens generated, as you can see more tokens are dedicated to syntax in eBPE, which Standard BPE ignore gladly by splitting on newlines and spaces. Standard BPE eBPE ‘th’ ‘\r\n’ ‘the ‘ ’, ‘ ‘an’ ‘d ‘ ‘in’ ‘Th’ ‘ou’ ‘ve’ ‘er’ ‘y ‘ ‘ed ‘ ’; ‘ ‘ar’ ‘f ‘ ‘hi’ ’,\r\n’ ‘on’ ‘\r\n\r\n’ ‘re’ ‘th’ ‘en’ ‘qu’ ‘and ‘ ‘the’ ‘of ‘ ’ ‘ ‘st’ ‘the ‘ ‘to ‘ ‘The’ ‘as ‘ ‘\r\n’ ‘se’ ’, ‘ ‘ha’ ‘y ‘ ‘or’ ‘d ‘ ’. \r ‘ ‘Th’ ‘it’ ‘ve’ ‘he ‘ ’; ‘ ‘le’ ‘f ‘ ‘ing ‘ ’,\r\n’ ’,\r ‘ ’ ‘ ‘as’ ‘\r\n’ ‘in ‘ ’, ‘ ‘at’ ‘d ‘ ‘at ‘ ‘y ‘ ‘ro’ ‘Th’ ‘er ‘ ‘ve’ ‘al’ ‘f ‘ ‘es’ ’; ‘ ‘on ‘ ’ ‘ ‘was ‘ ’,\r\n’ ‘no’ ‘th’ ‘his ‘ ‘\r\n’ ‘ed’ ’, ‘ ‘ac’ ‘d ‘ ’“\r ‘ ‘y ‘ ‘ri’ ‘Th’ ‘be’ ‘ve’ ‘ly ‘ ‘f ‘ ‘om’ ’; ‘ ‘li’ ’ ‘ ‘en ‘ ’,\r\n’ ‘ti’ ‘th’ ‘og’ ‘\r\n\r\n’ ‘ra’ ‘the’ ‘di’ ‘the ‘ ‘art’ ‘The’ ‘Fog’ ‘qu’ ‘the’ ’s ‘ ‘ma’ ‘The ‘ ‘ve ‘ ‘g ‘ ‘is ‘ ’,”’ ‘or ‘ ‘no’ ‘ld ‘ ‘t ‘ ‘whi’ ‘th ‘ ‘il’ ‘o ‘ ‘ur’ ’?”’ ’s, ‘ ‘\r\n\r\n”’ ‘de’ ’,” ‘ ‘wh’ ‘Mr’ ‘lo’ ‘e ‘ ‘ch ‘ ‘yo’ ‘ere ‘ ‘Yo’ ‘ith ‘ ‘ou’ ‘The ‘ ’. ‘ ‘am’ ‘nd ‘ ‘ent’ ‘h ‘ ‘un’ ‘n ‘ ‘gh’ ’;\r\n’ ‘with ‘ ‘og’ ‘an ‘ ‘you’ ‘oun’ ‘r ‘ ‘part’ ‘of ‘ ‘ver’ ‘to ‘ ‘si’ ’s F’ ‘had ‘ ‘Pa’ ‘not ‘ ‘as ‘ ‘ould ‘ '’s ‘ ‘ing’ ’. F’ ‘out ‘ ‘is ‘ ‘el’ ‘ld ‘ ‘sa’ ‘ng ‘ ‘ce’ ‘at ‘ ‘that ‘ ‘re’ ‘asse’ ‘ve ‘ ‘fi’ ‘gh’ ‘ol’ ‘ut ‘ ‘sh’ ‘ll’ ‘r. ‘ ‘Pas’ ’. ”\r ‘ ‘re ‘ ‘Passe’ ‘ed ‘ ‘Passepart’ ’. Fog’ ‘ut ‘ ‘ch ‘ ‘which ‘ ‘and ‘ ‘ay’ ‘ea’ I would love to check the tokenization of German or Chinese but I’m not a speaker of either language so it’s hard for me to analyze the results anyway. What’s for sure is that the technique is applicable. I also tried the technique on different types of files like wav files or mp3 files, even jpeg images. Analysis is harder to do. Still some interesting notes, it took longer for the model to emit new tokens on the mp3 files than on the wav files. The mp3 file is encoded, therefore should have a lower entropy (meaning it’s harder to predict the next token) than the wav files so the model takes longer to actually get good at predicting. It’s probable (I haven’t checked) that we have to overfit the mp3 file and jpeg files before we can predict any meaningful content (except maybe the header part) Future Work: Many interesting ideas are still left to explore to continue exploring the idea of models creating their own tokenization. For now a limiting factor is the actual BPE encoding process that takes longer and longer as the model creates new tokens. That’s because the encoding process is done in Python, so it’s quite slow and can’t be precalculated as you would do with fixed BPE encodings. To give a sense of the slowdown, the training loop starts at ~11it/s on a GTX970 and finished at roughly 10s/it. That’s a 100x slowdown over the course of the training, with only 1k tokens in the end, far from the 50k used by GPT-2 for instance. It’s going to be an actual requirement to train on larger and more representative datasets. Training on bigger datasets would help us understand how important are those multi word tokens and maybe what are those multi words. The token “(…) asked” was pretty surprising to me, I’m eager to see what else can be discovered. The actual epsilon used was 40% which actually quite a big (value was chosen with trial and error, to get a small but not null rejection rate of new tokens, to add tokens as fast as possible but not making too many mistakes). That value probably has a sweet spot depending on the number of current tokens, after speeding up the process it would be interesting to look at the best value for epsilon as a function of the number of tokens. "
}];
var idx = lunr(function () {
this.ref('id')
this.field('title')
this.field('body')
this.metadataWhitelist = ['position']
documents.forEach(function (doc) {
this.add(doc)
}, this)
});
function lunr_search(term) {
document.getElementById('lunrsearchresults').innerHTML = '<ul></ul>';
if(term) {
document.getElementById('lunrsearchresults').innerHTML = "<p>Search results for '" + term + "'</p>" + document.getElementById('lunrsearchresults').innerHTML;
//put results on the screen.
var results = idx.search(term);
if(results.length>0){
//console.log(idx.search(term));
//if results
for (var i = 0; i < results.length; i++) {
// more statements
var ref = results[i]['ref'];
var url = documents[ref]['url'];
var title = documents[ref]['title'];
var body = documents[ref]['body'].substring(0,160)+'...';
document.querySelectorAll('#lunrsearchresults ul')[0].innerHTML = document.querySelectorAll('#lunrsearchresults ul')[0].innerHTML + "<li class='lunrsearchresult'><a href='" + url + "'><span class='title'>" + title + "</span><br /><span class='body'>"+ body +"</span><br /><span class='url'>"+ url +"</span></a></li>";
}
} else {
document.querySelectorAll('#lunrsearchresults ul')[0].innerHTML = "<li class='lunrsearchresult'>No results found...</li>";
}
}
return false;
}