Spaces:

shoukaku
/

fake-health-news-detection

Sleeping

App Files Files Community

shoukaku commited on May 18, 2024

Commit

f2abd03

1 Parent(s): ee7d166

initial commit

Browse files

Files changed (6) hide show

.gitignore +163 -0
app.py +81 -0
requirements.txt +3 -0
src/__init__.py +1 -0
src/ckpt/checkpoint_here.txt +0 -0
src/distilbert_tf.py +72 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,163 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Checkpoints
+src/ckpt/*.pt

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import Callable
+import gradio as gr
+if gr.NO_RELOAD:
+    import numpy as np
+    from src.distilbert_tf import DistilBertTransferLearningModel
+DEVICE = 'cpu'
+MODELS = [
+    (
+        'distilbert-1linear-1650',
+        lambda: DistilBertTransferLearningModel(
+            'distilbert-base-uncased',
+            [
+                ('linear', ['in', 'out']),
+                ('softmax'),
+            ],
+            2,
+            device=DEVICE,
+            state_dict='src/ckpt/distilbert-1linear-dataset-all-augmented-all-1650.pt',
+        ),
+    ),
+]
+class WebUI:
+    def __init__(self, models: list[(str, Callable)] = [], device: str = 'cpu') -> None:
+        self.models = models
+        self.device = device
+        self.model = self.models[0][1]()
+    def _change_model(self, idx: int) -> None:
+        if gr.NO_RELOAD:
+            try:
+                print(self.models[idx])
+                del self.model
+                self.model = self.models[idx][1]()
+                print('done loading')
+            except Exception as e:
+                print(e)
+                gr.Error(e)
+    def _predict(self, text: str) -> str:
+        print(text)
+        output = self.model.predict(text, self.device).detach().cpu().numpy()[0]
+        return f'Fake: {output[0]}, Real: {output[1]}'
+    def get_ui(self) -> None:
+        with gr.Blocks() as ui:
+            with gr.Row():
+                with gr.Column():
+                    t_inp = gr.Textbox(label='Input')
+                    with gr.Row():
+                        btn_reset = gr.ClearButton(
+                            value='Reset',
+                            components=[
+                                t_inp,
+                            ],
+                        )
+                        btn_submit = gr.Button(value='Submit', variant='primary')
+                with gr.Column():
+                    ddl_model = gr.Dropdown(
+                        label='Model',
+                        choices=[model[0] for model in self.models],
+                        value=self.models[0][0],
+                        type='index',
+                        interactive=True,
+                        filterable=True,
+                    )
+                    t_out = gr.Textbox(label='Output')
+            ddl_model.change(fn=self._change_model, inputs=ddl_model)
+            btn_submit.click(fn=self._predict, inputs=t_inp, outputs=t_out)
+        return ui
+webui = WebUI(models=MODELS, device=DEVICE).get_ui()
+if __name__ == '__main__':
+    webui.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+numpy==1.26.4
+torch==2.2.1
+transformers==4.39.3

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .distilbert_tf import DistilBertTransferLearningModel

src/ckpt/checkpoint_here.txt ADDED Viewed

File without changes

src/distilbert_tf.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from typing import Any, Optional, Tuple, Union
+import torch
+import transformers
+class DistilBertTransferLearningModel(torch.nn.Module):
+    def __init__(
+        self,
+        pretrained_model: str = "distilbert-base-uncased",
+        layers: list[Tuple[str, Optional[list[Any]]]] = [
+            ('linear', ['in', 'out']),
+            ('softmax'),
+        ],
+        dim_out: int = 2,
+        use_local_file: bool = False,
+        device: str = 'cpu',
+        state_dict: Optional[Union[str, dict]] = None,
+    ):
+        super(DistilBertTransferLearningModel, self).__init__()
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained_model, local_files_only=use_local_file
+        )
+        self.base_model = transformers.AutoModel.from_pretrained(
+            pretrained_model, local_files_only=use_local_file
+        )
+        clf_layers = []
+        for layer in layers:
+            layer_type = layer[0] if isinstance(layer, tuple) else layer
+            if layer_type == 'linear':
+                layer_in, layer_out = [
+                    (
+                        self.base_model.config.hidden_size
+                        if x == 'in'
+                        else dim_out if x == 'out' else x
+                    )
+                    for x in layer[1]
+                ]
+                clf_layers.append(torch.nn.Linear(layer_in, layer_out))
+            elif layer_type == 'softmax':
+                clf_layers.append(torch.nn.Softmax(dim=-1))
+        self.clf = torch.nn.Sequential(*clf_layers)
+        if state_dict is not None:
+            if isinstance(state_dict, str) and state_dict.endswith('.pt'):
+                if device == 'cpu':
+                    state_dict = torch.load(state_dict, map_location='cpu')
+                else:
+                    state_dict = torch.load(state_dict)
+            self.load_state_dict(state_dict)
+    def forward(self, ids: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        y = self.base_model(ids, attention_mask=mask, return_dict=False)[0][:, 0]
+        y = self.clf(y)
+        return y
+    def predict(self, text: str, device: str) -> torch.Tensor:
+        encoded = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            return_token_type_ids=False,
+            return_attention_mask=True,
+            max_length=512,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt',
+        )
+        with torch.no_grad():
+            ids = encoded['input_ids'].to(device)
+            mask = encoded['attention_mask'].to(device)
+            output = self.forward(ids, mask)
+        return output.to(device)