Spaces:

fkonovalenko
/

llm4career

Sleeping

App Files Files Community

fkonovalenko commited on Apr 25, 2024

Commit

e6dc8c2

1 Parent(s): ea8ee61

first commit

Browse files

Files changed (4) hide show

app.py +148 -0
llm.py +43 -0
ml.py +47 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import gradio as gr
+import os
+import shutil
+import json
+from ml import VacancyAnalyzer
+class GlobalState:
+    """
+    Class to store global variables
+    """
+    result_file_path = os.path.join(os.path.dirname(__file__), 'result/archive.json')
+    result_dir = os.path.join(os.path.dirname(__file__), 'result')
+    bert_path = os.path.join(os.path.dirname(__file__), 'tiny.pt')
+    catboost_path = os.path.join(os.path.dirname(__file__), 'best_cat.joblib')
+    conv_classes = {0: 'low',
+                    1: 'middle',
+                    2: 'high'
+                    }
+    default_data = {'id': 'a0000',
+                    'emp_brand': '',
+                    'mandatory': '',
+                    'additional': '',
+                    'comp_stages': '',
+                    'work_conditions': '',
+                    'conversion': 0,
+                    'conversion_class': 'unknown'
+                    }
+    data = None
+def cid(txt):
+    GlobalState.data['id'] = txt
+def cbrand(txt):
+    GlobalState.data['emp_brand'] = txt
+def cmand(txt):
+    GlobalState.data['mandatory'] = txt
+def cadd(txt):
+    GlobalState.data['additional'] = txt
+def ccomp(txt):
+    GlobalState.data['comp_stages'] = txt
+def ccond(txt):
+    GlobalState.data['work_conditions'] = txt
+def submit(chk):
+    # print(GlobalState.data)
+    return gr.update("Run!", visible=True)
+def append_to_json(_dict, path):
+    with open(path, 'ab+') as f:
+        f.seek(0, 2)
+        if f.tell() == 0:
+            f.write(json.dumps([_dict]).encode())
+        else:
+            f.seek(-1, 2)
+            f.truncate()
+            f.write(' , '.encode())
+            f.write(json.dumps(_dict).encode())
+            f.write(']'.encode())
+def predict(btn):
+    analyzer = VacancyAnalyzer(GlobalState.bert_path, GlobalState.catboost_path, GlobalState.data)
+    status, result = analyzer.classify()
+    gr.Info(status)
+    if result != 'unknown':
+        result = GlobalState.conv_classes[int(result[0])]
+    out_2 = f'Predicted by vacancy description conversion - {result}'
+    GlobalState.data['conversion_class'] = result
+    fid = GlobalState.result_file_path
+    append_to_json(GlobalState.data, fid)
+    GlobalState.data = GlobalState.default_data
+    link = GlobalState.result_file_path
+    return gr.update(value=out_2), gr.update(link="/file=" + link, visible=True)
+def save(btn):
+    link = GlobalState.result_file_path
+    return gr.update(link="/file=" + link)
+def main():
+    shutil.rmtree(os.path.join(os.path.dirname(__file__), 'result/'), ignore_errors=True)
+    os.mkdir(os.path.join(os.path.dirname(__file__), 'result/'))
+    GlobalState.data = GlobalState.default_data
+    with gr.Blocks() as demo:
+        with gr.Tab("Load"):
+            with gr.Row():
+                gr.Markdown(
+                    """
+                    # Input the text description of the position
+                    # 👾👾👾 Then press **Run!** 👾👾👾
+                    """)
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        brand = gr.Textbox(label='Company name', value=None)
+                    with gr.Row():
+                        vid = gr.Textbox(label='Position id', value=None)
+                    with gr.Row():
+                        req = gr.Textbox(label='Mandatory')
+                with gr.Column():
+                    with gr.Row():
+                        add = gr.Textbox(label='Additional')
+                    with gr.Row():
+                        comp = gr.Textbox(label='Competition stage')
+                    with gr.Row():
+                        cond = gr.Textbox(label='Work conditions')
+                with gr.Column():
+                    with gr.Row():
+                        with gr.Column():
+                            ready = gr.Checkbox(label='Data Filled')
+                        with gr.Column():
+                            process_button = gr.Button("Run!", visible=False, interactive=True)
+                    with gr.Row():
+                        output_2 = gr.Textbox(label='LLM Result')
+                    with gr.Row():
+                        download_button = gr.Button("JSON Archive", visible=False)
+        brand.change(cbrand, inputs=[brand])
+        vid.change(cid, inputs=[vid])
+        req.change(cmand, inputs=[req])
+        add.change(cadd, inputs=[add])
+        comp.change(ccomp, inputs=[comp])
+        cond.change(ccond, inputs=[cond])
+        ready.change(submit, inputs=[ready], outputs=[process_button])
+        process_button.click(predict, inputs=[process_button], outputs=[output_2, download_button],
+                             show_progress='full')
+        download_button.click(save, inputs=[download_button], outputs=[download_button])
+    demo.launch(share=True, allowed_paths=[GlobalState.result_dir])
+if __name__ == "__main__":
+    main()

llm.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from torch import nn
+from transformers import AutoTokenizer, AutoModel, BertConfig
+class TransformerRegrModel(nn.Module):
+    def __init__(self, base_transformer_model: str, num_classes: int):
+        super().__init__()
+        self.tr_model = base_transformer_model
+        self.num = num_classes
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        if self.tr_model not in ['rubert', 'base']:
+            raise Exception('unknown model')
+        elif self.tr_model == 'rubert':
+            self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+            self.config = BertConfig.from_pretrained("cointegrated/rubert-tiny2", output_hidden_states=True,
+                                                     output_attentions=True)
+        elif self.tr_model == 'base':
+            self.tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base", model_max_length=512)
+            self.config = BertConfig.from_pretrained("ai-forever/ruBert-base", output_hidden_states=True,
+                                                     output_attentions=True)
+        self.model = AutoModel.from_config(self.config)
+        self.a1 = nn.ReLU()
+        self.classifier_1 = nn.Linear(self.model.pooler.dense.out_features, self.num)
+        # self.classifier_dropout = nn.Dropout(p=0.2)
+        # self.classifier_2 = nn.Linear(128, self.num)
+    def forward(self, inputs):
+        t = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
+        tokens = self.tokenizer.convert_ids_to_tokens(t['input_ids'][0])
+        model_output = self.model(**{k: v.to(self.device) for k, v in t.items()})
+        attentions = torch.cat(model_output['attentions']).to('cpu')
+        embeddings = model_output.last_hidden_state[:, 0, :]
+        embeddings = torch.nn.functional.normalize(embeddings)
+        outputs = self.a1(embeddings)
+        outputs = self.classifier_1(outputs)
+        # outputs = self.classifier_dropout(outputs)
+        # outputs = self.a1(outputs)
+        # outputs = self.classifier_dropout(outputs)
+        # outputs = self.classifier_2(outputs)
+        return outputs, tokens, attentions

ml.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import pandas as pd
+from catboost import Pool
+import joblib
+import torch
+import re
+from llm import TransformerRegrModel
+class VacancyAnalyzer:
+    def __init__(self, transformer_path: str, catboost_path: str, inputs: dict):
+        self.transformer_path = transformer_path
+        self.catboost_path = catboost_path
+        self.inputs = pd.DataFrame(inputs, index=[0]).drop(columns=['conversion', 'conversion_class', 'id'], axis=1)
+        self.cat_features = ['profession', 'grade', 'location']
+        self.text_features = ['emp_brand', 'mandatory', 'additional', 'comp_stages', 'work_conditions']
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    def __cleaner__(self, txt: str) -> str:
+        txt = re.sub(r'\_(.*?)\_', r'', txt)
+        txt = re.sub(r'([\n\t]*)', r'', txt)
+        return txt
+    def predict(self) -> float:
+        df = self.inputs.drop(columns=self.text_features, axis=1)
+        pool = Pool(df, cat_features=self.cat_features)
+        regressor = joblib.load(self.catboost_path)
+        prediction = regressor.predict(pool).tolist()
+        return prediction[0]
+    def classify(self) -> tuple:
+        df = self.inputs[self.text_features]
+        description = df[self.text_features[0]].values[0] + ' '
+        for t in self.text_features[1:]:
+            description += df[t].values[0]
+            description += ' '
+        description = self.__cleaner__(description)
+        if len(description) < 100:
+            return 'Too short text', 'unknown'
+        tbert = TransformerRegrModel('rubert', 3)
+        tbert.load_state_dict(torch.load(self.transformer_path, map_location=torch.device(self.device)))
+        tbert.to(self.device)
+        tbert.eval()
+        with torch.no_grad():
+            outputs, _, _ = tbert(description)
+            prediction = torch.argmax(outputs, 1).cpu().numpy()
+        return 'Text analyzing finished', prediction

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas==2.0.3
+joblib==1.3.2
+torch==2.0.1+cpu
+catboost==1.2
+transformers==4.40.0
+gradio==4.27.0