en-uk-translator

Running on Zero

App Files Files Community

Yehor commited on 28 days ago

Commit

f55d282

1 Parent(s): 8a4cb84

Add vision part

Browse files

Files changed (1) hide show

app.py +131 -7

app.py CHANGED Viewed

@@ -22,6 +22,9 @@ from transformers import (
     MoonshineForConditionalGeneration,
 )
 use_zero_gpu = is_zero_gpu_space()
 use_cuda = torch.cuda.is_available()
@@ -64,6 +67,10 @@ audio_model = MoonshineForConditionalGeneration.from_pretrained(
 audio_model.to(device)
 audio_model.to(torch_dtype)
 # Examples
 examples_text = [
     "WP: F-16s are unlikely to make a significant difference on the battlefield",
@@ -73,7 +80,6 @@ examples_text = [
     "Over the week, the NBU sold almost $800 million on the interbank market",
     "Paris 2024. Day 2: Text broadcast",
 ]
 examples_audio = [
     "example_1.wav",
     "example_2.wav",
@@ -83,6 +89,14 @@ examples_audio = [
     "example_6.wav",
     "example_7.wav",
 ]
 title = "EN-UK Translator"
@@ -103,7 +117,7 @@ Follow them on social networks and **contact** if you need any help or have any
 description_head = f"""
 # {title}
-This space translates your text from English to Ukrainian. Use another spaces: [from Audio](https://huggingface.co/spaces/Yehor/audio-en-uk-translator), [from Images](https://huggingface.co/spaces/Yehor/vision-en-uk-translator). Also, check [UK-EN Translator](https://huggingface.co/spaces/Yehor/uk-en-translator) out for reverse task.
 """.strip()
@@ -120,6 +134,8 @@ tech_env = f"""
 #### Models
 - [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk)
 """.strip()
 tech_libraries = f"""
@@ -309,11 +325,88 @@ def inference_audio(audio, progress=gr.Progress()):
     result_texts = []
     for result in results:
-        result_texts.append(f"> {result['sentence']}")
-        result_texts.append(f"{result['translated_text']}\n")
     sum_elapsed_text = sum([result["elapsed_time"] for result in results])
-    result_texts.append(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds")
     return "\n".join(result_texts)
@@ -334,6 +427,7 @@ def create_app():
             label="Translated text",
             placeholder=translated_text_value,
             show_copy_button=True,
         )
         text = gr.Textbox(label="Text", autofocus=True, lines=5)
@@ -361,6 +455,7 @@ def create_audio_app():
             label="Translated text",
             placeholder=translated_text_value,
             show_copy_button=True,
         )
         audio = gr.Audio(label="Audio file", sources="upload", type="filepath")
@@ -373,7 +468,34 @@ def create_audio_app():
         )
         with gr.Row():
-            gr.Examples(label="Choose an example", inputs=audio, examples=examples_audio)
     return tab
@@ -396,14 +518,16 @@ def create_authors():
 def create_demo():
     app_tab = create_app()
     app_audio_tab = create_audio_app()
     authors_tab = create_authors()
     env_tab = create_env()
     return gr.TabbedInterface(
-        [app_tab, app_audio_tab, authors_tab, env_tab],
         tab_names=[
             "✍️ Translation",
             "🔊 Audio",
             "👥 Authors",
             "📦 Environment, Models, and Libraries",
         ],

     MoonshineForConditionalGeneration,
 )
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
 use_zero_gpu = is_zero_gpu_space()
 use_cuda = torch.cuda.is_available()
 audio_model.to(device)
 audio_model.to(torch_dtype)
+# Load OCR
+ocr_model = ocr_predictor(pretrained=True)
+ocr_model.to(device)
 # Examples
 examples_text = [
     "WP: F-16s are unlikely to make a significant difference on the battlefield",
     "Over the week, the NBU sold almost $800 million on the interbank market",
     "Paris 2024. Day 2: Text broadcast",
 ]
 examples_audio = [
     "example_1.wav",
     "example_2.wav",
     "example_6.wav",
     "example_7.wav",
 ]
+examples_vision = [
+    "example_1.jpg",
+    "example_2.jpg",
+    "example_3.jpg",
+    "example_4.jpg",
+    "example_5.jpg",
+    "example_6.jpg",
+]
 title = "EN-UK Translator"
 description_head = f"""
 # {title}
+This space translates your text, audio, image from English to Ukrainian. Also, check [UK-EN Translator](https://huggingface.co/spaces/Yehor/uk-en-translator) out for reverse task.
 """.strip()
 #### Models
 - [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk)
+- [moonshine-base](https://huggingface.co/UsefulSensors/moonshine-base)
+- [doctr](https://github.com/mindee/doctr)
 """.strip()
 tech_libraries = f"""
     result_texts = []
     for result in results:
+        result_texts.append(f"{result['sentence']}: {result['translated_text']}\n")
     sum_elapsed_text = sum([result["elapsed_time"] for result in results])
+    print(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds")
+    return "\n".join(result_texts)
+@spaces.GPU
+def inference_vision(image, progress=gr.Progress()):
+    if not image:
+        raise gr.Error("Please paste your image file.")
+    progress(0, desc="Translating...")
+    if isinstance(image, str):
+        doc = DocumentFile.from_images(image)
+    else:
+        raise gr.Error("Please paste your image file.")
+    result = ocr_model(doc)
+    text = result.render()
+    print("Text:", text)
+    results = []
+    sentences = [text.replace("\n", " ")]
+    for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"):
+        t0 = time.time()
+        prompt = "Translate the text to Ukrainian:\n" + sentence
+        input_ids = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            return_tensors="pt",
+            tokenize=True,
+        ).to(model.device)
+        output = model.generate(
+            input_ids,
+            max_new_tokens=2048,
+            # Greedy Search
+            do_sample=False,
+            repetition_penalty=1.05,
+            # Sampling
+            # do_sample=True,
+            # temperature=0.1,
+            # # top_k=1,
+            # min_p=0.9,
+            # repetition_penalty=1.05,
+        )
+        prompt_len = input_ids.shape[1]
+        generated_tokens = output[:, prompt_len:]
+        translated_text = tokenizer.batch_decode(
+            generated_tokens, skip_special_tokens=True
+        )[0]
+        elapsed_time = round(time.time() - t0, 2)
+        translated_text = translated_text.strip()
+        results.append(
+            {
+                "sentence": sentence,
+                "translated_text": translated_text,
+                "elapsed_time": elapsed_time,
+            }
+        )
+    gr.Info("Finished!", duration=2)
+    result_texts = []
+    for result in results:
+        result_texts.append(f"> {result['sentence']}: {result['translated_text']}\n")
+    sum_elapsed_text = sum([result["elapsed_time"] for result in results])
+    print(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds")
     return "\n".join(result_texts)
             label="Translated text",
             placeholder=translated_text_value,
             show_copy_button=True,
+            lines=5,
         )
         text = gr.Textbox(label="Text", autofocus=True, lines=5)
             label="Translated text",
             placeholder=translated_text_value,
             show_copy_button=True,
+            lines=5,
         )
         audio = gr.Audio(label="Audio file", sources="upload", type="filepath")
         )
         with gr.Row():
+            gr.Examples(
+                label="Choose an example", inputs=audio, examples=examples_audio
+            )
+    return tab
+def create_vision_app():
+    with gr.Blocks(theme=Soft()) as tab:
+        translated_text = gr.Textbox(
+            label="Translated text",
+            placeholder=translated_text_value,
+            show_copy_button=True,
+        )
+        image = gr.Image(label="Image file", sources="upload", type="filepath")
+        gr.Button("Translate").click(
+            inference_vision,
+            concurrency_limit=concurrency_limit,
+            inputs=image,
+            outputs=translated_text,
+        )
+        with gr.Row():
+            gr.Examples(
+                label="Choose an example", inputs=image, examples=examples_vision
+            )
     return tab
 def create_demo():
     app_tab = create_app()
     app_audio_tab = create_audio_app()
+    app_vision_tab = create_vision_app()
     authors_tab = create_authors()
     env_tab = create_env()
     return gr.TabbedInterface(
+        [app_tab, app_audio_tab, app_vision_tab, authors_tab, env_tab],
         tab_names=[
             "✍️ Translation",
             "🔊 Audio",
+            "👀 Image",
             "👥 Authors",
             "📦 Environment, Models, and Libraries",
         ],