Spaces:
Running
on
Zero
Running
on
Zero
Add vision part
Browse files
app.py
CHANGED
@@ -22,6 +22,9 @@ from transformers import (
|
|
22 |
MoonshineForConditionalGeneration,
|
23 |
)
|
24 |
|
|
|
|
|
|
|
25 |
use_zero_gpu = is_zero_gpu_space()
|
26 |
use_cuda = torch.cuda.is_available()
|
27 |
|
@@ -64,6 +67,10 @@ audio_model = MoonshineForConditionalGeneration.from_pretrained(
|
|
64 |
audio_model.to(device)
|
65 |
audio_model.to(torch_dtype)
|
66 |
|
|
|
|
|
|
|
|
|
67 |
# Examples
|
68 |
examples_text = [
|
69 |
"WP: F-16s are unlikely to make a significant difference on the battlefield",
|
@@ -73,7 +80,6 @@ examples_text = [
|
|
73 |
"Over the week, the NBU sold almost $800 million on the interbank market",
|
74 |
"Paris 2024. Day 2: Text broadcast",
|
75 |
]
|
76 |
-
|
77 |
examples_audio = [
|
78 |
"example_1.wav",
|
79 |
"example_2.wav",
|
@@ -83,6 +89,14 @@ examples_audio = [
|
|
83 |
"example_6.wav",
|
84 |
"example_7.wav",
|
85 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
title = "EN-UK Translator"
|
88 |
|
@@ -103,7 +117,7 @@ Follow them on social networks and **contact** if you need any help or have any
|
|
103 |
description_head = f"""
|
104 |
# {title}
|
105 |
|
106 |
-
This space translates your text from English to Ukrainian.
|
107 |
""".strip()
|
108 |
|
109 |
|
@@ -120,6 +134,8 @@ tech_env = f"""
|
|
120 |
#### Models
|
121 |
|
122 |
- [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk)
|
|
|
|
|
123 |
""".strip()
|
124 |
|
125 |
tech_libraries = f"""
|
@@ -309,11 +325,88 @@ def inference_audio(audio, progress=gr.Progress()):
|
|
309 |
result_texts = []
|
310 |
|
311 |
for result in results:
|
312 |
-
result_texts.append(f"
|
313 |
-
result_texts.append(f"{result['translated_text']}\n")
|
314 |
|
315 |
sum_elapsed_text = sum([result["elapsed_time"] for result in results])
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
|
318 |
return "\n".join(result_texts)
|
319 |
|
@@ -334,6 +427,7 @@ def create_app():
|
|
334 |
label="Translated text",
|
335 |
placeholder=translated_text_value,
|
336 |
show_copy_button=True,
|
|
|
337 |
)
|
338 |
|
339 |
text = gr.Textbox(label="Text", autofocus=True, lines=5)
|
@@ -361,6 +455,7 @@ def create_audio_app():
|
|
361 |
label="Translated text",
|
362 |
placeholder=translated_text_value,
|
363 |
show_copy_button=True,
|
|
|
364 |
)
|
365 |
|
366 |
audio = gr.Audio(label="Audio file", sources="upload", type="filepath")
|
@@ -373,7 +468,34 @@ def create_audio_app():
|
|
373 |
)
|
374 |
|
375 |
with gr.Row():
|
376 |
-
gr.Examples(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
return tab
|
379 |
|
@@ -396,14 +518,16 @@ def create_authors():
|
|
396 |
def create_demo():
|
397 |
app_tab = create_app()
|
398 |
app_audio_tab = create_audio_app()
|
|
|
399 |
authors_tab = create_authors()
|
400 |
env_tab = create_env()
|
401 |
|
402 |
return gr.TabbedInterface(
|
403 |
-
[app_tab, app_audio_tab, authors_tab, env_tab],
|
404 |
tab_names=[
|
405 |
"βοΈ Translation",
|
406 |
"π Audio",
|
|
|
407 |
"π₯ Authors",
|
408 |
"π¦ Environment, Models, and Libraries",
|
409 |
],
|
|
|
22 |
MoonshineForConditionalGeneration,
|
23 |
)
|
24 |
|
25 |
+
from doctr.io import DocumentFile
|
26 |
+
from doctr.models import ocr_predictor
|
27 |
+
|
28 |
use_zero_gpu = is_zero_gpu_space()
|
29 |
use_cuda = torch.cuda.is_available()
|
30 |
|
|
|
67 |
audio_model.to(device)
|
68 |
audio_model.to(torch_dtype)
|
69 |
|
70 |
+
# Load OCR
|
71 |
+
ocr_model = ocr_predictor(pretrained=True)
|
72 |
+
ocr_model.to(device)
|
73 |
+
|
74 |
# Examples
|
75 |
examples_text = [
|
76 |
"WP: F-16s are unlikely to make a significant difference on the battlefield",
|
|
|
80 |
"Over the week, the NBU sold almost $800 million on the interbank market",
|
81 |
"Paris 2024. Day 2: Text broadcast",
|
82 |
]
|
|
|
83 |
examples_audio = [
|
84 |
"example_1.wav",
|
85 |
"example_2.wav",
|
|
|
89 |
"example_6.wav",
|
90 |
"example_7.wav",
|
91 |
]
|
92 |
+
examples_vision = [
|
93 |
+
"example_1.jpg",
|
94 |
+
"example_2.jpg",
|
95 |
+
"example_3.jpg",
|
96 |
+
"example_4.jpg",
|
97 |
+
"example_5.jpg",
|
98 |
+
"example_6.jpg",
|
99 |
+
]
|
100 |
|
101 |
title = "EN-UK Translator"
|
102 |
|
|
|
117 |
description_head = f"""
|
118 |
# {title}
|
119 |
|
120 |
+
This space translates your text, audio, image from English to Ukrainian. Also, check [UK-EN Translator](https://huggingface.co/spaces/Yehor/uk-en-translator) out for reverse task.
|
121 |
""".strip()
|
122 |
|
123 |
|
|
|
134 |
#### Models
|
135 |
|
136 |
- [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk)
|
137 |
+
- [moonshine-base](https://huggingface.co/UsefulSensors/moonshine-base)
|
138 |
+
- [doctr](https://github.com/mindee/doctr)
|
139 |
""".strip()
|
140 |
|
141 |
tech_libraries = f"""
|
|
|
325 |
result_texts = []
|
326 |
|
327 |
for result in results:
|
328 |
+
result_texts.append(f"{result['sentence']}: {result['translated_text']}\n")
|
|
|
329 |
|
330 |
sum_elapsed_text = sum([result["elapsed_time"] for result in results])
|
331 |
+
print(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds")
|
332 |
+
|
333 |
+
return "\n".join(result_texts)
|
334 |
+
|
335 |
+
|
336 |
+
@spaces.GPU
|
337 |
+
def inference_vision(image, progress=gr.Progress()):
|
338 |
+
if not image:
|
339 |
+
raise gr.Error("Please paste your image file.")
|
340 |
+
|
341 |
+
progress(0, desc="Translating...")
|
342 |
+
|
343 |
+
if isinstance(image, str):
|
344 |
+
doc = DocumentFile.from_images(image)
|
345 |
+
else:
|
346 |
+
raise gr.Error("Please paste your image file.")
|
347 |
+
|
348 |
+
result = ocr_model(doc)
|
349 |
+
|
350 |
+
text = result.render()
|
351 |
+
|
352 |
+
print("Text:", text)
|
353 |
+
|
354 |
+
results = []
|
355 |
+
|
356 |
+
sentences = [text.replace("\n", " ")]
|
357 |
+
|
358 |
+
for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"):
|
359 |
+
t0 = time.time()
|
360 |
+
|
361 |
+
prompt = "Translate the text to Ukrainian:\n" + sentence
|
362 |
+
|
363 |
+
input_ids = tokenizer.apply_chat_template(
|
364 |
+
[{"role": "user", "content": prompt}],
|
365 |
+
add_generation_prompt=True,
|
366 |
+
return_tensors="pt",
|
367 |
+
tokenize=True,
|
368 |
+
).to(model.device)
|
369 |
+
|
370 |
+
output = model.generate(
|
371 |
+
input_ids,
|
372 |
+
max_new_tokens=2048,
|
373 |
+
# Greedy Search
|
374 |
+
do_sample=False,
|
375 |
+
repetition_penalty=1.05,
|
376 |
+
# Sampling
|
377 |
+
# do_sample=True,
|
378 |
+
# temperature=0.1,
|
379 |
+
# # top_k=1,
|
380 |
+
# min_p=0.9,
|
381 |
+
# repetition_penalty=1.05,
|
382 |
+
)
|
383 |
+
|
384 |
+
prompt_len = input_ids.shape[1]
|
385 |
+
generated_tokens = output[:, prompt_len:]
|
386 |
+
translated_text = tokenizer.batch_decode(
|
387 |
+
generated_tokens, skip_special_tokens=True
|
388 |
+
)[0]
|
389 |
+
|
390 |
+
elapsed_time = round(time.time() - t0, 2)
|
391 |
+
|
392 |
+
translated_text = translated_text.strip()
|
393 |
+
results.append(
|
394 |
+
{
|
395 |
+
"sentence": sentence,
|
396 |
+
"translated_text": translated_text,
|
397 |
+
"elapsed_time": elapsed_time,
|
398 |
+
}
|
399 |
+
)
|
400 |
+
|
401 |
+
gr.Info("Finished!", duration=2)
|
402 |
+
|
403 |
+
result_texts = []
|
404 |
+
|
405 |
+
for result in results:
|
406 |
+
result_texts.append(f"> {result['sentence']}: {result['translated_text']}\n")
|
407 |
+
|
408 |
+
sum_elapsed_text = sum([result["elapsed_time"] for result in results])
|
409 |
+
print(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds")
|
410 |
|
411 |
return "\n".join(result_texts)
|
412 |
|
|
|
427 |
label="Translated text",
|
428 |
placeholder=translated_text_value,
|
429 |
show_copy_button=True,
|
430 |
+
lines=5,
|
431 |
)
|
432 |
|
433 |
text = gr.Textbox(label="Text", autofocus=True, lines=5)
|
|
|
455 |
label="Translated text",
|
456 |
placeholder=translated_text_value,
|
457 |
show_copy_button=True,
|
458 |
+
lines=5,
|
459 |
)
|
460 |
|
461 |
audio = gr.Audio(label="Audio file", sources="upload", type="filepath")
|
|
|
468 |
)
|
469 |
|
470 |
with gr.Row():
|
471 |
+
gr.Examples(
|
472 |
+
label="Choose an example", inputs=audio, examples=examples_audio
|
473 |
+
)
|
474 |
+
|
475 |
+
return tab
|
476 |
+
|
477 |
+
|
478 |
+
def create_vision_app():
|
479 |
+
with gr.Blocks(theme=Soft()) as tab:
|
480 |
+
translated_text = gr.Textbox(
|
481 |
+
label="Translated text",
|
482 |
+
placeholder=translated_text_value,
|
483 |
+
show_copy_button=True,
|
484 |
+
)
|
485 |
+
|
486 |
+
image = gr.Image(label="Image file", sources="upload", type="filepath")
|
487 |
+
|
488 |
+
gr.Button("Translate").click(
|
489 |
+
inference_vision,
|
490 |
+
concurrency_limit=concurrency_limit,
|
491 |
+
inputs=image,
|
492 |
+
outputs=translated_text,
|
493 |
+
)
|
494 |
+
|
495 |
+
with gr.Row():
|
496 |
+
gr.Examples(
|
497 |
+
label="Choose an example", inputs=image, examples=examples_vision
|
498 |
+
)
|
499 |
|
500 |
return tab
|
501 |
|
|
|
518 |
def create_demo():
|
519 |
app_tab = create_app()
|
520 |
app_audio_tab = create_audio_app()
|
521 |
+
app_vision_tab = create_vision_app()
|
522 |
authors_tab = create_authors()
|
523 |
env_tab = create_env()
|
524 |
|
525 |
return gr.TabbedInterface(
|
526 |
+
[app_tab, app_audio_tab, app_vision_tab, authors_tab, env_tab],
|
527 |
tab_names=[
|
528 |
"βοΈ Translation",
|
529 |
"π Audio",
|
530 |
+
"π Image",
|
531 |
"π₯ Authors",
|
532 |
"π¦ Environment, Models, and Libraries",
|
533 |
],
|