Yehor commited on
Commit
f55d282
Β·
1 Parent(s): 8a4cb84

Add vision part

Browse files
Files changed (1) hide show
  1. app.py +131 -7
app.py CHANGED
@@ -22,6 +22,9 @@ from transformers import (
22
  MoonshineForConditionalGeneration,
23
  )
24
 
 
 
 
25
  use_zero_gpu = is_zero_gpu_space()
26
  use_cuda = torch.cuda.is_available()
27
 
@@ -64,6 +67,10 @@ audio_model = MoonshineForConditionalGeneration.from_pretrained(
64
  audio_model.to(device)
65
  audio_model.to(torch_dtype)
66
 
 
 
 
 
67
  # Examples
68
  examples_text = [
69
  "WP: F-16s are unlikely to make a significant difference on the battlefield",
@@ -73,7 +80,6 @@ examples_text = [
73
  "Over the week, the NBU sold almost $800 million on the interbank market",
74
  "Paris 2024. Day 2: Text broadcast",
75
  ]
76
-
77
  examples_audio = [
78
  "example_1.wav",
79
  "example_2.wav",
@@ -83,6 +89,14 @@ examples_audio = [
83
  "example_6.wav",
84
  "example_7.wav",
85
  ]
 
 
 
 
 
 
 
 
86
 
87
  title = "EN-UK Translator"
88
 
@@ -103,7 +117,7 @@ Follow them on social networks and **contact** if you need any help or have any
103
  description_head = f"""
104
  # {title}
105
 
106
- This space translates your text from English to Ukrainian. Use another spaces: [from Audio](https://huggingface.co/spaces/Yehor/audio-en-uk-translator), [from Images](https://huggingface.co/spaces/Yehor/vision-en-uk-translator). Also, check [UK-EN Translator](https://huggingface.co/spaces/Yehor/uk-en-translator) out for reverse task.
107
  """.strip()
108
 
109
 
@@ -120,6 +134,8 @@ tech_env = f"""
120
  #### Models
121
 
122
  - [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk)
 
 
123
  """.strip()
124
 
125
  tech_libraries = f"""
@@ -309,11 +325,88 @@ def inference_audio(audio, progress=gr.Progress()):
309
  result_texts = []
310
 
311
  for result in results:
312
- result_texts.append(f"> {result['sentence']}")
313
- result_texts.append(f"{result['translated_text']}\n")
314
 
315
  sum_elapsed_text = sum([result["elapsed_time"] for result in results])
316
- result_texts.append(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
  return "\n".join(result_texts)
319
 
@@ -334,6 +427,7 @@ def create_app():
334
  label="Translated text",
335
  placeholder=translated_text_value,
336
  show_copy_button=True,
 
337
  )
338
 
339
  text = gr.Textbox(label="Text", autofocus=True, lines=5)
@@ -361,6 +455,7 @@ def create_audio_app():
361
  label="Translated text",
362
  placeholder=translated_text_value,
363
  show_copy_button=True,
 
364
  )
365
 
366
  audio = gr.Audio(label="Audio file", sources="upload", type="filepath")
@@ -373,7 +468,34 @@ def create_audio_app():
373
  )
374
 
375
  with gr.Row():
376
- gr.Examples(label="Choose an example", inputs=audio, examples=examples_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
  return tab
379
 
@@ -396,14 +518,16 @@ def create_authors():
396
  def create_demo():
397
  app_tab = create_app()
398
  app_audio_tab = create_audio_app()
 
399
  authors_tab = create_authors()
400
  env_tab = create_env()
401
 
402
  return gr.TabbedInterface(
403
- [app_tab, app_audio_tab, authors_tab, env_tab],
404
  tab_names=[
405
  "✍️ Translation",
406
  "πŸ”Š Audio",
 
407
  "πŸ‘₯ Authors",
408
  "πŸ“¦ Environment, Models, and Libraries",
409
  ],
 
22
  MoonshineForConditionalGeneration,
23
  )
24
 
25
+ from doctr.io import DocumentFile
26
+ from doctr.models import ocr_predictor
27
+
28
  use_zero_gpu = is_zero_gpu_space()
29
  use_cuda = torch.cuda.is_available()
30
 
 
67
  audio_model.to(device)
68
  audio_model.to(torch_dtype)
69
 
70
+ # Load OCR
71
+ ocr_model = ocr_predictor(pretrained=True)
72
+ ocr_model.to(device)
73
+
74
  # Examples
75
  examples_text = [
76
  "WP: F-16s are unlikely to make a significant difference on the battlefield",
 
80
  "Over the week, the NBU sold almost $800 million on the interbank market",
81
  "Paris 2024. Day 2: Text broadcast",
82
  ]
 
83
  examples_audio = [
84
  "example_1.wav",
85
  "example_2.wav",
 
89
  "example_6.wav",
90
  "example_7.wav",
91
  ]
92
+ examples_vision = [
93
+ "example_1.jpg",
94
+ "example_2.jpg",
95
+ "example_3.jpg",
96
+ "example_4.jpg",
97
+ "example_5.jpg",
98
+ "example_6.jpg",
99
+ ]
100
 
101
  title = "EN-UK Translator"
102
 
 
117
  description_head = f"""
118
  # {title}
119
 
120
+ This space translates your text, audio, image from English to Ukrainian. Also, check [UK-EN Translator](https://huggingface.co/spaces/Yehor/uk-en-translator) out for reverse task.
121
  """.strip()
122
 
123
 
 
134
  #### Models
135
 
136
  - [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk)
137
+ - [moonshine-base](https://huggingface.co/UsefulSensors/moonshine-base)
138
+ - [doctr](https://github.com/mindee/doctr)
139
  """.strip()
140
 
141
  tech_libraries = f"""
 
325
  result_texts = []
326
 
327
  for result in results:
328
+ result_texts.append(f"{result['sentence']}: {result['translated_text']}\n")
 
329
 
330
  sum_elapsed_text = sum([result["elapsed_time"] for result in results])
331
+ print(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds")
332
+
333
+ return "\n".join(result_texts)
334
+
335
+
336
+ @spaces.GPU
337
+ def inference_vision(image, progress=gr.Progress()):
338
+ if not image:
339
+ raise gr.Error("Please paste your image file.")
340
+
341
+ progress(0, desc="Translating...")
342
+
343
+ if isinstance(image, str):
344
+ doc = DocumentFile.from_images(image)
345
+ else:
346
+ raise gr.Error("Please paste your image file.")
347
+
348
+ result = ocr_model(doc)
349
+
350
+ text = result.render()
351
+
352
+ print("Text:", text)
353
+
354
+ results = []
355
+
356
+ sentences = [text.replace("\n", " ")]
357
+
358
+ for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"):
359
+ t0 = time.time()
360
+
361
+ prompt = "Translate the text to Ukrainian:\n" + sentence
362
+
363
+ input_ids = tokenizer.apply_chat_template(
364
+ [{"role": "user", "content": prompt}],
365
+ add_generation_prompt=True,
366
+ return_tensors="pt",
367
+ tokenize=True,
368
+ ).to(model.device)
369
+
370
+ output = model.generate(
371
+ input_ids,
372
+ max_new_tokens=2048,
373
+ # Greedy Search
374
+ do_sample=False,
375
+ repetition_penalty=1.05,
376
+ # Sampling
377
+ # do_sample=True,
378
+ # temperature=0.1,
379
+ # # top_k=1,
380
+ # min_p=0.9,
381
+ # repetition_penalty=1.05,
382
+ )
383
+
384
+ prompt_len = input_ids.shape[1]
385
+ generated_tokens = output[:, prompt_len:]
386
+ translated_text = tokenizer.batch_decode(
387
+ generated_tokens, skip_special_tokens=True
388
+ )[0]
389
+
390
+ elapsed_time = round(time.time() - t0, 2)
391
+
392
+ translated_text = translated_text.strip()
393
+ results.append(
394
+ {
395
+ "sentence": sentence,
396
+ "translated_text": translated_text,
397
+ "elapsed_time": elapsed_time,
398
+ }
399
+ )
400
+
401
+ gr.Info("Finished!", duration=2)
402
+
403
+ result_texts = []
404
+
405
+ for result in results:
406
+ result_texts.append(f"> {result['sentence']}: {result['translated_text']}\n")
407
+
408
+ sum_elapsed_text = sum([result["elapsed_time"] for result in results])
409
+ print(f"Elapsed time: {round(sum_elapsed_text, 4)} seconds")
410
 
411
  return "\n".join(result_texts)
412
 
 
427
  label="Translated text",
428
  placeholder=translated_text_value,
429
  show_copy_button=True,
430
+ lines=5,
431
  )
432
 
433
  text = gr.Textbox(label="Text", autofocus=True, lines=5)
 
455
  label="Translated text",
456
  placeholder=translated_text_value,
457
  show_copy_button=True,
458
+ lines=5,
459
  )
460
 
461
  audio = gr.Audio(label="Audio file", sources="upload", type="filepath")
 
468
  )
469
 
470
  with gr.Row():
471
+ gr.Examples(
472
+ label="Choose an example", inputs=audio, examples=examples_audio
473
+ )
474
+
475
+ return tab
476
+
477
+
478
+ def create_vision_app():
479
+ with gr.Blocks(theme=Soft()) as tab:
480
+ translated_text = gr.Textbox(
481
+ label="Translated text",
482
+ placeholder=translated_text_value,
483
+ show_copy_button=True,
484
+ )
485
+
486
+ image = gr.Image(label="Image file", sources="upload", type="filepath")
487
+
488
+ gr.Button("Translate").click(
489
+ inference_vision,
490
+ concurrency_limit=concurrency_limit,
491
+ inputs=image,
492
+ outputs=translated_text,
493
+ )
494
+
495
+ with gr.Row():
496
+ gr.Examples(
497
+ label="Choose an example", inputs=image, examples=examples_vision
498
+ )
499
 
500
  return tab
501
 
 
518
  def create_demo():
519
  app_tab = create_app()
520
  app_audio_tab = create_audio_app()
521
+ app_vision_tab = create_vision_app()
522
  authors_tab = create_authors()
523
  env_tab = create_env()
524
 
525
  return gr.TabbedInterface(
526
+ [app_tab, app_audio_tab, app_vision_tab, authors_tab, env_tab],
527
  tab_names=[
528
  "✍️ Translation",
529
  "πŸ”Š Audio",
530
+ "πŸ‘€ Image",
531
  "πŸ‘₯ Authors",
532
  "πŸ“¦ Environment, Models, and Libraries",
533
  ],