import sys import time try: import spaces except ImportError: print("ZeroGPU is not available, skipping...") import torch import torchaudio import gradio as gr import torchaudio.transforms as T import polars as pl from importlib.metadata import version from gradio.utils import is_zero_gpu_space from gradio.themes import Base from paddleocr import PaddleOCR from transformers import ( AutoModelForCausalLM, AutoTokenizer, AutoModelForCTC, Wav2Vec2BertProcessor, ) use_zero_gpu = is_zero_gpu_space() use_cuda = torch.cuda.is_available() if use_zero_gpu: spaces_version = version("spaces") print("ZeroGPU is available, changing inference call.") else: spaces_version = "N/A" print("ZeroGPU is not available, skipping...") print(f"Spaces version: {spaces_version}") if use_cuda: print("CUDA is available, setting correct `device` variable.") device = "cuda" torch_dtype = torch.bfloat16 else: device = "cpu" torch_dtype = torch.bfloat16 # Config model_name = "Yehor/kulyk-uk-en" concurrency_limit = 5 min_duration = 0.5 max_duration = 60 current_theme = Base() # Load the model model = AutoModelForCausalLM.from_pretrained( model_name, device_map=device, torch_dtype=torch_dtype, ) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_name) # Load ASR audio_model = AutoModelForCTC.from_pretrained( "Yehor/w2v-bert-uk-v2.1-bf16", torch_dtype=torch_dtype, device_map=device ) processor = Wav2Vec2BertProcessor.from_pretrained("Yehor/w2v-bert-uk-v2.1-bf16") # Load OCR ocr_model = PaddleOCR( lang="uk", use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False, ) # Examples examples_text = [ "WP: F-16 навряд чи суттєво змінять ситуацію на полі бою", "Над Україною збито ракету та 7 із 8 «Шахедів»", "Олімпійські ігри 2024. Розклад змагань українських спортсменів на 28 липня", "Кампанія Гарріс зібрала понад 200 мільйонів доларів менш ніж за тиждень", "За тиждень НБУ продав майже 800 мільйонів доларів на міжбанківському ринку", "Париж 2024. День 2: Текстова трансляція", ] examples_audio = [ "example_1.wav", "example_2.wav", "example_3.wav", "example_4.wav", "example_5.wav", "example_6.wav", ] examples_image = [ "example_1.jpg", "example_2.jpg", "example_3.jpg", "example_4.jpg", "example_5.jpg", "example_6.jpg", ] title = "UK-EN Translator" authors_table = """ ## Authors Follow them on social networks and **contact** if you need any help or have any questions: | **Yehor Smoliakov** | |-------------------------------------------------------------------------------------------------| | https://t.me/smlkw in Telegram | | https://x.com/yehor_smoliakov at X | | https://github.com/egorsmkv at GitHub | | https://huggingface.co/Yehor at Hugging Face | | or use egorsmkv@gmail.com | """.strip() description_head = f""" # {title} This space translates your text, audio, image from Ukrainian to English using [kulyk-uk-en](https://huggingface.co/Yehor/kulyk-uk-en) model. Also, check [EN-UK Translator](https://huggingface.co/spaces/Yehor/en-uk-translator) out. """.strip() tech_env = f""" #### Environment - Python: {sys.version} - Torch device: {device} - Torch dtype: {torch_dtype} #### Models - [kulyk-uk-en](https://huggingface.co/Yehor/kulyk-en-uk) - [wav2vec2-bert](https://huggingface.co/Yehor/w2v-bert-uk-v2.1-bf16) - [PaddleOCR](https://huggingface.co/PaddlePaddle/eslav_PP-OCRv5_mobile_rec) """.strip() tech_libraries = f""" #### Libraries - torch: {version("torch")} - torchaudio: {version("torchaudio")} - transformers: {version("transformers")} - accelerate: {version("accelerate")} - gradio: {version("gradio")} """.strip() def translate(text: str) -> str: prompt = "Translate the text to Ukrainian:\n" + text input_ids = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, return_tensors="pt", tokenize=True, ).to(model.device) output = model.generate( input_ids, max_new_tokens=2048, # Greedy Search do_sample=False, repetition_penalty=1.05, # Sampling # do_sample=True, # temperature=0.1, # # top_k=1, # min_p=0.9, # repetition_penalty=1.05, ) prompt_len = input_ids.shape[1] generated_tokens = output[:, prompt_len:] translated_text = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True )[0] return translated_text.strip() @spaces.GPU def inference_text(text, progress=gr.Progress()): if not text: raise gr.Error("Please paste your text.") progress(0, desc="Translating...") results = [] sentences = text.split("\n") non_empty_sentences = [] for sentence in sentences: s = sentence.strip() if len(s) != 0: non_empty_sentences.append(s) for sentence in progress.tqdm( non_empty_sentences, desc="Translating...", unit="sentence" ): t0 = time.time() translated_text = translate(sentence) elapsed_time = round(time.time() - t0, 2) results.append( { "sentence": sentence, "translated_text": translated_text, "elapsed_time": elapsed_time, } ) gr.Info("Finished!", duration=2) return pl.DataFrame(results) @spaces.GPU def inference_audio(audio, progress=gr.Progress()): if not audio: raise gr.Error("Please paste your audio file.") progress(0, desc="Translating...") meta = torchaudio.info(audio) duration = meta.num_frames / meta.sample_rate if duration < min_duration: raise gr.Error( f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds." ) if duration > max_duration: raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.") audio_input, sr = torchaudio.load(audio) if meta.num_channels > 1: audio_input = torch.mean(audio_input, dim=0, keepdim=True) if meta.sample_rate != 16_000: resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype) audio_input = resampler(audio_input) audio_input = audio_input.squeeze().numpy() features = processor([audio_input], sampling_rate=16_000).input_features features = torch.tensor(features).to(device, dtype=torch_dtype) with torch.inference_mode(): logits = audio_model(features).logits predicted_ids = torch.argmax(logits, dim=-1) predictions = processor.batch_decode(predicted_ids) print("Predictions:", predictions) if not predictions: text = "-" else: text = "\n".join(predictions) print("Text:", text) results = [] sentences = text.split("\n") non_empty_sentences = [] for sentence in sentences: s = sentence.strip() if len(s) != 0: non_empty_sentences.append(s) for sentence in progress.tqdm( non_empty_sentences, desc="Translating...", unit="sentence" ): t0 = time.time() translated_text = translate(sentence) elapsed_time = round(time.time() - t0, 2) results.append( { "sentence": sentence, "translated_text": translated_text, "elapsed_time": elapsed_time, } ) gr.Info("Finished!", duration=2) return pl.DataFrame(results) @spaces.GPU def inference_image(image, progress=gr.Progress()): if not image: raise gr.Error("Please paste your image file.") progress(0, desc="Translating...") if not isinstance(image, str): raise gr.Error("Please paste your image file.") predictions = ocr_model.predict(image) results = [] for prediction in predictions: results.append(' '.join(prediction['rec_texts'])) text = " ".join(results) print("Text:", text) results = [] sentences = [text] for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"): t0 = time.time() translated_text = translate(sentence) elapsed_time = round(time.time() - t0, 2) results.append( { "sentence": sentence, "translated_text": translated_text, "elapsed_time": elapsed_time, } ) gr.Info("Finished!", duration=2) return pl.DataFrame(results) def create_app(): tab = gr.Blocks( title=title, analytics_enabled=False, theme=current_theme, ) with tab: gr.Markdown(description_head) gr.Markdown("## Usage") translated_text = gr.DataFrame( label="Translated text", ) text = gr.Textbox(label="Text", autofocus=True, lines=5) gr.Button("Translate").click( inference_text, concurrency_limit=concurrency_limit, inputs=text, outputs=translated_text, ) with gr.Row(): gr.Examples(label="Choose an example", inputs=text, examples=examples_text) return tab def create_audio_app(): with gr.Blocks(theme=current_theme) as tab: gr.Markdown(description_head) gr.Markdown("## Usage") translated_text = gr.DataFrame( label="Translated text", ) audio = gr.Audio(label="Audio file", sources="upload", type="filepath") gr.Button("Translate").click( inference_audio, concurrency_limit=concurrency_limit, inputs=audio, outputs=translated_text, ) with gr.Row(): gr.Examples( label="Choose an example", inputs=audio, examples=examples_audio ) gr.Markdown( f"> Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds." ) return tab def create_image_app(): with gr.Blocks(theme=current_theme) as tab: gr.Markdown(description_head) gr.Markdown("## Usage") translated_text = gr.DataFrame( label="Translated text", ) image = gr.Image(label="Image file", sources="upload", type="filepath") gr.Button("Translate").click( inference_image, concurrency_limit=concurrency_limit, inputs=image, outputs=translated_text, ) with gr.Row(): gr.Examples( label="Choose an example", inputs=image, examples=examples_image ) return tab def create_env(): with gr.Blocks(theme=current_theme) as tab: gr.Markdown(tech_env) gr.Markdown(tech_libraries) return tab def create_authors(): with gr.Blocks(theme=current_theme) as tab: gr.Markdown(authors_table) return tab def create_demo(): app_tab = create_app() app_audio_tab = create_audio_app() app_image_tab = create_image_app() authors_tab = create_authors() env_tab = create_env() return gr.TabbedInterface( [app_tab, app_audio_tab, app_image_tab, authors_tab, env_tab], tab_names=[ "✍️ Text", "🔊 Audio", "👀 Image", "👥 Authors", "📦 Environment, Models, and Libraries", ], ) if __name__ == "__main__": demo = create_demo() demo.queue() demo.launch()