Spaces:

fastrtc
/

talk-to-oai-gpt-oss-20b

Running

App Files Files Community

freddyaboulton HF Staff commited on 11 days ago

Commit

ae054df

verified ·

1 Parent(s): a67cb3f

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -6

app.py CHANGED Viewed

@@ -1,19 +1,43 @@
-import numpy as np
 import gradio as gr
 from dotenv import load_dotenv
 from fastrtc import (
     ReplyOnPause,
     Stream,
-    AdditionalOutputs,
     get_current_context,
     get_hf_turn_credentials,
     get_hf_turn_credentials_async,
     get_stt_model,
     get_tts_model,
-    WebRTCError,
 )
-import gradio as gr
 from huggingface_hub import InferenceClient
 load_dotenv()
@@ -30,7 +54,12 @@ def response(
     if hf_token is None or hf_token == "":
         raise WebRTCError("HF Token is required")
-    llm_client = InferenceClient(provider="groq", token=hf_token)
     context = get_current_context()
     if context.webrtc_id not in conversations:
@@ -46,7 +75,6 @@ def response(
     messages = conversations[context.webrtc_id]
-    transcription = stt_model.stt(audio)
     messages.append({"role": "user", "content": transcription})
     output = llm_client.chat.completions.create(  # type: ignore
@@ -82,6 +110,8 @@ stream = Stream(
     additional_outputs=[chatbot],
     additional_outputs_handler=lambda old, new: new,
     ui_args={"title": "Talk To OpenAI GPT-OSS 20B (Powered by FastRTC ⚡️)"},
 )
 stream.ui.launch()

 import gradio as gr
+import numpy as np
+import torch
 from dotenv import load_dotenv
 from fastrtc import (
+    AdditionalOutputs,
     ReplyOnPause,
     Stream,
+    WebRTCError,
+    audio_to_float32,
     get_current_context,
     get_hf_turn_credentials,
     get_hf_turn_credentials_async,
     get_stt_model,
     get_tts_model,
 )
 from huggingface_hub import InferenceClient
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id = "openai/whisper-large-v3-turbo"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    device=device,
+)
 load_dotenv()
     if hf_token is None or hf_token == "":
         raise WebRTCError("HF Token is required")
+    llm_client = InferenceClient(provider="auto", token=hf_token)
+    result = pipe(
+        {"array": audio_to_float32(audio[1]).squeeze(), "sampling_rate": audio[0]}
+    )
+    transcription = result["text"]
     context = get_current_context()
     if context.webrtc_id not in conversations:
     messages = conversations[context.webrtc_id]
     messages.append({"role": "user", "content": transcription})
     output = llm_client.chat.completions.create(  # type: ignore
     additional_outputs=[chatbot],
     additional_outputs_handler=lambda old, new: new,
     ui_args={"title": "Talk To OpenAI GPT-OSS 20B (Powered by FastRTC ⚡️)"},
+    time_limit=90,
+    concurrency_limit=5,
 )
 stream.ui.launch()