hfendpoints-images
/

nvidia-nemo-asr

Model card Files Files and versions Community

feat(parakeet): pin `torch` and fix formatting

by alvarobartt HF Staff - opened 29 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+45

-33

Files changed (3) hide show

Dockerfile +4 -4
handler.py +34 -25
requirements.txt +7 -4

Dockerfile CHANGED Viewed

@@ -1,4 +1,5 @@
-ARG SDK_VERSION=latest
 FROM huggingface/hfendpoints-sdk:${SDK_VERSION} AS sdk
 FROM nvcr.io/nvidia/nemo:25.04
@@ -7,8 +8,7 @@ RUN --mount=type=bind,from=sdk,source=/opt/hfendpoints/dist,target=/usr/local/en
     python3 -m pip install -r /tmp/requirements.txt && \
     python3 -m pip install /usr/local/endpoints/dist/*.whl
-COPY handler.py /usr/local/endpoint/
 # Disable TQDM
 ENV TQDM_DISABLE=1
@@ -20,4 +20,4 @@ ENV PORT=80
 EXPOSE 80
 ENTRYPOINT ["python3"]
-CMD ["/usr/local/endpoint/handler.py"]

+# ARG SDK_VERSION=6751aaa
+ARG SDK_VERSION=v0.2.0
 FROM huggingface/hfendpoints-sdk:${SDK_VERSION} AS sdk
 FROM nvcr.io/nvidia/nemo:25.04
     python3 -m pip install -r /tmp/requirements.txt && \
     python3 -m pip install /usr/local/endpoints/dist/*.whl
+COPY handler.py /usr/local/endpoint/handler.py
 # Disable TQDM
 ENV TQDM_DISABLE=1
 EXPOSE 80
 ENTRYPOINT ["python3"]
+CMD ["/usr/local/endpoint/handler.py"]

handler.py CHANGED Viewed

@@ -4,35 +4,40 @@ from functools import partial
 from io import BytesIO
 import torch
 from hfendpoints.openai import Context, run
-from hfendpoints.openai.audio import AutomaticSpeechRecognitionEndpoint, SegmentBuilder, Segment, \
-    TranscriptionRequest, TranscriptionResponse, TranscriptionResponseKind, VerboseTranscription
-from librosa import load as load_audio, get_duration
 from loguru import logger
 from nemo.collections.asr.models import ASRModel
-from hfendpoints import EndpointConfig, Handler, __version__
 def compression_ratio(text: str) -> float:
-    """
-    :param text:
-    :return:
-    """
     text_bytes = text.encode("utf-8")
     return len(text_bytes) / len(zlib.compress(text_bytes))
 def get_segment(idx: int, segment, tokenizer, request: TranscriptionRequest) -> Segment:
-    return SegmentBuilder() \
-        .id(idx) \
-        .start(segment['start']) \
-        .end(segment['end']) \
-        .text(segment['segment']) \
-        .tokens(tokenizer.text_to_ids(segment['segment'])) \
-        .temperature(request.temperature) \
-        .compression_ratio(compression_ratio(segment['segment'])) \
         .build()
 class NemoAsrHandler(Handler):
@@ -43,7 +48,9 @@ class NemoAsrHandler(Handler):
         self._model = ASRModel.from_pretrained(model_name=str(config.repository)).eval()
         self._model = self._model.to(torch.bfloat16)
-    async def __call__(self, request: TranscriptionRequest, ctx: Context) -> TranscriptionResponse:
         with logger.contextualize(request_id=ctx.request_id):
             with memoryview(request) as audio:
                 (waveform, sampling) = load_audio(BytesIO(audio), sr=16000, mono=True)
@@ -52,13 +59,15 @@ class NemoAsrHandler(Handler):
                 )
                 # Do we need to compute the timestamps?
-                needs_timestamps = request.response_kind == TranscriptionResponseKind.VERBOSE_JSON
-                transcribe_f = partial(self._model.transcribe, timestamps=needs_timestamps, verbose=False)
                 outputs = await asyncio.get_running_loop().run_in_executor(
-                    None,
-                    transcribe_f,
-                    (waveform,)
                 )
                 output = outputs[0]
@@ -66,7 +75,7 @@ class NemoAsrHandler(Handler):
                 match request.response_kind:
                     case TranscriptionResponseKind.VERBOSE_JSON:
-                        segment_timestamps = output.timestamp['segment']
                         segments = [
                             get_segment(idx, stamp, self._model.tokenizer, request)
                             for (idx, stamp) in enumerate(segment_timestamps)
@@ -102,5 +111,5 @@ def entrypoint():
     run(endpoint, config.interface, config.port)
-if __name__ == '__main__':
     entrypoint()

 from io import BytesIO
 import torch
+from hfendpoints import EndpointConfig, Handler, __version__
 from hfendpoints.openai import Context, run
+from hfendpoints.openai.audio import (
+    AutomaticSpeechRecognitionEndpoint,
+    Segment,
+    SegmentBuilder,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionResponseKind,
+    VerboseTranscription,
+)
+from librosa import get_duration
+from librosa import load as load_audio
 from loguru import logger
 from nemo.collections.asr.models import ASRModel
 def compression_ratio(text: str) -> float:
     text_bytes = text.encode("utf-8")
     return len(text_bytes) / len(zlib.compress(text_bytes))
 def get_segment(idx: int, segment, tokenizer, request: TranscriptionRequest) -> Segment:
+    return (
+        SegmentBuilder()
+        .id(idx)
+        .start(segment["start"])
+        .end(segment["end"])
+        .text(segment["segment"])
+        .tokens(tokenizer.text_to_ids(segment["segment"]))
+        .temperature(request.temperature)
+        .compression_ratio(compression_ratio(segment["segment"]))
         .build()
+    )
 class NemoAsrHandler(Handler):
         self._model = ASRModel.from_pretrained(model_name=str(config.repository)).eval()
         self._model = self._model.to(torch.bfloat16)
+    async def __call__(
+        self, request: TranscriptionRequest, ctx: Context
+    ) -> TranscriptionResponse:
         with logger.contextualize(request_id=ctx.request_id):
             with memoryview(request) as audio:
                 (waveform, sampling) = load_audio(BytesIO(audio), sr=16000, mono=True)
                 )
                 # Do we need to compute the timestamps?
+                needs_timestamps = (
+                    request.response_kind == TranscriptionResponseKind.VERBOSE_JSON
+                )
+                transcribe_f = partial(
+                    self._model.transcribe, timestamps=needs_timestamps, verbose=False
+                )
                 outputs = await asyncio.get_running_loop().run_in_executor(
+                    None, transcribe_f, (waveform,)
                 )
                 output = outputs[0]
                 match request.response_kind:
                     case TranscriptionResponseKind.VERBOSE_JSON:
+                        segment_timestamps = output.timestamp["segment"]
                         segments = [
                             get_segment(idx, stamp, self._model.tokenizer, request)
                             for (idx, stamp) in enumerate(segment_timestamps)
     run(endpoint, config.interface, config.port)
+if __name__ == "__main__":
     entrypoint()

requirements.txt CHANGED Viewed

@@ -1,5 +1,8 @@
-huggingface_hub [hf_xet]
-librosa >= 0.11.0
-nemo_toolkit [asr] >= 2.3.0
 numpy
-tqdm

+--extra-index-url https://download.pytorch.org/whl/cu124
+torch>=2.6.0,<2.7.0
+torchvision
+huggingface_hub[hf_xet]
+librosa>=0.11.0
+nemo_toolkit[asr]>=2.3.0
 numpy
+tqdm