Spaces:

Sven33
/

SATE

Sleeping

Shuwei Hou

init

67d6834 2 months ago

5.54 kB

	import argparse
	import io
	from typing import Any, Dict, List, Tuple, Union

	import moviepy.editor as mp
	import numpy as np
	import streamlit as st
	import torch
	import torchaudio
	import torchaudio.transforms as T
	from scipy.io import wavfile
	from streamlit_mic_recorder import mic_recorder
	from transformers import (
	AutomaticSpeechRecognitionPipeline,
	AutoModelForSpeechSeq2Seq,
	AutoProcessor,
	pipeline,
	)


	def parse_arguments() -> argparse.Namespace:
	"""Parse command-line arguments."""
	parser = argparse.ArgumentParser(
	description="Streamlit app for speech transcription."
	)
	parser.add_argument(
	"--model_id", type=str, required=True, help="Path to the model directory"
	)
	return parser.parse_args()


	# Load model and processor from the specified path
	@st.cache_resource # type: ignore
	def load_model_and_processor(
	model_id: str,
	) -> Tuple[AutoModelForSpeechSeq2Seq, AutoProcessor]:
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model.to(device)
	model.generation_config.median_filter_width = 3
	processor = AutoProcessor.from_pretrained(model_id)
	return model, processor


	# Setup the pipeline
	@st.cache_resource # type: ignore
	def setup_pipeline(
	_model: AutoModelForSpeechSeq2Seq, _processor: AutoProcessor
	) -> AutomaticSpeechRecognitionPipeline:
	return pipeline(
	"automatic-speech-recognition",
	model=_model,
	tokenizer=_processor.tokenizer,
	feature_extractor=_processor.feature_extractor,
	chunk_length_s=30,
	batch_size=1,
	return_timestamps=True,
	torch_dtype=torch_dtype,
	device=device,
	)


	def wav_to_black_mp4(wav_path: str, output_path: str, fps: int = 25) -> None:
	"""Convert WAV file to a black-screen MP4 with the same audio."""
	waveform, sample_rate = torchaudio.load(wav_path)
	duration: float = waveform.shape[1] / sample_rate
	audio = mp.AudioFileClip(wav_path)
	black_clip = mp.ColorClip((256, 250), color=(0, 0, 0), duration=duration)
	final_clip = black_clip.set_audio(audio)
	final_clip.write_videofile(output_path, fps=fps)


	def timestamps_to_vtt(timestamps: List[Dict[str, Union[str, Any]]]) -> str:
	"""Convert timestamps to VTT format."""
	vtt_content: str = "WEBVTT\n\n"
	for word in timestamps:
	start_time, end_time = word["timestamp"]
	start_time_str = f"{int(start_time // 3600)}:{int(start_time // 60 % 60):02d}:{start_time % 60:06.3f}"
	end_time_str = f"{int(end_time // 3600)}:{int(end_time // 60 % 60):02d}:{end_time % 60:06.3f}"
	vtt_content += f"{start_time_str} --> {end_time_str}\n{word['text']}\n\n"
	return vtt_content


	def process_audio_bytes(audio_bytes: bytes) -> torch.Tensor:
	"""Process audio bytes to the required format."""
	audio_stream = io.BytesIO(audio_bytes)
	sr, y = wavfile.read(audio_stream)
	y = y.astype(np.float32)
	y_mean = np.mean(y)
	y_std = np.std(y)
	y_normalized = (y - y_mean) / y_std
	transform = T.Resample(sr, 16000)
	waveform = transform(torch.unsqueeze(torch.tensor(y_normalized / 8), 0))
	torchaudio.save("sample.wav", waveform, sample_rate=16000)
	return waveform


	def transcribe(audio_bytes: bytes) -> Dict[str, Any]:
	"""Transcribe the given audio bytes."""
	waveform = process_audio_bytes(audio_bytes)
	transcription = pipe(waveform[0, :].numpy(), return_timestamps="word")
	return transcription


	args = parse_arguments()
	model_id = args.model_id

	# Set up device and data type for processing
	device: str = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype: torch.dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	model, processor = load_model_and_processor(model_id)
	pipe = setup_pipeline(model, processor)

	# Streamlit app interface
	st.title("CrisperWhisper++ 🦻")
	st.subheader("Caution when using. Make sure you can handle the crispness. ⚠️")
	st.write("🎙️ Record an audio to transcribe or 📁 upload an audio file.")

	# Audio recorder component
	audio = mic_recorder(
	start_prompt="Start recording",
	stop_prompt="Stop recording",
	just_once=False,
	use_container_width=False,
	format="wav",
	callback=None,
	args=(),
	kwargs={},
	key=None,
	)

	audio_bytes: Union[bytes, None] = audio["bytes"] if audio else None

	# Audio file upload handling
	audio_file = st.file_uploader("Or upload an audio file", type=["wav", "mp3", "ogg"])

	if audio_file is not None:
	audio_bytes = audio_file.getvalue()

	if audio_bytes:
	try:
	transcription = transcribe(audio_bytes)
	vtt = timestamps_to_vtt(transcription["chunks"])

	with open("subtitles.vtt", "w") as file:
	file.write(vtt)

	wav_to_black_mp4("sample.wav", "video.mp4")

	st.video("video.mp4", subtitles="subtitles.vtt")
	st.subheader("Transcription:")
	st.markdown(
	f"""
	<div style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">
	<p style="font-size: 16px; color: #333;">{transcription['text']}</p>
	</div>
	""",
	unsafe_allow_html=True,
	)
	except Exception as e:
	st.error(f"An error occurred during transcription: {e}")

	# Footer
	st.markdown(
	"""
	<hr>
	<footer>
	<p style="text-align: center;">© 2024 nyra health GmbH</p>
	</footer>
	""",
	unsafe_allow_html=True,
	)