# Transform an audio to text script with language detection. # Author: Pratiksha Patel # Description: This script record the audio, transform it to text, detect the language of the file and save it to a txt file. # import required modules #import torch #import streamlit as st #from audio_recorder_streamlit import audio_recorder #import numpy as np #from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq #def transcribe_audio(audio_bytes): # processor = AutoProcessor.from_pretrained("openai/whisper-large") # model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large") # Convert audio bytes to numpy array # audio_array = np.frombuffer(audio_bytes, dtype=np.int16) # Normalize audio array #audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0 # Provide inputs to the processor ##inputs = processor(audio=audio_tensor, sampling_rate=16000, return_tensors="pt") #input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features # generate token ids #predicted_ids = model.generate(input_features) # decode token ids to text #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) #return transcription # Streamlit app #st.title("Audio to Text Transcription..") #audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000) #if audio_bytes: # st.audio(audio_bytes, format="audio/wav") # transcription = transcribe_audio(audio_bytes) # if transcription: # st.write("Transcription:") # st.write(transcription) #else: # st.write("Error: Failed to transcribe audio.") #else: # st.write("No audio recorded.") import torch import streamlit as st from audio_recorder_streamlit import audio_recorder import numpy as np #from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq # Load model directly from transformers import AutoProcessor, AutoModelForPreTraining def transcribe_audio(audio_bytes): # processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") # model = AutoModelForSpeechSeq2Seq.from_pretrained("facebook/s2t-small-librispeech-asr") processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base") model = AutoModelForPreTraining.from_pretrained("facebook/wav2vec2-base") # Convert audio bytes to numpy array audio_array = np.frombuffer(audio_bytes, dtype=np.int16) # Normalize audio array audio_tensor = torch.tensor(audio_array, dtype=torch.float64) / 32768.0 # Provide inputs to the processor input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features # generate token ids predicted_ids = model.generate(input_features) # decode token ids to text transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) return transcription # Streamlit app st.title("Audio to Text Transcription..") audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000) if audio_bytes: st.audio(audio_bytes, format="audio/wav") transcription = transcribe_audio(audio_bytes) if transcription: st.write("Transcription:") st.write(transcription) else: st.write("Error: Failed to transcribe audio.") else: st.write("No audio recorded.")