Vosk is an open-source speech recognition toolkit widely used for building speech-to-text (STT) systems. It supports multiple languages, including Hindi, and can be used in various applications due to its efficient, real-time transcription capabilities. Here’s an overview of how to use the Vosk model for Hindi language transcriptions in a speech-to-text application:
Overview of the Vosk Model
Vosk is a part of the Kaldi speech recognition toolkit known for its accuracy and efficiency. Vosk provides a set of pre-trained models that can be used out-of-the-box for recognizing speech in different languages.
Key Features of Vosk
- Multilingual Support: Supports multiple languages including Hindi.
- Offline Capabilities: Can run offline without needing an internet connection.
- Real-time Processing: Capable of processing speech in real-time.
- Compact Models: Efficient models that can run on various devices, including mobile phones and Raspberry Pi.
Here is the code you provided:
import os
import sys
import pyaudio
import time
import threading
from vosk import Model, KaldiRecognizer
from gtts import gTTS
from playsound import playsound
# Path to the Vosk model and its configuration
model_path = "vosk_model/Vosk_small_hindi"
if not os.path.exists(model_path):
print(f"Please download a Vosk model to {model_path}")
sys.exit(1)
# Initialize the Vosk model and recognizer
model = Model(model_path)
rec = KaldiRecognizer(model, 16000)
# Setup audio stream
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("Listening...")
# Flag to prevent transcription during TTS playback
transcribing_enabled = True
def play_tts(text):
global transcribing_enabled
transcribing_enabled = False # Disable transcription temporarily
tts = gTTS(text=text, lang='hi')
tts.save("jai_shri_ram.mp3")
playsound("jai_shri_ram.mp3")
os.remove("jai_shri_ram.mp3")
transcribing_enabled = True # Re-enable transcription
try:
while True:
try:
data = stream.read(CHUNK, exception_on_overflow=False)
except OSError as e:
print(f"Error reading audio stream: {e}")
continue
if len(data) == 0:
break
if transcribing_enabled and rec.AcceptWaveform(data):
result = rec.Result()
result_dict = eval(result) # Convert JSON string to dictionary
if 'text' in result_dict:
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
transcription = result_dict['text']
print(f"[{timestamp}] {transcription}")
if "जय श्री राम" in transcription or "जय श्रीराम" in transcription:
tts_thread = threading.Thread(target=play_tts, args=("जय श्री राम",))
tts_thread.start()
if "नाव" in transcription or "नौ" in transcription:
tts_thread = threading.Thread(target=play_tts, args=("आपका स्वागत है। मैं एक नव रोबोट हूँ। मैं आपकी कैसे मदद कर सकता हूँ?",))
tts_thread.start()
except KeyboardInterrupt:
print("Stopping...")
finally:
try:
stream.stop_stream()
except OSError as e:
print(f"Error stopping stream: {e}")
stream.close()
p.terminate()
This script sets up a voice recognition system using the Vosk speech recognition toolkit and provides responses with text-to-speech using the gTTS
library. It listens for specific phrases in Hindi and responds accordingly.
Explanation of Code
- Importing Libraries: The necessary libraries are imported.
vosk
is for speech recognition, andwave
is for handling audio files. - Loading the Model: The Vosk model is loaded from the specified directory.
- Recognizing Speech: The function
recognize_speech_from_file
processes a WAV file. It checks if the audio format is correct and usesKaldiRecognizer
to transcribe the audio. - Reading Audio Data: Audio data is read in chunks and processed to produce partial and final results, which are printed to the console.
Applications
- Transcription Services: Automated Hindi audio transcription for documentation, media, and content creation.
- Assistive Technologies: Helping individuals with hearing impairments by converting speech to text in real-time.
- Voice Commands: Enabling voice control for applications in Hindi.
By using the Vosk model for Hindi, developers can create robust and efficient speech-to-text applications tailored for Hindi-speaking users.
Updated Code as per Long Transcriptions
import os
import sys
import pyaudio
import time
import threading
from vosk import Model, KaldiRecognizer
from gtts import gTTS
from playsound import playsound
# Path to the Vosk model and its configuration
model_path = "vosk_model/Vosk_small_hindi"
if not os.path.exists(model_path):
print(f"Please download a Vosk model to {model_path}")
sys.exit(1)
# Initialize the Vosk model and recognizer
model = Model(model_path)
rec = KaldiRecognizer(model, 16000)
# Setup audio stream
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("Listening...")
# Flag to prevent transcription during TTS playback
transcribing_enabled = True
def play_tts(text):
global transcribing_enabled
transcribing_enabled = False # Disable transcription temporarily
tts = gTTS(text=text, lang='hi')
tts.save("jai_shri_ram.mp3")
playsound("jai_shri_ram.mp3")
os.remove("jai_shri_ram.mp3")
transcribing_enabled = True # Re-enable transcription
def process_result(result):
result_dict = eval(result) # Convert JSON string to dictionary
if 'text' in result_dict:
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
transcription = result_dict['text']
print(f"[{timestamp}] {transcription}")
if "जय श्री राम" in transcription or "जय श्रीराम" in transcription:
tts_thread = threading.Thread(target=play_tts, args=("जय श्री राम",))
tts_thread.start()
if "नाव" in transcription or "नौ" in transcription:
tts_thread = threading.Thread(target=play_tts, args=("आपका स्वागत है। मैं एक नव रोबोट हूँ। मैं आपकी कैसे मदद कर सकता हूँ?",))
tts_thread.start()
try:
partial_transcription = ""
while True:
try:
data = stream.read(CHUNK, exception_on_overflow=False)
except OSError as e:
print(f"Error reading audio stream: {e}")
continue
if len(data) == 0:
break
if transcribing_enabled:
if rec.AcceptWaveform(data):
result = rec.Result()
process_result(result)
partial_transcription = ""
else:
partial_result = rec.PartialResult()
partial_transcription = eval(partial_result).get('partial', '')
# print(f"Partial: {partial_transcription}")
except KeyboardInterrupt:
print("Stopping...")
finally:
try:
stream.stop_stream()
except OSError as e:
print(f"Error stopping stream: {e}")
stream.close()
p.terminate()