File size: 1,579 Bytes
4b0e845
 
0c5c3aa
4b0e845
 
 
 
 
 
 
 
0c5c3aa
4b0e845
 
0c5c3aa
 
4b0e845
 
 
0c5c3aa
 
4b0e845
 
0c5c3aa
4b0e845
0c5c3aa
 
4b0e845
 
 
 
 
 
0c5c3aa
 
4b0e845
 
0c5c3aa
 
4b0e845
 
 
0c5c3aa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import speech_recognition as sr
import numpy as np
import io
import config

class SpeechTranscriber:
    def __init__(self):
        self.recognizer = sr.Recognizer()
        self.recognizer.energy_threshold = config.ENERGY_THRESHOLD
        self.recognizer.dynamic_energy_threshold = config.DYNAMIC_ENERGY_THRESHOLD
        self.recognizer.pause_threshold = config.PAUSE_THRESHOLD
        self.audio_buffer = bytearray()
        
    def add_audio_chunk(self, audio_chunk):
        # Convert numpy array to bytes
        self.audio_buffer.extend(audio_chunk.tobytes())
        
    def get_transcript_chunk(self):
        # Only process if we have enough audio
        min_bytes = config.SAMPLE_RATE * config.MIN_PROCESSING_DURATION * 2  # 2 bytes per sample
        if len(self.audio_buffer) < min_bytes:
            return None
            
        # Create AudioData object
        audio_data = sr.AudioData(
            bytes(self.audio_buffer),
            config.SAMPLE_RATE,
            2  # Sample width in bytes
        )
        
        try:
            # Use Google Web Speech API for best accuracy
            text = self.recognizer.recognize_google(audio_data)
            # Clear buffer after successful recognition
            self.audio_buffer = bytearray()
            return text
        except sr.UnknownValueError:
            # Clear buffer even if we couldn't recognize
            self.audio_buffer = bytearray()
            return None
        except sr.RequestError as e:
            print(f"Speech recognition error: {str(e)}")
            return None