import time from typing import Any from google import genai from google.genai import types import utils class AudioTranscriber: """A class to transcribe audio files""" SYSTEM_INSTRUCTION = '''You are an advanced audio transcription model. Your task is to accurately transcribe provided audio input into a structured JSON format. **Output Format Specification:** Your response MUST be a valid JSON object with the following structure: ```json { "segments": [ { "text": "The transcribed text for the segment.", "start": "The start time of the segment in seconds.", "end": "The end time of the segment in seconds.", "speaker": "The speaker ID for the segment." } ], "language": "The language of the transcribed text in ISO 639-1 format." } ``` **Detailed Instructions and Rules:** 1. Segments: - A "segment" is defined as a continuous section of speech from a single speaker include multiple sentences or phrases. - Each segment object MUST contain `text`, `start`, `end`, and `speaker` fields. - `text`: The verbatim transcription of the speech within that segment. - `start`: The precise start time of the segment in seconds, represented as a integer number (e.g., 1, 5) - `end`: The precise end time of the segment in seconds, represented as a integer number (e.g., 2, 6) - `speaker`: An integer representing the speaker ID. + Speaker IDs start at `0` for the first detected speaker. + The speaker ID MUST increment by 1 each time a new, distinct speaker is identified in the audio. Do not reuse speaker IDs within the same transcription. + If the same speaker talks again after another speaker, they retain their original speaker ID. + **Segment Splitting Rule**: A segment for the same speaker should only be split if there is a period of silence lasting more than 5 seconds or have length longer than 30 seconds. Otherwise, continuous speech from the same speaker, even with short pauses, should remain within a single segment. 2. Language: - `language`: A two-letter ISO 639-1 code representing the primary language of the transcribed text (e.g., "en" for English, "es" for Spanish, "fr" for French). - If multiple languages are detected in the audio, you MUST select and output only the ISO 639-1 code for the primary language used throughout the audio. ''' RESPONSE_SCHEMA = { 'type': 'object', 'properties': { 'segments': { 'type': 'array', "description": 'A list of transcribed segments from the audio file.', 'items': { 'type': 'object', 'properties': { 'text': { 'type': 'string', 'description': 'The transcribed text for the segment.' }, 'start': { 'type': 'integer', 'description': 'The start time of the segment in seconds.' }, 'end': { 'type': 'integer', 'description': 'The end time of the segment in seconds.' }, 'speaker': { 'type': 'integer', 'description': 'The speaker ID for the segment.' } }, 'required': ['text', 'start', 'end', 'speaker'], 'propertyOrdering': ['text', 'start', 'end', 'speaker'] }, }, 'language': { 'type': 'string', 'description': 'The language of the transcribed text in ISO 639-1 format.', } }, 'required': ['segments', 'language'], 'propertyOrdering': ['segments', 'language'] } def __init__(self, model: str = 'gemini-2.0-flash', api_key: str = None): self.model = model self.client = genai.Client(api_key=api_key) def transcribe(self, audio_path: str) -> dict[str, Any]: """Transcribe an audio file from the given path. Args: audio_path (str): The path to the audio file to be transcribed. Returns: dict[str, Any]: The transcription result. ```{ "segments": [ { "text": "Transcribed text", "start": 0.0, "end": 5.0, "speaker": 0 } ], "language": "en" }``` """ uploaded_file = self.client.files.upload(file=audio_path) while uploaded_file.state != 'ACTIVE': time.sleep(1) uploaded_file = self.client.files.get(name=uploaded_file.name) if uploaded_file.state == 'FAILED': raise ValueError('Failed to upload the audio file') audio_duration = utils.get_media_duration(audio_path) response = self.client.models.generate_content( model=self.model, contents=[uploaded_file, f'Audio duration: {int(audio_duration)} seconds'], config=types.GenerateContentConfig( system_instruction=self.SYSTEM_INSTRUCTION, temperature=0.2, response_mime_type='application/json', response_schema=self.RESPONSE_SCHEMA, ) ) if response.parsed is None: raise ValueError('Failed to transcribe the audio file') return response.parsed # type: ignore