import cohere import gradio as gr from pypdf import PdfReader from gtts import gTTS # Import Google Text-to-Speech from io import BytesIO # To handle audio in memory import os from loguru import logger import tempfile # To create temporary files from dotenv import load_dotenv # To load environment variables from a .env file # Load environment variables from .env file (if you're using one) load_dotenv() # Read the Cohere API key from an environment variable COHERE_API_KEY = os.getenv('COHERE_API_KEY') # Check if the API key is available if not COHERE_API_KEY: raise ValueError("Cohere API key not found. Please set the COHERE_API_KEY environment variable.") cohere_client = cohere.Client(COHERE_API_KEY) # Correct language codes for gTTS language_options = [ ("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"), ("Italian", "it"), ("Chinese", "zh-CN"), ("Japanese", "ja"), ("Hindi", "hi") ] # Function to extract text from PDF def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text return text # Function to convert text to speech using gTTS def text_to_speech(text, language_code): if not text or not isinstance(text, str): logger.error("No valid text available for speech conversion.") return None try: tts = gTTS(text, lang=language_code) audio_fp = BytesIO() # In-memory file to store audio tts.write_to_fp(audio_fp) # Write audio data to the in-memory file audio_fp.seek(0) # Reset file pointer to the start # Create a temporary file to save the audio data for Gradio with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file: temp_audio_file.write(audio_fp.read()) # Write the audio data to the temp file temp_audio_path = temp_audio_file.name # Store the path of the temporary file return temp_audio_path # Return the file path except Exception as e: logger.error(f"Error during text-to-speech conversion: {e}") return None # Function to convert PDF text to audio via Cohere and gTTS def pdf_to_audio(pdf_file, language_code): try: text = extract_text_from_pdf(pdf_file) # Check if the extracted text is empty if not text.strip(): logger.error("The PDF contains no extractable text.") return "The PDF contains no extractable text. Please try a different file.", None # Process the text with Cohere before audio generation response = cohere_client.generate( model='c4ai-aya-23', # Using your specified model prompt=text, max_tokens=500 # Adjust based on your needs ) # Check if the response is valid if not response or not response.generations: logger.error("Cohere API did not return a valid response.") return "Error: Cohere API did not return a valid response.", None processed_text = response.generations[0].text.strip() # Check if processed_text is valid if not processed_text: logger.error("Cohere generated an empty response.") return "Error: Cohere generated an empty response.", None # Convert the processed text to speech and return the file path audio_file_path = text_to_speech(processed_text, language_code) if audio_file_path is None: return "Error: Failed to generate speech from the provided text.", None return processed_text, audio_file_path # Return the text and the path to the audio file except Exception as e: logger.error(f"Error during PDF to audio conversion: {e}") return "An error occurred while processing the PDF.", None # Gradio interface def gradio_interface(pdf_file, language_code): return pdf_to_audio(pdf_file, language_code) # Launch the Gradio interface with file input, language dropdown, text output, and audio output gr.Interface( fn=gradio_interface, inputs=[ "file", gr.Dropdown(choices=language_options, label="Select Language") ], outputs=[ "text", "audio" ], title="PDF to Audio using Cohere (Multi-language)" ).launch(debug=True)