Spaces:

mobenta
/

pdf_audio

Sleeping

File size: 4,471 Bytes

import cohere
import gradio as gr
from pypdf import PdfReader
from gtts import gTTS  # Import Google Text-to-Speech
from io import BytesIO  # To handle audio in memory
import os
from loguru import logger
import tempfile  # To create temporary files
from dotenv import load_dotenv  # To load environment variables from a .env file

# Load environment variables from .env file (if you're using one)
load_dotenv()

# Read the Cohere API key from an environment variable
COHERE_API_KEY = os.getenv('COHERE_API_KEY')

# Check if the API key is available
if not COHERE_API_KEY:
    raise ValueError("Cohere API key not found. Please set the COHERE_API_KEY environment variable.")

cohere_client = cohere.Client(COHERE_API_KEY)

# Correct language codes for gTTS
language_options = [
    ("English", "en"),
    ("Spanish", "es"),
    ("French", "fr"),
    ("German", "de"),
    ("Italian", "it"),
    ("Chinese", "zh-CN"),
    ("Japanese", "ja"),
    ("Hindi", "hi")
]

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# Function to convert text to speech using gTTS
def text_to_speech(text, language_code):
    if not text or not isinstance(text, str):
        logger.error("No valid text available for speech conversion.")
        return None
    
    try:
        tts = gTTS(text, lang=language_code)
        audio_fp = BytesIO()  # In-memory file to store audio
        tts.write_to_fp(audio_fp)  # Write audio data to the in-memory file
        audio_fp.seek(0)  # Reset file pointer to the start

        # Create a temporary file to save the audio data for Gradio
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
            temp_audio_file.write(audio_fp.read())  # Write the audio data to the temp file
            temp_audio_path = temp_audio_file.name  # Store the path of the temporary file
        return temp_audio_path  # Return the file path
    except Exception as e:
        logger.error(f"Error during text-to-speech conversion: {e}")
        return None

# Function to convert PDF text to audio via Cohere and gTTS
def pdf_to_audio(pdf_file, language_code):
    try:
        text = extract_text_from_pdf(pdf_file)
        
        # Check if the extracted text is empty
        if not text.strip():
            logger.error("The PDF contains no extractable text.")
            return "The PDF contains no extractable text. Please try a different file.", None
        
        # Process the text with Cohere before audio generation
        response = cohere_client.generate(
            model='c4ai-aya-23',  # Using your specified model
            prompt=text,
            max_tokens=500  # Adjust based on your needs
        )
        
        # Check if the response is valid
        if not response or not response.generations:
            logger.error("Cohere API did not return a valid response.")
            return "Error: Cohere API did not return a valid response.", None
        
        processed_text = response.generations[0].text.strip()
        
        # Check if processed_text is valid
        if not processed_text:
            logger.error("Cohere generated an empty response.")
            return "Error: Cohere generated an empty response.", None
        
        # Convert the processed text to speech and return the file path
        audio_file_path = text_to_speech(processed_text, language_code)
        
        if audio_file_path is None:
            return "Error: Failed to generate speech from the provided text.", None
        
        return processed_text, audio_file_path  # Return the text and the path to the audio file
    except Exception as e:
        logger.error(f"Error during PDF to audio conversion: {e}")
        return "An error occurred while processing the PDF.", None

# Gradio interface
def gradio_interface(pdf_file, language_code):
    return pdf_to_audio(pdf_file, language_code)

# Launch the Gradio interface with file input, language dropdown, text output, and audio output
gr.Interface(
    fn=gradio_interface,
    inputs=[
        "file",
        gr.Dropdown(choices=language_options, label="Select Language")
    ],
    outputs=[
        "text",
        "audio"
    ],
    title="PDF to Audio using Cohere (Multi-language)"
).launch(debug=True)