File size: 4,471 Bytes
53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 53e0ff0 b3226f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import cohere
import gradio as gr
from pypdf import PdfReader
from gtts import gTTS # Import Google Text-to-Speech
from io import BytesIO # To handle audio in memory
import os
from loguru import logger
import tempfile # To create temporary files
from dotenv import load_dotenv # To load environment variables from a .env file
# Load environment variables from .env file (if you're using one)
load_dotenv()
# Read the Cohere API key from an environment variable
COHERE_API_KEY = os.getenv('COHERE_API_KEY')
# Check if the API key is available
if not COHERE_API_KEY:
raise ValueError("Cohere API key not found. Please set the COHERE_API_KEY environment variable.")
cohere_client = cohere.Client(COHERE_API_KEY)
# Correct language codes for gTTS
language_options = [
("English", "en"),
("Spanish", "es"),
("French", "fr"),
("German", "de"),
("Italian", "it"),
("Chinese", "zh-CN"),
("Japanese", "ja"),
("Hindi", "hi")
]
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
# Function to convert text to speech using gTTS
def text_to_speech(text, language_code):
if not text or not isinstance(text, str):
logger.error("No valid text available for speech conversion.")
return None
try:
tts = gTTS(text, lang=language_code)
audio_fp = BytesIO() # In-memory file to store audio
tts.write_to_fp(audio_fp) # Write audio data to the in-memory file
audio_fp.seek(0) # Reset file pointer to the start
# Create a temporary file to save the audio data for Gradio
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
temp_audio_file.write(audio_fp.read()) # Write the audio data to the temp file
temp_audio_path = temp_audio_file.name # Store the path of the temporary file
return temp_audio_path # Return the file path
except Exception as e:
logger.error(f"Error during text-to-speech conversion: {e}")
return None
# Function to convert PDF text to audio via Cohere and gTTS
def pdf_to_audio(pdf_file, language_code):
try:
text = extract_text_from_pdf(pdf_file)
# Check if the extracted text is empty
if not text.strip():
logger.error("The PDF contains no extractable text.")
return "The PDF contains no extractable text. Please try a different file.", None
# Process the text with Cohere before audio generation
response = cohere_client.generate(
model='c4ai-aya-23', # Using your specified model
prompt=text,
max_tokens=500 # Adjust based on your needs
)
# Check if the response is valid
if not response or not response.generations:
logger.error("Cohere API did not return a valid response.")
return "Error: Cohere API did not return a valid response.", None
processed_text = response.generations[0].text.strip()
# Check if processed_text is valid
if not processed_text:
logger.error("Cohere generated an empty response.")
return "Error: Cohere generated an empty response.", None
# Convert the processed text to speech and return the file path
audio_file_path = text_to_speech(processed_text, language_code)
if audio_file_path is None:
return "Error: Failed to generate speech from the provided text.", None
return processed_text, audio_file_path # Return the text and the path to the audio file
except Exception as e:
logger.error(f"Error during PDF to audio conversion: {e}")
return "An error occurred while processing the PDF.", None
# Gradio interface
def gradio_interface(pdf_file, language_code):
return pdf_to_audio(pdf_file, language_code)
# Launch the Gradio interface with file input, language dropdown, text output, and audio output
gr.Interface(
fn=gradio_interface,
inputs=[
"file",
gr.Dropdown(choices=language_options, label="Select Language")
],
outputs=[
"text",
"audio"
],
title="PDF to Audio using Cohere (Multi-language)"
).launch(debug=True)
|