File size: 4,471 Bytes
53e0ff0
 
 
b3226f0
 
53e0ff0
 
b3226f0
 
 
 
 
 
 
 
 
 
 
 
53e0ff0
b3226f0
 
 
 
 
 
 
 
 
 
 
 
 
53e0ff0
 
 
 
 
 
b3226f0
 
 
53e0ff0
 
b3226f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53e0ff0
 
 
b3226f0
 
 
 
 
 
53e0ff0
b3226f0
53e0ff0
 
 
 
b3226f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53e0ff0
b3226f0
53e0ff0
 
b3226f0
53e0ff0
 
b3226f0
 
53e0ff0
b3226f0
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import cohere
import gradio as gr
from pypdf import PdfReader
from gtts import gTTS  # Import Google Text-to-Speech
from io import BytesIO  # To handle audio in memory
import os
from loguru import logger
import tempfile  # To create temporary files
from dotenv import load_dotenv  # To load environment variables from a .env file

# Load environment variables from .env file (if you're using one)
load_dotenv()

# Read the Cohere API key from an environment variable
COHERE_API_KEY = os.getenv('COHERE_API_KEY')

# Check if the API key is available
if not COHERE_API_KEY:
    raise ValueError("Cohere API key not found. Please set the COHERE_API_KEY environment variable.")

cohere_client = cohere.Client(COHERE_API_KEY)

# Correct language codes for gTTS
language_options = [
    ("English", "en"),
    ("Spanish", "es"),
    ("French", "fr"),
    ("German", "de"),
    ("Italian", "it"),
    ("Chinese", "zh-CN"),
    ("Japanese", "ja"),
    ("Hindi", "hi")
]

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# Function to convert text to speech using gTTS
def text_to_speech(text, language_code):
    if not text or not isinstance(text, str):
        logger.error("No valid text available for speech conversion.")
        return None
    
    try:
        tts = gTTS(text, lang=language_code)
        audio_fp = BytesIO()  # In-memory file to store audio
        tts.write_to_fp(audio_fp)  # Write audio data to the in-memory file
        audio_fp.seek(0)  # Reset file pointer to the start

        # Create a temporary file to save the audio data for Gradio
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
            temp_audio_file.write(audio_fp.read())  # Write the audio data to the temp file
            temp_audio_path = temp_audio_file.name  # Store the path of the temporary file
        return temp_audio_path  # Return the file path
    except Exception as e:
        logger.error(f"Error during text-to-speech conversion: {e}")
        return None

# Function to convert PDF text to audio via Cohere and gTTS
def pdf_to_audio(pdf_file, language_code):
    try:
        text = extract_text_from_pdf(pdf_file)
        
        # Check if the extracted text is empty
        if not text.strip():
            logger.error("The PDF contains no extractable text.")
            return "The PDF contains no extractable text. Please try a different file.", None
        
        # Process the text with Cohere before audio generation
        response = cohere_client.generate(
            model='c4ai-aya-23',  # Using your specified model
            prompt=text,
            max_tokens=500  # Adjust based on your needs
        )
        
        # Check if the response is valid
        if not response or not response.generations:
            logger.error("Cohere API did not return a valid response.")
            return "Error: Cohere API did not return a valid response.", None
        
        processed_text = response.generations[0].text.strip()
        
        # Check if processed_text is valid
        if not processed_text:
            logger.error("Cohere generated an empty response.")
            return "Error: Cohere generated an empty response.", None
        
        # Convert the processed text to speech and return the file path
        audio_file_path = text_to_speech(processed_text, language_code)
        
        if audio_file_path is None:
            return "Error: Failed to generate speech from the provided text.", None
        
        return processed_text, audio_file_path  # Return the text and the path to the audio file
    except Exception as e:
        logger.error(f"Error during PDF to audio conversion: {e}")
        return "An error occurred while processing the PDF.", None

# Gradio interface
def gradio_interface(pdf_file, language_code):
    return pdf_to_audio(pdf_file, language_code)

# Launch the Gradio interface with file input, language dropdown, text output, and audio output
gr.Interface(
    fn=gradio_interface,
    inputs=[
        "file",
        gr.Dropdown(choices=language_options, label="Select Language")
    ],
    outputs=[
        "text",
        "audio"
    ],
    title="PDF to Audio using Cohere (Multi-language)"
).launch(debug=True)