import gradio as gr import uuid import os import speech_recognition as sr from gtts import gTTS from langchain_community.llms import Ollama from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_community.chat_message_histories import ChatMessageHistory from langchain_core.runnables.history import RunnableWithMessageHistory # Initialize the model and prompt template chat = Ollama(model="llama3:latest") prompt = ChatPromptTemplate.from_messages([ ("system", """ You are a helpful AI assistant. Your task is to engage in conversation with users, answer their questions, and assist them with various tasks. Communicate politely and maintain focus on the user's needs. Keep responses concise, typically two to three sentences. """), MessagesPlaceholder(variable_name="history"), ("human", "{input}"), ]) runnable = prompt | chat with_message_history = RunnableWithMessageHistory( runnable, lambda session_id: ChatMessageHistory(), input_messages_key="input", history_messages_key="history", ) def text_to_speech(text, file_name): tts = gTTS(text=text, lang='en', slow=False) file_path = os.path.join(os.getcwd(), file_name) tts.save(file_path) return file_path def speech_to_text(audio): if audio is None: return "No audio input received." recognizer = sr.Recognizer() try: with sr.AudioFile(audio) as source: audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data) print(text) return text except sr.UnknownValueError: return "Speech recognition could not understand the audio" except sr.RequestError: return "Could not request results from the speech recognition service" except Exception as e: return f"Error processing audio: {str(e)}" def chat_function(input_type, text_input=None, audio_input=None, history=None): if history is None: history = [] if input_type == "text": user_input = text_input elif input_type == "audio": if audio_input is not None: user_input = speech_to_text(audio_input) else: user_input = "No audio input received." else: return history, history, None print(f"User input: {user_input}") # Debug information # Get LLM response response = with_message_history.invoke( {"input": user_input}, config={"configurable": {"session_id": "chat_history"}}, ) # Generate audio for LLM response audio_file = f"response_{uuid.uuid4()}.mp3" audio_path = text_to_speech(response, audio_file) # Update history in the correct format history.append((user_input, response)) return history, history, audio_path # Gradio interface with gr.Blocks() as demo: chatbot = gr.Chatbot() with gr.Row(): text_input = gr.Textbox(placeholder="Type your message here...") audio_input = gr.Audio(sources=['microphone'], type="filepath") with gr.Row(): text_button = gr.Button("Send Text") audio_button = gr.Button("Send Audio") audio_output = gr.Audio() def on_audio_change(audio): if audio is not None: return speech_to_text(audio) return "" audio_input.change(on_audio_change, inputs=[audio_input], outputs=[text_input]) text_button.click(chat_function, inputs=[gr.Textbox(value="text"), text_input, audio_input, chatbot], outputs=[chatbot, chatbot, audio_output]) audio_button.click(chat_function, inputs=[gr.Textbox(value="audio"), text_input, audio_input, chatbot], outputs=[chatbot, chatbot, audio_output]) demo.launch(server_name='0.0.0.0',share=True,max_threads=10)