import os import gradio as gr from openai import AzureOpenAI from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings # Load PDF (Tiruvāsagam) loader = PyPDFLoader("tiru.pdf") docs = loader.load() # Split into chunks splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.split_documents(docs) # Local embedding model (Tamil capable) embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") # Store in Chroma vectorstore = Chroma.from_documents(chunks, embedding=embedding_model) retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3}) # Azure OpenAI client client = AzureOpenAI( api_key=os.getenv("AZURE_OPENAI_API_KEY").strip(), api_version="2025-01-01-preview", azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT").strip() ) # Chat function def chat_fn(message, history): docs = retriever.get_relevant_documents(message) context = "\n\n".join([d.page_content for d in docs]) completion = client.chat.completions.create( model="gpt-4.1", # your Azure deployment name messages=[ { "role": "system", "content": ( "You are a helpful assistant answering only from Tiruvāsagam. " "Always reply in Tamil with simple, clear, and correct grammar. " "Be token efficient. " "If the question is not related to Tiruvāsagam, Lord Shiva, or " "Manikkavasagar, just reply: 'எனக்கு தெரியாது'." ) }, { "role": "user", "content": f"Context:\n{context}\n\nQuestion: {message}" } ], temperature=0.8, max_tokens=500 ) return completion.choices[0].message.content # Gradio UI chatbot = gr.ChatInterface( fn=chat_fn, title="திருவாசகம் RAG Chatbot", description="திருவாசகத்தை அடிப்படையாகக் கொண்டு கேள்விகளை கேளுங்கள் (Tamil/English supported)." ) if __name__ == "__main__": chatbot.launch(server_name="0.0.0.0", server_port=7860, debug=True)