import os import requests from bs4 import BeautifulSoup from flask import Flask, request, jsonify, send_from_directory from flask_cors import CORS # --- Import updated RAG-specific libraries --- from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain.schema import Document from langchain.chains import RetrievalQA # --- Basic Flask App Setup --- app = Flask(__name__, static_folder='.', static_url_path='') CORS(app) # Global variable to hold the RAG chain rag_chain = None # --- Route for the homepage --- @app.route('/') def serve_index(): """Serves the index.html file as the homepage.""" return send_from_directory('.', 'index.html') # --- API Route for queries --- @app.route("/query", methods=["POST"]) def handle_query(): """Handles incoming queries from the frontend.""" global rag_chain if not rag_chain: return jsonify({"error": "Failed to process the query. Details: RAG pipeline not initialized. Check server logs."}), 500 data = request.json query_text = data.get("query") if not query_text: return jsonify({"error": "No query provided."}), 400 try: print(f"Received query: {query_text}") result = rag_chain.invoke(query_text) answer = result.get('result', 'No answer found.') sources = [doc.metadata.get('source', 'Unknown source') for doc in result.get('source_documents', [])] unique_sources = list(dict.fromkeys(sources)) print(f"Generated response: {answer}") return jsonify({ "answer": answer, "sources": unique_sources }) except Exception as e: print(f"Error during query processing: {e}") return jsonify({"error": f"An error occurred: {str(e)}"}), 500 # --- RAG Pipeline Initialization --- def initialize_rag_pipeline(): """ This function loads documents, creates the vector store, and initializes the RAG chain. """ global rag_chain print("--- Starting RAG pipeline initialization ---") api_key = os.environ.get("GOOGLE_API_KEY") if not api_key: print("ERROR: GOOGLE_API_KEY environment variable not set. Halting.") return print("Step 1: Google API Key found.") pdf_files = ["Augusta rule 101 CPE Webinar.pdf", "Augusta rule workshop.pdf"] pdf_docs = [] try: for file in pdf_files: if os.path.exists(file): loader = PyPDFLoader(file) pdf_docs.extend(loader.load()) else: print(f"Warning: PDF file not found at {file}") except Exception as e: print(f"ERROR loading PDFs: {e}. Halting.") return print(f"Step 2: Loaded {len(pdf_docs)} pages from PDF files.") def scrape_url(url): try: response = requests.get(url, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") return Document(page_content=soup.get_text(separator=" ", strip=True), metadata={"source": url}) except requests.RequestException as e: print(f"Warning: Could not scrape URL {url}. Error: {e}") return None urls = [ "https://www.instead.com/blog/the-augusta-rule-a-tax-strategy-for-business-owners", "https://www.instead.com/blog/s-corp-reasonable-salary-guide", "https://www.instead.com/blog/how-to-start-an-s-corp" ] web_docs = [doc for doc in [scrape_url(url) for url in urls] if doc is not None] print(f"Step 3: Scraped {len(web_docs)} web pages.") all_docs = pdf_docs + web_docs if not all_docs: print("ERROR: No documents were loaded. Halting.") return splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) chunks = splitter.split_documents(all_docs) print(f"Step 4: Split documents into {len(chunks)} chunks.") try: embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") print("Step 5: Gemini embedding model loaded successfully.") # Create vector store in-memory for better stability on free servers vectorstore = Chroma.from_documents(chunks, embedding) print("Step 6: In-memory vector store created successfully.") llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.3) print("Step 7: Gemini LLM loaded successfully.") except Exception as e: print(f"ERROR during AI model initialization: {e}. Halting.") return rag_chain = RetrievalQA.from_chain_type( llm=llm, retriever=vectorstore.as_retriever(), return_source_documents=True ) print("--- RAG pipeline initialized successfully! ---") # --- Initialize the RAG pipeline when the app starts --- # This is now called in the global scope to ensure it runs on Hugging Face initialize_rag_pipeline() # --- Main Execution Block (Only used for local testing) --- if __name__ == "__main__": if not rag_chain: print("\nCould not start the server because the RAG pipeline failed to initialize.") else: # This app.run is for local development and will not be used by Gunicorn on Hugging Face app.run(host='0.0.0.0', port=5000)