Akmr008's picture
Update app.py
15176fe verified
import os
import requests
from bs4 import BeautifulSoup
from flask import Flask, request, jsonify, send_from_directory
from flask_cors import CORS
# --- Import updated RAG-specific libraries ---
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain.chains import RetrievalQA
# --- Basic Flask App Setup ---
app = Flask(__name__, static_folder='.', static_url_path='')
CORS(app)
# Global variable to hold the RAG chain
rag_chain = None
# --- Route for the homepage ---
@app.route('/')
def serve_index():
"""Serves the index.html file as the homepage."""
return send_from_directory('.', 'index.html')
# --- API Route for queries ---
@app.route("/query", methods=["POST"])
def handle_query():
"""Handles incoming queries from the frontend."""
global rag_chain
if not rag_chain:
return jsonify({"error": "Failed to process the query. Details: RAG pipeline not initialized. Check server logs."}), 500
data = request.json
query_text = data.get("query")
if not query_text:
return jsonify({"error": "No query provided."}), 400
try:
print(f"Received query: {query_text}")
result = rag_chain.invoke(query_text)
answer = result.get('result', 'No answer found.')
sources = [doc.metadata.get('source', 'Unknown source') for doc in result.get('source_documents', [])]
unique_sources = list(dict.fromkeys(sources))
print(f"Generated response: {answer}")
return jsonify({
"answer": answer,
"sources": unique_sources
})
except Exception as e:
print(f"Error during query processing: {e}")
return jsonify({"error": f"An error occurred: {str(e)}"}), 500
# --- RAG Pipeline Initialization ---
def initialize_rag_pipeline():
"""
This function loads documents, creates the vector store, and initializes the RAG chain.
"""
global rag_chain
print("--- Starting RAG pipeline initialization ---")
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
print("ERROR: GOOGLE_API_KEY environment variable not set. Halting.")
return
print("Step 1: Google API Key found.")
pdf_files = ["Augusta rule 101 CPE Webinar.pdf", "Augusta rule workshop.pdf"]
pdf_docs = []
try:
for file in pdf_files:
if os.path.exists(file):
loader = PyPDFLoader(file)
pdf_docs.extend(loader.load())
else:
print(f"Warning: PDF file not found at {file}")
except Exception as e:
print(f"ERROR loading PDFs: {e}. Halting.")
return
print(f"Step 2: Loaded {len(pdf_docs)} pages from PDF files.")
def scrape_url(url):
try:
response = requests.get(url, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
return Document(page_content=soup.get_text(separator=" ", strip=True), metadata={"source": url})
except requests.RequestException as e:
print(f"Warning: Could not scrape URL {url}. Error: {e}")
return None
urls = [
"https://www.instead.com/blog/the-augusta-rule-a-tax-strategy-for-business-owners",
"https://www.instead.com/blog/s-corp-reasonable-salary-guide",
"https://www.instead.com/blog/how-to-start-an-s-corp"
]
web_docs = [doc for doc in [scrape_url(url) for url in urls] if doc is not None]
print(f"Step 3: Scraped {len(web_docs)} web pages.")
all_docs = pdf_docs + web_docs
if not all_docs:
print("ERROR: No documents were loaded. Halting.")
return
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(all_docs)
print(f"Step 4: Split documents into {len(chunks)} chunks.")
try:
embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
print("Step 5: Gemini embedding model loaded successfully.")
# Create vector store in-memory for better stability on free servers
vectorstore = Chroma.from_documents(chunks, embedding)
print("Step 6: In-memory vector store created successfully.")
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.3)
print("Step 7: Gemini LLM loaded successfully.")
except Exception as e:
print(f"ERROR during AI model initialization: {e}. Halting.")
return
rag_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(),
return_source_documents=True
)
print("--- RAG pipeline initialized successfully! ---")
# --- Initialize the RAG pipeline when the app starts ---
# This is now called in the global scope to ensure it runs on Hugging Face
initialize_rag_pipeline()
# --- Main Execution Block (Only used for local testing) ---
if __name__ == "__main__":
if not rag_chain:
print("\nCould not start the server because the RAG pipeline failed to initialize.")
else:
# This app.run is for local development and will not be used by Gunicorn on Hugging Face
app.run(host='0.0.0.0', port=5000)