Spaces:
Sleeping
Sleeping
import os | |
import requests | |
from bs4 import BeautifulSoup | |
from flask import Flask, request, jsonify, send_from_directory | |
from flask_cors import CORS | |
# --- Import updated RAG-specific libraries --- | |
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import Chroma | |
from langchain.schema import Document | |
from langchain.chains import RetrievalQA | |
# --- Basic Flask App Setup --- | |
app = Flask(__name__, static_folder='.', static_url_path='') | |
CORS(app) | |
# Global variable to hold the RAG chain | |
rag_chain = None | |
# --- Route for the homepage --- | |
def serve_index(): | |
"""Serves the index.html file as the homepage.""" | |
return send_from_directory('.', 'index.html') | |
# --- API Route for queries --- | |
def handle_query(): | |
"""Handles incoming queries from the frontend.""" | |
global rag_chain | |
if not rag_chain: | |
return jsonify({"error": "Failed to process the query. Details: RAG pipeline not initialized. Check server logs."}), 500 | |
data = request.json | |
query_text = data.get("query") | |
if not query_text: | |
return jsonify({"error": "No query provided."}), 400 | |
try: | |
print(f"Received query: {query_text}") | |
result = rag_chain.invoke(query_text) | |
answer = result.get('result', 'No answer found.') | |
sources = [doc.metadata.get('source', 'Unknown source') for doc in result.get('source_documents', [])] | |
unique_sources = list(dict.fromkeys(sources)) | |
print(f"Generated response: {answer}") | |
return jsonify({ | |
"answer": answer, | |
"sources": unique_sources | |
}) | |
except Exception as e: | |
print(f"Error during query processing: {e}") | |
return jsonify({"error": f"An error occurred: {str(e)}"}), 500 | |
# --- RAG Pipeline Initialization --- | |
def initialize_rag_pipeline(): | |
""" | |
This function loads documents, creates the vector store, and initializes the RAG chain. | |
""" | |
global rag_chain | |
print("--- Starting RAG pipeline initialization ---") | |
api_key = os.environ.get("GOOGLE_API_KEY") | |
if not api_key: | |
print("ERROR: GOOGLE_API_KEY environment variable not set. Halting.") | |
return | |
print("Step 1: Google API Key found.") | |
pdf_files = ["Augusta rule 101 CPE Webinar.pdf", "Augusta rule workshop.pdf"] | |
pdf_docs = [] | |
try: | |
for file in pdf_files: | |
if os.path.exists(file): | |
loader = PyPDFLoader(file) | |
pdf_docs.extend(loader.load()) | |
else: | |
print(f"Warning: PDF file not found at {file}") | |
except Exception as e: | |
print(f"ERROR loading PDFs: {e}. Halting.") | |
return | |
print(f"Step 2: Loaded {len(pdf_docs)} pages from PDF files.") | |
def scrape_url(url): | |
try: | |
response = requests.get(url, timeout=15) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
return Document(page_content=soup.get_text(separator=" ", strip=True), metadata={"source": url}) | |
except requests.RequestException as e: | |
print(f"Warning: Could not scrape URL {url}. Error: {e}") | |
return None | |
urls = [ | |
"https://www.instead.com/blog/the-augusta-rule-a-tax-strategy-for-business-owners", | |
"https://www.instead.com/blog/s-corp-reasonable-salary-guide", | |
"https://www.instead.com/blog/how-to-start-an-s-corp" | |
] | |
web_docs = [doc for doc in [scrape_url(url) for url in urls] if doc is not None] | |
print(f"Step 3: Scraped {len(web_docs)} web pages.") | |
all_docs = pdf_docs + web_docs | |
if not all_docs: | |
print("ERROR: No documents were loaded. Halting.") | |
return | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
chunks = splitter.split_documents(all_docs) | |
print(f"Step 4: Split documents into {len(chunks)} chunks.") | |
try: | |
embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") | |
print("Step 5: Gemini embedding model loaded successfully.") | |
# Create vector store in-memory for better stability on free servers | |
vectorstore = Chroma.from_documents(chunks, embedding) | |
print("Step 6: In-memory vector store created successfully.") | |
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.3) | |
print("Step 7: Gemini LLM loaded successfully.") | |
except Exception as e: | |
print(f"ERROR during AI model initialization: {e}. Halting.") | |
return | |
rag_chain = RetrievalQA.from_chain_type( | |
llm=llm, | |
retriever=vectorstore.as_retriever(), | |
return_source_documents=True | |
) | |
print("--- RAG pipeline initialized successfully! ---") | |
# --- Initialize the RAG pipeline when the app starts --- | |
# This is now called in the global scope to ensure it runs on Hugging Face | |
initialize_rag_pipeline() | |
# --- Main Execution Block (Only used for local testing) --- | |
if __name__ == "__main__": | |
if not rag_chain: | |
print("\nCould not start the server because the RAG pipeline failed to initialize.") | |
else: | |
# This app.run is for local development and will not be used by Gunicorn on Hugging Face | |
app.run(host='0.0.0.0', port=5000) | |