Spaces:

Akmr008
/

ai-blog-assistant

Sleeping

File size: 5,482 Bytes

import os
import requests
from bs4 import BeautifulSoup
from flask import Flask, request, jsonify, send_from_directory
from flask_cors import CORS

# --- Import updated RAG-specific libraries ---
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain.chains import RetrievalQA

# --- Basic Flask App Setup ---
app = Flask(__name__, static_folder='.', static_url_path='')
CORS(app)

# Global variable to hold the RAG chain
rag_chain = None

# --- Route for the homepage ---
@app.route('/')
def serve_index():
    """Serves the index.html file as the homepage."""
    return send_from_directory('.', 'index.html')

# --- API Route for queries ---
@app.route("/query", methods=["POST"])
def handle_query():
    """Handles incoming queries from the frontend."""
    global rag_chain
    if not rag_chain:
        return jsonify({"error": "Failed to process the query. Details: RAG pipeline not initialized. Check server logs."}), 500

    data = request.json
    query_text = data.get("query")
    if not query_text:
        return jsonify({"error": "No query provided."}), 400

    try:
        print(f"Received query: {query_text}")
        result = rag_chain.invoke(query_text)
        
        answer = result.get('result', 'No answer found.')
        sources = [doc.metadata.get('source', 'Unknown source') for doc in result.get('source_documents', [])]
        unique_sources = list(dict.fromkeys(sources))

        print(f"Generated response: {answer}")
        return jsonify({
            "answer": answer,
            "sources": unique_sources
        })
    except Exception as e:
        print(f"Error during query processing: {e}")
        return jsonify({"error": f"An error occurred: {str(e)}"}), 500

# --- RAG Pipeline Initialization ---
def initialize_rag_pipeline():
    """
    This function loads documents, creates the vector store, and initializes the RAG chain.
    """
    global rag_chain
    print("--- Starting RAG pipeline initialization ---")

    api_key = os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        print("ERROR: GOOGLE_API_KEY environment variable not set. Halting.")
        return
    print("Step 1: Google API Key found.")

    pdf_files = ["Augusta rule 101 CPE Webinar.pdf", "Augusta rule workshop.pdf"]
    pdf_docs = []
    try:
        for file in pdf_files:
            if os.path.exists(file):
                loader = PyPDFLoader(file)
                pdf_docs.extend(loader.load())
            else:
                print(f"Warning: PDF file not found at {file}")
    except Exception as e:
        print(f"ERROR loading PDFs: {e}. Halting.")
        return
    print(f"Step 2: Loaded {len(pdf_docs)} pages from PDF files.")

    def scrape_url(url):
        try:
            response = requests.get(url, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            return Document(page_content=soup.get_text(separator=" ", strip=True), metadata={"source": url})
        except requests.RequestException as e:
            print(f"Warning: Could not scrape URL {url}. Error: {e}")
            return None

    urls = [
        "https://www.instead.com/blog/the-augusta-rule-a-tax-strategy-for-business-owners",
        "https://www.instead.com/blog/s-corp-reasonable-salary-guide",
        "https://www.instead.com/blog/how-to-start-an-s-corp"
    ]
    web_docs = [doc for doc in [scrape_url(url) for url in urls] if doc is not None]
    print(f"Step 3: Scraped {len(web_docs)} web pages.")

    all_docs = pdf_docs + web_docs
    if not all_docs:
        print("ERROR: No documents were loaded. Halting.")
        return

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(all_docs)
    print(f"Step 4: Split documents into {len(chunks)} chunks.")

    try:
        embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
        print("Step 5: Gemini embedding model loaded successfully.")
        
        # Create vector store in-memory for better stability on free servers
        vectorstore = Chroma.from_documents(chunks, embedding)
        print("Step 6: In-memory vector store created successfully.")

        llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.3)
        print("Step 7: Gemini LLM loaded successfully.")

    except Exception as e:
        print(f"ERROR during AI model initialization: {e}. Halting.")
        return

    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        return_source_documents=True
    )
    
    print("--- RAG pipeline initialized successfully! ---")

# --- Initialize the RAG pipeline when the app starts ---
# This is now called in the global scope to ensure it runs on Hugging Face
initialize_rag_pipeline()

# --- Main Execution Block (Only used for local testing) ---
if __name__ == "__main__":
    if not rag_chain:
        print("\nCould not start the server because the RAG pipeline failed to initialize.")
    else:
        # This app.run is for local development and will not be used by Gunicorn on Hugging Face
        app.run(host='0.0.0.0', port=5000)