File size: 7,599 Bytes
437d8b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import os
import json
import tempfile
import requests
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict, Union
from dotenv import load_dotenv

from processing_utility import download_and_parse_document, extract_schema_from_file

# Import functions and constants from the colbert_utils.py file
# Make sure colbert_utils.py is in the same directory or accessible via PYTHONPATH
from rag_utils import (
    process_markdown_with_manual_sections,
    perform_vector_search,
    generate_answer_with_groq,
    CHUNK_SIZE,
    CHUNK_OVERLAP,
    TOP_K_CHUNKS,
    GROQ_MODEL_NAME
)

load_dotenv()


# --- FastAPI App Initialization ---
app = FastAPI(
    title="HackRX RAG API",
    description="API for Retrieval-Augmented Generation from PDF documents.",
    version="1.0.0",
)

# --- Groq API Key Setup ---
# It's highly recommended to set this as an environment variable in production.
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "NOT_FOUND")
if GROQ_API_KEY == "NOT_FOUND":
    print("WARNING: GROQ_API_KEY is using a placeholder or hardcoded value. Please set GROQ_API_KEY environment variable for production.")

# --- Pydantic Models for Request and Response ---
class RunRequest(BaseModel):
    documents: str  # URL to the PDF document
    questions: List[str]

class Answer(BaseModel):
    answer: str

class RunResponse(BaseModel):
    answers: List[Answer]

# --- Pseudo-functions (Replace with actual implementations if needed) ---

def convert_to_markdown(pdf_url: str) -> str:
    """
    PSEUDO-FUNCTION: Downloads the PDF from the URL and returns its local path.
    In a real scenario, this might involve converting PDF to Markdown,
    but for process_pdf_with_manual_sections, we just need the local PDF path.
    """
    print(f"Downloading PDF from: {pdf_url}")
    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)

        # Create a temporary file to store the PDF
        temp_pdf_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        for chunk in response.iter_content(chunk_size=8192):
            temp_pdf_file.write(chunk)
        temp_pdf_file.close()
        print(f"PDF downloaded to temporary path: {temp_pdf_file.name}")
        return temp_pdf_file.name
    except requests.exceptions.RequestException as e:
        raise HTTPException(status_code=500, detail=f"Failed to download PDF from URL: {e}")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during PDF download: {e}")


def fetch_headings_json(pdf_url: str) -> Dict:
    """
    PSEUDO-FUNCTION: Fetches section headings for the PDF.
    In a real scenario, this would involve a more sophisticated service
    or logic to extract headings from the PDF.
    For this example, we return a hardcoded dummy JSON.
    """
    print(f"Fetching headings for PDF URL (pseudo-function): {pdf_url}")
    # This dummy JSON should match the expected schema for process_pdf_with_manual_sections
    # {"data":{"headings": ["Your Heading"]}}
    dummy_headings = {
        "run_id": "dummy-run-id",
        "extraction_agent_id": "dummy-agent-id",
        "data": {
            "headings": [
                "Policy Wordings",
                "SECTION A) PREAMBLE",
                "SECTION B) DEFINITIONS - STANDARD DEFINITIONS",
                "SECTION B) DEFINITIONS - SPECIFIC DEFINITIONS",
                "SECTION C) BENEFITS COVERED UNDER THE POLICY",
                "PART A- COVERAGE- Domestic (Within India Only, for Imperial and Imperial Plus Plans)",
                "PART B- COVERAGE- International",
                "SECTION D) EXCLUSIONS- STANDARD EXCLUSIONS APPLICABLE TO PART A- DOMESTIC COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
                "SECTION D) EXCLUSIONS– SPECIFIC EXCLUSIONS APPLICABLE TO PART A- DOMESTIC COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
                "SECTION D) EXCLUSIONS- STANDARD EXCLUSIONS APPLICABLE TO PART B- INTERNATIONAL COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
                "SECTION D) EXCLUSIONS– SPECIFIC EXCLUSIONS APPLICABLE TO INTERNATIONAL COVER UNDER SECTION C) BENEFITS COVERED UNDER THE POLICY",
                "SECTION E) GENERAL TERMS AND CONDITIONS - STANDARD GENERAL TERMS AND CONDITIONS",
                "SECTION E) GENERAL TERMS AND CONDITIONS - SPECIFIC TERMS AND CONDITIONS",
                "SECTION E) GENERAL TERMS AND CLAUSES - STANDARD GENERAL TERMS AND CLAUSES"
            ]
        },
        "extraction_metadata": {
            "field_metadata": {},
            "usage": {
                "num_pages_extracted": 49,
                "num_document_tokens": 48701,
                "num_output_tokens": 1229
            }
        }
    }
    return dummy_headings

# --- API Endpoint ---
@app.post("/hackrx/run", response_model=RunResponse)
async def run_rag_pipeline(request: RunRequest):
    """
    Runs the RAG pipeline for a given PDF document and a list of questions.
    """
    pdf_url = request.documents
    questions = request.questions
    
    local_pdf_path = None
    try:
        # Step 1: Download PDF (using pseudo-function)
        local_markdown_path = await download_and_parse_document(pdf_url) # Renamed from convert_to_markdown to reflect it returns local path

        # Step 2: Fetch headings JSON (using pseudo-function)
        headings_json = extract_schema_from_file(local_markdown_path)
        with open("output.json", 'w', encoding='utf-8') as f:
            json.dump(headings_json, f, indent=4, ensure_ascii=False)
        if not headings_json or not headings_json.get("headings"):
            raise HTTPException(status_code=400, detail="Could not retrieve valid headings from the provided PDF URL.")

        # Step 3: Process PDF with manual sections to get chunks with metadata
        print("Processing PDF into chunks with manual sections...")
        processed_documents = process_markdown_with_manual_sections(
            local_markdown_path,
            headings_json,
            CHUNK_SIZE,
            CHUNK_OVERLAP
        )
        if not processed_documents:
            raise HTTPException(status_code=500, detail="Failed to process PDF into document chunks.")

        all_answers = []
        # Step 4: Iterate through questions, perform search, and generate answers
        for i, question in enumerate(questions):
            print(f"Processing question {i+1}/{len(questions)}: '{question}'")
            # Perform vector search
            retrieved_results = perform_vector_search(processed_documents, question, TOP_K_CHUNKS)

            if retrieved_results:
                # Generate answer using Groq
                answer_text = generate_answer_with_groq(question, retrieved_results, GROQ_API_KEY)
            else:
                answer_text = "No relevant information found in the document to answer this question."
            
            all_answers.append(Answer(answer=answer_text))

        return RunResponse(answers=all_answers)

    except HTTPException as e:
        raise e
    except Exception as e:
        print(f"An unhandled error occurred: {e}")
        raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")
    finally:
        # Clean up the temporary PDF file
        if local_pdf_path and os.path.exists(local_pdf_path):
            os.unlink(local_pdf_path)
            print(f"Cleaned up temporary PDF file: {local_pdf_path}")