File size: 6,650 Bytes
437d8b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import httpx      # An asynchronous HTTP client.
import os         # To handle file paths and create directories.
import asyncio    # To run synchronous libraries in an async environment.
from urllib.parse import unquote, urlparse # To get the filename from the URL.
import uuid       # To generate unique filenames if needed.

from pydantic import HttpUrl
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
import json
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document


import os
import argparse
from typing import Optional

# Ensure required libraries are installed.
# You can install them using:
# pip install llama_cloud_services pydantic python-dotenv

from llama_cloud_services import LlamaExtract
from pydantic import BaseModel, Field
from dotenv import load_dotenv

class Insurance(BaseModel):
    """
    A Pydantic model to define the data schema for extraction.
    The description helps guide the AI model.
    """
    headings: str = Field(description="An array of headings")



def extract_schema_from_file(file_path: str) -> Optional[Insurance]:
    """
    Initializes the LlamaExtract client, creates an agent, and extracts
    data from the provided file path based on the Resume schema.

    Args:
        file_path (str): The path to the local markdown file.

    Returns:
        An instance of the Resume Pydantic model containing the extracted data,
        or None if the extraction fails or the file doesn't exist.
    """
    if not os.path.exists(file_path):
        print(f"❌ Error: The file '{file_path}' was not found.")
        return None

    print(f"πŸš€ Initializing extractor and sending '{file_path}' to LlamaCloud...")

    try:
        # Initialize the LlamaExtract client.
        # It will automatically use the LLAMA_CLOUD_API_KEY from the environment.
        extractor = LlamaExtract()

        # Create an extraction agent with a specific name and data schema.
        # This configuration is taken directly from your provided code.
        # Assuming 'extractor' and 'Insurance' are already defined
        # Define your configuration dictionary
        agent_config = {
            "system_prompt": "Identify and extract the primary section/segment headings within this legal or policy document. Focus on headings that establish the overarching theme or context for the entire block of text they introduce. Examples include 'Introduction', 'Definitions', 'Scope', 'Liabilities', or 'Terms and Conditions'. Do not extract subheadings or any headings that merely denote a list item."
        }

        # Create the agent, passing the config dictionary
        #agent = extractor.create_agent(name="insurance-parser", data_schema=Insurance)
        agent = extractor.get_agent(name="insurance-parser")


        # Call the agent to extract data from the specified document.
        print("πŸ€– Agent created. Starting extraction...")
        result = agent.extract(file_path)

        if result and result.data:
            print("βœ… Extraction successful!")
            # The function returns the structured data.
            return result.data
        else:
            print("⚠️ Extraction did not return any data.")
            return None

    except Exception as e:
        print(f"\n❌ An error occurred during the API call: {e}")
        print("Please check your API key, network connection, and file format.")
        return None



async def download_and_parse_document(doc_url: HttpUrl) -> str:
    """
    Asynchronously downloads a document, saves it to a local directory,
    and then parses it using LangChain's PyMuPDF4LLMLoader.

    Args:
        doc_url: The Pydantic-validated URL of the document to process.

    Returns:
        A single string containing the document's content as structured Markdown.
    """
    print(f"Initiating download from: {doc_url}")
    try:
        # Create the local storage directory if it doesn't exist.
        LOCAL_STORAGE_DIR = "data/"
        os.makedirs(LOCAL_STORAGE_DIR, exist_ok=True)

        async with httpx.AsyncClient() as client:
            response = await client.get(str(doc_url), timeout=30.0, follow_redirects=True)
            response.raise_for_status()
        
        doc_bytes = response.content
        print("Download successful.")

        # --- Logic to determine the local filename ---
        # Parse the URL to extract the path.
        parsed_path = urlparse(str(doc_url)).path
        # Get the last part of the path and decode URL-encoded characters (like %20 for space).
        filename = unquote(os.path.basename(parsed_path))
        
        # If the filename is empty, create a unique one.
        if not filename:
            filename = f"{uuid.uuid4()}.pdf"
            
        # Construct the full path where the file will be saved.
        local_file_path = os.path.join(LOCAL_STORAGE_DIR, filename)

        # Save the downloaded document to the local file.
        with open(local_file_path, "wb") as f:
            f.write(doc_bytes)
        
        print(f"Document saved locally at: {local_file_path}")
        print("Parsing document with LangChain's PyMuPDF4LLMLoader...")

        # The loader's 'load' method is synchronous. Run it in a separate thread.
        def load_document():
            loader = PyMuPDF4LLMLoader(local_file_path)
            documents = loader.load()
            return documents

        documents = await asyncio.to_thread(load_document)
        
        if documents:
            parsed_markdown = "\n\n".join([doc.page_content for doc in documents])
            print(f"Parsing complete. Extracted {len(parsed_markdown)} characters as Markdown.")
            # The local file is NOT deleted, as requested.
            '''with open("sample_schema.json", 'r') as file:
                # Load the JSON data from the file into a Python variable (dictionary or list)
                data_variable = json.load(file)'''

            #await process_markdown_with_manual_sections(parsed_markdown, data_variable, chunk_size = 1000, chunk_overlap =200)
            filename = "hello.md"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(parsed_markdown)
            print(f"Markdown successfully saved to {filename}")
            #return parsed_markdown
            return filename
        else:
            raise ValueError("PyMuPDF4LLMLoader did not return any content.")

    except httpx.HTTPStatusError as e:
        print(f"Error downloading document: {e}")
        raise
    except Exception as e:
        print(f"Error during processing: {e}")
        raise