File size: 6,706 Bytes
a831d50
2833068
 
87d876e
a831d50
2833068
 
a831d50
 
 
bb41ea7
 
87d876e
 
a831d50
 
2833068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb92c4
 
 
 
 
a831d50
cbb92c4
a831d50
cbb92c4
 
 
 
 
 
246bb62
cbb92c4
a831d50
 
 
 
 
 
bb41ea7
cbb92c4
a831d50
cbb92c4
 
 
a831d50
 
 
 
cbb92c4
 
 
a831d50
 
 
 
 
 
 
 
 
 
 
 
cbb92c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb41ea7
cbb92c4
 
 
 
 
 
 
 
 
 
 
a831d50
 
 
 
 
 
2833068
 
 
 
a831d50
cbb92c4
 
a831d50
 
 
cbb92c4
 
a831d50
 
 
 
cbb92c4
a831d50
 
 
 
 
2833068
 
a831d50
2833068
a831d50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb92c4
 
 
a831d50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2833068
 
a831d50
cbb92c4
2833068
a831d50
 
87d876e
a831d50
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import re
import duckdb
import pandas as pd
import gradio as gr
from io import StringIO
from langchain_community.vectorstores.duckdb import DuckDB
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_transformers import Html2TextTransformer
from langsmith import traceable



TAB_LINES = 22

# Embedding Model args
model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

# HuggingFace Embeddings
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# DuckDB Connection
con = duckdb.connect('Collections.duckdb')

# DuckDB Vector Store
vector_store = DuckDB(connection = con, embedding=hf)

def html_only_metadata_extractor(raw_html, url, response):
    content_type = response.headers.get("Content-Type", "")
    if "text/html" in content_type:
        return {"source": url, "content_type": content_type}
    return {}

def scrape_text(url, max_depth):
    try:
        loader = RecursiveUrlLoader(
                url=url,
                max_depth=max_depth,
                check_response_status=True,
                metadata_extractor=html_only_metadata_extractor,
                prevent_outside=True,
                use_async=True
            )
        documents = loader.load()
    except Exception as e:
        print(f"Error loading URL: {e}")
        return None
    return documents

@traceable()       
def clean_text(docs):
    html2text = Html2TextTransformer()
    docs_transformed = html2text.transform_documents(docs)
    for doc in docs_transformed:
        doc.page_content = re.sub(r'\n\n+|\n+|\s+', ' ', doc.page_content)
    return docs_transformed


def remove_tables(docs):
    for doc in docs:
        table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
        doc.page_content = table_pattern.sub('', doc.page_content)
    return docs


def format_chunks_with_spaces(chunks):
    separator = "\n\n---\n\n"  
    formatted_chunks = ""
    for i, chunk in enumerate(chunks):
        formatted_chunks += f"Chunk {i+1}: \n\n"
        formatted_chunks += chunk.page_content
        formatted_chunks += separator 
    return formatted_chunks

def format_metdata(docs):
    formatted_metadata = ""
    for i, doc in enumerate(docs):
        formatted_metadata += f"Metadata {i+1}: \n\n"
        formatted_metadata += str(doc.metadata)
        formatted_metadata += "\n\n---\n\n"
    return formatted_metadata

def format_page_content(docs):
    formatted_docs = ""
    for i, doc in enumerate(docs):
        formatted_docs += f"Page Content {i+1}: \n\n"
        formatted_docs += str(doc.page_content)
        formatted_docs += "\n\n---\n\n"
    return formatted_docs

@traceable()
def get_tables(raw_docs):
    tables_list = []
    for raw_doc in raw_docs:
        try:
            tables = pd.read_html(StringIO(str(raw_doc.page_content)))
            tables_list.extend(tables)
        except Exception as e:
            print(f"Error reading table: {e}")
            continue
        
    return tables_list


def concat_dfs(df_list):
    concatenated_df = pd.concat(df_list, ignore_index=True)
    return concatenated_df

def create_embeddings(docs):
    ids = vector_store.add_documents(docs)
    result = con.execute(f"SELECT * FROM embeddings").fetchdf()
    return result[result['id'].isin(ids)]

def get_docs(url, max_depth):
    raw_html = scrape_text(url, max_depth)
    if raw_html is None:
        return None, None, None, None, None

    tables_list = get_tables(raw_html) 
    if tables_list:
        concat_tables = concat_dfs(tables_list)
    else:
        concat_tables = None
        
    tables_rmv_html = remove_tables(raw_html)
    clean_docs = clean_text(tables_rmv_html)
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
    documents_splits = text_splitter.split_documents(clean_docs)
    formatted_chunks = format_chunks_with_spaces(documents_splits)
    embeddings = create_embeddings(documents_splits)
    
    
    return format_page_content(raw_html), format_page_content(clean_docs), concat_tables, format_metdata(raw_html), formatted_chunks, embeddings


with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
    
    gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
    
    gr.Markdown("""
    <div style='text-align: center;'>
    <strong style='font-size: 36px;'>Domain Document Indexing</strong>

    </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
            with gr.Row():
                max_depth = gr.Slider(1, 50, value=1, step=1, label="Max Depth", interactive=True)
                scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")

        with gr.Column(elem_id = "col_container", scale=2):
            with gr.Tabs():
                with gr.Tab("RAW HTML"):
                    raw_page_content = gr.Textbox(lines=TAB_LINES, label="Page Content HTML", value="", interactive=False,
                                                  autoscroll=False)
                with gr.Tab("Clean Content"):
                    page_content = gr.Textbox(lines=TAB_LINES, label="Clean Page Content", value="", interactive=False,
                                              autoscroll=False)
                with gr.Tab("Tables"):
                    tables = gr.Textbox(lines=TAB_LINES, label="Tables", value="", interactive=False,
                                        autoscroll=False)
                with gr.Tab("Chunks"):
                    parsed_chunks = gr.Textbox(lines=TAB_LINES, label="Parsed   Chunks", value="", interactive=False,
                                               autoscroll=False)
                with gr.Tab("Metadata"):
                    metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
                                          autoscroll=False)
                with gr.Tab("Embeddings"):
                    embeddings = gr.Dataframe(label="Vector Store", interactive=False)
        
        scarpe_url_button.click(get_docs, inputs=[url_input, max_depth], outputs=[raw_page_content, page_content, tables,
                                                                     metadata, parsed_chunks, embeddings])

        
if __name__ == "__main__":
    demo.launch()