Spaces:
Sleeping
Sleeping
File size: 6,706 Bytes
a831d50 2833068 87d876e a831d50 2833068 a831d50 bb41ea7 87d876e a831d50 2833068 cbb92c4 a831d50 cbb92c4 a831d50 cbb92c4 246bb62 cbb92c4 a831d50 bb41ea7 cbb92c4 a831d50 cbb92c4 a831d50 cbb92c4 a831d50 cbb92c4 bb41ea7 cbb92c4 a831d50 2833068 a831d50 cbb92c4 a831d50 cbb92c4 a831d50 cbb92c4 a831d50 2833068 a831d50 2833068 a831d50 cbb92c4 a831d50 2833068 a831d50 cbb92c4 2833068 a831d50 87d876e a831d50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import re
import duckdb
import pandas as pd
import gradio as gr
from io import StringIO
from langchain_community.vectorstores.duckdb import DuckDB
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_transformers import Html2TextTransformer
from langsmith import traceable
TAB_LINES = 22
# Embedding Model args
model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
# HuggingFace Embeddings
hf = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
# DuckDB Connection
con = duckdb.connect('Collections.duckdb')
# DuckDB Vector Store
vector_store = DuckDB(connection = con, embedding=hf)
def html_only_metadata_extractor(raw_html, url, response):
content_type = response.headers.get("Content-Type", "")
if "text/html" in content_type:
return {"source": url, "content_type": content_type}
return {}
def scrape_text(url, max_depth):
try:
loader = RecursiveUrlLoader(
url=url,
max_depth=max_depth,
check_response_status=True,
metadata_extractor=html_only_metadata_extractor,
prevent_outside=True,
use_async=True
)
documents = loader.load()
except Exception as e:
print(f"Error loading URL: {e}")
return None
return documents
@traceable()
def clean_text(docs):
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
for doc in docs_transformed:
doc.page_content = re.sub(r'\n\n+|\n+|\s+', ' ', doc.page_content)
return docs_transformed
def remove_tables(docs):
for doc in docs:
table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
doc.page_content = table_pattern.sub('', doc.page_content)
return docs
def format_chunks_with_spaces(chunks):
separator = "\n\n---\n\n"
formatted_chunks = ""
for i, chunk in enumerate(chunks):
formatted_chunks += f"Chunk {i+1}: \n\n"
formatted_chunks += chunk.page_content
formatted_chunks += separator
return formatted_chunks
def format_metdata(docs):
formatted_metadata = ""
for i, doc in enumerate(docs):
formatted_metadata += f"Metadata {i+1}: \n\n"
formatted_metadata += str(doc.metadata)
formatted_metadata += "\n\n---\n\n"
return formatted_metadata
def format_page_content(docs):
formatted_docs = ""
for i, doc in enumerate(docs):
formatted_docs += f"Page Content {i+1}: \n\n"
formatted_docs += str(doc.page_content)
formatted_docs += "\n\n---\n\n"
return formatted_docs
@traceable()
def get_tables(raw_docs):
tables_list = []
for raw_doc in raw_docs:
try:
tables = pd.read_html(StringIO(str(raw_doc.page_content)))
tables_list.extend(tables)
except Exception as e:
print(f"Error reading table: {e}")
continue
return tables_list
def concat_dfs(df_list):
concatenated_df = pd.concat(df_list, ignore_index=True)
return concatenated_df
def create_embeddings(docs):
ids = vector_store.add_documents(docs)
result = con.execute(f"SELECT * FROM embeddings").fetchdf()
return result[result['id'].isin(ids)]
def get_docs(url, max_depth):
raw_html = scrape_text(url, max_depth)
if raw_html is None:
return None, None, None, None, None
tables_list = get_tables(raw_html)
if tables_list:
concat_tables = concat_dfs(tables_list)
else:
concat_tables = None
tables_rmv_html = remove_tables(raw_html)
clean_docs = clean_text(tables_rmv_html)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
documents_splits = text_splitter.split_documents(clean_docs)
formatted_chunks = format_chunks_with_spaces(documents_splits)
embeddings = create_embeddings(documents_splits)
return format_page_content(raw_html), format_page_content(clean_docs), concat_tables, format_metdata(raw_html), formatted_chunks, embeddings
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
gr.Markdown("""
<div style='text-align: center;'>
<strong style='font-size: 36px;'>Domain Document Indexing</strong>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
with gr.Row():
max_depth = gr.Slider(1, 50, value=1, step=1, label="Max Depth", interactive=True)
scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")
with gr.Column(elem_id = "col_container", scale=2):
with gr.Tabs():
with gr.Tab("RAW HTML"):
raw_page_content = gr.Textbox(lines=TAB_LINES, label="Page Content HTML", value="", interactive=False,
autoscroll=False)
with gr.Tab("Clean Content"):
page_content = gr.Textbox(lines=TAB_LINES, label="Clean Page Content", value="", interactive=False,
autoscroll=False)
with gr.Tab("Tables"):
tables = gr.Textbox(lines=TAB_LINES, label="Tables", value="", interactive=False,
autoscroll=False)
with gr.Tab("Chunks"):
parsed_chunks = gr.Textbox(lines=TAB_LINES, label="Parsed Chunks", value="", interactive=False,
autoscroll=False)
with gr.Tab("Metadata"):
metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
autoscroll=False)
with gr.Tab("Embeddings"):
embeddings = gr.Dataframe(label="Vector Store", interactive=False)
scarpe_url_button.click(get_docs, inputs=[url_input, max_depth], outputs=[raw_page_content, page_content, tables,
metadata, parsed_chunks, embeddings])
if __name__ == "__main__":
demo.launch()
|