Spaces:
Sleeping
Sleeping
File size: 3,938 Bytes
4fdeec5 3eef18c 4fdeec5 3eef18c 4fdeec5 3eef18c 4fdeec5 3eef18c 4fdeec5 3eef18c be79cad 4fdeec5 be79cad 4fdeec5 be79cad 4fdeec5 3eef18c be79cad 3eef18c be79cad 3eef18c be79cad 3eef18c 4fdeec5 3eef18c 4fdeec5 be79cad 4fdeec5 be79cad 4fdeec5 be79cad 4fdeec5 be79cad c0de0f6 be79cad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# app.py
import logging
import re
import requests
import numpy as np
import faiss
import gradio as gr
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.llms import Together
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
# Logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load Embedding Model
logger.info("π Loading sentence transformer...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
# Load LLM (Replace with your API Key)
llm = Together(
model="togethercomputer/llama-3-70b-chat",
temperature=0.7,
max_tokens=512,
together_api_key="your_together_api_key"
)
def fetch_webpage_text(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
content_div = soup.find("div", {"id": "mw-content-text"}) or soup.body
return content_div.get_text(separator="\n", strip=True)
except Exception as e:
logger.error(f"Error fetching content: {e}")
return ""
def clean_text(text):
text = re.sub(r'\[\s*\d+\s*\]', '', text)
text = re.sub(r'\[\s*[a-zA-Z]+\s*\]', '', text)
text = re.sub(r'^\[\s*\d+\s*\]$', '', text, flags=re.MULTILINE)
text = re.sub(r'\n{2,}', '\n', text)
text = re.sub(r'[ \t]+', ' ', text)
return text.strip()
def chunk_text(text, chunk_size=500, overlap=50):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap
)
return splitter.split_text(text)
def create_vectorstore(chunks):
texts = [chunk for chunk in chunks]
embeddings = [embed_model.encode(text) for text in texts]
dim = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings).astype(np.float32))
return index, texts, embeddings
def get_summary(chunks):
full_doc = Document(page_content="\n\n".join(chunks))
summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
return summarize_chain.run([full_doc])
def retrieve_answer(query, chunks, embeddings, texts):
query_vector = embed_model.encode(query).astype(np.float32)
index = faiss.IndexFlatL2(embeddings[0].shape[0])
index.add(np.array(embeddings).astype(np.float32))
D, I = index.search(np.array([query_vector]), k=5)
top_chunks = [texts[i] for i in I[0]]
rag_doc = "\n\n".join(top_chunks)
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=None)
return qa_chain.run(input_documents=[Document(page_content=rag_doc)], question=query)
# Gradio Interface
def run_chatbot(url, query):
raw_text = fetch_webpage_text(url)
if not raw_text:
return "β Failed to fetch content.", ""
cleaned = clean_text(raw_text)
chunks = chunk_text(cleaned)
if not chunks:
return "β No valid content to process.", ""
summary = get_summary(chunks)
index, texts, embeddings = create_vectorstore(chunks)
answer = retrieve_answer(query, chunks, embeddings, texts)
return summary, answer
demo = gr.Interface(
fn=run_chatbot,
inputs=[
gr.Textbox(label="Webpage URL", placeholder="Enter a Wikipedia link"),
gr.Textbox(label="Your Question", placeholder="Ask a question about the webpage")
],
outputs=[
gr.Textbox(label="Webpage Summary"),
gr.Textbox(label="Answer")
],
title="π¦ LLaMA RAG Chatbot",
description="Enter a Wikipedia article URL and ask a question. Powered by Together AI and LangChain.",
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch()
|