import requests from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer, util from transformers import AutoModelForCausalLM, AutoTokenizer import torch device = torch.device("cpu") model_id = "TheBloke/Mistral-7B-Instruct-v0.1" # replace if needed tokenizer = AutoTokenizer.from_pretrained(model_id) llm = AutoModelForCausalLM.from_pretrained(model_id).to(device) embedder = SentenceTransformer('all-MiniLM-L6-v2') def search_and_summarize(query, max_papers=5): url = "https://arxiv.org/rss/cs.AI" res = requests.get(url) soup = BeautifulSoup(res.text, 'xml') items = soup.find_all('item') papers = [] for item in items: title = item.title.text abstract = item.description.text link = item.link.text papers.append({'title': title, 'abstract': abstract, 'link': link}) # embed & find top matches query_emb = embedder.encode(query) paper_embs = embedder.encode([p['abstract'] for p in papers]) sims = util.cos_sim(query_emb, paper_embs)[0] top_idx = sims.argsort(descending=True)[:max_papers] results = [] for idx in top_idx: paper = papers[idx] context = f"Title: {paper['title']}\nAbstract: {paper['abstract']}" prompt = f"{context}\n\nExplain this paper in simple terms for an AI researcher:" inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = llm.generate(**inputs, max_new_tokens=200) explanation = tokenizer.decode(outputs[0], skip_special_tokens=True) explanation = explanation[len(prompt):].strip() results.append({ 'title': paper['title'], 'summary': explanation, 'link': paper['link'] }) return results