File size: 8,430 Bytes
20235b3
9c8a515
 
 
20235b3
79e3e7f
20235b3
d36ef60
 
 
 
 
 
 
20235b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11f36f0
20235b3
 
 
 
 
 
 
 
 
 
9c8a515
9e0c99a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20235b3
 
 
9c8a515
20235b3
 
 
 
9c8a515
20235b3
 
 
 
9c8a515
20235b3
 
 
 
 
 
 
 
 
 
 
9c8a515
20235b3
 
 
 
9c8a515
20235b3
 
 
 
 
 
11f36f0
9e0c99a
20235b3
 
a0c8c4f
20235b3
 
 
 
 
 
 
 
a0c8c4f
20235b3
 
 
 
 
 
 
 
 
 
 
 
84c6533
20235b3
 
 
 
 
 
 
 
 
9233412
a0c8c4f
20235b3
 
 
 
 
 
 
 
 
 
 
 
9e0c99a
 
 
 
 
20235b3
 
 
 
 
 
 
 
 
 
84c6533
20235b3
 
 
 
 
 
 
11f36f0
20235b3
11f36f0
20235b3
11f36f0
20235b3
11f36f0
20235b3
11f36f0
 
 
 
 
 
 
d36ef60
 
 
11f36f0
a0c8c4f
20235b3
 
 
9c8a515
20235b3
 
 
 
 
 
 
 
9c8a515
20235b3
 
 
 
 
 
 
 
 
573bf66
20235b3
 
 
 
 
 
 
 
 
9c8a515
20235b3
db7beab
20235b3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import json
import os
import random
import uuid
import datetime
import re
from typing import List, Tuple, Dict, Optional, Generator, Any
from agent import (
    PREFIX,
    COMPRESS_DATA_PROMPT_SMALL,
    COMPRESS_DATA_PROMPT,
    LOG_PROMPT,
    LOG_RESPONSE
)
import gradio as gr
import requests
from bs4 import BeautifulSoup
from pypdf import PdfReader
import openai
from huggingface_hub import HfApi

# Configuration
OPENAI_API_BASE = "https://openrouter.ai/api/v1"
OPENAI_API_KEY = os.environ.get("OR_KEY", "")
REPO_NAME = "LPX55/ArxivPapers"
SAVE_DATA_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
HF_TOKEN = os.environ.get("HF_TOKEN", "")
api = HfApi(token=HF_TOKEN)

# Initialize OpenAI client
openai.api_base = OPENAI_API_BASE
openai.api_key = OPENAI_API_KEY
VERBOSE = True  # Set to False to disable debug logging


# Indexing Constants
INDEX_PROMPT = """Compile this data into a structured JSON format with these keys:
- "keywords": List of important keywords
- "title": Descriptive title
- "description": Brief summary
- "content": Main content
- "url": Source URL if available
"""

def extract_paper_metadata(content: str) -> Dict:
    """Extract structured metadata from a paper's content."""
    metadata = {
        "keywords": [],
        "title": "Untitled",
        "description": "No description",
        "content": content[:1000],
        "url": ""
    }
    
    # Extract URL
    url_match = re.search(r'https?://[^\s]+', content)
    if url_match:
        metadata['url'] = url_match.group(0)
    
    # Extract title (first line that looks like a title)
    lines = content.split('\n')
    for line in lines:
        if len(line) > 20 and line[0].isupper() and line[-1] in ('.', '?', '!'):
            metadata['title'] = line
            break
    
    # Extract description (first paragraph)
    paragraphs = [p for p in content.split('\n\n') if len(p) > 50]
    if paragraphs:
        metadata['description'] = paragraphs[0]
    
    # Extract keywords (from title and description)
    text_for_keywords = f"{metadata['title']} {metadata['description']}"
    words = [w.lower() for w in re.findall(r'\w+', text_for_keywords) if len(w) > 3]
    metadata['keywords'] = sorted(list(set(words)))[:10]  # Get top 10 unique keywords
    
    return metadata


def save_paper_to_memory(content: str) -> Dict:
    """Save a paper to memory with proper metadata extraction."""
    metadata = extract_paper_metadata(content)
    
    # Additional processing for academic papers
    if 'arxiv' in metadata['url'].lower():
        metadata['keywords'].extend(['arxiv', 'paper', 'research'])
        metadata['description'] = f"Academic paper: {metadata['description']}"
    
    return metadata

def create_index() -> None:
    """Create or update the search index from memory files."""
    uid = uuid.uuid4()
    
    # Load existing index
    index_url = f"{SAVE_DATA_URL}mem-test2/index.json"
    r = requests.get(index_url)
    index_data = json.loads(r.text) if r.status_code == 200 else [{}]
    
    # Load main memory data
    main_url = f"{SAVE_DATA_URL}mem-test2/main.json"
    m = requests.get(main_url)
    main_data = json.loads(m.text) if m.status_code == 200 else []
    
    # Update index
    for entry in main_data:
        try:
            for keyword in entry.get('keywords', []):
                if keyword in index_data[0]:
                    if entry['file_name'] not in index_data[0][keyword]:
                        index_data[0][keyword].append(entry['file_name'])
                else:
                    index_data[0][keyword] = [entry['file_name']]
        except Exception as e:
            print(f"Indexing error: {e}")
    
    # Save updated index
    index_path = f"tmp-index-{uid}.json"
    with open(index_path, "w") as f:
        json.dump(index_data, f)
    
    api.upload_file(
        path_or_fileobj=index_path,
        path_in_repo="/mem-test2/index.json",
        repo_id=REPO_NAME,
        repo_type="dataset",
    )

 
def fetch_url_content(url: str) -> Tuple[bool, str]:
    """Fetch content from a URL and return status and content."""
    try:
        if not url:
            return False, "Enter valid URL"
            
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "lxml")
            return True, str(soup)
        return False, f"Status: {response.status_code}"
    except Exception as e:
        return False, f"Error: {e}"

def read_file_content(file_path: str) -> str:
    """Read content from a file (txt or pdf)."""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        return "\n".join(page.extract_text() for page in reader.pages)
    elif file_path.endswith(".txt"):
        with open(file_path, "r") as f:
            return f.read()
    return ""

def generate_response(prompt: str, model: str = "meta-llama/llama-4-maverick:free") -> str:
    """Generate response using OpenRouter API."""
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"


def process_pdf_url(pdf_url: str) -> str:
    """Process PDF from URL and extract text."""
    try:
        response = requests.get(pdf_url, stream=True)
        if response.status_code == 200:
            temp_path = f"temp_{uuid.uuid4()}.pdf"
            with open(temp_path, "wb") as f:
                f.write(response.content)
            return read_file_content(temp_path)
        return f"Error: Status {response.status_code}"
    except Exception as e:
        return f"Error: {e}"
def save_memory(purpose: str, content: str) -> List[Dict]:
    """Save processed content to memory with proper metadata extraction."""
    metadata = extract_paper_metadata(content)
    return [metadata]
   
def summarize(
    inp: str,
    history: List[Tuple[str, str]],
    report_check: bool,
    sum_check: bool,
    mem_check: bool,
    data: str = "",
    file: Optional[str] = None,
    url: str = "",
    pdf_url: str = "",
    model: str = "meta-llama/llama-4-maverick:free"
) -> Generator[Tuple[str, List[Tuple[str, str]], str, Dict], None, None]:
    """Main summarization function with memory support."""
    history = [(inp, "Processing...")]
    yield "", history, "", {}

    processed_data = ""
    if pdf_url.startswith("http"):
        processed_data += f"PDF URL: {pdf_url}\n"
    if url.startswith("http"):
        processed_data += f"URL: {url}\n"
    if file:
        processed_data += f"File: {file}\n"
    if data:
        processed_data += f"Data: {data[:1000]}\n"

    summary = f"Summary for: {inp[:100]}\n{processed_data[:500]}"
    memory_entries = []

    if mem_check:
        memory_entries = save_memory(inp, processed_data)
        if memory_entries:
            summary += "\n\nSaved to memory"
        else:
            summary += "\n\nMemory save failed"

    yield summary, history, "", memory_entries[0] if memory_entries else {}
    
def create_app():
    with gr.Blocks() as app:
        gr.Markdown("## Mixtral 8x7B Summarizer")
        
        with gr.Row():
            with gr.Column(scale=3):
                prompt = gr.Textbox(label="Instruction")
            with gr.Column(scale=1):
                report_check = gr.Checkbox(label="Return report", value=True)
                sum_check = gr.Checkbox(label="Summarize", value=True)
                mem_check = gr.Checkbox(label="Memory", value=True)
                submit_btn = gr.Button("Submit")
        
        with gr.Row():
            with gr.Tab("Text"):
                data = gr.Textbox(label="Input text")
            with gr.Tab("File"):
                file = gr.File(label="Upload file")
            with gr.Tab("URL"):
                url = gr.Textbox(label="Website URL")
            with gr.Tab("PDF"):
                pdf_url = gr.Textbox(label="PDF URL")
        
        chatbot = gr.Chatbot()
        error_box = gr.Textbox()
        json_output = gr.JSON()

        submit_btn.click(
            summarize,
            [prompt, chatbot, report_check, sum_check, mem_check, data, file, url, pdf_url],
            [prompt, chatbot, error_box, json_output]
        )

    return app

if __name__ == "__main__":
    app = create_app()
    app.launch()