Spaces:
Sleeping
Sleeping
import os | |
import fitz # PyMuPDF | |
import docx | |
def extract_text_from_pdf(file_path): | |
text = "" | |
with fitz.open(file_path) as doc: | |
for page in doc: | |
text += page.get_text() | |
return text | |
def extract_text_from_docx(file_path): | |
doc = docx.Document(file_path) | |
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
def process_documents(uploaded_files): | |
passages = [] | |
for i, file in enumerate(uploaded_files): | |
file_path = file.name | |
if file_path.endswith(".pdf"): | |
text = extract_text_from_pdf(file_path) | |
elif file_path.endswith(".docx"): | |
text = extract_text_from_docx(file_path) | |
else: | |
continue | |
chunks = text.split("\n\n") | |
for j, chunk in enumerate(chunks): | |
chunk = chunk.strip() | |
if len(chunk) > 100: | |
passages.append({ | |
"text": chunk, | |
"source": f"{os.path.basename(file_path)} - فقرة {j+1}" | |
}) | |
return passages |