RagGV1 / utils.py
ramysaidagieb's picture
Upload 5 files
4028152 verified
import os
import fitz # PyMuPDF
import docx
def extract_text_from_pdf(file_path):
text = ""
with fitz.open(file_path) as doc:
for page in doc:
text += page.get_text()
return text
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
def process_documents(uploaded_files):
passages = []
for i, file in enumerate(uploaded_files):
file_path = file.name
if file_path.endswith(".pdf"):
text = extract_text_from_pdf(file_path)
elif file_path.endswith(".docx"):
text = extract_text_from_docx(file_path)
else:
continue
chunks = text.split("\n\n")
for j, chunk in enumerate(chunks):
chunk = chunk.strip()
if len(chunk) > 100:
passages.append({
"text": chunk,
"source": f"{os.path.basename(file_path)} - فقرة {j+1}"
})
return passages