Spaces:

ramysaidagieb
/

RagGV1

Sleeping

RagGV1 / utils.py

Upload 5 files

4028152 verified 3 months ago

1.05 kB

	import os
	import fitz # PyMuPDF
	import docx

	def extract_text_from_pdf(file_path):
	text = ""
	with fitz.open(file_path) as doc:
	for page in doc:
	text += page.get_text()
	return text

	def extract_text_from_docx(file_path):
	doc = docx.Document(file_path)
	return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

	def process_documents(uploaded_files):
	passages = []
	for i, file in enumerate(uploaded_files):
	file_path = file.name
	if file_path.endswith(".pdf"):
	text = extract_text_from_pdf(file_path)
	elif file_path.endswith(".docx"):
	text = extract_text_from_docx(file_path)
	else:
	continue
	chunks = text.split("\n\n")
	for j, chunk in enumerate(chunks):
	chunk = chunk.strip()
	if len(chunk) > 100:
	passages.append({
	"text": chunk,
	"source": f"{os.path.basename(file_path)} - فقرة {j+1}"
	})
	return passages