Spaces:
Sleeping
Sleeping
import os | |
from PyPDF2 import PdfReader | |
from docx import Document | |
def process_pdf(file_path): | |
reader = PdfReader(file_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text.split('\n\n') # ุชูุณูู ุงููุต ุฅูู ููุฑุงุช | |
def process_docx(file_path): | |
doc = Document(file_path) | |
paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""] | |
return paragraphs | |
def process_txt(file_path): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
text = f.read() | |
return text.split('\n\n') | |
def process_documents(file_path): | |
ext = os.path.splitext(file_path)[1].lower() | |
if ext == '.pdf': | |
return process_pdf(file_path) | |
elif ext == '.docx': | |
return process_docx(file_path) | |
elif ext == '.txt': | |
return process_txt(file_path) | |
else: | |
return [] | |