Harsh Upadhyay
removing local file support and adding SQLAlchemy support for all database operations.
fca1742
raw
history blame contribute delete
767 Bytes
import tempfile
from pdfminer.high_level import extract_text
import os
from PyPDF2 import PdfReader
def extract_text_from_pdf(file_or_path):
"""
Extract text from a PDF file. Accepts either a file path (str) or a file-like object (e.g., BytesIO).
"""
if isinstance(file_or_path, (str, bytes)):
# Assume it's a file path
with open(file_or_path, 'rb') as f:
reader = PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
else:
# Assume it's a file-like object
reader = PdfReader(file_or_path)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text