saherPervaiz commited on
Commit
7cc953c
·
verified ·
1 Parent(s): cb19513

Update text_extractor.py

Browse files
Files changed (1) hide show
  1. text_extractor.py +20 -13
text_extractor.py CHANGED
@@ -1,18 +1,25 @@
1
  # text_extractor.py
 
 
2
  import docx2txt
3
- import fitz # PyMuPDF
4
 
5
  def extract_text_from_file(file_path):
6
- if file_path.endswith(".pdf"):
7
- return extract_text_from_pdf(file_path)
8
- elif file_path.endswith(".docx"):
9
- return docx2txt.process(file_path)
10
- else:
11
- return "Unsupported file type."
 
 
 
12
 
13
- def extract_text_from_pdf(file_path):
14
- text = ""
15
- with fitz.open(file_path) as doc:
16
- for page in doc:
17
- text += page.get_text()
18
- return text
 
 
 
1
  # text_extractor.py
2
+
3
+ import os
4
  import docx2txt
5
+ import PyPDF2
6
 
7
  def extract_text_from_file(file_path):
8
+ ext = os.path.splitext(file_path)[1].lower()
9
+
10
+ if ext == ".pdf":
11
+ try:
12
+ with open(file_path, "rb") as f:
13
+ reader = PyPDF2.PdfReader(f)
14
+ return " ".join([page.extract_text() or "" for page in reader.pages])
15
+ except:
16
+ return "[Error extracting PDF text]"
17
 
18
+ elif ext == ".docx":
19
+ try:
20
+ return docx2txt.process(file_path)
21
+ except:
22
+ return "[Error extracting DOCX text]"
23
+
24
+ else:
25
+ return "[Unsupported file type]"