Spaces:
Sleeping
Sleeping
Update text_extractor.py
Browse files- text_extractor.py +20 -13
text_extractor.py
CHANGED
@@ -1,18 +1,25 @@
|
|
1 |
# text_extractor.py
|
|
|
|
|
2 |
import docx2txt
|
3 |
-
import
|
4 |
|
5 |
def extract_text_from_file(file_path):
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
1 |
# text_extractor.py
|
2 |
+
|
3 |
+
import os
|
4 |
import docx2txt
|
5 |
+
import PyPDF2
|
6 |
|
7 |
def extract_text_from_file(file_path):
|
8 |
+
ext = os.path.splitext(file_path)[1].lower()
|
9 |
+
|
10 |
+
if ext == ".pdf":
|
11 |
+
try:
|
12 |
+
with open(file_path, "rb") as f:
|
13 |
+
reader = PyPDF2.PdfReader(f)
|
14 |
+
return " ".join([page.extract_text() or "" for page in reader.pages])
|
15 |
+
except:
|
16 |
+
return "[Error extracting PDF text]"
|
17 |
|
18 |
+
elif ext == ".docx":
|
19 |
+
try:
|
20 |
+
return docx2txt.process(file_path)
|
21 |
+
except:
|
22 |
+
return "[Error extracting DOCX text]"
|
23 |
+
|
24 |
+
else:
|
25 |
+
return "[Unsupported file type]"
|