pdf-parser-api / backend /file_utils.py
blaxx14's picture
update KTP scanning
605c260
import time
import os
from pdf2docx import Converter
from docx import Document
def convert_pdf_to_word(pdf_path, docx_path):
cv = Converter(pdf_path)
cv.convert(docx_path)
cv.close()
def convert_image_to_word(text, filename):
copy_num = 0
doc = Document()
doc.add_heading(f"{filename}", level=1)
doc.add_paragraph(text)
doc.add_paragraph("\n" + "-"*50 + "\n")
if not os.path.exists(os.path.join('/tmp', filename)):
filepath = os.path.join('/tmp', filename)
else:
copy_num+=1
filepath = os.path.join('/tmp', f'{filename}({copy_num})')
doc.save(filepath)
def wait_for_file_release(file_path, timeout=5):
start_time = time.time()
while time.time() - start_time < timeout:
try:
with open(file_path, 'rb'):
return True
except PermissionError:
time.sleep(0.5)
return False
def delete_temp_folder(temp_path="/tmp"):
time.sleep(0.5)
for filename in os.listdir(temp_path):
file_path = os.path.join(temp_path, filename)
if wait_for_file_release(file_path):
try:
os.remove(file_path)
print(f"Hapus: {file_path}")
except Exception as e:
print(f"Gagal hapus {file_path}: {e}")
else:
print(f"File terkunci terlalu lama: {file_path}")
def extract_tables_from_docx(docx_path):
doc = Document(docx_path)
all_tables = []
for table in doc.tables:
table_data = []
bold_map = []
for row in table.rows:
row_data = []
row_bold_flags = []
for cell in row.cells:
texts = []
is_bold = False
for paragraph in cell.paragraphs:
for run in paragraph.runs:
texts.append(run.text.strip())
if run.bold:
is_bold = True
cell_text = " ".join(texts).strip()
row_data.append(cell_text)
row_bold_flags.append(is_bold)
table_data.append(row_data)
bold_map.append(row_bold_flags)
all_tables.append({
"table_data": table_data,
"bold_map": bold_map
})
return all_tables