import time import os from pdf2docx import Converter from docx import Document def convert_pdf_to_word(pdf_path, docx_path): cv = Converter(pdf_path) cv.convert(docx_path) cv.close() def convert_image_to_word(text, filename): copy_num = 0 doc = Document() doc.add_heading(f"{filename}", level=1) doc.add_paragraph(text) doc.add_paragraph("\n" + "-"*50 + "\n") if not os.path.exists(os.path.join('/tmp', filename)): filepath = os.path.join('/tmp', filename) else: copy_num+=1 filepath = os.path.join('/tmp', f'{filename}({copy_num})') doc.save(filepath) def wait_for_file_release(file_path, timeout=5): start_time = time.time() while time.time() - start_time < timeout: try: with open(file_path, 'rb'): return True except PermissionError: time.sleep(0.5) return False def delete_temp_folder(temp_path="/tmp"): time.sleep(0.5) for filename in os.listdir(temp_path): file_path = os.path.join(temp_path, filename) if wait_for_file_release(file_path): try: os.remove(file_path) print(f"Hapus: {file_path}") except Exception as e: print(f"Gagal hapus {file_path}: {e}") else: print(f"File terkunci terlalu lama: {file_path}") def extract_tables_from_docx(docx_path): doc = Document(docx_path) all_tables = [] for table in doc.tables: table_data = [] bold_map = [] for row in table.rows: row_data = [] row_bold_flags = [] for cell in row.cells: texts = [] is_bold = False for paragraph in cell.paragraphs: for run in paragraph.runs: texts.append(run.text.strip()) if run.bold: is_bold = True cell_text = " ".join(texts).strip() row_data.append(cell_text) row_bold_flags.append(is_bold) table_data.append(row_data) bold_map.append(row_bold_flags) all_tables.append({ "table_data": table_data, "bold_map": bold_map }) return all_tables