Spaces:
Sleeping
Sleeping
import time | |
import os | |
from pdf2docx import Converter | |
from docx import Document | |
def convert_pdf_to_word(pdf_path, docx_path): | |
cv = Converter(pdf_path) | |
cv.convert(docx_path) | |
cv.close() | |
def convert_image_to_word(text, filename): | |
copy_num = 0 | |
doc = Document() | |
doc.add_heading(f"{filename}", level=1) | |
doc.add_paragraph(text) | |
doc.add_paragraph("\n" + "-"*50 + "\n") | |
if not os.path.exists(os.path.join('/tmp', filename)): | |
filepath = os.path.join('/tmp', filename) | |
else: | |
copy_num+=1 | |
filepath = os.path.join('/tmp', f'{filename}({copy_num})') | |
doc.save(filepath) | |
def wait_for_file_release(file_path, timeout=5): | |
start_time = time.time() | |
while time.time() - start_time < timeout: | |
try: | |
with open(file_path, 'rb'): | |
return True | |
except PermissionError: | |
time.sleep(0.5) | |
return False | |
def delete_temp_folder(temp_path="/tmp"): | |
time.sleep(0.5) | |
for filename in os.listdir(temp_path): | |
file_path = os.path.join(temp_path, filename) | |
if wait_for_file_release(file_path): | |
try: | |
os.remove(file_path) | |
print(f"Hapus: {file_path}") | |
except Exception as e: | |
print(f"Gagal hapus {file_path}: {e}") | |
else: | |
print(f"File terkunci terlalu lama: {file_path}") | |
def extract_tables_from_docx(docx_path): | |
doc = Document(docx_path) | |
all_tables = [] | |
for table in doc.tables: | |
table_data = [] | |
bold_map = [] | |
for row in table.rows: | |
row_data = [] | |
row_bold_flags = [] | |
for cell in row.cells: | |
texts = [] | |
is_bold = False | |
for paragraph in cell.paragraphs: | |
for run in paragraph.runs: | |
texts.append(run.text.strip()) | |
if run.bold: | |
is_bold = True | |
cell_text = " ".join(texts).strip() | |
row_data.append(cell_text) | |
row_bold_flags.append(is_bold) | |
table_data.append(row_data) | |
bold_map.append(row_bold_flags) | |
all_tables.append({ | |
"table_data": table_data, | |
"bold_map": bold_map | |
}) | |
return all_tables |