Spaces:

hugging2021
/

open-webui-rag-system

Runtime error

App Files Files Community

open-webui-rag-system / document_processor_image_test.py

hugging2021

Update document_processor_image_test.py

bd2e020 verified 7 days ago

raw

history blame contribute delete

16.9 kB

	import os
	import re
	import glob
	import time
	from collections import defaultdict

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS

	# PyMuPDF library
	try:
	import fitz # PyMuPDF
	PYMUPDF_AVAILABLE = True
	print("✅ PyMuPDF library available")
	except ImportError:
	PYMUPDF_AVAILABLE = False
	print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")

	# PDF processing utilities
	import pytesseract
	from PIL import Image
	from pdf2image import convert_from_path
	import pdfplumber
	from pymupdf4llm import LlamaMarkdownReader

	# --------------------------------
	# Log Output
	# --------------------------------

	def log(msg):
	print(f"[{time.strftime('%H:%M:%S')}] {msg}")

	# --------------------------------
	# Text Cleaning Function
	# --------------------------------

	def clean_text(text):
	return re.sub(r"[^\uAC00-\uD7A3\u1100-\u11FF\u3130-\u318F\w\s.,!?\"'()$:\-]", "", text)

	def apply_corrections(text):
	corrections = {
	'º©': 'info', 'Ì': 'of', '½': 'operation', 'Ã': '', '©': '',
	'â€™': "'", 'â€œ': '"', 'â€': '"'
	}
	for k, v in corrections.items():
	text = text.replace(k, v)
	return text

	# --------------------------------
	# HWPX Processing (Section-wise Processing Only)
	# --------------------------------

	def load_hwpx(file_path):
	"""Loading HWPX file (using XML parsing method only)"""
	import zipfile
	import xml.etree.ElementTree as ET
	import chardet

	log(f"📥 Starting HWPX section-wise processing: {file_path}")
	start = time.time()
	documents = []

	try:
	with zipfile.ZipFile(file_path, 'r') as zip_ref:
	file_list = zip_ref.namelist()
	section_files = [f for f in file_list
	if f.startswith('Contents/section') and f.endswith('.xml')]
	section_files.sort() # Sort by section0.xml, section1.xml order

	log(f"📄 Found section files: {len(section_files)}")

	for section_idx, section_file in enumerate(section_files):
	with zip_ref.open(section_file) as xml_file:
	raw = xml_file.read()
	encoding = chardet.detect(raw)['encoding'] or 'utf-8'
	try:
	text = raw.decode(encoding)
	except UnicodeDecodeError:
	text = raw.decode("cp949", errors="replace")

	tree = ET.ElementTree(ET.fromstring(text))
	root = tree.getroot()

	# Find text without namespace
	t_elements = [elem for elem in root.iter() if elem.tag.endswith('}t') or elem.tag == 't']
	body_text = ""
	for elem in t_elements:
	if elem.text:
	body_text += clean_text(elem.text) + " "

	# Set page metadata to empty
	page_value = ""

	if body_text.strip():
	documents.append(Document(
	page_content=apply_corrections(body_text),
	metadata={
	"source": file_path,
	"filename": os.path.basename(file_path),
	"type": "hwpx_body",
	"page": page_value,
	"total_sections": len(section_files)
	}
	))
	log(f"✅ Section text extraction complete (chars: {len(body_text)})")

	# Find tables
	table_elements = [elem for elem in root.iter() if elem.tag.endswith('}table') or elem.tag == 'table']
	if table_elements:
	table_text = ""
	for table_idx, table in enumerate(table_elements):
	table_text += f"[Table {table_idx + 1}]\n"
	rows = [elem for elem in table.iter() if elem.tag.endswith('}tr') or elem.tag == 'tr']
	for row in rows:
	row_text = []
	cells = [elem for elem in row.iter() if elem.tag.endswith('}tc') or elem.tag == 'tc']
	for cell in cells:
	cell_texts = []
	for t_elem in cell.iter():
	if (t_elem.tag.endswith('}t') or t_elem.tag == 't') and t_elem.text:
	cell_texts.append(clean_text(t_elem.text))
	row_text.append(" ".join(cell_texts))
	if row_text:
	table_text += "\t".join(row_text) + "\n"

	if table_text.strip():
	documents.append(Document(
	page_content=apply_corrections(table_text),
	metadata={
	"source": file_path,
	"filename": os.path.basename(file_path),
	"type": "hwpx_table",
	"page": page_value,
	"total_sections": len(section_files)
	}
	))
	log(f"📊 Table extraction complete")

	# Find images
	if [elem for elem in root.iter() if elem.tag.endswith('}picture') or elem.tag == 'picture']:
	documents.append(Document(
	page_content="[Image included]",
	metadata={
	"source": file_path,
	"filename": os.path.basename(file_path),
	"type": "hwpx_image",
	"page": page_value,
	"total_sections": len(section_files)
	}
	))
	log(f"🖼️ Image found")

	except Exception as e:
	log(f"❌ HWPX processing error: {e}")

	duration = time.time() - start

	# Print summary of document information
	if documents:
	log(f"📋 Number of extracted documents: {len(documents)}")

	log(f"✅ HWPX processing complete: {file_path} ⏱️ {duration:.2f}s, total {len(documents)} documents")
	return documents

	# --------------------------------
	# PDF Processing Functions (same as before)
	# --------------------------------

	def run_ocr_on_image(image: Image.Image, lang='kor+eng'):
	return pytesseract.image_to_string(image, lang=lang)

	def extract_images_with_ocr(pdf_path, lang='kor+eng'):
	try:
	images = convert_from_path(pdf_path)
	page_ocr_data = {}
	for idx, img in enumerate(images):
	page_num = idx + 1
	text = run_ocr_on_image(img, lang=lang)
	if text.strip():
	page_ocr_data[page_num] = text.strip()
	return page_ocr_data
	except Exception as e:
	print(f"❌ Image OCR failed: {e}")
	return {}

	def extract_tables_with_pdfplumber(pdf_path):
	page_table_data = {}
	try:
	with pdfplumber.open(pdf_path) as pdf:
	for i, page in enumerate(pdf.pages):
	page_num = i + 1
	tables = page.extract_tables()
	table_text = ""
	for t_index, table in enumerate(tables):
	if table:
	table_text += f"[Table {t_index+1}]\n"
	for row in table:
	row_text = "\t".join(cell if cell else "" for cell in row)
	table_text += row_text + "\n"
	if table_text.strip():
	page_table_data[page_num] = table_text.strip()
	return page_table_data
	except Exception as e:
	print(f"❌ Table extraction failed: {e}")
	return {}

	def extract_body_text_with_pages(pdf_path):
	page_body_data = {}
	try:
	pdf_processor = LlamaMarkdownReader()
	docs = pdf_processor.load_data(file_path=pdf_path)

	combined_text = ""
	for d in docs:
	if isinstance(d, dict) and "text" in d:
	combined_text += d["text"]
	elif hasattr(d, "text"):
	combined_text += d.text

	if combined_text.strip():
	chars_per_page = 2000
	start = 0
	page_num = 1

	while start < len(combined_text):
	end = start + chars_per_page
	if end > len(combined_text):
	end = len(combined_text)

	page_text = combined_text[start:end]
	if page_text.strip():
	page_body_data[page_num] = page_text.strip()
	page_num += 1

	if end == len(combined_text):
	break
	start = end - 100

	except Exception as e:
	print(f"❌ Body extraction failed: {e}")

	return page_body_data

	def load_pdf_with_metadata(pdf_path):
	"""Extracts page-specific information from a PDF file"""
	log(f"📑 Starting PDF page-wise processing: {pdf_path}")
	start = time.time()

	# First, check the actual number of pages using PyPDFLoader
	try:
	from langchain_community.document_loaders import PyPDFLoader
	loader = PyPDFLoader(pdf_path)
	pdf_pages = loader.load()
	actual_total_pages = len(pdf_pages)
	log(f"📄 Actual page count as verified by PyPDFLoader: {actual_total_pages}")
	except Exception as e:
	log(f"❌ PyPDFLoader page count verification failed: {e}")
	actual_total_pages = 1

	try:
	page_tables = extract_tables_with_pdfplumber(pdf_path)
	except Exception as e:
	page_tables = {}
	print(f"❌ Table extraction failed: {e}")

	try:
	page_ocr = extract_images_with_ocr(pdf_path)
	except Exception as e:
	page_ocr = {}
	print(f"❌ Image OCR failed: {e}")

	try:
	page_body = extract_body_text_with_pages(pdf_path)
	except Exception as e:
	page_body = {}
	print(f"❌ Body extraction failed: {e}")

	duration = time.time() - start
	log(f"✅ PDF page-wise processing complete: {pdf_path} ⏱️ {duration:.2f}s")

	# Set the total number of pages based on the actual number of pages
	all_pages = set(page_tables.keys()) \| set(page_ocr.keys()) \| set(page_body.keys())
	if all_pages:
	max_extracted_page = max(all_pages)
	# Use the greater of the actual and extracted page numbers
	total_pages = max(actual_total_pages, max_extracted_page)
	else:
	total_pages = actual_total_pages

	log(f"📊 Final total page count set to: {total_pages}")

	docs = []

	for page_num in sorted(all_pages):
	if page_num in page_tables and page_tables[page_num].strip():
	docs.append(Document(
	page_content=clean_text(apply_corrections(page_tables[page_num])),
	metadata={
	"source": pdf_path,
	"filename": os.path.basename(pdf_path),
	"type": "table",
	"page": page_num,
	"total_pages": total_pages
	}
	))
	log(f"📊 Page {page_num}: Table extraction complete")

	if page_num in page_body and page_body[page_num].strip():
	docs.append(Document(
	page_content=clean_text(apply_corrections(page_body[page_num])),
	metadata={
	"source": pdf_path,
	"filename": os.path.basename(pdf_path),
	"type": "body",
	"page": page_num,
	"total_pages": total_pages
	}
	))
	log(f"📄 Page {page_num}: Body extraction complete")

	if page_num in page_ocr and page_ocr[page_num].strip():
	docs.append(Document(
	page_content=clean_text(apply_corrections(page_ocr[page_num])),
	metadata={
	"source": pdf_path,
	"filename": os.path.basename(pdf_path),
	"type": "ocr",
	"page": page_num,
	"total_pages": total_pages
	}
	))
	log(f"🖼️ Page {page_num}: OCR extraction complete")

	if not docs:
	docs.append(Document(
	page_content="[Content extraction failed]",
	metadata={
	"source": pdf_path,
	"filename": os.path.basename(pdf_path),
	"type": "error",
	"page": 1,
	"total_pages": total_pages
	}
	))

	# Print summary of page information
	if docs:
	page_numbers = [doc.metadata.get('page', 0) for doc in docs if doc.metadata.get('page')]
	if page_numbers:
	log(f"📋 Extracted page range: {min(page_numbers)} ~ {max(page_numbers)}")

	log(f"📊 PDF documents with extracted pages: {len(docs)} documents (total {total_pages} pages)")
	return docs

	# --------------------------------
	# Document Loading and Splitting
	# --------------------------------

	def load_documents(folder_path):
	documents = []

	for file in glob.glob(os.path.join(folder_path, "*.hwpx")):
	log(f"📄 HWPX file found: {file}")
	docs = load_hwpx(file)
	documents.extend(docs)

	for file in glob.glob(os.path.join(folder_path, "*.pdf")):
	log(f"📄 PDF file found: {file}")
	documents.extend(load_pdf_with_metadata(file))

	log(f"📚 Document loading complete! Total documents: {len(documents)}")
	return documents

	def split_documents(documents, chunk_size=800, chunk_overlap=100):
	log("🔪 Starting chunk splitting")
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len
	)
	chunks = []
	for doc in documents:
	split = splitter.split_text(doc.page_content)
	for i, chunk in enumerate(split):
	enriched_chunk = f"passage: {chunk}"
	chunks.append(Document(
	page_content=enriched_chunk,
	metadata={**doc.metadata, "chunk_index": i}
	))
	log(f"✅ Chunk splitting complete: Created {len(chunks)} chunks")
	return chunks

	# --------------------------------
	# Main Execution
	# --------------------------------

	if __name__ == "__main__":
	folder = "dataset_test"
	log("🚀 PyMuPDF-based document processing started")
	docs = load_documents(folder)
	log("📦 Document loading complete")

	# Page information check
	log("📄 Page information summary:")
	page_info = {}
	for doc in docs:
	source = doc.metadata.get('source', 'unknown')
	page = doc.metadata.get('page', 'unknown')
	doc_type = doc.metadata.get('type', 'unknown')

	if source not in page_info:
	page_info[source] = {'pages': set(), 'types': set()}
	page_info[source]['pages'].add(page)
	page_info[source]['types'].add(doc_type)

	for source, info in page_info.items():
	max_page = max(info['pages']) if info['pages'] and isinstance(max(info['pages']), int) else 'unknown'
	log(f" 📄 {os.path.basename(source)}: {max_page} pages, type: {info['types']}")

	chunks = split_documents(docs)
	log("💡 E5-Large-Instruct embedding preparation")
	embedding_model = HuggingFaceEmbeddings(
	model_name="intfloat/e5-large-v2",
	model_kwargs={"device": "cuda"}
	)

	vectorstore = FAISS.from_documents(chunks, embedding_model)
	vectorstore.save_local("vector_db")

	log(f"📊 Total number of documents: {len(docs)}")
	log(f"🔗 Total number of chunks: {len(chunks)}")
	log("✅ FAISS save complete: vector_db")

	# Sample output with page information
	log("\n📋 Sample including actual page information:")
	for i, chunk in enumerate(chunks[:5]):
	meta = chunk.metadata
	log(f" Chunk {i+1}: {meta.get('type')} \| Page {meta.get('page')} \| {os.path.basename(meta.get('source', 'unknown'))}")