washeed
/

ocr

Model card Files Files and versions

ocr / inputPDFToOutputOCR.py

washeed's picture

Upload 18 files

b692870 verified almost 2 years ago

history blame contribute delete

3.04 kB

	import os
	import abc_1
	import shutil

	def get_subfolder_names(folder_path):
	try:
	subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
	return subfolders
	except FileNotFoundError:
	print(f"Error: Folder not found: {folder_path}")
	return []


	def create_folder(folder_path):
	"""Creates a folder if it doesn't exist."""
	if not os.path.exists(folder_path):
	try:
	os.makedirs(folder_path) # Create the folder and any missing parent directories
	except OSError as e:
	print(f"Error creating folder {folder_path}: {e}")


	def move_file(source_path, destination_path):
	"""Moves a file from the source to the destination."""
	try:
	os.rename(source_path, destination_path)
	except OSError as e:
	print(f"Error moving file {source_path} to {destination_path}: {e}")


	def process_file(folder_path, name):
	"""Processes a single file, performing OCR, categorization, and moving."""
	text = abc_1.use_ocr(os.path.join(folder_path, name))
	category = abc_1.categorize_text_chunk(text, compiled_keywords)

	category_folder = os.path.join(folder_output, category)
	create_folder(category_folder)
	has_pdf, has_docx= check_file_existence(folder_path,name)
	if has_pdf:
	source_file = os.path.join(folder_path, name + '.pdf')
	destination_file = os.path.join(category_folder, name + '.pdf')
	move_file(source_file, destination_file)
	print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
	if has_docx:
	source_file = os.path.join(folder_path, name + '.docx')
	destination_file = os.path.join(category_folder, name + '.docx')
	move_file(source_file, destination_file)
	print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")


	def check_file_existence(folder_path, filename):
	has_pdf = False
	has_docx = False

	for filename_in_folder in os.listdir(folder_path):
	base_filename, ext = os.path.splitext(filename_in_folder)
	if base_filename == filename:
	if ext == '.pdf':
	has_pdf = True
	elif ext == '.docx':
	has_docx = True

	return has_pdf, has_docx

	def runOCR(subfolder_names):
	for name in subfolder_names:
	process_file(folder_path, name)
	if os.path.exists(folder_path+'/'+name): # buffer folder delete
	shutil.rmtree(folder_path+'/'+name)


	if __name__ == '__main__':
	categories_keywords_dict = {
	'AI': ['Artificial', 'Intelligence'],
	'Automata': ['finite', 'state', 'machines'],
	'DT': ['game', 'theory']
	}

	folder_path = 'input' #output folder ni pdftoimage toh
	folder_output = 'output' # Fixed typo
	compiled_keywords = abc_1.compile_keywords(categories_keywords_dict)

	subfolder_names = get_subfolder_names(folder_path)
	runOCR(subfolder_names)