| import os
|
| import abc_1
|
| import shutil
|
|
|
| def get_subfolder_names(folder_path):
|
| try:
|
| subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
|
| return subfolders
|
| except FileNotFoundError:
|
| print(f"Error: Folder not found: {folder_path}")
|
| return []
|
|
|
|
|
| def create_folder(folder_path):
|
| """Creates a folder if it doesn't exist."""
|
| if not os.path.exists(folder_path):
|
| try:
|
| os.makedirs(folder_path)
|
| except OSError as e:
|
| print(f"Error creating folder {folder_path}: {e}")
|
|
|
|
|
| def move_file(source_path, destination_path):
|
| """Moves a file from the source to the destination."""
|
| try:
|
| os.rename(source_path, destination_path)
|
| except OSError as e:
|
| print(f"Error moving file {source_path} to {destination_path}: {e}")
|
|
|
|
|
| def process_file(folder_path, name):
|
| """Processes a single file, performing OCR, categorization, and moving."""
|
| text = abc_1.use_ocr(os.path.join(folder_path, name))
|
| category = abc_1.categorize_text_chunk(text, compiled_keywords)
|
|
|
| category_folder = os.path.join(folder_output, category)
|
| create_folder(category_folder)
|
| has_pdf, has_docx= check_file_existence(folder_path,name)
|
| if has_pdf:
|
| source_file = os.path.join(folder_path, name + '.pdf')
|
| destination_file = os.path.join(category_folder, name + '.pdf')
|
| move_file(source_file, destination_file)
|
| print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
|
| if has_docx:
|
| source_file = os.path.join(folder_path, name + '.docx')
|
| destination_file = os.path.join(category_folder, name + '.docx')
|
| move_file(source_file, destination_file)
|
| print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
|
|
|
|
|
| def check_file_existence(folder_path, filename):
|
| has_pdf = False
|
| has_docx = False
|
|
|
| for filename_in_folder in os.listdir(folder_path):
|
| base_filename, ext = os.path.splitext(filename_in_folder)
|
| if base_filename == filename:
|
| if ext == '.pdf':
|
| has_pdf = True
|
| elif ext == '.docx':
|
| has_docx = True
|
|
|
| return has_pdf, has_docx
|
|
|
| def runOCR(subfolder_names):
|
| for name in subfolder_names:
|
| process_file(folder_path, name)
|
| if os.path.exists(folder_path+'/'+name):
|
| shutil.rmtree(folder_path+'/'+name)
|
|
|
|
|
| if __name__ == '__main__':
|
| categories_keywords_dict = {
|
| 'AI': ['Artificial', 'Intelligence'],
|
| 'Automata': ['finite', 'state', 'machines'],
|
| 'DT': ['game', 'theory']
|
| }
|
|
|
| folder_path = 'input'
|
| folder_output = 'output'
|
| compiled_keywords = abc_1.compile_keywords(categories_keywords_dict)
|
|
|
| subfolder_names = get_subfolder_names(folder_path)
|
| runOCR(subfolder_names)
|
|
|
|
|
|
|