import os import gradio as gr import subprocess try: # Update the package lists subprocess.run(['apt-get', 'update'], check=True) # Install the required packages subprocess.run(['apt-get', 'install', '-y', 'poppler-utils'], check=True) subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True) subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-eng'], check=True) print("Packages installed successfully!") except subprocess.CalledProcessError as e: print(f"An error occurred: {e}") def process_pdf(file): # Get the uploaded PDF filename (Gradio File object) input_pdf = file.name os.system(f'pdftoppm -png "{input_pdf}" img') # Perform OCR using Tesseract on each PNG image (only English) for image in os.listdir(): if image.startswith('img') and image.endswith('.png'): output_txt = f"ocr_{image}.txt" os.system(f'tesseract "{image}" "{output_txt[:-4]}"') # Combine all OCR text files into one output_txt_file = f"{input_pdf[:-4]}.txt" with open(output_txt_file, 'w') as output_file: for text_file in os.listdir(): if text_file.startswith('ocr_img') and text_file.endswith('.txt'): with open(text_file, 'r') as f: output_file.write(f.read()) output_file.write("\n") # Optional: add newline between text files # Optional: Clean up intermediate PNG and text files for file in os.listdir(): if file.startswith('img') or file.startswith('ocr_img'): os.remove(file) return output_txt_file # Example Gradio Interface interface = gr.Interface( fn=process_pdf, inputs=gr.File(), outputs=gr.File(), title="PDF to Text with OCR", description="Upload a PDF, perform OCR on it." ) # Launch the interface interface.launch(debug=True)