import os
import gradio as gr 
import subprocess

try:
    # Update the package lists
    subprocess.run(['apt-get', 'update'], check=True)

    # Install the required packages
    subprocess.run(['apt-get', 'install', '-y', 'poppler-utils'], check=True)
    subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr'], check=True)
    subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-eng'], check=True)

    print("Packages installed successfully!")
except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")

def process_pdf(file):

  # Get the uploaded PDF filename (Gradio File object)
  input_pdf = file.name
  os.system(f'pdftoppm -png "{input_pdf}" img')

  # Perform OCR using Tesseract on each PNG image (only English)
  for image in os.listdir():
      if image.startswith('img') and image.endswith('.png'):
          output_txt = f"ocr_{image}.txt"
          os.system(f'tesseract "{image}" "{output_txt[:-4]}"')

  # Combine all OCR text files into one
  output_txt_file = f"{input_pdf[:-4]}.txt"
  with open(output_txt_file, 'w') as output_file:
      for text_file in os.listdir():
          if text_file.startswith('ocr_img') and text_file.endswith('.txt'):
              with open(text_file, 'r') as f:
                  output_file.write(f.read())
                  output_file.write("\n")  # Optional: add newline between text files

  # Optional: Clean up intermediate PNG and text files
  for file in os.listdir():
      if file.startswith('img') or file.startswith('ocr_img'):
          os.remove(file)

  return output_txt_file


# Example Gradio Interface
interface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(),
    outputs=gr.File(),
    title="PDF to Text with OCR",
    description="Upload a PDF, perform OCR on it."
)

# Launch the interface
interface.launch(debug=True)