import pandas as pd import streamlit as st import skimage.io as io from PIL import Image import numpy as np import matplotlib.pyplot as plt import seaborn as sns import cv2 from skimage.filters import threshold_local import pytesseract import re import os from pytesseract import Output os.system('apt-get install tesseract-ocr') def plot_gray(image): plt.figure(figsize=(16,10)) return plt.imshow(image, cmap='Greys_r') def plot_rgb(image): plt.figure(figsize=(16,10)) return plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) def bw_scanner(image): gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) T = threshold_local(gray, 21, offset = 5, method = "gaussian") return (gray > T).astype("uint8") * 255 def text_box_detection(image): d = pytesseract.image_to_data(image, output_type=Output.DICT) n_boxes = len(d['level']) boxes = cv2.cvtColor(image.copy(), cv2.COLOR_BGR2RGB) for i in range(n_boxes): (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i]) boxes = cv2.rectangle(boxes, (x, y), (x + w, y + h), (0, 255, 0), 2) return boxes def ui(): st.markdown("# Text Extraction") uploaded_file = st.file_uploader("Upload an Image", type=['png', 'jpeg', 'jpg']) if uploaded_file is not None: image = Image.open(uploaded_file) img_array = np.array(image) gray_image=bw_scanner(img_array) boxes=text_box_detection(gray_image) st.image(boxes, width = 500, channels = 'RGB') extracted_text = pytesseract.image_to_string(img_array) st.markdown(f"Predicted Text {extracted_text}") if __name__ == '__main__': ui()