Spaces:

mdasad3617
/

lab-report-analyzer

Running

App Files Files Community

mdasad3617 commited on Dec 1, 2024

Commit

e97c412

verified ·

1 Parent(s): 0d68aff

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -193

app.py CHANGED Viewed

@@ -1,201 +1,34 @@
-import streamlit as st
-import logging
-from concurrent.futures import ThreadPoolExecutor
-import subprocess
-import sys
-# Attempt to import libraries, with fallback
-try:
-    import pytesseract
-    import cv2
-    import numpy as np
-    from PIL import Image
-    import fitz  # PyMuPDF for PDF processing
-    from transformers import pipeline
-except ImportError:
-    st.error("Required libraries are missing. Please install them using pip.")
-    st.stop()
-# Setup logging
-def setup_logging():
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s - %(levelname)s - %(message)s",
-    )
-# Tesseract installation check and guide
-def check_tesseract():
-    try:
-        # Try to get Tesseract version
-        version = subprocess.check_output(['tesseract', '--version'],
-                                          stderr=subprocess.STDOUT).decode('utf-8')
-        return True
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        # Provide installation instructions based on operating system
-        st.error("Tesseract OCR is not installed.")
-        st.markdown("### Tesseract Installation Guide:")
-        if sys.platform.startswith('linux'):
-            st.code("""
-            # For Ubuntu/Debian
-            sudo apt-get update
-            sudo apt-get install -y tesseract-ocr
-            # For Fedora
-            sudo dnf install -y tesseract
-            # For CentOS/RHEL
-            sudo yum install -y tesseract
-            """)
-        elif sys.platform.startswith('darwin'):
-            st.code("""
-            # For macOS (using Homebrew)
-            brew install tesseract
-            """)
-        elif sys.platform.startswith('win'):
-            st.markdown("""
-            1. Download Tesseract installer from:
-               https://github.com/UB-Mannheim/tesseract/wiki
-            2. Run the installer
-            3. Add Tesseract directory to your system PATH
-            """)
-        st.info("After installation, restart your application.")
-        return False
-# Load models globally for faster performance
-@st.cache_resource
-def load_models():
-    logging.info("Loading Hugging Face models...")
-    # Translation models
-    translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
-    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
-    # Summarization model
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    return translator_hi, translator_ur, summarizer
-# Function to preprocess image for better OCR
-def preprocess_image(image):
-    # Convert PIL Image to OpenCV format
-    img_np = np.array(image)
-    # Convert to grayscale
-    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
-    # Apply thresholding to preprocess the image
-    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
-    # Apply deskewing if needed
-    coords = np.column_stack(np.where(gray > 0))
-    # Prevent error if no foreground pixels found
-    if coords.size == 0:
-        return gray
-    angle = cv2.minAreaRect(coords)[-1]
-    # The cv2.minAreaRect returns values in the range [:-90, 0)
-    # so we need to take the inverse to get the rotation from the horizontal axis
-    if angle < -45:
-        angle = -(90 + angle)
-    else:
-        angle = -angle
-    # Rotate the image to deskew
-    (h, w) = gray.shape[:2]
-    center = (w // 2, h // 2)
-    M = cv2.getRotationMatrix2D(center, angle, 1.0)
-    rotated = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
-    return rotated
-# Function to extract text from images
-def extract_text_from_image(image):
-    logging.info("Extracting text from image...")
-    # Preprocess image
-    preprocessed_img = preprocess_image(image)
-    # Use pytesseract for OCR
-    text = pytesseract.image_to_string(preprocessed_img)
-    return text.strip()
-# Function to extract text from PDFs
-def extract_text_from_pdf(pdf_file):
-    logging.info("Extracting text from PDF...")
-    doc = fitz.open(pdf_file)
-    text = ""
-    for page in doc:
-        text += page.get_text()
-    return text
-# Function to process text in chunks for better performance
-def process_chunks(text, model, chunk_size=500):
-    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
-    results = []
-    with ThreadPoolExecutor() as executor:
-        results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks))
-    return " ".join([result[0]["translation_text"] for result in results])
-# Main app logic
-def main():
-    # Check Tesseract installation first
-    if not check_tesseract():
-        return
-    setup_logging()
-    st.title("Advanced Lab Report Analyzer")
-    st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.")
-    # Load all models
-    translator_hi, translator_ur, summarizer = load_models()
-    file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"])
-    if file:
-        text = ""
-        try:
-            if file.type in ["image/jpeg", "image/png", "image/jpg"]:
-                image = Image.open(file)
-                text = extract_text_from_image(image)
-            elif file.type == "application/pdf":
-                text = extract_text_from_pdf(file)
-            elif file.type == "text/plain":
-                text = file.read().decode("utf-8")
-            if text:
-                with st.spinner("Analyzing the report..."):
-                    # Generate summary
-                    summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"]
-                    # Generate translations
-                    hindi_translation = process_chunks(text, translator_hi)
-                    urdu_translation = process_chunks(text, translator_ur)
-                    # Display results
-                    st.subheader("Original Text:")
-                    st.write(text)
-                    st.subheader("Analysis Summary (English):")
-                    st.write(summary)
-                    st.subheader("Hindi Translation:")
-                    st.write(hindi_translation)
-                    st.subheader("Urdu Translation:")
-                    st.write(urdu_translation)
-            else:
-                st.warning("No text could be extracted. Please check the file and try again.")
-        except Exception as e:
-            logging.error(f"Error processing the file: {e}")
-            st.error(f"An error occurred while processing the file: {e}")
-    else:
-        st.info("Please upload a file to begin.")
 if __name__ == "__main__":
-    main()

+from models import initialize_models
+from models.pdf_handler import parse_pdf
+from models.image_handler import analyze_image
+from models.summarizer import summarize_text
+from models.translator import translate_text
+from models.problem_checker import flag_lab_problems
+def main():
+    # Initialize Hugging Face models
+    models = initialize_models()
+    # Example 1: Parse and summarize a PDF lab report
+    pdf_path = "example_lab_report.pdf"
+    pdf_text = parse_pdf(pdf_path)
+    print("Extracted Text from PDF:\n", pdf_text)
+    summary = summarize_text(pdf_text, models["summarize_model"])
+    print("\nSummary:\n", summary)
+    # Check for problems in the lab report
+    problems = flag_lab_problems(summary)
+    print("\nDetected Problems:\n", problems)
+    # Example 2: Translate the summary if needed
+    translated_summary = translate_text(summary, models["translation_model"])
+    print("\nTranslated Summary:\n", translated_summary)
+    # Example 3: Analyze an image
+    image_path = "example_lab_image.jpg"
+    image_results = analyze_image(image_path, models["image_model"])
+    print("\nImage Analysis Results:\n", image_results)
 if __name__ == "__main__":
+    main()