mdasad3617 commited on
Commit
e97c412
·
verified ·
1 Parent(s): 0d68aff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -193
app.py CHANGED
@@ -1,201 +1,34 @@
1
- import streamlit as st
2
- import logging
3
- from concurrent.futures import ThreadPoolExecutor
4
- import subprocess
5
- import sys
 
6
 
7
- # Attempt to import libraries, with fallback
8
- try:
9
- import pytesseract
10
- import cv2
11
- import numpy as np
12
- from PIL import Image
13
- import fitz # PyMuPDF for PDF processing
14
- from transformers import pipeline
15
- except ImportError:
16
- st.error("Required libraries are missing. Please install them using pip.")
17
- st.stop()
18
-
19
- # Setup logging
20
- def setup_logging():
21
- logging.basicConfig(
22
- level=logging.INFO,
23
- format="%(asctime)s - %(levelname)s - %(message)s",
24
- )
25
-
26
- # Tesseract installation check and guide
27
- def check_tesseract():
28
- try:
29
- # Try to get Tesseract version
30
- version = subprocess.check_output(['tesseract', '--version'],
31
- stderr=subprocess.STDOUT).decode('utf-8')
32
- return True
33
- except (subprocess.CalledProcessError, FileNotFoundError):
34
- # Provide installation instructions based on operating system
35
- st.error("Tesseract OCR is not installed.")
36
- st.markdown("### Tesseract Installation Guide:")
37
-
38
- if sys.platform.startswith('linux'):
39
- st.code("""
40
- # For Ubuntu/Debian
41
- sudo apt-get update
42
- sudo apt-get install -y tesseract-ocr
43
-
44
- # For Fedora
45
- sudo dnf install -y tesseract
46
-
47
- # For CentOS/RHEL
48
- sudo yum install -y tesseract
49
- """)
50
- elif sys.platform.startswith('darwin'):
51
- st.code("""
52
- # For macOS (using Homebrew)
53
- brew install tesseract
54
- """)
55
- elif sys.platform.startswith('win'):
56
- st.markdown("""
57
- 1. Download Tesseract installer from:
58
- https://github.com/UB-Mannheim/tesseract/wiki
59
- 2. Run the installer
60
- 3. Add Tesseract directory to your system PATH
61
- """)
62
-
63
- st.info("After installation, restart your application.")
64
- return False
65
-
66
- # Load models globally for faster performance
67
- @st.cache_resource
68
- def load_models():
69
- logging.info("Loading Hugging Face models...")
70
-
71
- # Translation models
72
- translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
73
- translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
74
-
75
- # Summarization model
76
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
77
-
78
- return translator_hi, translator_ur, summarizer
79
-
80
- # Function to preprocess image for better OCR
81
- def preprocess_image(image):
82
- # Convert PIL Image to OpenCV format
83
- img_np = np.array(image)
84
-
85
- # Convert to grayscale
86
- gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
87
-
88
- # Apply thresholding to preprocess the image
89
- gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
90
-
91
- # Apply deskewing if needed
92
- coords = np.column_stack(np.where(gray > 0))
93
-
94
- # Prevent error if no foreground pixels found
95
- if coords.size == 0:
96
- return gray
97
-
98
- angle = cv2.minAreaRect(coords)[-1]
99
-
100
- # The cv2.minAreaRect returns values in the range [:-90, 0)
101
- # so we need to take the inverse to get the rotation from the horizontal axis
102
- if angle < -45:
103
- angle = -(90 + angle)
104
- else:
105
- angle = -angle
106
-
107
- # Rotate the image to deskew
108
- (h, w) = gray.shape[:2]
109
- center = (w // 2, h // 2)
110
- M = cv2.getRotationMatrix2D(center, angle, 1.0)
111
- rotated = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
112
-
113
- return rotated
114
-
115
- # Function to extract text from images
116
- def extract_text_from_image(image):
117
- logging.info("Extracting text from image...")
118
-
119
- # Preprocess image
120
- preprocessed_img = preprocess_image(image)
121
-
122
- # Use pytesseract for OCR
123
- text = pytesseract.image_to_string(preprocessed_img)
124
 
125
- return text.strip()
 
 
 
126
 
127
- # Function to extract text from PDFs
128
- def extract_text_from_pdf(pdf_file):
129
- logging.info("Extracting text from PDF...")
130
- doc = fitz.open(pdf_file)
131
- text = ""
132
- for page in doc:
133
- text += page.get_text()
134
- return text
135
 
136
- # Function to process text in chunks for better performance
137
- def process_chunks(text, model, chunk_size=500):
138
- chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
139
- results = []
140
- with ThreadPoolExecutor() as executor:
141
- results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks))
142
- return " ".join([result[0]["translation_text"] for result in results])
143
 
144
- # Main app logic
145
- def main():
146
- # Check Tesseract installation first
147
- if not check_tesseract():
148
- return
149
 
150
- setup_logging()
151
- st.title("Advanced Lab Report Analyzer")
152
- st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.")
153
-
154
- # Load all models
155
- translator_hi, translator_ur, summarizer = load_models()
156
-
157
- file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"])
158
-
159
- if file:
160
- text = ""
161
- try:
162
- if file.type in ["image/jpeg", "image/png", "image/jpg"]:
163
- image = Image.open(file)
164
- text = extract_text_from_image(image)
165
- elif file.type == "application/pdf":
166
- text = extract_text_from_pdf(file)
167
- elif file.type == "text/plain":
168
- text = file.read().decode("utf-8")
169
-
170
- if text:
171
- with st.spinner("Analyzing the report..."):
172
- # Generate summary
173
- summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"]
174
-
175
- # Generate translations
176
- hindi_translation = process_chunks(text, translator_hi)
177
- urdu_translation = process_chunks(text, translator_ur)
178
-
179
- # Display results
180
- st.subheader("Original Text:")
181
- st.write(text)
182
-
183
- st.subheader("Analysis Summary (English):")
184
- st.write(summary)
185
-
186
- st.subheader("Hindi Translation:")
187
- st.write(hindi_translation)
188
-
189
- st.subheader("Urdu Translation:")
190
- st.write(urdu_translation)
191
- else:
192
- st.warning("No text could be extracted. Please check the file and try again.")
193
-
194
- except Exception as e:
195
- logging.error(f"Error processing the file: {e}")
196
- st.error(f"An error occurred while processing the file: {e}")
197
- else:
198
- st.info("Please upload a file to begin.")
199
 
200
  if __name__ == "__main__":
201
- main()
 
1
+ from models import initialize_models
2
+ from models.pdf_handler import parse_pdf
3
+ from models.image_handler import analyze_image
4
+ from models.summarizer import summarize_text
5
+ from models.translator import translate_text
6
+ from models.problem_checker import flag_lab_problems
7
 
8
+ def main():
9
+ # Initialize Hugging Face models
10
+ models = initialize_models()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # Example 1: Parse and summarize a PDF lab report
13
+ pdf_path = "example_lab_report.pdf"
14
+ pdf_text = parse_pdf(pdf_path)
15
+ print("Extracted Text from PDF:\n", pdf_text)
16
 
17
+ summary = summarize_text(pdf_text, models["summarize_model"])
18
+ print("\nSummary:\n", summary)
 
 
 
 
 
 
19
 
20
+ # Check for problems in the lab report
21
+ problems = flag_lab_problems(summary)
22
+ print("\nDetected Problems:\n", problems)
 
 
 
 
23
 
24
+ # Example 2: Translate the summary if needed
25
+ translated_summary = translate_text(summary, models["translation_model"])
26
+ print("\nTranslated Summary:\n", translated_summary)
 
 
27
 
28
+ # Example 3: Analyze an image
29
+ image_path = "example_lab_image.jpg"
30
+ image_results = analyze_image(image_path, models["image_model"])
31
+ print("\nImage Analysis Results:\n", image_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  if __name__ == "__main__":
34
+ main()