Spaces:

jacobmp
/

multi-line-OCR-handwritten

Running

App Files Files Community

jacobmp commited on 28 days ago

Commit

adee107

verified ·

1 Parent(s): 98361de

Do recognition and decoding in batch to speedup

Browse files

Files changed (1) hide show

app.py +15 -8

app.py CHANGED Viewed

@@ -15,9 +15,10 @@ def process(path, progress = gr.Progress()):
     # Load the model and processor
     processor = TrOCRProcessor.from_pretrained(OCR_MODEL_PATH)
     model = VisionEncoderDecoderModel.from_pretrained(OCR_MODEL_PATH)
     # Open an image of handwritten text
     image = Image.open(path).convert("RGB")
     progress(0, desc="Extracting Text Lines")
     try:
         # Load the trained line detection model
@@ -25,23 +26,29 @@ def process(path, progress = gr.Progress()):
         line_model = YOLO(cached_model_path)
     except Exception as e:
         print('Failed to load the line detection model: %s' % e)
     results = line_model.predict(source = image)[0]
-    full_text = ""
     boxes = results.boxes.xyxy
     indices = boxes[:,1].sort().indices
     boxes = boxes[indices]
-    for box in progress.tqdm(boxes, desc="Text Recognition"):
         #box = box + torch.tensor([-10,0, 10, 0])
         box = [tensor.item() for tensor in box]
         lineImg = image.crop(tuple(list(box)))
-        # Preprocess and predict
         pixel_values = processor(lineImg, return_tensors="pt").pixel_values
-        generated_ids = model.generate(pixel_values)
-        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        full_text += generated_text
     fix_spelling = pipeline("text2text-generation",model=CORRECTOR_PATH)
     fixed_text = fix_spelling(full_text, max_new_tokens=len(full_text)+100)
     fixed_text = fixed_text[0]['generated_text']

     # Load the model and processor
     processor = TrOCRProcessor.from_pretrained(OCR_MODEL_PATH)
     model = VisionEncoderDecoderModel.from_pretrained(OCR_MODEL_PATH)
     # Open an image of handwritten text
     image = Image.open(path).convert("RGB")
     progress(0, desc="Extracting Text Lines")
     try:
         # Load the trained line detection model
         line_model = YOLO(cached_model_path)
     except Exception as e:
         print('Failed to load the line detection model: %s' % e)
     results = line_model.predict(source = image)[0]
     boxes = results.boxes.xyxy
     indices = boxes[:,1].sort().indices
     boxes = boxes[indices]
+    batch = []
+    for box in progress.tqdm(boxes, desc="Preprocessing"):
         #box = box + torch.tensor([-10,0, 10, 0])
         box = [tensor.item() for tensor in box]
         lineImg = image.crop(tuple(list(box)))
+        # Preprocess
         pixel_values = processor(lineImg, return_tensors="pt").pixel_values
+        batch.append(pixel_values)
+    #Predict and decode the entire batch
+    progress(0, desc="Recognizing..")
+    generated_ids = model.generate(torch.cat(batch))
+    progress(0, desc="Decoding (token -> str)")
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    full_text = " ".join(generated_text)
+    progress(0, desc="Correction..")
     fix_spelling = pipeline("text2text-generation",model=CORRECTOR_PATH)
     fixed_text = fix_spelling(full_text, max_new_tokens=len(full_text)+100)
     fixed_text = fixed_text[0]['generated_text']