File size: 5,682 Bytes
6ff22d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import sys
import argparse
from ultralytics import YOLO
from os.path import basename, splitext
import time

from yolo_script import process_yolo

from script import process_image

def process_image_description(
    input_image: str,
    weights_file: str,
    output_dir: str,
    model_to_use: str = 'llama',
    save_images: bool = False,
    icon_detection_path: str = None,
    cache_directory: str = './models_cache',
    huggingface_token: str = 'your_token',
    no_captioning: bool = False,
    output_json: bool = False,
    json_mini: bool = False,
    model_obj: YOLO = None,
    sr=None,
    reader=None,
    spell=None,
    skip_ocr=False,
    skip_spell=False,
) -> None:
    """
    Processes an image by running YOLO detection (via the imported process_yolo function)
    and then calling process_image() from script.py to do the image description work.
    
    Parameters:
      - input_image: Path to the input image.
      - weights_file: Path to the YOLO weights file.
      - output_dir: Directory for YOLO output
      - model_to_use: Which model to use for captioning ('llama' or 'blip').
      - save_images: Whether to save intermediate images.
      - icon_detection_path: Optional path to an icon detection model.
      - cache_directory: Cache directory for models.
      - huggingface_token: Hugging Face token for model downloads.
      - no_captioning: If True, disable image captioning.
      - output_json: If True, output the results in JSON format.
      - json_mini: same as output_json but has more compact json output.
      - model_obj: YOLO object that was initialized at a startup time (optional)
      - sr: Super resolution object (optional)
      - reader: EasyOCR object (optional)
      - spell: Spell checker object (optional)
    """

    base_name = splitext(basename(input_image))[0]
    
    process_yolo(input_image, weights_file, output_dir, model_obj=model_obj)
    
    labels_dir = os.path.join(output_dir, 'labels')
    label_file = os.path.join(labels_dir, base_name + '.txt')

    if not os.path.isfile(label_file):
        raise FileNotFoundError(f"Labels file not found at expected path: {label_file}")

    process_image(
        input_image_path=input_image,
        yolo_output_path=label_file,
        output_dir=output_dir,
        model_to_use=model_to_use,
        save_images=save_images,
        icon_model_path=icon_detection_path,
        cache_directory=cache_directory,
        huggingface_token=huggingface_token,
        no_captioning=no_captioning,
        output_json=output_json,
        json_mini=json_mini,
        sr=sr,
        reader=reader,
        spell=spell,
        skip_ocr=skip_ocr,
        skip_spell=skip_spell,
    )

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Wrapper script to run YOLO detection and image description in sequence.'
    )
    parser.add_argument('--input_image', required=True, help='Path to the input image.')
    parser.add_argument('--weights_file', required=True, help='Path to the YOLO weights file.')
    parser.add_argument('--output_dir', default='./output', help='Output directory for YOLO results.')
    parser.add_argument('--model_to_use', choices=['llama', 'blip'], default='llama',
                        help='Model for captioning.')
    parser.add_argument('--save_images', action='store_true',
                        help='Flag to save intermediate images.')
    parser.add_argument('--icon_detection_path', help='Path to the icon detection model.')
    parser.add_argument('--cache_directory', default='./models_cache',
                        help='Cache directory for models.')
    parser.add_argument('--huggingface_token', default='your_token',
                        help='Hugging Face token for model downloads.')
    parser.add_argument('--no-captioning', action='store_true',
                        help='Disable any image captioning')
    parser.add_argument('--json', dest='output_json', action='store_true',
                        help='Output the image data in JSON format')
    parser.add_argument('--json-mini', action='store_true',
                        help='JSON output in a more condensed format')
    parser.add_argument('--skip-ocr', action='store_true',
                        help='Disable OCR & spell-checking (faster).')
    parser.add_argument('--skip-spell', action='store_true', help='Run OCR but skip spell-check')
    
    args = parser.parse_args()
    
    try:
        print("Running YOLO detection...")
        yolo_output_dir = args.output_dir
        os.makedirs(yolo_output_dir, exist_ok=True)
        process_yolo(args.input_image, args.weights_file, yolo_output_dir)

        base_name = splitext(basename(args.input_image))[0]
        labels_dir = os.path.join(yolo_output_dir, 'labels')
        label_file = os.path.join(labels_dir, base_name + '.txt')
        if not os.path.isfile(label_file):
            raise FileNotFoundError(f"Labels file not found: {label_file}")

        print("Running image description...")
        process_image(
            input_image_path=args.input_image,
            yolo_output_path=label_file,
            model_to_use=args.model_to_use,
            save_images=args.save_images,
            icon_model_path=args.icon_detection_path,
            cache_directory=args.cache_directory,
            huggingface_token=args.huggingface_token,
            no_captioning=args.no_captioning,
            output_json=args.output_json,
            json_mini=args.json_mini,
            skip_ocr=args.skip_ocr,
            skip_spell=args.skip_spell
        )
    except Exception as e:
        print(e)
        sys.exit(1)