menu-text-detection/app.py at main · ryanlinjui/menu-text-detection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import os
import json
import base64
import requests
from io import BytesIO
from typing import List

import gradio as gr
from PIL import Image
from dotenv import load_dotenv
from pillow_heif import register_heif_opener

from menu.llm import (
    GeminiAPI,
    OpenAIAPI
)
from menu.donut import DonutFinetuned

donut_finetuned = DonutFinetuned("ryanlinjui/donut-base-finetuned-menu")

register_heif_opener()
load_dotenv(override=True)
GEMINI_API_TOKEN = os.getenv("GEMINI_API_TOKEN", "")
OPENAI_API_TOKEN = os.getenv("OPENAI_API_TOKEN", "")

SOURCE_CODE_GH_URL = "https://github.com/ryanlinjui/menu-text-detection"
BADGE_URL = "https://img.shields.io/badge/GitHub_Code-Click_Here!!-default?logo=github"

GITHUB_RAW_URL = "https://raw.githubusercontent.com/ryanlinjui/menu-text-detection/main"
EXAMPLE_IMAGE_LIST = [
    [f"{GITHUB_RAW_URL}/examples/menu-hd.jpg"],
    [f"{GITHUB_RAW_URL}/examples/menu-vs.jpg"],
    [f"{GITHUB_RAW_URL}/examples/menu-si.jpg"]
]
FINETUNED_MODEL_LIST = [
    "Donut (Document Parsing Task) Fine-tuned Model"
]
LLM_MODEL_LIST = [
    "gemini-3-pro-preview",
    "gemini-3-flash-preview",
    "gemini-2.5-pro",
    "gemini-2.5-flash",
    "gemini-2.0-flash",
    "gpt-4.1",
    "gpt-4o",
    "o4-mini"
]
CSS_STYLE = """
    .image-panel img {
        max-height: 500px;
        margin-top: -100px;
    }
    .large-text textarea {
        font-size: 20px !important;
        height: 600px !important;
        width: 100% !important;
    }
    .control-row {
        margin-top: -10px !important;
        margin-bottom: -10px !important;
        align-items: center !important;
        justify-content: center !important;
    }
    .page-info {
        text-align: center !important;
        font-size: 20px !important;
        display: flex !important;
        align-items: center !important;
        justify-content: center !important;
        height: 100% !important;
        font-weight: 900 !important;
        color: #374151; /* Darker gray for clarity */
    }
    .page-info p {
        margin: 0 !important;
        width: 100% !important;
        text-align: center !important;
    }
    .upload-btn {
        margin-top: 2px !important;
        background-color: #e0f2fe !important; /* Light blue background */
        color: #0369a1 !important; /* Dark blue text */
        border: 1px solid #0ea5e9 !important;
    }
    .upload-btn:hover {
        background-color: #bae6fd !important;
    }
    .clear-btn {
        margin-top: 2px !important;
    }
    .image-container {
        height: 650px !important;
        display: flex;
        flex-direction: column;
        border: 1px solid #e5e7eb;
        border-radius: 8px;
        padding: 4px;
    }
"""

def handle(images: List[str], model: str, api_token: str) -> str:
    if not images:
        raise gr.Error("Please upload an image first.")

    # Convert to PIL Images
    pil_images = []
    for img in images:
        if img.startswith("http://") or img.startswith("https://"):
            try:
                response = requests.get(img)
                response.raise_for_status()
                pil_images.append(Image.open(BytesIO(response.content)))
            except Exception as e:
                raise gr.Error(f"Failed to load image from URL: {str(e)}")
        elif img.startswith("data:image/") and ";base64," in img:
            try:
                _, encoded = img.split(";base64,", 1)
                data = base64.b64decode(encoded)
                pil_images.append(Image.open(BytesIO(data)))
            except Exception as e:
                raise gr.Error(f"Failed to decode Base64 image: {str(e)}")
        else:
            pil_images.append(Image.open(img))

    if model == FINETUNED_MODEL_LIST[0]:
        result = donut_finetuned.predict(pil_images[0])

    elif model in LLM_MODEL_LIST:
        if len(api_token) < 10:
            raise gr.Error(f"Please provide a valid token for {model}.")
        try:
            if model in LLM_MODEL_LIST[:5]:
                result = GeminiAPI.call(pil_images, model, api_token)
            else:
                result = OpenAIAPI.call(pil_images, model, api_token)
        except Exception as e:
            raise gr.Error(f"Failed to process with API model {model}: {str(e)}")
    else:
        raise gr.Error("Invalid model selection. Please choose a valid model.")

    return json.dumps(result, indent=4, ensure_ascii=False, sort_keys=True)

def UserInterface() -> gr.Interface:
    with gr.Blocks(delete_cache=(86400, 86400)) as gradio_interface:
        gr.HTML(f'<a href="{SOURCE_CODE_GH_URL}" target="_blank"><img src="{BADGE_URL}" alt="GitHub Code"/></a>')
        gr.Markdown("# Menu Text Detection")

        images_state = gr.State([])
        current_index_state = gr.State(0)

        with gr.Row():
            with gr.Column(scale=1, min_width=500):
                gr.Markdown("## 📷 Menu Image")

                with gr.Column(elem_classes="image-container"):
                    menu_image_display = gr.Image(
                        label="Input menu image",
                        type="filepath",
                        elem_classes="image-panel",
                        interactive=False,
                        show_label=True,
                        height="100%",
                        width="100%"
                    )
                    with gr.Row(elem_classes="control-row"):
                        prev_btn = gr.Button("◀️ Previous", variant="secondary", scale=1)
                        with gr.Column(scale=2, min_width=50):
                            page_info = gr.Markdown("Page 1 / 1", elem_classes="page-info")
                        next_btn = gr.Button("Next ▶️", variant="secondary", scale=1)

                    with gr.Row():
                        upload_btn = gr.UploadButton(
                            "📷 Upload Menu Images",
                            file_types=["image"],
                            file_count="multiple",
                            scale=3,
                            elem_classes="upload-btn",
                            variant="primary"
                        )
                        clear_btn = gr.Button("🗑️ Remove", variant="stop", scale=1, elem_classes="clear-btn")

                gr.Markdown("## 🤖 Model Selection")
                model_choice_dropdown = gr.Dropdown(
                    choices=FINETUNED_MODEL_LIST + LLM_MODEL_LIST,
                    value=FINETUNED_MODEL_LIST[0],
                    label="Select Text Detection Model"
                )

                api_token_textbox = gr.Textbox(
                    label="API Token",
                    placeholder="Enter your API token here...",
                    type="password",
                    visible=False
                )

                generate_button = gr.Button("Generate Menu Information", variant="primary")
                example_receiver = gr.Image(visible=False, label="Example Preview", type="filepath")

                examples_component = gr.Examples(
                    examples=[[img_list[0]] for img_list in EXAMPLE_IMAGE_LIST],
                    inputs=example_receiver,
                    label="Example Menu Images"
                )

            with gr.Column(scale=1):
                gr.Markdown("## 🍽️ Menu Info")
                menu_json_textbox = gr.Textbox(
                    label="Output JSON",
                    interactive=True,
                    text_align="left",
                    elem_classes="large-text"
                )

        def update_display(images, index):
            if not images:
                return None, "Page 1 / 1"
            idx = max(0, min(index, len(images) - 1))
            return images[idx], f"Page {idx + 1} / {len(images)}"

        def on_upload(new_files, current_images):
            if current_images is None:
                current_images = []
            if new_files:
                new_paths = [f.name for f in new_files]
                current_images.extend(new_paths)
            new_index = len(current_images) - 1
            img, info = update_display(current_images, new_index)
            return current_images, new_index, img, info

        upload_btn.upload(
            fn=on_upload,
            inputs=[upload_btn, images_state],
            outputs=[images_state, current_index_state, menu_image_display, page_info]
        )

        def on_clear(images, index):
            if not images:
                return [], 0, None, "Page 1 / 1"

            new_images = list(images)
            if 0 <= index < len(new_images):
                new_images.pop(index)

            if not new_images:
                 return [], 0, None, "Page 1 / 1"

            new_index = index
            if new_index >= len(new_images):
                new_index = len(new_images) - 1

            img, info = update_display(new_images, new_index)
            return new_images, new_index, img, info

        clear_btn.click(
            fn=on_clear,
            inputs=[images_state, current_index_state],
            outputs=[images_state, current_index_state, menu_image_display, page_info]
        )

        def on_prev(images, index):
            if not images:
                return 0, None, "Page 1 / 1"
            new_index = max(0, index - 1)
            img, info = update_display(images, new_index)
            return new_index, img, info

        def on_next(images, index):
            if not images:
                return 0, None, "Page 1 / 1"
            new_index = min(len(images) - 1, index + 1)
            img, info = update_display(images, new_index)
            return new_index, img, info

        prev_btn.click(on_prev, [images_state, current_index_state], [current_index_state, menu_image_display, page_info])
        next_btn.click(on_next, [images_state, current_index_state], [current_index_state, menu_image_display, page_info])

        def on_example_click(evt: gr.SelectData):
            if evt.index is None:
                return [], 0, None, "Page 1 / 1"

            # Retrieve the full batch based on the clicked index
            if 0 <= evt.index < len(EXAMPLE_IMAGE_LIST):
                current_images = EXAMPLE_IMAGE_LIST[evt.index]
            else:
                current_images = []

            new_index = 0
            img, info = update_display(current_images, new_index)
            return current_images, new_index, img, info

        examples_component.dataset.select(
            fn=on_example_click,
            inputs=None,
            outputs=[images_state, current_index_state, menu_image_display, page_info]
        )

        def update_token_visibility(choice):
            if choice in LLM_MODEL_LIST:
                current_token = ""
                if choice in LLM_MODEL_LIST[:5]:
                    current_token = GEMINI_API_TOKEN
                else:
                    current_token = OPENAI_API_TOKEN
                return gr.update(visible=True, value=current_token)
            else:
                return gr.update(visible=False)

        model_choice_dropdown.change(
            fn=update_token_visibility,
            inputs=model_choice_dropdown,
            outputs=api_token_textbox
        )

        generate_button.click(
            fn=handle,
            inputs=[images_state, model_choice_dropdown, api_token_textbox],
            outputs=menu_json_textbox
        )

        gr.api(
            fn=handle,
            api_name="run"
        )

    return gradio_interface

if __name__ == "__main__":
    demo = UserInterface()
    demo.launch(css=CSS_STYLE)