File size: 4,218 Bytes
52e5488
2cca7fc
 
db0e40a
944093d
2cca7fc
944093d
2cca7fc
2b74719
1736fe6
1607f5b
1736fe6
944093d
 
db0e40a
 
3cb873e
2b74719
8364b2d
2b74719
8364b2d
2b74719
8364b2d
db0e40a
4f47c49
5054d5e
4f32ea6
cf0f262
e0b5c19
4f47c49
2cca7fc
 
 
2b74719
2cca7fc
 
 
f7c1cff
2cca7fc
 
63dc99a
2cca7fc
 
8210da7
 
2cca7fc
 
49ae11d
2cca7fc
8210da7
7e82a0b
2cca7fc
3cb873e
f7c1cff
2cca7fc
 
7e82a0b
2cca7fc
 
 
 
63dc99a
7e82a0b
 
 
2b74719
7e82a0b
 
52e5488
63dc99a
 
 
53d4231
63dc99a
 
 
2cca7fc
52e5488
 
2cca7fc
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
import polars as pl

# favourite_langs = {"English": "en", "Romanian": "ro", "German": "de", "-----": "-----"}
favourite_langs = {"English": "en", "Romanian": "ro", "German": "de"}
options = list(favourite_langs.keys())
models = ['ENRO', 'DERO']
# English, Romanian
def search_text(input_text, sselected_language, tselected_language, model_name, hits, toggle_case):
    # df = pl.read_csv('hf://datasets/TiberiuCristianLeon/2RO/ENRO/ENRO.tsv', separator='\t')
    # df = pl.read_parquet('hf://datasets/TiberiuCristianLeon/RSSNEWS/data/train-00000-of-00001.parquet')
    # df = pl.read_parquet('https://huggingface.co/datasets/TiberiuCristianLeon/2RO/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet')
    path_to_model = f"https://huggingface.co/api/datasets/TiberiuCristianLeon/2RO/parquet/{model_name.lower()}/train/0.parquet"
    df = pl.read_parquet(path_to_model)
    # Filter rows
    # df.filter(pl.col(sselected_language).str.contains(input_text)).head(hits)
    # print(df.head(hits))
    if toggle_case:
        filtered = df.filter(pl.col(sselected_language).str.contains(input_text).alias("literal")) # case sensitive
    else:
        filtered = df.filter(pl.col(sselected_language).str.contains(f"(?i){input_text}").alias("literal")) # (?i) case insensitive
    # filtered = df.filter(pl.col(sselected_language).str.contains_any([input_text], ascii_case_insensitive=True).alias("contains_any"))
    print(toggle_case, filtered.head(hits))
    # print(filtered)
    # Extract rows
    list_of_arrays = filtered.select([sselected_language, tselected_language]).head(hits)
    # for dataframe type="numpy"
    # list_of_arrays = filtered.select([sselected_language, tselected_language]).head(hits).to_numpy()
    message_text = f'Done! Found {len(list_of_arrays)} entries'
    return list_of_arrays, message_text

# Define a function to swap dropdown values
def swap_languages(src_lang, tgt_lang):
    return tgt_lang, src_lang

def create_interface():
    with gr.Blocks() as interface:
        gr.Markdown("## Search Text in Dataset")

        with gr.Row():
            input_text = gr.Textbox(label="Enter text to search:", placeholder="Type your text here...", info="Press Enter key to start search")
        
        with gr.Row():
            sselected_language = gr.Dropdown(choices=options, value = options[0], label="Source language", interactive=True)
            tselected_language = gr.Dropdown(choices=options, value = options[1], label="Target language", interactive=True)
            swap_button = gr.Button("Swap Languages")
            swap_button.click(fn=swap_languages, inputs=[sselected_language, tselected_language], outputs=[sselected_language, tselected_language])
            toggle_case = gr.Checkbox(info="Case sensitive search", label="Toggle case sensitive search", value=True, interactive=True, visible=True)

        model_name = gr.Dropdown(choices=models, label="Select a dataset", value = models[0], interactive=True)
        search_button = gr.Button("Search")

        translated_text = gr.Dataframe(label="Returned entries:", interactive=False, headers=[options[0], options [1]], datatype=["str", "str"], col_count=(2, "fixed"),
        type="polars", wrap=True, show_row_numbers=False, show_copy_button=True)
        message_text = gr.Textbox(label="Messages:", placeholder="Display field for status and error messages", interactive=False)

        hits = gr.Slider(
            minimum=1,
            maximum=100,
            value=10,
            step=5,
            label="Number of returned hits")
        
        search_button.click(
            search_text, 
            inputs=[input_text, sselected_language, tselected_language, model_name, hits, toggle_case], 
            outputs=[translated_text, message_text]
        )

        # Submit the form when Enter is pressed in the input_text textbox
        input_text.submit(
            search_text, 
            inputs=[input_text, sselected_language, tselected_language, model_name, hits, toggle_case], 
            outputs=[translated_text, message_text]
        )

    return interface

if __name__ == "__main__":
    interface = create_interface()
    interface.launch()