Spaces:

latticetower
/

keyword-embeddings-space

Sleeping

App Files Files

latticetower commited on Feb 19

Commit

b40aac1

1 Parent(s): 2d1d8cb

fix avxline in plots, use common legend in gradio, add reaction and loading on launch

Browse files

Files changed (4) hide show

app.py +70 -9
constants.py +17 -0
mpl_data_plotter.py +28 -23
plot_utils.py +1 -1

app.py CHANGED Viewed

@@ -17,11 +17,13 @@ def convert_int64_to_int32(df):
 print(f"Loading domains data...")
 single_df = pd.read_csv(SINGLE_DOMAINS_FILE, compression='gzip')
-single_df['biosyn_class_index'] = single_df.bgc_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
 single_df = convert_int64_to_int32(single_df)
 pair_df = pd.read_csv(PAIR_DOMAINS_FILE, compression='gzip')
-pair_df['biosyn_class_index'] = pair_df.bgc_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
 pair_df = convert_int64_to_int32(pair_df)
 num_domains_in_region_df = single_df.groupby('cds_region_id', as_index=False).agg({'as_domain_id': 'count'}).rename(
@@ -33,6 +35,53 @@ print(f"Initializing data plotter...")
 data_plotter = MatplotlibDataPlotter(single_df, pair_df, num_domains_in_region_df)
 def update_all_plots(frequency, split_name):
     return data_plotter.plot_single_domains(frequency, split_name), data_plotter.plot_pair_domains(frequency, split_name)
@@ -43,6 +92,8 @@ with gr.Blocks(title="BGC Keyword Plotter") as demo:
     gr.Markdown("## BGC Keyword Plotter")
     gr.Markdown("Select the model name and minimal number of domains in Antismash-db subset.")
     with gr.Row():
         frequency_slider = gr.Slider(
             minimum=int(unique_domain_lengths.min()),
@@ -51,14 +102,13 @@ with gr.Blocks(title="BGC Keyword Plotter") as demo:
             value=int(unique_domain_lengths.min()),
             label="Min number of domains"
         )
     with gr.Row():
-        with gr.Column():
-            split_selector = gr.Dropdown(
-                choices=["stratified"] + BIOSYN_CLASS_NAMES,
-                value="stratified",
-                label="Split name"
-            )
         with gr.Column():
             single_domains_plot = gr.Plot(
                 label="Single domains",
@@ -80,11 +130,22 @@ with gr.Blocks(title="BGC Keyword Plotter") as demo:
     frequency_slider.release(
         fn=update_all_plots,
-        inputs=[frequency_slider, split_selector],
         outputs=[single_domains_plot, pair_domains_plot]#, cosine_plot]
     )
 print(f"Launching!...")
 demo.launch()
 # demo.load(filter_map, [min_price, max_price, boroughs], map)

 print(f"Loading domains data...")
 single_df = pd.read_csv(SINGLE_DOMAINS_FILE, compression='gzip')
+single_df.rename(columns={'bgc_class': 'biosyn_class'}, inplace=True)
+single_df['biosyn_class_index'] = single_df.biosyn_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
 single_df = convert_int64_to_int32(single_df)
 pair_df = pd.read_csv(PAIR_DOMAINS_FILE, compression='gzip')
+pair_df.rename(columns={'bgc_class': 'biosyn_class'}, inplace=True)
+pair_df['biosyn_class_index'] = pair_df.biosyn_class.apply(lambda x: BIOSYN_CLASS_NAMES.index(x))
 pair_df = convert_int64_to_int32(pair_df)
 num_domains_in_region_df = single_df.groupby('cds_region_id', as_index=False).agg({'as_domain_id': 'count'}).rename(
 data_plotter = MatplotlibDataPlotter(single_df, pair_df, num_domains_in_region_df)
+def create_color_legend(class_to_color):
+    # Create HTML for the color legend
+    legend_html = """
+        <div style="
+            margin: 10px 0;
+            padding: 10px;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+            background: white;
+        ">
+            <div style="
+                font-weight: bold;
+                margin-bottom: 8px;
+            ">Color Legend:</div>
+            <div style="
+                display: flex;
+                flex-wrap: wrap;
+                gap: 15px;
+                align-items: center;
+            ">
+    """
+    # Add each class and its color
+    for class_name, color in class_to_color.items():
+        legend_html += f"""
+            <div style="
+                display: flex;
+                align-items: center;
+                gap: 5px;
+            ">
+                <div style="
+                    width: 20px;
+                    height: 20px;
+                    background-color: {color};
+                    border-radius: 3px;
+                "></div>
+                <span>{class_name}</span>
+            </div>
+        """
+    legend_html += """
+            </div>
+        </div>
+    """
+    return gr.HTML(legend_html)
 def update_all_plots(frequency, split_name):
     return data_plotter.plot_single_domains(frequency, split_name), data_plotter.plot_pair_domains(frequency, split_name)
     gr.Markdown("## BGC Keyword Plotter")
     gr.Markdown("Select the model name and minimal number of domains in Antismash-db subset.")
+    color_legend = create_color_legend(BIOSYN_CLASS_HEX_COLORS)
     with gr.Row():
         frequency_slider = gr.Slider(
             minimum=int(unique_domain_lengths.min()),
             value=int(unique_domain_lengths.min()),
             label="Min number of domains"
         )
+        model_selector = gr.Radio(
+            choices=["stratified"] + BIOSYN_CLASS_NAMES,
+            value="stratified",
+            label="Model name"
+        )
     with gr.Row():
         with gr.Column():
             single_domains_plot = gr.Plot(
                 label="Single domains",
     frequency_slider.release(
         fn=update_all_plots,
+        inputs=[frequency_slider, model_selector],
         outputs=[single_domains_plot, pair_domains_plot]#, cosine_plot]
     )
+    demo.load(
+        fn=update_all_plots,
+        inputs=[frequency_slider, model_selector],
+        outputs=[single_domains_plot, pair_domains_plot]
+    )
+    model_selector.input(
+        fn=update_all_plots,
+        inputs=[frequency_slider, model_selector],
+        outputs=[single_domains_plot, pair_domains_plot]
+    )
 print(f"Launching!...")
 demo.launch()
 # demo.load(filter_map, [min_price, max_price, boroughs], map)

constants.py CHANGED Viewed

@@ -1,4 +1,6 @@
 POSTER_BLUE = '#01589C'
 BIOSYN_CLASS_NAMES = ['Alkaloid', 'NRP', 'Polyketide', 'RiPP', 'Saccharide', 'Terpene', "Other"]
@@ -6,3 +8,18 @@ BIOSYN_CLASS_NAMES = ['Alkaloid', 'NRP', 'Polyketide', 'RiPP', 'Saccharide', 'Te
 SINGLE_DOMAINS_FILE = 'data/single_domains.csv.gz'
 PAIR_DOMAINS_FILE = 'data/pair_domains.csv.gz'

+import seaborn as sns
 POSTER_BLUE = '#01589C'
 BIOSYN_CLASS_NAMES = ['Alkaloid', 'NRP', 'Polyketide', 'RiPP', 'Saccharide', 'Terpene', "Other"]
 SINGLE_DOMAINS_FILE = 'data/single_domains.csv.gz'
 PAIR_DOMAINS_FILE = 'data/pair_domains.csv.gz'
+BIOSYN_CLASS_HEX_COLORS = {
+    'Alkaloid': '#a1c9f4',
+    'NRP': '#ffb482',
+    'Polyketide': '#8de5a1',
+    'RiPP': '#ff9f9b',
+    'Saccharide': '#d0bbff',
+    'Terpene': '#debb9b',
+    'Other': '#cfcfcf',
+    # 'stratified': '#01589C', # just in case
+}
+COLOR_PALETTE = sns.color_palette([
+    BIOSYN_CLASS_HEX_COLORS[biosyn_class]
+    for biosyn_class in BIOSYN_CLASS_NAMES
+])

mpl_data_plotter.py CHANGED Viewed

@@ -21,7 +21,12 @@ class MatplotlibDataPlotter:
         selected_region_ids = self.num_domains_in_region_df.loc[
             self.num_domains_in_region_df.num_domains >= num_domains,
             'cds_region_id'].values
         single_df_subset = self.single_df.loc[self.single_df.cds_region_id.isin(selected_region_ids)]
         # split_name = 'stratified'
         column_name = f'cosine_similarity_{split_name}'
         # single_df_subset = single_df.loc[single_df.dom_location_len >= num_domains]
@@ -35,12 +40,8 @@ class MatplotlibDataPlotter:
         bin_width=1
         hue_group_offset=0.5
         # hue_order=BIOSYN_CLASS_NAMES
-        hue2count={}
         width=0.9
-        show_legend=True
-        print(matplotlib.get_backend())
         fig = self.single_domains_fig
         fig.clf()
@@ -48,23 +49,29 @@ class MatplotlibDataPlotter:
         plot_utils.draw_barplots(
             targets_list,
             label_list=label_list,
-            top_n=5,
-            bin_width=1,
-            hue_group_offset=0.5,
             hue_order=BIOSYN_CLASS_NAMES,
-            hue2count={},
-            width=0.9,
             ax=ax,
-            show_legend=True
         )
-        plt.tight_layout()
-        return fig # plt.gcf()
     def plot_pair_domains(self, num_domains, split_name):
         selected_region_ids = self.num_domains_in_region_df.loc[
             self.num_domains_in_region_df.num_domains >= num_domains,
             'cds_region_id'].values
         pair_df_subset = self.pair_df.loc[self.pair_df.cds_region_id.isin(selected_region_ids)]
         # split_name = 'stratified'
         column_name = f'cosine_similarity_{split_name}'
         # pair_df_subset = pair_df.loc[pair_df.dom_location_len >= num_domains]
@@ -83,27 +90,25 @@ class MatplotlibDataPlotter:
         hue2count={}
         width=0.9
-        show_legend=True
-        # fig = plt.figure(figsize=(5, 10))
         fig = self.pair_domains_fig
-        # fig = plt.gcf()
         fig.clf()
-        print(matplotlib.get_backend())
         ax = fig.gca()
         plot_utils.draw_barplots(
             targets_list,
             label_list=label_list,
-            top_n=5,
-            bin_width=1,
-            hue_group_offset=0.5,
             hue_order=BIOSYN_CLASS_NAMES,
-            hue2count={},
-            width=0.9,
             ax=ax,
-            show_legend=True
         )
-        plt.tight_layout()
         return fig  #plt.gcf()

         selected_region_ids = self.num_domains_in_region_df.loc[
             self.num_domains_in_region_df.num_domains >= num_domains,
             'cds_region_id'].values
         single_df_subset = self.single_df.loc[self.single_df.cds_region_id.isin(selected_region_ids)]
+        biosyn_counts_single = single_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count()
+        hue2count_single = dict(biosyn_counts_single.values)
         # split_name = 'stratified'
         column_name = f'cosine_similarity_{split_name}'
         # single_df_subset = single_df.loc[single_df.dom_location_len >= num_domains]
         bin_width=1
         hue_group_offset=0.5
         # hue_order=BIOSYN_CLASS_NAMES
         width=0.9
         fig = self.single_domains_fig
         fig.clf()
         plot_utils.draw_barplots(
             targets_list,
             label_list=label_list,
+            top_n=top_n,
+            bin_width=bin_width,
+            hue_group_offset=hue_group_offset,
             hue_order=BIOSYN_CLASS_NAMES,
+            hue2count=hue2count_single,
+            width=width,
             ax=ax,
+            show_legend=False,
+            palette=COLOR_PALETTE
         )
+        fig.tight_layout()
+        return fig
     def plot_pair_domains(self, num_domains, split_name):
         selected_region_ids = self.num_domains_in_region_df.loc[
             self.num_domains_in_region_df.num_domains >= num_domains,
             'cds_region_id'].values
         pair_df_subset = self.pair_df.loc[self.pair_df.cds_region_id.isin(selected_region_ids)]
+        biosyn_counts_pairs = pair_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count()
+        hue2count_pairs = dict(biosyn_counts_pairs.values)
         # split_name = 'stratified'
         column_name = f'cosine_similarity_{split_name}'
         # pair_df_subset = pair_df.loc[pair_df.dom_location_len >= num_domains]
         hue2count={}
         width=0.9
+        show_legend=False
         fig = self.pair_domains_fig
         fig.clf()
         ax = fig.gca()
         plot_utils.draw_barplots(
             targets_list,
             label_list=label_list,
+            top_n=top_n,
+            bin_width=bin_width,
+            hue_group_offset=hue_group_offset,
             hue_order=BIOSYN_CLASS_NAMES,
+            hue2count=hue2count_pairs,
+            width=width,
             ax=ax,
+            show_legend=show_legend,
+            palette=COLOR_PALETTE
         )
+        fig.tight_layout()
         return fig  #plt.gcf()

plot_utils.py CHANGED Viewed

@@ -76,7 +76,7 @@ def draw_barplots(targets_list, label_list=None, top_n=5, bin_width=1,
         #     if not normalize:
         #         bottom[bin_indices] += bar_offset
         line_pos = bin_indices.max() + width/2 + hue_group_offset/2
-        plt.axhline(line_pos, linewidth=1, linestyle='dashed', color=POSTER_BLUE)
     if show_legend:
         ax.legend(
             loc='upper center', bbox_to_anchor=(0.5, -0.05),

         #     if not normalize:
         #         bottom[bin_indices] += bar_offset
         line_pos = bin_indices.max() + width/2 + hue_group_offset/2
+        ax.axhline(line_pos, linewidth=1, linestyle='dashed', color=POSTER_BLUE)
     if show_legend:
         ax.legend(
             loc='upper center', bbox_to_anchor=(0.5, -0.05),