Adjusted analysis_notebook to work with new dataset

mhehr · mhehr · commit bf9bf40cd1e9 · 2023-03-14T21:54:58.000+01:00
diff --git a/analysis/analysis_notebook.ipynb b/analysis/analysis_notebook.ipynb
diff --git a/analysis/functions/beeswarm.py b/analysis/functions/beeswarm.py
@@ -14,15 +14,15 @@ def beeswarm_coordinates(
         prefix=''):
     '''small hack: don't show image, but use seaborn to calculate coordinates.
     Could not find a solution performing as well as the seaborn implementation'''
+    df = df.sort_values(by=val_col, ascending=True)
     fig, ax = plt.subplots(figsize=figsize)
     ax.set(xscale=xscale)
     fig = sns.swarmplot(data=df, x=val_col, size=pointsize)
     # get precise data coordinates
     x, y = np.array(ax.collections[0].get_offsets()).T
-    df = df.sort_values(by=val_col, ascending=True)
     df[prefix + 'x'] = x
     df[prefix + 'y'] = y
-    xlim = ax.get_xlim()
-    ylim = ax.get_ylim()
+    xlim = min(df[prefix + 'x'])*0.95, max(df[prefix + 'x'])*1.05
+    ylim = min(df[prefix + 'y'])*0.95, max(df[prefix + 'y'])*0.95
     plt.close('all')
     return df, xlim, ylim
diff --git a/analysis/functions/bokeh_wrapper.py b/analysis/functions/bokeh_wrapper.py
@@ -1,6 +1,6 @@
 from bokeh.plotting import figure, show, save, output_notebook, output_file
 from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, ColorBar, LinearColorMapper
-from bokeh.models import FuncTickFormatter, FixedTicker, Legend, BasicTickFormatter, Panel, Tabs
+from bokeh.models import FuncTickFormatter, FixedTicker, Legend, BasicTickFormatter, TabPanel, Tabs
 from bokeh.palettes import Turbo256 as palette_umap
 from bokeh.transform import linear_cmap
 import matplotlib.colors as mpt_colors
@@ -93,8 +93,8 @@ def swarmplot(df, xlim, ylim, title="Swarmplot", legend_header="", **kwargs):
     df['edgecolor'] = df['color_values'].apply(col_edge_get)
     size = 6
 
-    plot_figure = figure(title=title, plot_width=900,
-                         plot_height=500, tools=(''),
+    plot_figure = figure(title=title, width=900,
+                         height=500, tools=(''),
                          x_axis_type="log", x_range=xlim, y_range=ylim,
                          x_axis_label='Single cell attention')
     plot_figure.add_tools(HoverTool(tooltips="""
@@ -151,7 +151,7 @@ def multi_swarmplot(df, xlim, ylim, title, path_save=None, **kwargs):
         title,
         legend_header="Annotated cell type",
         **kwargs)
-    tab1 = Panel(child=swarm_regular, title="Full annotation")
+    tab1 = TabPanel(child=swarm_regular, title="Full annotation")
 
     df_simplified = df.copy()
     df_simplified['color_values'] = df_simplified['color_values'].apply(
@@ -163,7 +163,7 @@ def multi_swarmplot(df, xlim, ylim, title, path_save=None, **kwargs):
         title,
         legend_header="Annotated cell group",
         **kwargs)
-    tab2 = Panel(child=swarm_simplified, title="Reduced annotation")
+    tab2 = TabPanel(child=swarm_simplified, title="Reduced annotation")
 
     if path_save is None:
         # if no path_save is given, show
@@ -197,7 +197,7 @@ def export_swarmplot(
     ax.set_xscale('log')
     yrange = ylim[0] - ylim[1]
     ax.set_xlim(xlim[0], xlim[1])
-    ax.set_ylim(ylim[1], yrange - ylim[1])
+    ax.set_ylim(ylim[1]*0.5, ylim[0]*0.5)
     ax.spines['top'].set_visible(False)
     ax.spines['left'].set_visible(False)
     ax.spines['right'].set_visible(False)
@@ -279,74 +279,74 @@ def export_swarmplot(
         fontsize=FONTSIZE)
     leg._legend_box.align = "left"
 
-    # plot in highlighted images
-    # draw out lines and plot images
-    if highlight_idx is not None:
-        im_buffer = {}
-        for identifier in highlight_idx:
-            cell = df.loc[df['im_id'] == identifier].iloc[0]
-            x, y = cell.x, cell.y
-            class_lbl = cell.color_values
-            ax2.plot([x, x], [y, y + yrange], c='lightgray', zorder=5)
-
-            # load and display image
-            im = Image.open(cell.im_path)
-            im_buffer[x] = im
-
-            ax2.scatter(
-                x,
-                y,
-                color=col_get(class_lbl),
-                linewidth=0.5,
-                s=dotsize,
-                zorder=10,
-                marker=shape_get_matplotlib(class_lbl),
-                edgecolors=col_edge_get(class_lbl))
-
-            class_lbl = cell.color_values_pooled
-            ax2.scatter(
-                x,
-                y + yrange,
-                color=col_get(class_lbl),
-                linewidth=0.5,
-                s=dotsize,
-                zorder=10,
-                marker=shape_get_matplotlib(class_lbl),
-                edgecolors=col_edge_get(class_lbl))
-
-        # shift images a little bit to improve optics
-        global xpoints
-        xpoints = sorted(im_buffer.keys())
-
-        def log_x_dist(x1, x2):
-            if min(x1, x2) <= 0:
-                return 10000
-            return math.log10(max(x1, x2) / min(x1, x2))
-
-        def f_positions(shifts):
-            global xpoints
-
-            # calculate distances to close points
-            xpoints_shifted = [xpoints[x] * shifts[x]
-                               for x in range(len(xpoints))]
-            el_dists = np.array([log_x_dist(
-                xpoints_shifted[x], xpoints_shifted[x + 1]) for x in range(len(xpoints) - 1)])
-            mean_dist = np.mean(el_dists)
-            dist_loss = np.sum(np.square(el_dists - mean_dist))
-
-            return dist_loss
-
-        # calculate coordinates
-        shift_images = fmin(f_positions, np.array([1] * len(xpoints)))
-
-        # add images
-        for x in xpoints:
-            im = im_buffer[x]
-            ab = AnnotationBbox(OffsetImage(im, zoom=0.5), (x *
-                                                            shift_images[xpoints.index(x)], yrange +
-                                                            ylim[1]), frameon=True, pad=0.0)
-            ab.set_zorder(10)
-            ax2.add_artist(ab)
+    # # plot in highlighted images
+    # # draw out lines and plot images
+    # if highlight_idx is not None:
+    #     im_buffer = {}
+    #     for identifier in highlight_idx:
+    #         cell = df.loc[df['im_id'] == identifier].iloc[0]
+    #         x, y = cell.x, cell.y
+    #         class_lbl = cell.color_values
+    #         ax2.plot([x, x], [y, y + yrange], c='lightgray', zorder=5)
+
+    #         # load and display image
+    #         im = Image.open(cell.im_path)
+    #         im_buffer[x] = im
+
+    #         ax2.scatter(
+    #             x,
+    #             y,
+    #             color=col_get(class_lbl),
+    #             linewidth=0.5,
+    #             s=dotsize,
+    #             zorder=10,
+    #             marker=shape_get_matplotlib(class_lbl),
+    #             edgecolors=col_edge_get(class_lbl))
+
+    #         class_lbl = cell.color_values_pooled
+    #         ax2.scatter(
+    #             x,
+    #             y + yrange,
+    #             color=col_get(class_lbl),
+    #             linewidth=0.5,
+    #             s=dotsize,
+    #             zorder=10,
+    #             marker=shape_get_matplotlib(class_lbl),
+    #             edgecolors=col_edge_get(class_lbl))
+
+    #     # shift images a little bit to improve optics
+    #     global xpoints
+    #     xpoints = sorted(im_buffer.keys())
+
+    #     def log_x_dist(x1, x2):
+    #         if min(x1, x2) <= 0:
+    #             return 10000
+    #         return math.log10(max(x1, x2) / min(x1, x2))
+
+    #     def f_positions(shifts):
+    #         global xpoints
+
+    #         # calculate distances to close points
+    #         xpoints_shifted = [xpoints[x] * shifts[x]
+    #                            for x in range(len(xpoints))]
+    #         el_dists = np.array([log_x_dist(
+    #             xpoints_shifted[x], xpoints_shifted[x + 1]) for x in range(len(xpoints) - 1)])
+    #         mean_dist = np.mean(el_dists)
+    #         dist_loss = np.sum(np.square(el_dists - mean_dist))
+
+    #         return dist_loss
+
+    #     # calculate coordinates
+    #     shift_images = fmin(f_positions, np.array([1] * len(xpoints)))
+
+    #     # add images
+    #     for x in xpoints:
+    #         im = im_buffer[x]
+    #         ab = AnnotationBbox(OffsetImage(im, zoom=0.5), (x *
+    #                                                         shift_images[xpoints.index(x)], yrange +
+    #                                                         ylim[1]), frameon=True, pad=0.0)
+    #         ab.set_zorder(10)
+    #         ax2.add_artist(ab)
 
     ax.text(
         x=0.01,
@@ -597,8 +597,8 @@ def umap(
     df['info'] = df[data_column]
     size = 8
 
-    plot_figure = figure(title=title, plot_width=900,
-                         plot_height=700, tools=('pan, wheel_zoom, reset'),
+    plot_figure = figure(title=title, width=900,
+                         height=700, tools=('pan, wheel_zoom, reset'),
                          aspect_scale=2)
 
     #     plot_figure.yaxis.visible = False
diff --git a/analysis/functions/entropy_plot.py b/analysis/functions/entropy_plot.py
@@ -11,8 +11,6 @@ def entropy_plot(dataframe):
     ax = sns.swarmplot(ax=ax, data=df_tmp, x="classification", y="entropy")
     ax.set(ylabel="Entropy")
     ax.set(xlabel="Classification")
-    ax.legend(title='Percent of acceptable images',
-              bbox_to_anchor=(1, 0., 0.5, 1), loc=10)
 
 
 def entropy_vs_myb(dataframe):
@@ -29,5 +27,5 @@ def entropy_vs_myb(dataframe):
         label='False prediction')
     ax.set(ylabel="Entropy")
     ax.set(xlabel="Myeloblast percentage")
-    ax.legend(title='Percent of acceptable images',
+    ax.legend(title='Percent of myeloblasts in differential blood count',
               bbox_to_anchor=(1, 0., 0.5, 1), loc=10)
diff --git a/analysis/functions/image_bytestream.py b/analysis/functions/image_bytestream.py
@@ -32,7 +32,7 @@ def map_images_to_dataframe(df):
             patient_basepath = os.path.dirname(df.loc[row_idx].im_path)
 
             patient_images_path = os.path.join(
-                patient_basepath, 'processed', 'stacked_images.npy')
+                patient_basepath, 'stacked_images.npy')
             patient_images = np.load(patient_images_path)
             current_pat_id = pat_id
 
diff --git a/analysis/functions/image_excerpt.py b/analysis/functions/image_excerpt.py
@@ -24,7 +24,6 @@ def plot(
     im_path = os.path.join(
         os.path.dirname(
             sc_df.iloc[0].im_path),
-        'processed',
         'stacked_images.npy')
     im_ar = np.load(im_path)
 
diff --git a/analysis/functions/load_data.py b/analysis/functions/load_data.py
@@ -37,7 +37,7 @@ def load_dataframes(
         path_preload=os.path.join(
             basepath, folder_list[0], 'class_conversion.csv'))
     patient_master_dataframe = pd.read_csv(
-        '{}/mll_data_master_pseudo.csv'.format(folder_dataset)).set_index('pseudonym')
+        '{}/metadata.csv'.format(folder_dataset)).set_index('patient_id')
 
     datapoints = []
     temporary_data_cache = {}
@@ -74,8 +74,7 @@ def load_dataframes(
                 pat_entropy = entropy(
                     pat_prediction_vector,
                     base=len(pat_prediction_vector))
-                pat_quality_category = patient_master_dataframe.loc[pat_id,
-                                                                    'examine_category_quality']
+                
                 pat_myb_share = patient_master_dataframe.loc[pat_id,
                                                              'pb_myeloblast']
                 pat_pmc_share = patient_master_dataframe.loc[pat_id,
@@ -89,7 +88,6 @@ def load_dataframes(
                                  lbl_conv_obj[int(pat_prediction_argmax)],
                                  pat_loss,
                                  pat_entropy,
-                                 pat_quality_category,
                                  pat_myb_share,
                                  pat_myb_share + pat_pmc_share + pat_myc_share]
                 pat_datapoint.extend(pat_prediction_vector)
@@ -111,7 +109,6 @@ def load_dataframes(
         'pred_lbl',
         'MIL loss',
         'entropy',
-        'quality_category',
         'myb_annotated',
         'filter_annotation']
     columns_df.extend(['mil_prediction_' + lbl_conv_obj[x]
@@ -175,7 +172,7 @@ def load_dataframes(
             pat_features = np.load(
                 os.path.join(
                     pat_path,
-                    'processed/{}bn_features_layer_7.npy'.format(prefix)))
+                    '{}bn_features_layer_7.npy'.format(prefix)))
             ft_dims = pat_features.shape
             pat_features_flattened = pat_features.reshape(
                 (ft_dims[0], ft_dims[1] * ft_dims[2] * ft_dims[3]))            # keeps image dimension
diff --git a/analysis/functions/sc_occlusion.py b/analysis/functions/sc_occlusion.py
@@ -59,7 +59,6 @@ def calculate_change_on_occlusion(
             patient_feature_path = os.path.join(
                 os.path.dirname(
                     cell.im_path),
-                'processed',
                 prefix +
                 'bn_features_layer_7.npy')
             patient_feature_array = np.load(patient_feature_path)
diff --git a/ml_pipeline/dataset.py b/ml_pipeline/dataset.py
@@ -67,7 +67,7 @@ def define_dataset(
 
     # load patient data
     df_data_master = pd.read_csv(
-        '{}/mll_data_master_pseudo.csv'.format(path_data)).set_index('pseudonym')
+        '{}/metadata.csv'.format(path_data)).set_index('patient_id')
 
     print("")
     print("Filtering the dataset...")
@@ -85,28 +85,28 @@ def define_dataset(
             'pb_myelocyte']
         annotation_count = sum(row[annotations_exclude_by])
         if annotation_count < filter_diff_count and (
-                not row['bag_label'] == 'SCD'):
+                not row['bag_label'] == 'control'):
             print("Not enough malign cells, exclude: ", row.name,
                   " with ", annotation_count, " malign cells ")
             continue
 
-        # filter if manual assessment revealed major flaws. If this cell
-        # contains N/A, then we don't exclude
-        keep_row = pd.isnull(row['examine_exclude'])
+        # # filter if manual assessment revealed major flaws. If this cell
+        # # contains N/A, then we don't exclude
+        # keep_row = pd.isnull(row['examine_exclude'])
 
-        # filter if the patient has known bad sample quality
-        if not keep_row and filter_quality_major_assessment:
-            print("Major flaws in slide quality, exclude: ", row.name, " ")
-            continue
+        # # filter if the patient has known bad sample quality
+        # if not keep_row and filter_quality_major_assessment:
+        #     print("Major flaws in slide quality, exclude: ", row.name, " ")
+        #     continue
 
-        # filter if manual assessment revealed *minor* flaws. If this cell
-        # contains N/A, then we don't exclude
-        keep_row = pd.isnull(row['examine_optional_exclude'])
+        # # filter if manual assessment revealed *minor* flaws. If this cell
+        # # contains N/A, then we don't exclude
+        # keep_row = pd.isnull(row['examine_optional_exclude'])
 
-        # filter if the patient has known bad sample quality
-        if not keep_row and filter_quality_minor_assessment:
-            print("Minor flaws in slide quality, exclude: ", row.name, " ")
-            continue
+        # # filter if the patient has known bad sample quality
+        # if not keep_row and filter_quality_minor_assessment:
+        #     print("Minor flaws in slide quality, exclude: ", row.name, " ")
+        #     continue
 
         # enter patient into label converter
         label = process_label(row)
@@ -190,7 +190,6 @@ def __getitem__(self, idx):
             bag = np.load(
                 os.path.join(
                     path,
-                    'processed',
                     prefix +
                     'bn_features_layer_7.npy'))
             self.features_loaded[path] = bag
diff --git a/ml_pipeline/run_pipeline.py b/ml_pipeline/run_pipeline.py