Skip to content

Commit bf9bf40

Browse files
committed
Adjusted analysis_notebook to work with new dataset
1 parent a02e192 commit bf9bf40

10 files changed

+166
-2490
lines changed

analysis/analysis_notebook.ipynb

+61-2,370
Large diffs are not rendered by default.

analysis/functions/beeswarm.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,15 @@ def beeswarm_coordinates(
1414
prefix=''):
1515
'''small hack: don't show image, but use seaborn to calculate coordinates.
1616
Could not find a solution performing as well as the seaborn implementation'''
17+
df = df.sort_values(by=val_col, ascending=True)
1718
fig, ax = plt.subplots(figsize=figsize)
1819
ax.set(xscale=xscale)
1920
fig = sns.swarmplot(data=df, x=val_col, size=pointsize)
2021
# get precise data coordinates
2122
x, y = np.array(ax.collections[0].get_offsets()).T
22-
df = df.sort_values(by=val_col, ascending=True)
2323
df[prefix + 'x'] = x
2424
df[prefix + 'y'] = y
25-
xlim = ax.get_xlim()
26-
ylim = ax.get_ylim()
25+
xlim = min(df[prefix + 'x'])*0.95, max(df[prefix + 'x'])*1.05
26+
ylim = min(df[prefix + 'y'])*0.95, max(df[prefix + 'y'])*0.95
2727
plt.close('all')
2828
return df, xlim, ylim

analysis/functions/bokeh_wrapper.py

+76-76
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from bokeh.plotting import figure, show, save, output_notebook, output_file
22
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, ColorBar, LinearColorMapper
3-
from bokeh.models import FuncTickFormatter, FixedTicker, Legend, BasicTickFormatter, Panel, Tabs
3+
from bokeh.models import FuncTickFormatter, FixedTicker, Legend, BasicTickFormatter, TabPanel, Tabs
44
from bokeh.palettes import Turbo256 as palette_umap
55
from bokeh.transform import linear_cmap
66
import matplotlib.colors as mpt_colors
@@ -93,8 +93,8 @@ def swarmplot(df, xlim, ylim, title="Swarmplot", legend_header="", **kwargs):
9393
df['edgecolor'] = df['color_values'].apply(col_edge_get)
9494
size = 6
9595

96-
plot_figure = figure(title=title, plot_width=900,
97-
plot_height=500, tools=(''),
96+
plot_figure = figure(title=title, width=900,
97+
height=500, tools=(''),
9898
x_axis_type="log", x_range=xlim, y_range=ylim,
9999
x_axis_label='Single cell attention')
100100
plot_figure.add_tools(HoverTool(tooltips="""
@@ -151,7 +151,7 @@ def multi_swarmplot(df, xlim, ylim, title, path_save=None, **kwargs):
151151
title,
152152
legend_header="Annotated cell type",
153153
**kwargs)
154-
tab1 = Panel(child=swarm_regular, title="Full annotation")
154+
tab1 = TabPanel(child=swarm_regular, title="Full annotation")
155155

156156
df_simplified = df.copy()
157157
df_simplified['color_values'] = df_simplified['color_values'].apply(
@@ -163,7 +163,7 @@ def multi_swarmplot(df, xlim, ylim, title, path_save=None, **kwargs):
163163
title,
164164
legend_header="Annotated cell group",
165165
**kwargs)
166-
tab2 = Panel(child=swarm_simplified, title="Reduced annotation")
166+
tab2 = TabPanel(child=swarm_simplified, title="Reduced annotation")
167167

168168
if path_save is None:
169169
# if no path_save is given, show
@@ -197,7 +197,7 @@ def export_swarmplot(
197197
ax.set_xscale('log')
198198
yrange = ylim[0] - ylim[1]
199199
ax.set_xlim(xlim[0], xlim[1])
200-
ax.set_ylim(ylim[1], yrange - ylim[1])
200+
ax.set_ylim(ylim[1]*0.5, ylim[0]*0.5)
201201
ax.spines['top'].set_visible(False)
202202
ax.spines['left'].set_visible(False)
203203
ax.spines['right'].set_visible(False)
@@ -279,74 +279,74 @@ def export_swarmplot(
279279
fontsize=FONTSIZE)
280280
leg._legend_box.align = "left"
281281

282-
# plot in highlighted images
283-
# draw out lines and plot images
284-
if highlight_idx is not None:
285-
im_buffer = {}
286-
for identifier in highlight_idx:
287-
cell = df.loc[df['im_id'] == identifier].iloc[0]
288-
x, y = cell.x, cell.y
289-
class_lbl = cell.color_values
290-
ax2.plot([x, x], [y, y + yrange], c='lightgray', zorder=5)
291-
292-
# load and display image
293-
im = Image.open(cell.im_path)
294-
im_buffer[x] = im
295-
296-
ax2.scatter(
297-
x,
298-
y,
299-
color=col_get(class_lbl),
300-
linewidth=0.5,
301-
s=dotsize,
302-
zorder=10,
303-
marker=shape_get_matplotlib(class_lbl),
304-
edgecolors=col_edge_get(class_lbl))
305-
306-
class_lbl = cell.color_values_pooled
307-
ax2.scatter(
308-
x,
309-
y + yrange,
310-
color=col_get(class_lbl),
311-
linewidth=0.5,
312-
s=dotsize,
313-
zorder=10,
314-
marker=shape_get_matplotlib(class_lbl),
315-
edgecolors=col_edge_get(class_lbl))
316-
317-
# shift images a little bit to improve optics
318-
global xpoints
319-
xpoints = sorted(im_buffer.keys())
320-
321-
def log_x_dist(x1, x2):
322-
if min(x1, x2) <= 0:
323-
return 10000
324-
return math.log10(max(x1, x2) / min(x1, x2))
325-
326-
def f_positions(shifts):
327-
global xpoints
328-
329-
# calculate distances to close points
330-
xpoints_shifted = [xpoints[x] * shifts[x]
331-
for x in range(len(xpoints))]
332-
el_dists = np.array([log_x_dist(
333-
xpoints_shifted[x], xpoints_shifted[x + 1]) for x in range(len(xpoints) - 1)])
334-
mean_dist = np.mean(el_dists)
335-
dist_loss = np.sum(np.square(el_dists - mean_dist))
336-
337-
return dist_loss
338-
339-
# calculate coordinates
340-
shift_images = fmin(f_positions, np.array([1] * len(xpoints)))
341-
342-
# add images
343-
for x in xpoints:
344-
im = im_buffer[x]
345-
ab = AnnotationBbox(OffsetImage(im, zoom=0.5), (x *
346-
shift_images[xpoints.index(x)], yrange +
347-
ylim[1]), frameon=True, pad=0.0)
348-
ab.set_zorder(10)
349-
ax2.add_artist(ab)
282+
# # plot in highlighted images
283+
# # draw out lines and plot images
284+
# if highlight_idx is not None:
285+
# im_buffer = {}
286+
# for identifier in highlight_idx:
287+
# cell = df.loc[df['im_id'] == identifier].iloc[0]
288+
# x, y = cell.x, cell.y
289+
# class_lbl = cell.color_values
290+
# ax2.plot([x, x], [y, y + yrange], c='lightgray', zorder=5)
291+
292+
# # load and display image
293+
# im = Image.open(cell.im_path)
294+
# im_buffer[x] = im
295+
296+
# ax2.scatter(
297+
# x,
298+
# y,
299+
# color=col_get(class_lbl),
300+
# linewidth=0.5,
301+
# s=dotsize,
302+
# zorder=10,
303+
# marker=shape_get_matplotlib(class_lbl),
304+
# edgecolors=col_edge_get(class_lbl))
305+
306+
# class_lbl = cell.color_values_pooled
307+
# ax2.scatter(
308+
# x,
309+
# y + yrange,
310+
# color=col_get(class_lbl),
311+
# linewidth=0.5,
312+
# s=dotsize,
313+
# zorder=10,
314+
# marker=shape_get_matplotlib(class_lbl),
315+
# edgecolors=col_edge_get(class_lbl))
316+
317+
# # shift images a little bit to improve optics
318+
# global xpoints
319+
# xpoints = sorted(im_buffer.keys())
320+
321+
# def log_x_dist(x1, x2):
322+
# if min(x1, x2) <= 0:
323+
# return 10000
324+
# return math.log10(max(x1, x2) / min(x1, x2))
325+
326+
# def f_positions(shifts):
327+
# global xpoints
328+
329+
# # calculate distances to close points
330+
# xpoints_shifted = [xpoints[x] * shifts[x]
331+
# for x in range(len(xpoints))]
332+
# el_dists = np.array([log_x_dist(
333+
# xpoints_shifted[x], xpoints_shifted[x + 1]) for x in range(len(xpoints) - 1)])
334+
# mean_dist = np.mean(el_dists)
335+
# dist_loss = np.sum(np.square(el_dists - mean_dist))
336+
337+
# return dist_loss
338+
339+
# # calculate coordinates
340+
# shift_images = fmin(f_positions, np.array([1] * len(xpoints)))
341+
342+
# # add images
343+
# for x in xpoints:
344+
# im = im_buffer[x]
345+
# ab = AnnotationBbox(OffsetImage(im, zoom=0.5), (x *
346+
# shift_images[xpoints.index(x)], yrange +
347+
# ylim[1]), frameon=True, pad=0.0)
348+
# ab.set_zorder(10)
349+
# ax2.add_artist(ab)
350350

351351
ax.text(
352352
x=0.01,
@@ -597,8 +597,8 @@ def umap(
597597
df['info'] = df[data_column]
598598
size = 8
599599

600-
plot_figure = figure(title=title, plot_width=900,
601-
plot_height=700, tools=('pan, wheel_zoom, reset'),
600+
plot_figure = figure(title=title, width=900,
601+
height=700, tools=('pan, wheel_zoom, reset'),
602602
aspect_scale=2)
603603

604604
# plot_figure.yaxis.visible = False

analysis/functions/entropy_plot.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ def entropy_plot(dataframe):
1111
ax = sns.swarmplot(ax=ax, data=df_tmp, x="classification", y="entropy")
1212
ax.set(ylabel="Entropy")
1313
ax.set(xlabel="Classification")
14-
ax.legend(title='Percent of acceptable images',
15-
bbox_to_anchor=(1, 0., 0.5, 1), loc=10)
1614

1715

1816
def entropy_vs_myb(dataframe):
@@ -29,5 +27,5 @@ def entropy_vs_myb(dataframe):
2927
label='False prediction')
3028
ax.set(ylabel="Entropy")
3129
ax.set(xlabel="Myeloblast percentage")
32-
ax.legend(title='Percent of acceptable images',
30+
ax.legend(title='Percent of myeloblasts in differential blood count',
3331
bbox_to_anchor=(1, 0., 0.5, 1), loc=10)

analysis/functions/image_bytestream.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def map_images_to_dataframe(df):
3232
patient_basepath = os.path.dirname(df.loc[row_idx].im_path)
3333

3434
patient_images_path = os.path.join(
35-
patient_basepath, 'processed', 'stacked_images.npy')
35+
patient_basepath, 'stacked_images.npy')
3636
patient_images = np.load(patient_images_path)
3737
current_pat_id = pat_id
3838

analysis/functions/image_excerpt.py

-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ def plot(
2424
im_path = os.path.join(
2525
os.path.dirname(
2626
sc_df.iloc[0].im_path),
27-
'processed',
2827
'stacked_images.npy')
2928
im_ar = np.load(im_path)
3029

analysis/functions/load_data.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def load_dataframes(
3737
path_preload=os.path.join(
3838
basepath, folder_list[0], 'class_conversion.csv'))
3939
patient_master_dataframe = pd.read_csv(
40-
'{}/mll_data_master_pseudo.csv'.format(folder_dataset)).set_index('pseudonym')
40+
'{}/metadata.csv'.format(folder_dataset)).set_index('patient_id')
4141

4242
datapoints = []
4343
temporary_data_cache = {}
@@ -74,8 +74,7 @@ def load_dataframes(
7474
pat_entropy = entropy(
7575
pat_prediction_vector,
7676
base=len(pat_prediction_vector))
77-
pat_quality_category = patient_master_dataframe.loc[pat_id,
78-
'examine_category_quality']
77+
7978
pat_myb_share = patient_master_dataframe.loc[pat_id,
8079
'pb_myeloblast']
8180
pat_pmc_share = patient_master_dataframe.loc[pat_id,
@@ -89,7 +88,6 @@ def load_dataframes(
8988
lbl_conv_obj[int(pat_prediction_argmax)],
9089
pat_loss,
9190
pat_entropy,
92-
pat_quality_category,
9391
pat_myb_share,
9492
pat_myb_share + pat_pmc_share + pat_myc_share]
9593
pat_datapoint.extend(pat_prediction_vector)
@@ -111,7 +109,6 @@ def load_dataframes(
111109
'pred_lbl',
112110
'MIL loss',
113111
'entropy',
114-
'quality_category',
115112
'myb_annotated',
116113
'filter_annotation']
117114
columns_df.extend(['mil_prediction_' + lbl_conv_obj[x]
@@ -175,7 +172,7 @@ def load_dataframes(
175172
pat_features = np.load(
176173
os.path.join(
177174
pat_path,
178-
'processed/{}bn_features_layer_7.npy'.format(prefix)))
175+
'{}bn_features_layer_7.npy'.format(prefix)))
179176
ft_dims = pat_features.shape
180177
pat_features_flattened = pat_features.reshape(
181178
(ft_dims[0], ft_dims[1] * ft_dims[2] * ft_dims[3])) # keeps image dimension

analysis/functions/sc_occlusion.py

-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ def calculate_change_on_occlusion(
5959
patient_feature_path = os.path.join(
6060
os.path.dirname(
6161
cell.im_path),
62-
'processed',
6362
prefix +
6463
'bn_features_layer_7.npy')
6564
patient_feature_array = np.load(patient_feature_path)

ml_pipeline/dataset.py

+16-17
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def define_dataset(
6767

6868
# load patient data
6969
df_data_master = pd.read_csv(
70-
'{}/mll_data_master_pseudo.csv'.format(path_data)).set_index('pseudonym')
70+
'{}/metadata.csv'.format(path_data)).set_index('patient_id')
7171

7272
print("")
7373
print("Filtering the dataset...")
@@ -85,28 +85,28 @@ def define_dataset(
8585
'pb_myelocyte']
8686
annotation_count = sum(row[annotations_exclude_by])
8787
if annotation_count < filter_diff_count and (
88-
not row['bag_label'] == 'SCD'):
88+
not row['bag_label'] == 'control'):
8989
print("Not enough malign cells, exclude: ", row.name,
9090
" with ", annotation_count, " malign cells ")
9191
continue
9292

93-
# filter if manual assessment revealed major flaws. If this cell
94-
# contains N/A, then we don't exclude
95-
keep_row = pd.isnull(row['examine_exclude'])
93+
# # filter if manual assessment revealed major flaws. If this cell
94+
# # contains N/A, then we don't exclude
95+
# keep_row = pd.isnull(row['examine_exclude'])
9696

97-
# filter if the patient has known bad sample quality
98-
if not keep_row and filter_quality_major_assessment:
99-
print("Major flaws in slide quality, exclude: ", row.name, " ")
100-
continue
97+
# # filter if the patient has known bad sample quality
98+
# if not keep_row and filter_quality_major_assessment:
99+
# print("Major flaws in slide quality, exclude: ", row.name, " ")
100+
# continue
101101

102-
# filter if manual assessment revealed *minor* flaws. If this cell
103-
# contains N/A, then we don't exclude
104-
keep_row = pd.isnull(row['examine_optional_exclude'])
102+
# # filter if manual assessment revealed *minor* flaws. If this cell
103+
# # contains N/A, then we don't exclude
104+
# keep_row = pd.isnull(row['examine_optional_exclude'])
105105

106-
# filter if the patient has known bad sample quality
107-
if not keep_row and filter_quality_minor_assessment:
108-
print("Minor flaws in slide quality, exclude: ", row.name, " ")
109-
continue
106+
# # filter if the patient has known bad sample quality
107+
# if not keep_row and filter_quality_minor_assessment:
108+
# print("Minor flaws in slide quality, exclude: ", row.name, " ")
109+
# continue
110110

111111
# enter patient into label converter
112112
label = process_label(row)
@@ -190,7 +190,6 @@ def __getitem__(self, idx):
190190
bag = np.load(
191191
os.path.join(
192192
path,
193-
'processed',
194193
prefix +
195194
'bn_features_layer_7.npy'))
196195
self.features_loaded[path] = bag

0 commit comments

Comments
 (0)