comparison SingleCellDataExtraction.py @ 1:aba3655fdef0 draft

"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit 897a7dc7cb43e45d6f0fdfe2b2970e59f20f8853"
author watsocam
date Fri, 11 Mar 2022 23:35:52 +0000
parents 928db0f952e3
children
comparison
equal deleted inserted replaced
0:928db0f952e3 1:aba3655fdef0
6 import h5py 6 import h5py
7 import pandas as pd 7 import pandas as pd
8 import numpy as np 8 import numpy as np
9 import os 9 import os
10 import skimage.measure as measure 10 import skimage.measure as measure
11 import tifffile
12
11 from pathlib import Path 13 from pathlib import Path
12 import csv
13 14
14 import sys 15 import sys
15 16
16 17
17 def MaskChannel(mask_loaded,image_loaded_z): 18 def gini_index(mask, intensity):
19 x = intensity[mask]
20 sorted_x = np.sort(x)
21 n = len(x)
22 cumx = np.cumsum(sorted_x, dtype=float)
23 return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n
24
25 def intensity_median(mask, intensity):
26 return np.median(intensity[mask])
27
28 def MaskChannel(mask_loaded, image_loaded_z, intensity_props=["intensity_mean"]):
18 """Function for quantifying a single channel image 29 """Function for quantifying a single channel image
19 30
20 Returns a table with CellID according to the mask and the mean pixel intensity 31 Returns a table with CellID according to the mask and the mean pixel intensity
21 for the given channel for each cell""" 32 for the given channel for each cell"""
22 print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr) 33 # Look for regionprops in skimage
23 print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr) 34 builtin_props = set(intensity_props).intersection(measure._regionprops.PROP_VALS)
24 dat = measure.regionprops(mask_loaded, image_loaded_z) 35 # Otherwise look for them in this module
25 n = len(dat) 36 extra_props = set(intensity_props).difference(measure._regionprops.PROP_VALS)
26 intensity_z = np.empty(n) 37 dat = measure.regionprops_table(
27 for i in range(n): 38 mask_loaded, image_loaded_z,
28 intensity_z[i] = dat[i].mean_intensity 39 properties = tuple(builtin_props),
29 # Clear reference to avoid memory leak -- see MaskIDs for explanation. 40 extra_properties = [globals()[n] for n in extra_props]
30 dat[i] = None 41 )
31 return intensity_z 42 return dat
32 43
33 44
34 def MaskIDs(mask): 45 def MaskIDs(mask, mask_props=None):
35 """This function will extract the CellIDs and the XY positions for each 46 """This function will extract the CellIDs and the XY positions for each
36 cell based on that cells centroid 47 cell based on that cells centroid
37 48
38 Returns a dictionary object""" 49 Returns a dictionary object"""
39 50
40 dat = measure.regionprops(mask) 51 all_mask_props = set(["label", "centroid", "area", "major_axis_length", "minor_axis_length", "eccentricity", "solidity", "extent", "orientation"])
41 n = len(dat) 52 if mask_props is not None:
42 53 all_mask_props = all_mask_props.union(mask_props)
43 # Pre-allocate numpy arrays for all properties we'll calculate. 54
44 labels = np.empty(n, int) 55 dat = measure.regionprops_table(
45 xcoords = np.empty(n) 56 mask,
46 ycoords = np.empty(n) 57 properties=all_mask_props
47 area = np.empty(n, int) 58 )
48 minor_axis_length = np.empty(n) 59
49 major_axis_length = np.empty(n) 60 name_map = {
50 eccentricity = np.empty(n) 61 "CellID": "label",
51 solidity = np.empty(n) 62 "X_centroid": "centroid-1",
52 extent = np.empty(n) 63 "Y_centroid": "centroid-0",
53 orientation = np.empty(n) 64 "Area": "area",
54 65 "MajorAxisLength": "major_axis_length",
55 for i in range(n): 66 "MinorAxisLength": "minor_axis_length",
56 labels[i] = dat[i].label 67 "Eccentricity": "eccentricity",
57 xcoords[i] = dat[i].centroid[1] 68 "Solidity": "solidity",
58 ycoords[i] = dat[i].centroid[0] 69 "Extent": "extent",
59 area[i] = dat[i].area 70 "Orientation": "orientation",
60 major_axis_length[i] = dat[i].major_axis_length
61 minor_axis_length[i] = dat[i].minor_axis_length
62 eccentricity[i] = dat[i].eccentricity
63 solidity[i] = dat[i].solidity
64 extent[i] = dat[i].extent
65 orientation[i] = dat[i].orientation
66 # By clearing the reference to each RegionProperties object, we allow it
67 # and its cache to be garbage collected immediately. Otherwise memory
68 # usage creeps up needlessly while this function is executing.
69 dat[i] = None
70
71 IDs = {
72 "CellID": labels,
73 "X_centroid": xcoords,
74 "Y_centroid": ycoords,
75 "column_centroid": xcoords,
76 "row_centroid": ycoords,
77 "Area": area,
78 "MajorAxisLength": major_axis_length,
79 "MinorAxisLength": minor_axis_length,
80 "Eccentricity": eccentricity,
81 "Solidity": solidity,
82 "Extent": extent,
83 "Orientation": orientation,
84 } 71 }
85 72 for new_name, old_name in name_map.items():
86 return IDs 73 dat[new_name] = dat[old_name]
87 74 for old_name in set(name_map.values()):
75 del dat[old_name]
76
77 return dat
78
79 def n_channels(image):
80 """Returns the number of channel in the input image. Supports [OME]TIFF and HDF5."""
81
82 image_path = Path(image)
83
84 if image_path.suffix in ['.tiff', '.tif', '.btf']:
85 s = tifffile.TiffFile(image).series[0]
86 ndim = len(s.shape)
87 if ndim == 2: return 1
88 elif ndim == 3: return min(s.shape)
89 else: raise Exception('mcquant supports only 2D/3D images.')
90
91 elif image_path.suffix in ['.h5', '.hdf5']:
92 f = h5py.File(image, 'r')
93 dat_name = list(f.keys())[0]
94 return f[dat_name].shape[3]
95
96 else:
97 raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only')
88 98
89 def PrepareData(image,z): 99 def PrepareData(image,z):
90 """Function for preparing input for maskzstack function. Connecting function 100 """Function for preparing input for maskzstack function. Connecting function
91 to use with mc micro ilastik pipeline""" 101 to use with mc micro ilastik pipeline"""
92 102
93 image_path = Path(image) 103 image_path = Path(image)
94 print(f'{image_path} at {z}', file=sys.stderr) 104 print(f'{image_path} at {z}', file=sys.stderr)
95 105
96 #Check to see if image tif(f) 106 #Check to see if image tif(f)
97 if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf': 107 if image_path.suffix in ['.tiff', '.tif', '.btf']:
98 #Check to see if the image is ome.tif(f) 108 image_loaded_z = tifffile.imread(image, key=z)
99 if image.endswith(('.ome.tif','.ome.tiff')):
100 #Read the image
101 image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
102 #print('OME TIF(F) found')
103 else:
104 #Read the image
105 image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
106 #print('TIF(F) found')
107 # Remove extra axis
108 #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4]))
109 109
110 #Check to see if image is hdf5 110 #Check to see if image is hdf5
111 elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5': 111 elif image_path.suffix in ['.h5', '.hdf5']:
112 #Read the image 112 #Read the image
113 f = h5py.File(image,'r+') 113 f = h5py.File(image,'r')
114 #Get the dataset name from the h5 file 114 #Get the dataset name from the h5 file
115 dat_name = list(f.keys())[0] 115 dat_name = list(f.keys())[0]
116 ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data' 116 #Retrieve the z^th channel
117 #Get the image data 117 image_loaded_z = f[dat_name][0,:,:,z]
118 image_loaded = np.array(f[dat_name]) 118
119 #Remove the first axis (ilastik convention) 119 else:
120 image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3])) 120 raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only')
121 ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be
122 ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1)
123 121
124 #Return the objects 122 #Return the objects
125 return image_loaded_z 123 return image_loaded_z
126 124
127 125
128 def MaskZstack(masks_loaded,image,channel_names_loaded): 126 def MaskZstack(masks_loaded,image,channel_names_loaded, mask_props=None, intensity_props=["intensity_mean"]):
129 """This function will extract the stats for each cell mask through each channel 127 """This function will extract the stats for each cell mask through each channel
130 in the input image 128 in the input image
131 129
132 mask_loaded: dictionary containing Tiff masks that represents the cells in your image. 130 mask_loaded: dictionary containing Tiff masks that represents the cells in your image.
133 131
134 z_stack: Multichannel z stack image""" 132 z_stack: Multichannel z stack image"""
135 133
136 #Get the names of the keys for the masks dictionary 134 #Get the names of the keys for the masks dictionary
137 mask_names = list(masks_loaded.keys()) 135 mask_names = list(masks_loaded.keys())
138 #Get the CellIDs for this dataset by using only a single mask (first mask) 136
139 IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]]))
140 #Create empty dictionary to store channel results per mask 137 #Create empty dictionary to store channel results per mask
141 dict_of_chan = {m_name: [] for m_name in mask_names} 138 dict_of_chan = {m_name: [] for m_name in mask_names}
142 #Get the z channel and the associated channel name from list of channel names 139 #Get the z channel and the associated channel name from list of channel names
143 print(f'channels: {channel_names_loaded}', file=sys.stderr) 140 print(f'channels: {channel_names_loaded}', file=sys.stderr)
144 print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr) 141 print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr)
147 image_loaded_z = PrepareData(image,z) 144 image_loaded_z = PrepareData(image,z)
148 145
149 #Iterate through number of masks to extract single cell data 146 #Iterate through number of masks to extract single cell data
150 for nm in range(len(mask_names)): 147 for nm in range(len(mask_names)):
151 #Use the above information to mask z stack 148 #Use the above information to mask z stack
152 dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z)) 149 dict_of_chan[mask_names[nm]].append(
150 MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z, intensity_props=intensity_props)
151 )
153 #Print progress 152 #Print progress
154 print("Finished "+str(z)) 153 print("Finished "+str(z))
155 154
156 #Iterate through the rest of the masks to modify names of channels and convert to data table 155 # Column order according to histoCAT convention (Move xy position to end with spatial information)
156 last_cols = (
157 "X_centroid",
158 "Y_centroid",
159 "column_centroid",
160 "row_centroid",
161 "Area",
162 "MajorAxisLength",
163 "MinorAxisLength",
164 "Eccentricity",
165 "Solidity",
166 "Extent",
167 "Orientation",
168 )
169 def col_sort(x):
170 if x == "CellID":
171 return -2
172 try:
173 return last_cols.index(x)
174 except ValueError:
175 return -1
176
177 #Iterate through the masks and format quantifications for each mask and property
157 for nm in mask_names: 178 for nm in mask_names:
158 #Check if this is the first mask 179 mask_dict = {}
159 if nm == mask_names[0]: 180 # Mean intensity is default property, stored without suffix
160 #Create channel names for this mask 181 mask_dict.update(
161 new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] 182 zip(channel_names_loaded, [x["intensity_mean"] for x in dict_of_chan[nm]])
162 #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY 183 )
163 dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1) 184 # All other properties are suffixed with their names
164 #Get the name of the columns in the dataframe so we can reorder to histoCAT convention 185 for prop_n in set(dict_of_chan[nm][0].keys()).difference(["intensity_mean"]):
165 cols = list(dict_of_chan[nm].columns.values) 186 mask_dict.update(
166 #Reorder the list (Move xy position to end with spatial information) 187 zip([f"{n}_{prop_n}" for n in channel_names_loaded], [x[prop_n] for x in dict_of_chan[nm]])
167 cols.append(cols.pop(cols.index("X_centroid"))) 188 )
168 cols.append(cols.pop(cols.index("Y_centroid"))) 189 # Get the cell IDs and mask properties
169 cols.append(cols.pop(cols.index("column_centroid"))) 190 mask_properties = pd.DataFrame(MaskIDs(masks_loaded[nm], mask_props=mask_props))
170 cols.append(cols.pop(cols.index("row_centroid"))) 191 mask_dict.update(mask_properties)
171 cols.append(cols.pop(cols.index("Area"))) 192 dict_of_chan[nm] = pd.DataFrame(mask_dict).reindex(columns=sorted(mask_dict.keys(), key=col_sort))
172 cols.append(cols.pop(cols.index("MajorAxisLength"))) 193
173 cols.append(cols.pop(cols.index("MinorAxisLength"))) 194 # Return the dict of dataframes for each mask
174 cols.append(cols.pop(cols.index("Eccentricity"))) 195 return dict_of_chan
175 cols.append(cols.pop(cols.index("Solidity"))) 196
176 cols.append(cols.pop(cols.index("Extent"))) 197 def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]):
177 cols.append(cols.pop(cols.index("Orientation")))
178 #Reindex the dataframe with new order
179 dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols)
180 #Otherwise, add no spatial information
181 else:
182 #Create channel names for this mask
183 new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
184 #Use the above information to mask z stack
185 dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))
186
187 #Concatenate all data from all masks to return
188 dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1)
189
190 #Return the dataframe
191 return dat
192
193
194 def ExtractSingleCells(masks,image,channel_names,output):
195 """Function for extracting single cell information from input 198 """Function for extracting single cell information from input
196 path containing single-cell masks, z_stack path, and channel_names path.""" 199 path containing single-cell masks, z_stack path, and channel_names path."""
197 200
198 #Create pathlib object for output 201 #Create pathlib object for output
199 output = Path(output) 202 output = Path(output)
200 203
201 #Check if header available
202 #sniffer = csv.Sniffer()
203 #sniffer.has_header(open(channel_names).readline())
204 #If header not available
205 #if not sniffer:
206 #If header available
207 #channel_names_loaded = pd.read_csv(channel_names)
208 #channel_names_loaded_list = list(channel_names_loaded.marker_name)
209 #else:
210 #print("negative")
211 #old one column version
212 #channel_names_loaded = pd.read_csv(channel_names,header=None)
213 #Add a column index for ease
214 #channel_names_loaded.columns = ["marker"]
215 #channel_names_loaded = list(channel_names_loaded.marker.values)
216
217 #Read csv channel names 204 #Read csv channel names
218 channel_names_loaded = pd.read_csv(channel_names) 205 channel_names_loaded = pd.read_csv(channel_names)
219 #Check for size of columns 206 #Check for the presence of `marker_name` column
220 if channel_names_loaded.shape[1] > 1: 207 if 'marker_name' in channel_names_loaded:
221 #Get the marker_name column if more than one column (CyCIF structure) 208 #Get the marker_name column if more than one column (CyCIF structure)
222 channel_names_loaded_list = list(channel_names_loaded.marker_name) 209 channel_names_loaded_list = list(channel_names_loaded.marker_name)
210 #Consider the old one-marker-per-line plain text format
211 elif channel_names_loaded.shape[1] == 1:
212 #re-read the csv file and add column name
213 channel_names_loaded = pd.read_csv(channel_names, header = None)
214 channel_names_loaded_list = list(channel_names_loaded.iloc[:,0])
223 else: 215 else:
224 #old one column version -- re-read the csv file and add column name 216 raise Exception('%s must contain the marker_name column'%channel_names)
225 channel_names_loaded = pd.read_csv(channel_names, header = None) 217
226 #Add a column index for ease and for standardization 218 #Contrast against the number of markers in the image
227 channel_names_loaded.columns = ["marker"] 219 if len(channel_names_loaded_list) != n_channels(image):
228 channel_names_loaded_list = list(channel_names_loaded.marker) 220 raise Exception("The number of channels in %s doesn't match the image"%channel_names)
229 221
230 #Check for unique marker names -- create new list to store new names 222 #Check for unique marker names -- create new list to store new names
231 channel_names_loaded_checked = [] 223 channel_names_loaded_checked = []
232 for idx,val in enumerate(channel_names_loaded_list): 224 for idx,val in enumerate(channel_names_loaded_list):
233 #Check for unique value 225 #Check for unique value
234 if channel_names_loaded_list.count(val) > 1: 226 if channel_names_loaded_list.count(val) > 1:
236 channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1)) 228 channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1))
237 else: 229 else:
238 #Otherwise, leave channel name 230 #Otherwise, leave channel name
239 channel_names_loaded_checked.append(val) 231 channel_names_loaded_checked.append(val)
240 232
241 #Clear small memory amount by clearing old channel names
242 channel_names_loaded, channel_names_loaded_list = None, None
243
244 #Read the masks 233 #Read the masks
245 masks_loaded = {} 234 masks_loaded = {}
246 #iterate through mask paths and read images to add to dictionary object 235 #iterate through mask paths and read images to add to dictionary object
247 for m in masks: 236 for m in masks:
248 m_full_name = os.path.basename(m) 237 m_full_name = os.path.basename(m)
249 m_name = m_full_name.split('.')[0] 238 m_name = m_full_name.split('.')[0]
250 masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')}) 239 masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')})
251 240
252 scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked) 241 scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked, mask_props=mask_props, intensity_props=intensity_props)
253 #Write the singe cell data to a csv file using the image name 242 #Write the singe cell data to a csv file using the image name
254 243
255 im_full_name = os.path.basename(image) 244 im_full_name = os.path.basename(image)
256 im_name = im_full_name.split('.')[0] 245 im_name = im_full_name.split('.')[0]
257 scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False) 246
258 247 # iterate through each mask and export csv with mask name as suffix
259 248 for k,v in scdata_z.items():
260 def MultiExtractSingleCells(masks,image,channel_names,output): 249 # export the csv for this mask name
250 scdata_z[k].to_csv(
251 str(Path(os.path.join(str(output),
252 str(im_name+"_{}"+".csv").format(k)))),
253 index=False
254 )
255
256
257 def MultiExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]):
261 """Function for iterating over a list of z_stacks and output locations to 258 """Function for iterating over a list of z_stacks and output locations to
262 export single-cell data from image masks""" 259 export single-cell data from image masks"""
263 260
264 print("Extracting single-cell data for "+str(image)+'...') 261 print("Extracting single-cell data for "+str(image)+'...')
265 262
266 #Run the ExtractSingleCells function for this image 263 #Run the ExtractSingleCells function for this image
267 ExtractSingleCells(masks,image,channel_names,output) 264 ExtractSingleCells(masks,image,channel_names,output, mask_props=mask_props, intensity_props=intensity_props)
268 265
269 #Print update 266 #Print update
270 im_full_name = os.path.basename(image) 267 im_full_name = os.path.basename(image)
271 im_name = im_full_name.split('.')[0] 268 im_name = im_full_name.split('.')[0]
272 print("Finished "+str(im_name)) 269 print("Finished "+str(im_name))