Mercurial > repos > perssond > quantification
comparison SingleCellDataExtraction.py @ 1:aba3655fdef0 draft
"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit 897a7dc7cb43e45d6f0fdfe2b2970e59f20f8853"
author | watsocam |
---|---|
date | Fri, 11 Mar 2022 23:35:52 +0000 |
parents | 928db0f952e3 |
children |
comparison
equal
deleted
inserted
replaced
0:928db0f952e3 | 1:aba3655fdef0 |
---|---|
6 import h5py | 6 import h5py |
7 import pandas as pd | 7 import pandas as pd |
8 import numpy as np | 8 import numpy as np |
9 import os | 9 import os |
10 import skimage.measure as measure | 10 import skimage.measure as measure |
11 import tifffile | |
12 | |
11 from pathlib import Path | 13 from pathlib import Path |
12 import csv | |
13 | 14 |
14 import sys | 15 import sys |
15 | 16 |
16 | 17 |
17 def MaskChannel(mask_loaded,image_loaded_z): | 18 def gini_index(mask, intensity): |
19 x = intensity[mask] | |
20 sorted_x = np.sort(x) | |
21 n = len(x) | |
22 cumx = np.cumsum(sorted_x, dtype=float) | |
23 return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n | |
24 | |
25 def intensity_median(mask, intensity): | |
26 return np.median(intensity[mask]) | |
27 | |
28 def MaskChannel(mask_loaded, image_loaded_z, intensity_props=["intensity_mean"]): | |
18 """Function for quantifying a single channel image | 29 """Function for quantifying a single channel image |
19 | 30 |
20 Returns a table with CellID according to the mask and the mean pixel intensity | 31 Returns a table with CellID according to the mask and the mean pixel intensity |
21 for the given channel for each cell""" | 32 for the given channel for each cell""" |
22 print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr) | 33 # Look for regionprops in skimage |
23 print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr) | 34 builtin_props = set(intensity_props).intersection(measure._regionprops.PROP_VALS) |
24 dat = measure.regionprops(mask_loaded, image_loaded_z) | 35 # Otherwise look for them in this module |
25 n = len(dat) | 36 extra_props = set(intensity_props).difference(measure._regionprops.PROP_VALS) |
26 intensity_z = np.empty(n) | 37 dat = measure.regionprops_table( |
27 for i in range(n): | 38 mask_loaded, image_loaded_z, |
28 intensity_z[i] = dat[i].mean_intensity | 39 properties = tuple(builtin_props), |
29 # Clear reference to avoid memory leak -- see MaskIDs for explanation. | 40 extra_properties = [globals()[n] for n in extra_props] |
30 dat[i] = None | 41 ) |
31 return intensity_z | 42 return dat |
32 | 43 |
33 | 44 |
34 def MaskIDs(mask): | 45 def MaskIDs(mask, mask_props=None): |
35 """This function will extract the CellIDs and the XY positions for each | 46 """This function will extract the CellIDs and the XY positions for each |
36 cell based on that cells centroid | 47 cell based on that cells centroid |
37 | 48 |
38 Returns a dictionary object""" | 49 Returns a dictionary object""" |
39 | 50 |
40 dat = measure.regionprops(mask) | 51 all_mask_props = set(["label", "centroid", "area", "major_axis_length", "minor_axis_length", "eccentricity", "solidity", "extent", "orientation"]) |
41 n = len(dat) | 52 if mask_props is not None: |
42 | 53 all_mask_props = all_mask_props.union(mask_props) |
43 # Pre-allocate numpy arrays for all properties we'll calculate. | 54 |
44 labels = np.empty(n, int) | 55 dat = measure.regionprops_table( |
45 xcoords = np.empty(n) | 56 mask, |
46 ycoords = np.empty(n) | 57 properties=all_mask_props |
47 area = np.empty(n, int) | 58 ) |
48 minor_axis_length = np.empty(n) | 59 |
49 major_axis_length = np.empty(n) | 60 name_map = { |
50 eccentricity = np.empty(n) | 61 "CellID": "label", |
51 solidity = np.empty(n) | 62 "X_centroid": "centroid-1", |
52 extent = np.empty(n) | 63 "Y_centroid": "centroid-0", |
53 orientation = np.empty(n) | 64 "Area": "area", |
54 | 65 "MajorAxisLength": "major_axis_length", |
55 for i in range(n): | 66 "MinorAxisLength": "minor_axis_length", |
56 labels[i] = dat[i].label | 67 "Eccentricity": "eccentricity", |
57 xcoords[i] = dat[i].centroid[1] | 68 "Solidity": "solidity", |
58 ycoords[i] = dat[i].centroid[0] | 69 "Extent": "extent", |
59 area[i] = dat[i].area | 70 "Orientation": "orientation", |
60 major_axis_length[i] = dat[i].major_axis_length | |
61 minor_axis_length[i] = dat[i].minor_axis_length | |
62 eccentricity[i] = dat[i].eccentricity | |
63 solidity[i] = dat[i].solidity | |
64 extent[i] = dat[i].extent | |
65 orientation[i] = dat[i].orientation | |
66 # By clearing the reference to each RegionProperties object, we allow it | |
67 # and its cache to be garbage collected immediately. Otherwise memory | |
68 # usage creeps up needlessly while this function is executing. | |
69 dat[i] = None | |
70 | |
71 IDs = { | |
72 "CellID": labels, | |
73 "X_centroid": xcoords, | |
74 "Y_centroid": ycoords, | |
75 "column_centroid": xcoords, | |
76 "row_centroid": ycoords, | |
77 "Area": area, | |
78 "MajorAxisLength": major_axis_length, | |
79 "MinorAxisLength": minor_axis_length, | |
80 "Eccentricity": eccentricity, | |
81 "Solidity": solidity, | |
82 "Extent": extent, | |
83 "Orientation": orientation, | |
84 } | 71 } |
85 | 72 for new_name, old_name in name_map.items(): |
86 return IDs | 73 dat[new_name] = dat[old_name] |
87 | 74 for old_name in set(name_map.values()): |
75 del dat[old_name] | |
76 | |
77 return dat | |
78 | |
79 def n_channels(image): | |
80 """Returns the number of channel in the input image. Supports [OME]TIFF and HDF5.""" | |
81 | |
82 image_path = Path(image) | |
83 | |
84 if image_path.suffix in ['.tiff', '.tif', '.btf']: | |
85 s = tifffile.TiffFile(image).series[0] | |
86 ndim = len(s.shape) | |
87 if ndim == 2: return 1 | |
88 elif ndim == 3: return min(s.shape) | |
89 else: raise Exception('mcquant supports only 2D/3D images.') | |
90 | |
91 elif image_path.suffix in ['.h5', '.hdf5']: | |
92 f = h5py.File(image, 'r') | |
93 dat_name = list(f.keys())[0] | |
94 return f[dat_name].shape[3] | |
95 | |
96 else: | |
97 raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only') | |
88 | 98 |
89 def PrepareData(image,z): | 99 def PrepareData(image,z): |
90 """Function for preparing input for maskzstack function. Connecting function | 100 """Function for preparing input for maskzstack function. Connecting function |
91 to use with mc micro ilastik pipeline""" | 101 to use with mc micro ilastik pipeline""" |
92 | 102 |
93 image_path = Path(image) | 103 image_path = Path(image) |
94 print(f'{image_path} at {z}', file=sys.stderr) | 104 print(f'{image_path} at {z}', file=sys.stderr) |
95 | 105 |
96 #Check to see if image tif(f) | 106 #Check to see if image tif(f) |
97 if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf': | 107 if image_path.suffix in ['.tiff', '.tif', '.btf']: |
98 #Check to see if the image is ome.tif(f) | 108 image_loaded_z = tifffile.imread(image, key=z) |
99 if image.endswith(('.ome.tif','.ome.tiff')): | |
100 #Read the image | |
101 image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') | |
102 #print('OME TIF(F) found') | |
103 else: | |
104 #Read the image | |
105 image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') | |
106 #print('TIF(F) found') | |
107 # Remove extra axis | |
108 #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4])) | |
109 | 109 |
110 #Check to see if image is hdf5 | 110 #Check to see if image is hdf5 |
111 elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5': | 111 elif image_path.suffix in ['.h5', '.hdf5']: |
112 #Read the image | 112 #Read the image |
113 f = h5py.File(image,'r+') | 113 f = h5py.File(image,'r') |
114 #Get the dataset name from the h5 file | 114 #Get the dataset name from the h5 file |
115 dat_name = list(f.keys())[0] | 115 dat_name = list(f.keys())[0] |
116 ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data' | 116 #Retrieve the z^th channel |
117 #Get the image data | 117 image_loaded_z = f[dat_name][0,:,:,z] |
118 image_loaded = np.array(f[dat_name]) | 118 |
119 #Remove the first axis (ilastik convention) | 119 else: |
120 image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3])) | 120 raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only') |
121 ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be | |
122 ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1) | |
123 | 121 |
124 #Return the objects | 122 #Return the objects |
125 return image_loaded_z | 123 return image_loaded_z |
126 | 124 |
127 | 125 |
128 def MaskZstack(masks_loaded,image,channel_names_loaded): | 126 def MaskZstack(masks_loaded,image,channel_names_loaded, mask_props=None, intensity_props=["intensity_mean"]): |
129 """This function will extract the stats for each cell mask through each channel | 127 """This function will extract the stats for each cell mask through each channel |
130 in the input image | 128 in the input image |
131 | 129 |
132 mask_loaded: dictionary containing Tiff masks that represents the cells in your image. | 130 mask_loaded: dictionary containing Tiff masks that represents the cells in your image. |
133 | 131 |
134 z_stack: Multichannel z stack image""" | 132 z_stack: Multichannel z stack image""" |
135 | 133 |
136 #Get the names of the keys for the masks dictionary | 134 #Get the names of the keys for the masks dictionary |
137 mask_names = list(masks_loaded.keys()) | 135 mask_names = list(masks_loaded.keys()) |
138 #Get the CellIDs for this dataset by using only a single mask (first mask) | 136 |
139 IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]])) | |
140 #Create empty dictionary to store channel results per mask | 137 #Create empty dictionary to store channel results per mask |
141 dict_of_chan = {m_name: [] for m_name in mask_names} | 138 dict_of_chan = {m_name: [] for m_name in mask_names} |
142 #Get the z channel and the associated channel name from list of channel names | 139 #Get the z channel and the associated channel name from list of channel names |
143 print(f'channels: {channel_names_loaded}', file=sys.stderr) | 140 print(f'channels: {channel_names_loaded}', file=sys.stderr) |
144 print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr) | 141 print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr) |
147 image_loaded_z = PrepareData(image,z) | 144 image_loaded_z = PrepareData(image,z) |
148 | 145 |
149 #Iterate through number of masks to extract single cell data | 146 #Iterate through number of masks to extract single cell data |
150 for nm in range(len(mask_names)): | 147 for nm in range(len(mask_names)): |
151 #Use the above information to mask z stack | 148 #Use the above information to mask z stack |
152 dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z)) | 149 dict_of_chan[mask_names[nm]].append( |
150 MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z, intensity_props=intensity_props) | |
151 ) | |
153 #Print progress | 152 #Print progress |
154 print("Finished "+str(z)) | 153 print("Finished "+str(z)) |
155 | 154 |
156 #Iterate through the rest of the masks to modify names of channels and convert to data table | 155 # Column order according to histoCAT convention (Move xy position to end with spatial information) |
156 last_cols = ( | |
157 "X_centroid", | |
158 "Y_centroid", | |
159 "column_centroid", | |
160 "row_centroid", | |
161 "Area", | |
162 "MajorAxisLength", | |
163 "MinorAxisLength", | |
164 "Eccentricity", | |
165 "Solidity", | |
166 "Extent", | |
167 "Orientation", | |
168 ) | |
169 def col_sort(x): | |
170 if x == "CellID": | |
171 return -2 | |
172 try: | |
173 return last_cols.index(x) | |
174 except ValueError: | |
175 return -1 | |
176 | |
177 #Iterate through the masks and format quantifications for each mask and property | |
157 for nm in mask_names: | 178 for nm in mask_names: |
158 #Check if this is the first mask | 179 mask_dict = {} |
159 if nm == mask_names[0]: | 180 # Mean intensity is default property, stored without suffix |
160 #Create channel names for this mask | 181 mask_dict.update( |
161 new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] | 182 zip(channel_names_loaded, [x["intensity_mean"] for x in dict_of_chan[nm]]) |
162 #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY | 183 ) |
163 dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1) | 184 # All other properties are suffixed with their names |
164 #Get the name of the columns in the dataframe so we can reorder to histoCAT convention | 185 for prop_n in set(dict_of_chan[nm][0].keys()).difference(["intensity_mean"]): |
165 cols = list(dict_of_chan[nm].columns.values) | 186 mask_dict.update( |
166 #Reorder the list (Move xy position to end with spatial information) | 187 zip([f"{n}_{prop_n}" for n in channel_names_loaded], [x[prop_n] for x in dict_of_chan[nm]]) |
167 cols.append(cols.pop(cols.index("X_centroid"))) | 188 ) |
168 cols.append(cols.pop(cols.index("Y_centroid"))) | 189 # Get the cell IDs and mask properties |
169 cols.append(cols.pop(cols.index("column_centroid"))) | 190 mask_properties = pd.DataFrame(MaskIDs(masks_loaded[nm], mask_props=mask_props)) |
170 cols.append(cols.pop(cols.index("row_centroid"))) | 191 mask_dict.update(mask_properties) |
171 cols.append(cols.pop(cols.index("Area"))) | 192 dict_of_chan[nm] = pd.DataFrame(mask_dict).reindex(columns=sorted(mask_dict.keys(), key=col_sort)) |
172 cols.append(cols.pop(cols.index("MajorAxisLength"))) | 193 |
173 cols.append(cols.pop(cols.index("MinorAxisLength"))) | 194 # Return the dict of dataframes for each mask |
174 cols.append(cols.pop(cols.index("Eccentricity"))) | 195 return dict_of_chan |
175 cols.append(cols.pop(cols.index("Solidity"))) | 196 |
176 cols.append(cols.pop(cols.index("Extent"))) | 197 def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]): |
177 cols.append(cols.pop(cols.index("Orientation"))) | |
178 #Reindex the dataframe with new order | |
179 dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols) | |
180 #Otherwise, add no spatial information | |
181 else: | |
182 #Create channel names for this mask | |
183 new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] | |
184 #Use the above information to mask z stack | |
185 dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm]))) | |
186 | |
187 #Concatenate all data from all masks to return | |
188 dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1) | |
189 | |
190 #Return the dataframe | |
191 return dat | |
192 | |
193 | |
194 def ExtractSingleCells(masks,image,channel_names,output): | |
195 """Function for extracting single cell information from input | 198 """Function for extracting single cell information from input |
196 path containing single-cell masks, z_stack path, and channel_names path.""" | 199 path containing single-cell masks, z_stack path, and channel_names path.""" |
197 | 200 |
198 #Create pathlib object for output | 201 #Create pathlib object for output |
199 output = Path(output) | 202 output = Path(output) |
200 | 203 |
201 #Check if header available | |
202 #sniffer = csv.Sniffer() | |
203 #sniffer.has_header(open(channel_names).readline()) | |
204 #If header not available | |
205 #if not sniffer: | |
206 #If header available | |
207 #channel_names_loaded = pd.read_csv(channel_names) | |
208 #channel_names_loaded_list = list(channel_names_loaded.marker_name) | |
209 #else: | |
210 #print("negative") | |
211 #old one column version | |
212 #channel_names_loaded = pd.read_csv(channel_names,header=None) | |
213 #Add a column index for ease | |
214 #channel_names_loaded.columns = ["marker"] | |
215 #channel_names_loaded = list(channel_names_loaded.marker.values) | |
216 | |
217 #Read csv channel names | 204 #Read csv channel names |
218 channel_names_loaded = pd.read_csv(channel_names) | 205 channel_names_loaded = pd.read_csv(channel_names) |
219 #Check for size of columns | 206 #Check for the presence of `marker_name` column |
220 if channel_names_loaded.shape[1] > 1: | 207 if 'marker_name' in channel_names_loaded: |
221 #Get the marker_name column if more than one column (CyCIF structure) | 208 #Get the marker_name column if more than one column (CyCIF structure) |
222 channel_names_loaded_list = list(channel_names_loaded.marker_name) | 209 channel_names_loaded_list = list(channel_names_loaded.marker_name) |
210 #Consider the old one-marker-per-line plain text format | |
211 elif channel_names_loaded.shape[1] == 1: | |
212 #re-read the csv file and add column name | |
213 channel_names_loaded = pd.read_csv(channel_names, header = None) | |
214 channel_names_loaded_list = list(channel_names_loaded.iloc[:,0]) | |
223 else: | 215 else: |
224 #old one column version -- re-read the csv file and add column name | 216 raise Exception('%s must contain the marker_name column'%channel_names) |
225 channel_names_loaded = pd.read_csv(channel_names, header = None) | 217 |
226 #Add a column index for ease and for standardization | 218 #Contrast against the number of markers in the image |
227 channel_names_loaded.columns = ["marker"] | 219 if len(channel_names_loaded_list) != n_channels(image): |
228 channel_names_loaded_list = list(channel_names_loaded.marker) | 220 raise Exception("The number of channels in %s doesn't match the image"%channel_names) |
229 | 221 |
230 #Check for unique marker names -- create new list to store new names | 222 #Check for unique marker names -- create new list to store new names |
231 channel_names_loaded_checked = [] | 223 channel_names_loaded_checked = [] |
232 for idx,val in enumerate(channel_names_loaded_list): | 224 for idx,val in enumerate(channel_names_loaded_list): |
233 #Check for unique value | 225 #Check for unique value |
234 if channel_names_loaded_list.count(val) > 1: | 226 if channel_names_loaded_list.count(val) > 1: |
236 channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1)) | 228 channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1)) |
237 else: | 229 else: |
238 #Otherwise, leave channel name | 230 #Otherwise, leave channel name |
239 channel_names_loaded_checked.append(val) | 231 channel_names_loaded_checked.append(val) |
240 | 232 |
241 #Clear small memory amount by clearing old channel names | |
242 channel_names_loaded, channel_names_loaded_list = None, None | |
243 | |
244 #Read the masks | 233 #Read the masks |
245 masks_loaded = {} | 234 masks_loaded = {} |
246 #iterate through mask paths and read images to add to dictionary object | 235 #iterate through mask paths and read images to add to dictionary object |
247 for m in masks: | 236 for m in masks: |
248 m_full_name = os.path.basename(m) | 237 m_full_name = os.path.basename(m) |
249 m_name = m_full_name.split('.')[0] | 238 m_name = m_full_name.split('.')[0] |
250 masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')}) | 239 masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')}) |
251 | 240 |
252 scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked) | 241 scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked, mask_props=mask_props, intensity_props=intensity_props) |
253 #Write the singe cell data to a csv file using the image name | 242 #Write the singe cell data to a csv file using the image name |
254 | 243 |
255 im_full_name = os.path.basename(image) | 244 im_full_name = os.path.basename(image) |
256 im_name = im_full_name.split('.')[0] | 245 im_name = im_full_name.split('.')[0] |
257 scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False) | 246 |
258 | 247 # iterate through each mask and export csv with mask name as suffix |
259 | 248 for k,v in scdata_z.items(): |
260 def MultiExtractSingleCells(masks,image,channel_names,output): | 249 # export the csv for this mask name |
250 scdata_z[k].to_csv( | |
251 str(Path(os.path.join(str(output), | |
252 str(im_name+"_{}"+".csv").format(k)))), | |
253 index=False | |
254 ) | |
255 | |
256 | |
257 def MultiExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]): | |
261 """Function for iterating over a list of z_stacks and output locations to | 258 """Function for iterating over a list of z_stacks and output locations to |
262 export single-cell data from image masks""" | 259 export single-cell data from image masks""" |
263 | 260 |
264 print("Extracting single-cell data for "+str(image)+'...') | 261 print("Extracting single-cell data for "+str(image)+'...') |
265 | 262 |
266 #Run the ExtractSingleCells function for this image | 263 #Run the ExtractSingleCells function for this image |
267 ExtractSingleCells(masks,image,channel_names,output) | 264 ExtractSingleCells(masks,image,channel_names,output, mask_props=mask_props, intensity_props=intensity_props) |
268 | 265 |
269 #Print update | 266 #Print update |
270 im_full_name = os.path.basename(image) | 267 im_full_name = os.path.basename(image) |
271 im_name = im_full_name.split('.')[0] | 268 im_name = im_full_name.split('.')[0] |
272 print("Finished "+str(im_name)) | 269 print("Finished "+str(im_name)) |