quantification: SingleCellDataExtraction.py comparison

comparison SingleCellDataExtraction.py @ 1:aba3655fdef0 draft

"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit 897a7dc7cb43e45d6f0fdfe2b2970e59f20f8853"

author	watsocam
date	Fri, 11 Mar 2022 23:35:52 +0000
parents	928db0f952e3
children

comparison

equal deleted inserted replaced

-:928db0f952e3
+:aba3655fdef0
 import h5py
 import pandas as pd
 import numpy as np
 import os
 import skimage.measure as measure
+import tifffile
 from pathlib import Path
-import csv
 import sys
-def MaskChannel(mask_loaded,image_loaded_z):
+def gini_index(mask, intensity):
+x = intensity[mask]
+sorted_x = np.sort(x)
+n = len(x)
+cumx = np.cumsum(sorted_x, dtype=float)
+return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n
+def intensity_median(mask, intensity):
+return np.median(intensity[mask])
+def MaskChannel(mask_loaded, image_loaded_z, intensity_props=["intensity_mean"]):
 """Function for quantifying a single channel image
 Returns a table with CellID according to the mask and the mean pixel intensity
 for the given channel for each cell"""
-print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr)
+# Look for regionprops in skimage
-print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr)
+builtin_props = set(intensity_props).intersection(measure._regionprops.PROP_VALS)
-dat = measure.regionprops(mask_loaded, image_loaded_z)
+# Otherwise look for them in this module
-n = len(dat)
+extra_props = set(intensity_props).difference(measure._regionprops.PROP_VALS)
-intensity_z = np.empty(n)
+dat = measure.regionprops_table(
-for i in range(n):
+mask_loaded, image_loaded_z,
-intensity_z[i] = dat[i].mean_intensity
+properties = tuple(builtin_props),
-# Clear reference to avoid memory leak -- see MaskIDs for explanation.
+extra_properties = [globals()[n] for n in extra_props]
-dat[i] = None
+)
-return intensity_z
+return dat
-def MaskIDs(mask):
+def MaskIDs(mask, mask_props=None):
 """This function will extract the CellIDs and the XY positions for each
 cell based on that cells centroid
 Returns a dictionary object"""
-dat = measure.regionprops(mask)
+all_mask_props = set(["label", "centroid", "area", "major_axis_length", "minor_axis_length", "eccentricity", "solidity", "extent", "orientation"])
-n = len(dat)
+if mask_props is not None:
+all_mask_props = all_mask_props.union(mask_props)
-# Pre-allocate numpy arrays for all properties we'll calculate.
-labels = np.empty(n, int)
+dat = measure.regionprops_table(
-xcoords = np.empty(n)
+mask,
-ycoords = np.empty(n)
+properties=all_mask_props
-area = np.empty(n, int)
+)
-minor_axis_length = np.empty(n)
-major_axis_length = np.empty(n)
+name_map = {
-eccentricity = np.empty(n)
+"CellID": "label",
-solidity = np.empty(n)
+"X_centroid": "centroid-1",
-extent = np.empty(n)
+"Y_centroid": "centroid-0",
-orientation = np.empty(n)
+"Area": "area",
+"MajorAxisLength": "major_axis_length",
-for i in range(n):
+"MinorAxisLength": "minor_axis_length",
-labels[i] = dat[i].label
+"Eccentricity": "eccentricity",
-xcoords[i] = dat[i].centroid[1]
+"Solidity": "solidity",
-ycoords[i] = dat[i].centroid[0]
+"Extent": "extent",
-area[i] = dat[i].area
+"Orientation": "orientation",
-major_axis_length[i] = dat[i].major_axis_length
-minor_axis_length[i] = dat[i].minor_axis_length
-eccentricity[i] = dat[i].eccentricity
-solidity[i] = dat[i].solidity
-extent[i] = dat[i].extent
-orientation[i] = dat[i].orientation
-# By clearing the reference to each RegionProperties object, we allow it
-# and its cache to be garbage collected immediately. Otherwise memory
-# usage creeps up needlessly while this function is executing.
-dat[i] = None
-IDs = {
-"CellID": labels,
-"X_centroid": xcoords,
-"Y_centroid": ycoords,
-"column_centroid": xcoords,
-"row_centroid": ycoords,
-"Area": area,
-"MajorAxisLength": major_axis_length,
-"MinorAxisLength": minor_axis_length,
-"Eccentricity": eccentricity,
-"Solidity": solidity,
-"Extent": extent,
-"Orientation": orientation,
 }
+for new_name, old_name in name_map.items():
-return IDs
+dat[new_name] = dat[old_name]
+for old_name in set(name_map.values()):
+del dat[old_name]
+return dat
+def n_channels(image):
+"""Returns the number of channel in the input image. Supports [OME]TIFF and HDF5."""
+image_path = Path(image)
+if image_path.suffix in ['.tiff', '.tif', '.btf']:
+s = tifffile.TiffFile(image).series[0]
+ndim = len(s.shape)
+if ndim == 2: return 1
+elif ndim == 3: return min(s.shape)
+else: raise Exception('mcquant supports only 2D/3D images.')
+elif image_path.suffix in ['.h5', '.hdf5']:
+f = h5py.File(image, 'r')
+dat_name = list(f.keys())[0]
+return f[dat_name].shape[3]
+else:
+raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only')
 def PrepareData(image,z):
 """Function for preparing input for maskzstack function. Connecting function
 to use with mc micro ilastik pipeline"""
 image_path = Path(image)
 print(f'{image_path} at {z}', file=sys.stderr)
 #Check to see if image tif(f)
-if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf':
+if image_path.suffix in ['.tiff', '.tif', '.btf']:
-#Check to see if the image is ome.tif(f)
+image_loaded_z = tifffile.imread(image, key=z)
-if  image.endswith(('.ome.tif','.ome.tiff')):
-#Read the image
-image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
-#print('OME TIF(F) found')
-else:
-#Read the image
-image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
-#print('TIF(F) found')
-# Remove extra axis
-#image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4]))
 #Check to see if image is hdf5
-elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5':
+elif image_path.suffix in ['.h5', '.hdf5']:
 #Read the image
-f = h5py.File(image,'r+')
+f = h5py.File(image,'r')
 #Get the dataset name from the h5 file
 dat_name = list(f.keys())[0]
-###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data'
+#Retrieve the z^th channel
-#Get the image data
+image_loaded_z = f[dat_name][0,:,:,z]
-image_loaded = np.array(f[dat_name])
-#Remove the first axis (ilastik convention)
+else:
-image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3]))
+raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only')
-###If the hdf5 is exported from ilastik fiji plugin, the order will need to be
-###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1)
 #Return the objects
 return image_loaded_z
-def MaskZstack(masks_loaded,image,channel_names_loaded):
+def MaskZstack(masks_loaded,image,channel_names_loaded, mask_props=None, intensity_props=["intensity_mean"]):
 """This function will extract the stats for each cell mask through each channel
 in the input image
 mask_loaded: dictionary containing Tiff masks that represents the cells in your image.
 z_stack: Multichannel z stack image"""
 #Get the names of the keys for the masks dictionary
 mask_names = list(masks_loaded.keys())
-#Get the CellIDs for this dataset by using only a single mask (first mask)
-IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]]))
 #Create empty dictionary to store channel results per mask
 dict_of_chan = {m_name: [] for m_name in mask_names}
 #Get the z channel and the associated channel name from list of channel names
 print(f'channels: {channel_names_loaded}', file=sys.stderr)
 print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr)
 image_loaded_z = PrepareData(image,z)
 #Iterate through number of masks to extract single cell data
 for nm in range(len(mask_names)):
 #Use the above information to mask z stack
-dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z))
+dict_of_chan[mask_names[nm]].append(
+MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z, intensity_props=intensity_props)
+)
 #Print progress
 print("Finished "+str(z))
-#Iterate through the rest of the masks to modify names of channels and convert to data table
+# Column order according to histoCAT convention (Move xy position to end with spatial information)
+last_cols = (
+"X_centroid",
+"Y_centroid",
+"column_centroid",
+"row_centroid",
+"Area",
+"MajorAxisLength",
+"MinorAxisLength",
+"Eccentricity",
+"Solidity",
+"Extent",
+"Orientation",
+)
+def col_sort(x):
+if x == "CellID":
+return -2
+try:
+return last_cols.index(x)
+except ValueError:
+return -1
+#Iterate through the masks and format quantifications for each mask and property
 for nm in mask_names:
-#Check if this is the first mask
+mask_dict = {}
-if nm == mask_names[0]:
+# Mean intensity is default property, stored without suffix
-#Create channel names for this mask
+mask_dict.update(
-new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
+zip(channel_names_loaded, [x["intensity_mean"] for x in dict_of_chan[nm]])
-#Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY
+)
-dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1)
+# All other properties are suffixed with their names
-#Get the name of the columns in the dataframe so we can reorder to histoCAT convention
+for prop_n in set(dict_of_chan[nm][0].keys()).difference(["intensity_mean"]):
-cols = list(dict_of_chan[nm].columns.values)
+mask_dict.update(
-#Reorder the list (Move xy position to end with spatial information)
+zip([f"{n}_{prop_n}" for n in channel_names_loaded], [x[prop_n] for x in dict_of_chan[nm]])
-cols.append(cols.pop(cols.index("X_centroid")))
+)
-cols.append(cols.pop(cols.index("Y_centroid")))
+# Get the cell IDs and mask properties
-cols.append(cols.pop(cols.index("column_centroid")))
+mask_properties = pd.DataFrame(MaskIDs(masks_loaded[nm], mask_props=mask_props))
-cols.append(cols.pop(cols.index("row_centroid")))
+mask_dict.update(mask_properties)
-cols.append(cols.pop(cols.index("Area")))
+dict_of_chan[nm] = pd.DataFrame(mask_dict).reindex(columns=sorted(mask_dict.keys(), key=col_sort))
-cols.append(cols.pop(cols.index("MajorAxisLength")))
-cols.append(cols.pop(cols.index("MinorAxisLength")))
+# Return the dict of dataframes for each mask
-cols.append(cols.pop(cols.index("Eccentricity")))
+return dict_of_chan
-cols.append(cols.pop(cols.index("Solidity")))
-cols.append(cols.pop(cols.index("Extent")))
+def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]):
-cols.append(cols.pop(cols.index("Orientation")))
-#Reindex the dataframe with new order
-dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols)
-#Otherwise, add no spatial information
-else:
-#Create channel names for this mask
-new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
-#Use the above information to mask z stack
-dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))
-#Concatenate all data from all masks to return
-dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1)
-#Return the dataframe
-return dat
-def ExtractSingleCells(masks,image,channel_names,output):
 """Function for extracting single cell information from input
 path containing single-cell masks, z_stack path, and channel_names path."""
 #Create pathlib object for output
 output = Path(output)
-#Check if header available
-#sniffer = csv.Sniffer()
-#sniffer.has_header(open(channel_names).readline())
-#If header not available
-#if not sniffer:
-#If header available
-#channel_names_loaded = pd.read_csv(channel_names)
-#channel_names_loaded_list = list(channel_names_loaded.marker_name)
-#else:
-#print("negative")
-#old one column version
-#channel_names_loaded = pd.read_csv(channel_names,header=None)
-#Add a column index for ease
-#channel_names_loaded.columns = ["marker"]
-#channel_names_loaded = list(channel_names_loaded.marker.values)
 #Read csv channel names
 channel_names_loaded = pd.read_csv(channel_names)
-#Check for size of columns
+#Check for the presence of `marker_name` column
-if channel_names_loaded.shape[1] > 1:
+if 'marker_name' in channel_names_loaded:
 #Get the marker_name column if more than one column (CyCIF structure)
 channel_names_loaded_list = list(channel_names_loaded.marker_name)
+#Consider the old one-marker-per-line plain text format
+elif channel_names_loaded.shape[1] == 1:
+#re-read the csv file and add column name
+channel_names_loaded = pd.read_csv(channel_names, header = None)
+channel_names_loaded_list = list(channel_names_loaded.iloc[:,0])
 else:
-#old one column version -- re-read the csv file and add column name
+raise Exception('%s must contain the marker_name column'%channel_names)
-channel_names_loaded = pd.read_csv(channel_names, header = None)
-#Add a column index for ease and for standardization
+#Contrast against the number of markers in the image
-channel_names_loaded.columns = ["marker"]
+if len(channel_names_loaded_list) != n_channels(image):
-channel_names_loaded_list = list(channel_names_loaded.marker)
+raise Exception("The number of channels in %s doesn't match the image"%channel_names)
 #Check for unique marker names -- create new list to store new names
 channel_names_loaded_checked = []
 for idx,val in enumerate(channel_names_loaded_list):
 #Check for unique value
 if channel_names_loaded_list.count(val) > 1:
 channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1))
 else:
 #Otherwise, leave channel name
 channel_names_loaded_checked.append(val)
-#Clear small memory amount by clearing old channel names
-channel_names_loaded, channel_names_loaded_list = None, None
 #Read the masks
 masks_loaded = {}
 #iterate through mask paths and read images to add to dictionary object
 for m in masks:
 m_full_name = os.path.basename(m)
 m_name = m_full_name.split('.')[0]
 masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')})
-scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked)
+scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked, mask_props=mask_props, intensity_props=intensity_props)
 #Write the singe cell data to a csv file using the image name
 im_full_name = os.path.basename(image)
 im_name = im_full_name.split('.')[0]
-scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False)
+# iterate through each mask and export csv with mask name as suffix
+for k,v in scdata_z.items():
-def MultiExtractSingleCells(masks,image,channel_names,output):
+# export the csv for this mask name
+scdata_z[k].to_csv(
+str(Path(os.path.join(str(output),
+str(im_name+"_{}"+".csv").format(k)))),
+index=False
+)
+def MultiExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]):
 """Function for iterating over a list of z_stacks and output locations to
 export single-cell data from image masks"""
 print("Extracting single-cell data for "+str(image)+'...')
 #Run the ExtractSingleCells function for this image
-ExtractSingleCells(masks,image,channel_names,output)
+ExtractSingleCells(masks,image,channel_names,output, mask_props=mask_props, intensity_props=intensity_props)
 #Print update
 im_full_name = os.path.basename(image)
 im_name = im_full_name.split('.')[0]
 print("Finished "+str(im_name))

Mercurial > repos > perssond > quantification

comparison SingleCellDataExtraction.py @ 1:aba3655fdef0 draft