# HG changeset patch
# User watsocam
# Date 1647041752 0
# Node ID aba3655fdef07071e2e7822aaaa7bc8694300786
# Parent 928db0f952e328116fbd9893d669f84e199c915d
"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit 897a7dc7cb43e45d6f0fdfe2b2970e59f20f8853"
diff -r 928db0f952e3 -r aba3655fdef0 ParseInput.py
--- a/ParseInput.py Fri Mar 12 00:19:24 2021 +0000
+++ b/ParseInput.py Fri Mar 11 23:35:52 2022 +0000
@@ -8,15 +8,39 @@
#if __name__ == '__main__':
parser = argparse.ArgumentParser()
- parser.add_argument('--masks',nargs='*')
- parser.add_argument('--image')
- parser.add_argument('--channel_names')
- parser.add_argument('--output')
+ parser.add_argument('--masks',nargs='+', required=True)
+ parser.add_argument('--image', required=True)
+ parser.add_argument('--channel_names', required=True)
+ parser.add_argument('--output', required=True)
+ parser.add_argument(
+ '--mask_props', nargs = "+",
+ help="""
+ Space separated list of additional metrics to be calculated for every mask.
+ This is for metrics that depend only on the cell mask. If the metric depends
+ on signal intensity, use --intensity-props instead.
+ See list at https://scikit-image.org/docs/dev/api/skimage.measure.html#regionprops
+ """
+ )
+ parser.add_argument(
+ '--intensity_props', nargs = "+",
+ help="""
+ Space separated list of additional metrics to be calculated for every marker separately.
+ By default only mean intensity is calculated.
+ If the metric doesn't depend on signal intensity, use --mask-props instead.
+ See list at https://scikit-image.org/docs/dev/api/skimage.measure.html#regionprops
+ Additionally available is gini_index, which calculates a single number
+ between 0 and 1, representing how unequal the signal is distributed in each region.
+ See https://en.wikipedia.org/wiki/Gini_coefficient
+ """
+ )
#parser.add_argument('--suffix')
args = parser.parse_args()
#Create a dictionary object to pass to the next function
dict = {'masks': args.masks, 'image': args.image,\
- 'channel_names': args.channel_names,'output':args.output}
+ 'channel_names': args.channel_names,'output':args.output,
+ 'intensity_props': set(args.intensity_props if args.intensity_props is not None else []).union(["intensity_mean"]),
+ 'mask_props': args.mask_props,
+ }
#Print the dictionary object
print(dict)
#Return the dictionary
diff -r 928db0f952e3 -r aba3655fdef0 SingleCellDataExtraction.py
--- a/SingleCellDataExtraction.py Fri Mar 12 00:19:24 2021 +0000
+++ b/SingleCellDataExtraction.py Fri Mar 11 23:35:52 2022 +0000
@@ -8,83 +8,93 @@
import numpy as np
import os
import skimage.measure as measure
+import tifffile
+
from pathlib import Path
-import csv
import sys
-def MaskChannel(mask_loaded,image_loaded_z):
+def gini_index(mask, intensity):
+ x = intensity[mask]
+ sorted_x = np.sort(x)
+ n = len(x)
+ cumx = np.cumsum(sorted_x, dtype=float)
+ return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n
+
+def intensity_median(mask, intensity):
+ return np.median(intensity[mask])
+
+def MaskChannel(mask_loaded, image_loaded_z, intensity_props=["intensity_mean"]):
"""Function for quantifying a single channel image
Returns a table with CellID according to the mask and the mean pixel intensity
for the given channel for each cell"""
- print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr)
- print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr)
- dat = measure.regionprops(mask_loaded, image_loaded_z)
- n = len(dat)
- intensity_z = np.empty(n)
- for i in range(n):
- intensity_z[i] = dat[i].mean_intensity
- # Clear reference to avoid memory leak -- see MaskIDs for explanation.
- dat[i] = None
- return intensity_z
+ # Look for regionprops in skimage
+ builtin_props = set(intensity_props).intersection(measure._regionprops.PROP_VALS)
+ # Otherwise look for them in this module
+ extra_props = set(intensity_props).difference(measure._regionprops.PROP_VALS)
+ dat = measure.regionprops_table(
+ mask_loaded, image_loaded_z,
+ properties = tuple(builtin_props),
+ extra_properties = [globals()[n] for n in extra_props]
+ )
+ return dat
-def MaskIDs(mask):
+def MaskIDs(mask, mask_props=None):
"""This function will extract the CellIDs and the XY positions for each
cell based on that cells centroid
Returns a dictionary object"""
- dat = measure.regionprops(mask)
- n = len(dat)
+ all_mask_props = set(["label", "centroid", "area", "major_axis_length", "minor_axis_length", "eccentricity", "solidity", "extent", "orientation"])
+ if mask_props is not None:
+ all_mask_props = all_mask_props.union(mask_props)
- # Pre-allocate numpy arrays for all properties we'll calculate.
- labels = np.empty(n, int)
- xcoords = np.empty(n)
- ycoords = np.empty(n)
- area = np.empty(n, int)
- minor_axis_length = np.empty(n)
- major_axis_length = np.empty(n)
- eccentricity = np.empty(n)
- solidity = np.empty(n)
- extent = np.empty(n)
- orientation = np.empty(n)
+ dat = measure.regionprops_table(
+ mask,
+ properties=all_mask_props
+ )
- for i in range(n):
- labels[i] = dat[i].label
- xcoords[i] = dat[i].centroid[1]
- ycoords[i] = dat[i].centroid[0]
- area[i] = dat[i].area
- major_axis_length[i] = dat[i].major_axis_length
- minor_axis_length[i] = dat[i].minor_axis_length
- eccentricity[i] = dat[i].eccentricity
- solidity[i] = dat[i].solidity
- extent[i] = dat[i].extent
- orientation[i] = dat[i].orientation
- # By clearing the reference to each RegionProperties object, we allow it
- # and its cache to be garbage collected immediately. Otherwise memory
- # usage creeps up needlessly while this function is executing.
- dat[i] = None
+ name_map = {
+ "CellID": "label",
+ "X_centroid": "centroid-1",
+ "Y_centroid": "centroid-0",
+ "Area": "area",
+ "MajorAxisLength": "major_axis_length",
+ "MinorAxisLength": "minor_axis_length",
+ "Eccentricity": "eccentricity",
+ "Solidity": "solidity",
+ "Extent": "extent",
+ "Orientation": "orientation",
+ }
+ for new_name, old_name in name_map.items():
+ dat[new_name] = dat[old_name]
+ for old_name in set(name_map.values()):
+ del dat[old_name]
+
+ return dat
- IDs = {
- "CellID": labels,
- "X_centroid": xcoords,
- "Y_centroid": ycoords,
- "column_centroid": xcoords,
- "row_centroid": ycoords,
- "Area": area,
- "MajorAxisLength": major_axis_length,
- "MinorAxisLength": minor_axis_length,
- "Eccentricity": eccentricity,
- "Solidity": solidity,
- "Extent": extent,
- "Orientation": orientation,
- }
+def n_channels(image):
+ """Returns the number of channel in the input image. Supports [OME]TIFF and HDF5."""
+
+ image_path = Path(image)
- return IDs
+ if image_path.suffix in ['.tiff', '.tif', '.btf']:
+ s = tifffile.TiffFile(image).series[0]
+ ndim = len(s.shape)
+ if ndim == 2: return 1
+ elif ndim == 3: return min(s.shape)
+ else: raise Exception('mcquant supports only 2D/3D images.')
+ elif image_path.suffix in ['.h5', '.hdf5']:
+ f = h5py.File(image, 'r')
+ dat_name = list(f.keys())[0]
+ return f[dat_name].shape[3]
+
+ else:
+ raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only')
def PrepareData(image,z):
"""Function for preparing input for maskzstack function. Connecting function
@@ -94,38 +104,26 @@
print(f'{image_path} at {z}', file=sys.stderr)
#Check to see if image tif(f)
- if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf':
- #Check to see if the image is ome.tif(f)
- if image.endswith(('.ome.tif','.ome.tiff')):
- #Read the image
- image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
- #print('OME TIF(F) found')
- else:
- #Read the image
- image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
- #print('TIF(F) found')
- # Remove extra axis
- #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4]))
+ if image_path.suffix in ['.tiff', '.tif', '.btf']:
+ image_loaded_z = tifffile.imread(image, key=z)
#Check to see if image is hdf5
- elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5':
+ elif image_path.suffix in ['.h5', '.hdf5']:
#Read the image
- f = h5py.File(image,'r+')
+ f = h5py.File(image,'r')
#Get the dataset name from the h5 file
dat_name = list(f.keys())[0]
- ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data'
- #Get the image data
- image_loaded = np.array(f[dat_name])
- #Remove the first axis (ilastik convention)
- image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3]))
- ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be
- ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1)
+ #Retrieve the z^th channel
+ image_loaded_z = f[dat_name][0,:,:,z]
+
+ else:
+ raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only')
#Return the objects
return image_loaded_z
-def MaskZstack(masks_loaded,image,channel_names_loaded):
+def MaskZstack(masks_loaded,image,channel_names_loaded, mask_props=None, intensity_props=["intensity_mean"]):
"""This function will extract the stats for each cell mask through each channel
in the input image
@@ -135,8 +133,7 @@
#Get the names of the keys for the masks dictionary
mask_names = list(masks_loaded.keys())
- #Get the CellIDs for this dataset by using only a single mask (first mask)
- IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]]))
+
#Create empty dictionary to store channel results per mask
dict_of_chan = {m_name: [] for m_name in mask_names}
#Get the z channel and the associated channel name from list of channel names
@@ -149,84 +146,79 @@
#Iterate through number of masks to extract single cell data
for nm in range(len(mask_names)):
#Use the above information to mask z stack
- dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z))
+ dict_of_chan[mask_names[nm]].append(
+ MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z, intensity_props=intensity_props)
+ )
#Print progress
print("Finished "+str(z))
- #Iterate through the rest of the masks to modify names of channels and convert to data table
+ # Column order according to histoCAT convention (Move xy position to end with spatial information)
+ last_cols = (
+ "X_centroid",
+ "Y_centroid",
+ "column_centroid",
+ "row_centroid",
+ "Area",
+ "MajorAxisLength",
+ "MinorAxisLength",
+ "Eccentricity",
+ "Solidity",
+ "Extent",
+ "Orientation",
+ )
+ def col_sort(x):
+ if x == "CellID":
+ return -2
+ try:
+ return last_cols.index(x)
+ except ValueError:
+ return -1
+
+ #Iterate through the masks and format quantifications for each mask and property
for nm in mask_names:
- #Check if this is the first mask
- if nm == mask_names[0]:
- #Create channel names for this mask
- new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
- #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY
- dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1)
- #Get the name of the columns in the dataframe so we can reorder to histoCAT convention
- cols = list(dict_of_chan[nm].columns.values)
- #Reorder the list (Move xy position to end with spatial information)
- cols.append(cols.pop(cols.index("X_centroid")))
- cols.append(cols.pop(cols.index("Y_centroid")))
- cols.append(cols.pop(cols.index("column_centroid")))
- cols.append(cols.pop(cols.index("row_centroid")))
- cols.append(cols.pop(cols.index("Area")))
- cols.append(cols.pop(cols.index("MajorAxisLength")))
- cols.append(cols.pop(cols.index("MinorAxisLength")))
- cols.append(cols.pop(cols.index("Eccentricity")))
- cols.append(cols.pop(cols.index("Solidity")))
- cols.append(cols.pop(cols.index("Extent")))
- cols.append(cols.pop(cols.index("Orientation")))
- #Reindex the dataframe with new order
- dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols)
- #Otherwise, add no spatial information
- else:
- #Create channel names for this mask
- new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
- #Use the above information to mask z stack
- dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))
+ mask_dict = {}
+ # Mean intensity is default property, stored without suffix
+ mask_dict.update(
+ zip(channel_names_loaded, [x["intensity_mean"] for x in dict_of_chan[nm]])
+ )
+ # All other properties are suffixed with their names
+ for prop_n in set(dict_of_chan[nm][0].keys()).difference(["intensity_mean"]):
+ mask_dict.update(
+ zip([f"{n}_{prop_n}" for n in channel_names_loaded], [x[prop_n] for x in dict_of_chan[nm]])
+ )
+ # Get the cell IDs and mask properties
+ mask_properties = pd.DataFrame(MaskIDs(masks_loaded[nm], mask_props=mask_props))
+ mask_dict.update(mask_properties)
+ dict_of_chan[nm] = pd.DataFrame(mask_dict).reindex(columns=sorted(mask_dict.keys(), key=col_sort))
- #Concatenate all data from all masks to return
- dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1)
+ # Return the dict of dataframes for each mask
+ return dict_of_chan
- #Return the dataframe
- return dat
-
-
-def ExtractSingleCells(masks,image,channel_names,output):
+def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]):
"""Function for extracting single cell information from input
path containing single-cell masks, z_stack path, and channel_names path."""
#Create pathlib object for output
output = Path(output)
- #Check if header available
- #sniffer = csv.Sniffer()
- #sniffer.has_header(open(channel_names).readline())
- #If header not available
- #if not sniffer:
- #If header available
- #channel_names_loaded = pd.read_csv(channel_names)
- #channel_names_loaded_list = list(channel_names_loaded.marker_name)
- #else:
- #print("negative")
- #old one column version
- #channel_names_loaded = pd.read_csv(channel_names,header=None)
- #Add a column index for ease
- #channel_names_loaded.columns = ["marker"]
- #channel_names_loaded = list(channel_names_loaded.marker.values)
-
#Read csv channel names
channel_names_loaded = pd.read_csv(channel_names)
- #Check for size of columns
- if channel_names_loaded.shape[1] > 1:
+ #Check for the presence of `marker_name` column
+ if 'marker_name' in channel_names_loaded:
#Get the marker_name column if more than one column (CyCIF structure)
channel_names_loaded_list = list(channel_names_loaded.marker_name)
- else:
- #old one column version -- re-read the csv file and add column name
+ #Consider the old one-marker-per-line plain text format
+ elif channel_names_loaded.shape[1] == 1:
+ #re-read the csv file and add column name
channel_names_loaded = pd.read_csv(channel_names, header = None)
- #Add a column index for ease and for standardization
- channel_names_loaded.columns = ["marker"]
- channel_names_loaded_list = list(channel_names_loaded.marker)
+ channel_names_loaded_list = list(channel_names_loaded.iloc[:,0])
+ else:
+ raise Exception('%s must contain the marker_name column'%channel_names)
+ #Contrast against the number of markers in the image
+ if len(channel_names_loaded_list) != n_channels(image):
+ raise Exception("The number of channels in %s doesn't match the image"%channel_names)
+
#Check for unique marker names -- create new list to store new names
channel_names_loaded_checked = []
for idx,val in enumerate(channel_names_loaded_list):
@@ -238,9 +230,6 @@
#Otherwise, leave channel name
channel_names_loaded_checked.append(val)
- #Clear small memory amount by clearing old channel names
- channel_names_loaded, channel_names_loaded_list = None, None
-
#Read the masks
masks_loaded = {}
#iterate through mask paths and read images to add to dictionary object
@@ -249,22 +238,30 @@
m_name = m_full_name.split('.')[0]
masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')})
- scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked)
+ scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked, mask_props=mask_props, intensity_props=intensity_props)
#Write the singe cell data to a csv file using the image name
im_full_name = os.path.basename(image)
im_name = im_full_name.split('.')[0]
- scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False)
+
+ # iterate through each mask and export csv with mask name as suffix
+ for k,v in scdata_z.items():
+ # export the csv for this mask name
+ scdata_z[k].to_csv(
+ str(Path(os.path.join(str(output),
+ str(im_name+"_{}"+".csv").format(k)))),
+ index=False
+ )
-def MultiExtractSingleCells(masks,image,channel_names,output):
+def MultiExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]):
"""Function for iterating over a list of z_stacks and output locations to
export single-cell data from image masks"""
print("Extracting single-cell data for "+str(image)+'...')
#Run the ExtractSingleCells function for this image
- ExtractSingleCells(masks,image,channel_names,output)
+ ExtractSingleCells(masks,image,channel_names,output, mask_props=mask_props, intensity_props=intensity_props)
#Print update
im_full_name = os.path.basename(image)
diff -r 928db0f952e3 -r aba3655fdef0 macros.xml
--- a/macros.xml Fri Mar 12 00:19:24 2021 +0000
+++ b/macros.xml Fri Mar 11 23:35:52 2022 +0000
@@ -2,12 +2,13 @@
- python
- scikit-image
- h5py
- pandas
- numpy
- pathlib
+ labsyspharm/quantification:@VERSION@
+ python
+ scikit-image
+ h5py
+ pandas
+ numpy
+ pathlib
@@ -19,6 +20,13 @@
- 1.3.1
- python ${__tool_directory__}/CommandSingleCellExtraction.py
+ 1.5.1
+
diff -r 928db0f952e3 -r aba3655fdef0 quantification.xml
--- a/quantification.xml Fri Mar 12 00:19:24 2021 +0000
+++ b/quantification.xml Fri Mar 11 23:35:52 2022 +0000
@@ -18,6 +18,7 @@
@CMD_BEGIN@
+ python \$QUANT_PATH
--masks
'${primary_mask.name}'.ome.tiff
#if $supp_masks
@@ -28,9 +29,17 @@
--image '${image.name}'.ome.tiff
--output ./tool_out
+
+ #if $mask_props
+ --mask_props $mask_props
+ #end if
+ #if $intensity_props
+ --intensity_props $intensity_props
+ #end if
+
--channel_names '$channel_names';
- mv ./tool_out/*.csv ./tool_out/quantified.csv;
+ cp tool_out/*cellMasks.csv cellMasks.csv
]]>
@@ -38,11 +47,16 @@
+
+
-
-
+
+
+
+
+