Mercurial > repos > perssond > quantification
changeset 1:aba3655fdef0 draft
"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit 897a7dc7cb43e45d6f0fdfe2b2970e59f20f8853"
author | watsocam |
---|---|
date | Fri, 11 Mar 2022 23:35:52 +0000 |
parents | 928db0f952e3 |
children | 46b897eb2c8e |
files | ParseInput.py SingleCellDataExtraction.py macros.xml quantification.xml |
diffstat | 4 files changed, 206 insertions(+), 163 deletions(-) [+] |
line wrap: on
line diff
--- a/ParseInput.py Fri Mar 12 00:19:24 2021 +0000 +++ b/ParseInput.py Fri Mar 11 23:35:52 2022 +0000 @@ -8,15 +8,39 @@ #if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--masks',nargs='*') - parser.add_argument('--image') - parser.add_argument('--channel_names') - parser.add_argument('--output') + parser.add_argument('--masks',nargs='+', required=True) + parser.add_argument('--image', required=True) + parser.add_argument('--channel_names', required=True) + parser.add_argument('--output', required=True) + parser.add_argument( + '--mask_props', nargs = "+", + help=""" + Space separated list of additional metrics to be calculated for every mask. + This is for metrics that depend only on the cell mask. If the metric depends + on signal intensity, use --intensity-props instead. + See list at https://scikit-image.org/docs/dev/api/skimage.measure.html#regionprops + """ + ) + parser.add_argument( + '--intensity_props', nargs = "+", + help=""" + Space separated list of additional metrics to be calculated for every marker separately. + By default only mean intensity is calculated. + If the metric doesn't depend on signal intensity, use --mask-props instead. + See list at https://scikit-image.org/docs/dev/api/skimage.measure.html#regionprops + Additionally available is gini_index, which calculates a single number + between 0 and 1, representing how unequal the signal is distributed in each region. + See https://en.wikipedia.org/wiki/Gini_coefficient + """ + ) #parser.add_argument('--suffix') args = parser.parse_args() #Create a dictionary object to pass to the next function dict = {'masks': args.masks, 'image': args.image,\ - 'channel_names': args.channel_names,'output':args.output} + 'channel_names': args.channel_names,'output':args.output, + 'intensity_props': set(args.intensity_props if args.intensity_props is not None else []).union(["intensity_mean"]), + 'mask_props': args.mask_props, + } #Print the dictionary object print(dict) #Return the dictionary
--- a/SingleCellDataExtraction.py Fri Mar 12 00:19:24 2021 +0000 +++ b/SingleCellDataExtraction.py Fri Mar 11 23:35:52 2022 +0000 @@ -8,83 +8,93 @@ import numpy as np import os import skimage.measure as measure +import tifffile + from pathlib import Path -import csv import sys -def MaskChannel(mask_loaded,image_loaded_z): +def gini_index(mask, intensity): + x = intensity[mask] + sorted_x = np.sort(x) + n = len(x) + cumx = np.cumsum(sorted_x, dtype=float) + return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n + +def intensity_median(mask, intensity): + return np.median(intensity[mask]) + +def MaskChannel(mask_loaded, image_loaded_z, intensity_props=["intensity_mean"]): """Function for quantifying a single channel image Returns a table with CellID according to the mask and the mean pixel intensity for the given channel for each cell""" - print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr) - print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr) - dat = measure.regionprops(mask_loaded, image_loaded_z) - n = len(dat) - intensity_z = np.empty(n) - for i in range(n): - intensity_z[i] = dat[i].mean_intensity - # Clear reference to avoid memory leak -- see MaskIDs for explanation. - dat[i] = None - return intensity_z + # Look for regionprops in skimage + builtin_props = set(intensity_props).intersection(measure._regionprops.PROP_VALS) + # Otherwise look for them in this module + extra_props = set(intensity_props).difference(measure._regionprops.PROP_VALS) + dat = measure.regionprops_table( + mask_loaded, image_loaded_z, + properties = tuple(builtin_props), + extra_properties = [globals()[n] for n in extra_props] + ) + return dat -def MaskIDs(mask): +def MaskIDs(mask, mask_props=None): """This function will extract the CellIDs and the XY positions for each cell based on that cells centroid Returns a dictionary object""" - dat = measure.regionprops(mask) - n = len(dat) + all_mask_props = set(["label", "centroid", "area", "major_axis_length", "minor_axis_length", "eccentricity", "solidity", "extent", "orientation"]) + if mask_props is not None: + all_mask_props = all_mask_props.union(mask_props) - # Pre-allocate numpy arrays for all properties we'll calculate. - labels = np.empty(n, int) - xcoords = np.empty(n) - ycoords = np.empty(n) - area = np.empty(n, int) - minor_axis_length = np.empty(n) - major_axis_length = np.empty(n) - eccentricity = np.empty(n) - solidity = np.empty(n) - extent = np.empty(n) - orientation = np.empty(n) + dat = measure.regionprops_table( + mask, + properties=all_mask_props + ) - for i in range(n): - labels[i] = dat[i].label - xcoords[i] = dat[i].centroid[1] - ycoords[i] = dat[i].centroid[0] - area[i] = dat[i].area - major_axis_length[i] = dat[i].major_axis_length - minor_axis_length[i] = dat[i].minor_axis_length - eccentricity[i] = dat[i].eccentricity - solidity[i] = dat[i].solidity - extent[i] = dat[i].extent - orientation[i] = dat[i].orientation - # By clearing the reference to each RegionProperties object, we allow it - # and its cache to be garbage collected immediately. Otherwise memory - # usage creeps up needlessly while this function is executing. - dat[i] = None + name_map = { + "CellID": "label", + "X_centroid": "centroid-1", + "Y_centroid": "centroid-0", + "Area": "area", + "MajorAxisLength": "major_axis_length", + "MinorAxisLength": "minor_axis_length", + "Eccentricity": "eccentricity", + "Solidity": "solidity", + "Extent": "extent", + "Orientation": "orientation", + } + for new_name, old_name in name_map.items(): + dat[new_name] = dat[old_name] + for old_name in set(name_map.values()): + del dat[old_name] + + return dat - IDs = { - "CellID": labels, - "X_centroid": xcoords, - "Y_centroid": ycoords, - "column_centroid": xcoords, - "row_centroid": ycoords, - "Area": area, - "MajorAxisLength": major_axis_length, - "MinorAxisLength": minor_axis_length, - "Eccentricity": eccentricity, - "Solidity": solidity, - "Extent": extent, - "Orientation": orientation, - } +def n_channels(image): + """Returns the number of channel in the input image. Supports [OME]TIFF and HDF5.""" + + image_path = Path(image) - return IDs + if image_path.suffix in ['.tiff', '.tif', '.btf']: + s = tifffile.TiffFile(image).series[0] + ndim = len(s.shape) + if ndim == 2: return 1 + elif ndim == 3: return min(s.shape) + else: raise Exception('mcquant supports only 2D/3D images.') + elif image_path.suffix in ['.h5', '.hdf5']: + f = h5py.File(image, 'r') + dat_name = list(f.keys())[0] + return f[dat_name].shape[3] + + else: + raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only') def PrepareData(image,z): """Function for preparing input for maskzstack function. Connecting function @@ -94,38 +104,26 @@ print(f'{image_path} at {z}', file=sys.stderr) #Check to see if image tif(f) - if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf': - #Check to see if the image is ome.tif(f) - if image.endswith(('.ome.tif','.ome.tiff')): - #Read the image - image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') - #print('OME TIF(F) found') - else: - #Read the image - image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') - #print('TIF(F) found') - # Remove extra axis - #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4])) + if image_path.suffix in ['.tiff', '.tif', '.btf']: + image_loaded_z = tifffile.imread(image, key=z) #Check to see if image is hdf5 - elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5': + elif image_path.suffix in ['.h5', '.hdf5']: #Read the image - f = h5py.File(image,'r+') + f = h5py.File(image,'r') #Get the dataset name from the h5 file dat_name = list(f.keys())[0] - ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data' - #Get the image data - image_loaded = np.array(f[dat_name]) - #Remove the first axis (ilastik convention) - image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3])) - ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be - ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1) + #Retrieve the z^th channel + image_loaded_z = f[dat_name][0,:,:,z] + + else: + raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only') #Return the objects return image_loaded_z -def MaskZstack(masks_loaded,image,channel_names_loaded): +def MaskZstack(masks_loaded,image,channel_names_loaded, mask_props=None, intensity_props=["intensity_mean"]): """This function will extract the stats for each cell mask through each channel in the input image @@ -135,8 +133,7 @@ #Get the names of the keys for the masks dictionary mask_names = list(masks_loaded.keys()) - #Get the CellIDs for this dataset by using only a single mask (first mask) - IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]])) + #Create empty dictionary to store channel results per mask dict_of_chan = {m_name: [] for m_name in mask_names} #Get the z channel and the associated channel name from list of channel names @@ -149,84 +146,79 @@ #Iterate through number of masks to extract single cell data for nm in range(len(mask_names)): #Use the above information to mask z stack - dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z)) + dict_of_chan[mask_names[nm]].append( + MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z, intensity_props=intensity_props) + ) #Print progress print("Finished "+str(z)) - #Iterate through the rest of the masks to modify names of channels and convert to data table + # Column order according to histoCAT convention (Move xy position to end with spatial information) + last_cols = ( + "X_centroid", + "Y_centroid", + "column_centroid", + "row_centroid", + "Area", + "MajorAxisLength", + "MinorAxisLength", + "Eccentricity", + "Solidity", + "Extent", + "Orientation", + ) + def col_sort(x): + if x == "CellID": + return -2 + try: + return last_cols.index(x) + except ValueError: + return -1 + + #Iterate through the masks and format quantifications for each mask and property for nm in mask_names: - #Check if this is the first mask - if nm == mask_names[0]: - #Create channel names for this mask - new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] - #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY - dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1) - #Get the name of the columns in the dataframe so we can reorder to histoCAT convention - cols = list(dict_of_chan[nm].columns.values) - #Reorder the list (Move xy position to end with spatial information) - cols.append(cols.pop(cols.index("X_centroid"))) - cols.append(cols.pop(cols.index("Y_centroid"))) - cols.append(cols.pop(cols.index("column_centroid"))) - cols.append(cols.pop(cols.index("row_centroid"))) - cols.append(cols.pop(cols.index("Area"))) - cols.append(cols.pop(cols.index("MajorAxisLength"))) - cols.append(cols.pop(cols.index("MinorAxisLength"))) - cols.append(cols.pop(cols.index("Eccentricity"))) - cols.append(cols.pop(cols.index("Solidity"))) - cols.append(cols.pop(cols.index("Extent"))) - cols.append(cols.pop(cols.index("Orientation"))) - #Reindex the dataframe with new order - dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols) - #Otherwise, add no spatial information - else: - #Create channel names for this mask - new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] - #Use the above information to mask z stack - dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm]))) + mask_dict = {} + # Mean intensity is default property, stored without suffix + mask_dict.update( + zip(channel_names_loaded, [x["intensity_mean"] for x in dict_of_chan[nm]]) + ) + # All other properties are suffixed with their names + for prop_n in set(dict_of_chan[nm][0].keys()).difference(["intensity_mean"]): + mask_dict.update( + zip([f"{n}_{prop_n}" for n in channel_names_loaded], [x[prop_n] for x in dict_of_chan[nm]]) + ) + # Get the cell IDs and mask properties + mask_properties = pd.DataFrame(MaskIDs(masks_loaded[nm], mask_props=mask_props)) + mask_dict.update(mask_properties) + dict_of_chan[nm] = pd.DataFrame(mask_dict).reindex(columns=sorted(mask_dict.keys(), key=col_sort)) - #Concatenate all data from all masks to return - dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1) + # Return the dict of dataframes for each mask + return dict_of_chan - #Return the dataframe - return dat - - -def ExtractSingleCells(masks,image,channel_names,output): +def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]): """Function for extracting single cell information from input path containing single-cell masks, z_stack path, and channel_names path.""" #Create pathlib object for output output = Path(output) - #Check if header available - #sniffer = csv.Sniffer() - #sniffer.has_header(open(channel_names).readline()) - #If header not available - #if not sniffer: - #If header available - #channel_names_loaded = pd.read_csv(channel_names) - #channel_names_loaded_list = list(channel_names_loaded.marker_name) - #else: - #print("negative") - #old one column version - #channel_names_loaded = pd.read_csv(channel_names,header=None) - #Add a column index for ease - #channel_names_loaded.columns = ["marker"] - #channel_names_loaded = list(channel_names_loaded.marker.values) - #Read csv channel names channel_names_loaded = pd.read_csv(channel_names) - #Check for size of columns - if channel_names_loaded.shape[1] > 1: + #Check for the presence of `marker_name` column + if 'marker_name' in channel_names_loaded: #Get the marker_name column if more than one column (CyCIF structure) channel_names_loaded_list = list(channel_names_loaded.marker_name) - else: - #old one column version -- re-read the csv file and add column name + #Consider the old one-marker-per-line plain text format + elif channel_names_loaded.shape[1] == 1: + #re-read the csv file and add column name channel_names_loaded = pd.read_csv(channel_names, header = None) - #Add a column index for ease and for standardization - channel_names_loaded.columns = ["marker"] - channel_names_loaded_list = list(channel_names_loaded.marker) + channel_names_loaded_list = list(channel_names_loaded.iloc[:,0]) + else: + raise Exception('%s must contain the marker_name column'%channel_names) + #Contrast against the number of markers in the image + if len(channel_names_loaded_list) != n_channels(image): + raise Exception("The number of channels in %s doesn't match the image"%channel_names) + #Check for unique marker names -- create new list to store new names channel_names_loaded_checked = [] for idx,val in enumerate(channel_names_loaded_list): @@ -238,9 +230,6 @@ #Otherwise, leave channel name channel_names_loaded_checked.append(val) - #Clear small memory amount by clearing old channel names - channel_names_loaded, channel_names_loaded_list = None, None - #Read the masks masks_loaded = {} #iterate through mask paths and read images to add to dictionary object @@ -249,22 +238,30 @@ m_name = m_full_name.split('.')[0] masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')}) - scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked) + scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked, mask_props=mask_props, intensity_props=intensity_props) #Write the singe cell data to a csv file using the image name im_full_name = os.path.basename(image) im_name = im_full_name.split('.')[0] - scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False) + + # iterate through each mask and export csv with mask name as suffix + for k,v in scdata_z.items(): + # export the csv for this mask name + scdata_z[k].to_csv( + str(Path(os.path.join(str(output), + str(im_name+"_{}"+".csv").format(k)))), + index=False + ) -def MultiExtractSingleCells(masks,image,channel_names,output): +def MultiExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]): """Function for iterating over a list of z_stacks and output locations to export single-cell data from image masks""" print("Extracting single-cell data for "+str(image)+'...') #Run the ExtractSingleCells function for this image - ExtractSingleCells(masks,image,channel_names,output) + ExtractSingleCells(masks,image,channel_names,output, mask_props=mask_props, intensity_props=intensity_props) #Print update im_full_name = os.path.basename(image)
--- a/macros.xml Fri Mar 12 00:19:24 2021 +0000 +++ b/macros.xml Fri Mar 11 23:35:52 2022 +0000 @@ -2,12 +2,13 @@ <macros> <xml name="requirements"> <requirements> - <requirement type="package" version="3.6.10">python</requirement> - <requirement type="package" version="0.17.2">scikit-image</requirement> - <requirement type="package" version="2.10.0">h5py</requirement> - <requirement type="package" version="1.0.4">pandas</requirement> - <requirement type="package" version="1.18.5">numpy</requirement> - <requirement type="package" version="1.0.1">pathlib</requirement> + <container type="docker">labsyspharm/quantification:@VERSION@</container> + <requirement type="package" version="3.9">python</requirement> + <requirement type="package" version="0.18.0">scikit-image</requirement> + <requirement type="package">h5py</requirement> + <requirement type="package">pandas</requirement> + <requirement type="package">numpy</requirement> + <requirement type="package">pathlib</requirement> </requirements> </xml> @@ -19,6 +20,13 @@ </citations> </xml> - <token name="@VERSION@">1.3.1</token> - <token name="@CMD_BEGIN@">python ${__tool_directory__}/CommandSingleCellExtraction.py</token> + <token name="@VERSION@">1.5.1</token> + <token name="@CMD_BEGIN@"><![CDATA[ + QUANT_PATH=""; + if [ -f "/app/CommandSingleCellExtraction.py" ]; then + export QUANT_PATH="/app/CommandSingleCellExtraction.py"; + else + export QUANT_PATH="${__tool_directory__}/CommandSingleCellExtraction.py"; + fi; + ]]></token> </macros>
--- a/quantification.xml Fri Mar 12 00:19:24 2021 +0000 +++ b/quantification.xml Fri Mar 11 23:35:52 2022 +0000 @@ -18,6 +18,7 @@ @CMD_BEGIN@ + python \$QUANT_PATH --masks '${primary_mask.name}'.ome.tiff #if $supp_masks @@ -28,9 +29,17 @@ --image '${image.name}'.ome.tiff --output ./tool_out + + #if $mask_props + --mask_props $mask_props + #end if + #if $intensity_props + --intensity_props $intensity_props + #end if + --channel_names '$channel_names'; - mv ./tool_out/*.csv ./tool_out/quantified.csv; + cp tool_out/*cellMasks.csv cellMasks.csv ]]></command> <inputs> @@ -38,11 +47,16 @@ <param name="primary_mask" type="data" format="tiff" label="Primary Cell Mask"/> <param name="supp_masks" type="data" multiple="true" optional="true" format="tiff" label="Additional Cell Masks"/> <param name="channel_names" type="data" format="csv" label="Marker Channels"/> + <param name="mask_props" type="text" label="Mask Metrics" help="Space separated list of additional metrics to be calculated for every mask."/> + <param name="intensity_props" type="text" label="Intensity Metrics" help="Space separated list of additional metrics to be calculated for every marker separately."/> </inputs> <outputs> - <data format="csv" name="quant_out" from_work_dir="./tool_out/quantified.csv" label="${tool.name} on ${on_string}"/> - </outputs> + <data format="csv" name="cellmask" from_work_dir="cellMasks.csv" label="CellMaskQuant"/> + <collection type="list" name="quantification" label="${tool.name} on ${on_string}"> + <discover_datasets pattern="__designation_and_ext__" format="csv" directory="tool_out/" visible="true"/> + </collection> + </outputs> <help><![CDATA[ # Single cell quantification Module for single-cell data extraction given a segmentation mask and multi-channel image. The CSV structure is aligned with histoCAT output.