Mercurial > repos > perssond > quantification
changeset 0:928db0f952e3 draft
"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit a4349062e9177b5e60fb7c49115c57299e0d648d-dirty"
author | perssond |
---|---|
date | Fri, 12 Mar 2021 00:19:24 +0000 |
parents | |
children | aba3655fdef0 |
files | CommandSingleCellExtraction.py ParseInput.py SingleCellDataExtraction.py macros.xml quantification.xml |
diffstat | 5 files changed, 401 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CommandSingleCellExtraction.py Fri Mar 12 00:19:24 2021 +0000 @@ -0,0 +1,11 @@ +#Script for parsing command line arguments and running single-cell +#data extraction functions +#Joshua Hess +import ParseInput +import SingleCellDataExtraction + +#Parse the command line arguments +args = ParseInput.ParseInputDataExtract() + +#Run the MultiExtractSingleCells function +SingleCellDataExtraction.MultiExtractSingleCells(**args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ParseInput.py Fri Mar 12 00:19:24 2021 +0000 @@ -0,0 +1,23 @@ +#Functions for parsing command line arguments for ome ilastik prep +import argparse + + +def ParseInputDataExtract(): + """Function for parsing command line arguments for input to single-cell + data extraction""" + +#if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--masks',nargs='*') + parser.add_argument('--image') + parser.add_argument('--channel_names') + parser.add_argument('--output') + #parser.add_argument('--suffix') + args = parser.parse_args() + #Create a dictionary object to pass to the next function + dict = {'masks': args.masks, 'image': args.image,\ + 'channel_names': args.channel_names,'output':args.output} + #Print the dictionary object + print(dict) + #Return the dictionary + return dict
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SingleCellDataExtraction.py Fri Mar 12 00:19:24 2021 +0000 @@ -0,0 +1,272 @@ +#Functions for reading in single cell imaging data +#Joshua Hess + +#Import necessary modules +import skimage.io +import h5py +import pandas as pd +import numpy as np +import os +import skimage.measure as measure +from pathlib import Path +import csv + +import sys + + +def MaskChannel(mask_loaded,image_loaded_z): + """Function for quantifying a single channel image + + Returns a table with CellID according to the mask and the mean pixel intensity + for the given channel for each cell""" + print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr) + print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr) + dat = measure.regionprops(mask_loaded, image_loaded_z) + n = len(dat) + intensity_z = np.empty(n) + for i in range(n): + intensity_z[i] = dat[i].mean_intensity + # Clear reference to avoid memory leak -- see MaskIDs for explanation. + dat[i] = None + return intensity_z + + +def MaskIDs(mask): + """This function will extract the CellIDs and the XY positions for each + cell based on that cells centroid + + Returns a dictionary object""" + + dat = measure.regionprops(mask) + n = len(dat) + + # Pre-allocate numpy arrays for all properties we'll calculate. + labels = np.empty(n, int) + xcoords = np.empty(n) + ycoords = np.empty(n) + area = np.empty(n, int) + minor_axis_length = np.empty(n) + major_axis_length = np.empty(n) + eccentricity = np.empty(n) + solidity = np.empty(n) + extent = np.empty(n) + orientation = np.empty(n) + + for i in range(n): + labels[i] = dat[i].label + xcoords[i] = dat[i].centroid[1] + ycoords[i] = dat[i].centroid[0] + area[i] = dat[i].area + major_axis_length[i] = dat[i].major_axis_length + minor_axis_length[i] = dat[i].minor_axis_length + eccentricity[i] = dat[i].eccentricity + solidity[i] = dat[i].solidity + extent[i] = dat[i].extent + orientation[i] = dat[i].orientation + # By clearing the reference to each RegionProperties object, we allow it + # and its cache to be garbage collected immediately. Otherwise memory + # usage creeps up needlessly while this function is executing. + dat[i] = None + + IDs = { + "CellID": labels, + "X_centroid": xcoords, + "Y_centroid": ycoords, + "column_centroid": xcoords, + "row_centroid": ycoords, + "Area": area, + "MajorAxisLength": major_axis_length, + "MinorAxisLength": minor_axis_length, + "Eccentricity": eccentricity, + "Solidity": solidity, + "Extent": extent, + "Orientation": orientation, + } + + return IDs + + +def PrepareData(image,z): + """Function for preparing input for maskzstack function. Connecting function + to use with mc micro ilastik pipeline""" + + image_path = Path(image) + print(f'{image_path} at {z}', file=sys.stderr) + + #Check to see if image tif(f) + if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf': + #Check to see if the image is ome.tif(f) + if image.endswith(('.ome.tif','.ome.tiff')): + #Read the image + image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') + #print('OME TIF(F) found') + else: + #Read the image + image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') + #print('TIF(F) found') + # Remove extra axis + #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4])) + + #Check to see if image is hdf5 + elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5': + #Read the image + f = h5py.File(image,'r+') + #Get the dataset name from the h5 file + dat_name = list(f.keys())[0] + ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data' + #Get the image data + image_loaded = np.array(f[dat_name]) + #Remove the first axis (ilastik convention) + image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3])) + ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be + ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1) + + #Return the objects + return image_loaded_z + + +def MaskZstack(masks_loaded,image,channel_names_loaded): + """This function will extract the stats for each cell mask through each channel + in the input image + + mask_loaded: dictionary containing Tiff masks that represents the cells in your image. + + z_stack: Multichannel z stack image""" + + #Get the names of the keys for the masks dictionary + mask_names = list(masks_loaded.keys()) + #Get the CellIDs for this dataset by using only a single mask (first mask) + IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]])) + #Create empty dictionary to store channel results per mask + dict_of_chan = {m_name: [] for m_name in mask_names} + #Get the z channel and the associated channel name from list of channel names + print(f'channels: {channel_names_loaded}', file=sys.stderr) + print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr) + for z in range(len(channel_names_loaded)): + #Run the data Prep function + image_loaded_z = PrepareData(image,z) + + #Iterate through number of masks to extract single cell data + for nm in range(len(mask_names)): + #Use the above information to mask z stack + dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z)) + #Print progress + print("Finished "+str(z)) + + #Iterate through the rest of the masks to modify names of channels and convert to data table + for nm in mask_names: + #Check if this is the first mask + if nm == mask_names[0]: + #Create channel names for this mask + new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] + #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY + dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1) + #Get the name of the columns in the dataframe so we can reorder to histoCAT convention + cols = list(dict_of_chan[nm].columns.values) + #Reorder the list (Move xy position to end with spatial information) + cols.append(cols.pop(cols.index("X_centroid"))) + cols.append(cols.pop(cols.index("Y_centroid"))) + cols.append(cols.pop(cols.index("column_centroid"))) + cols.append(cols.pop(cols.index("row_centroid"))) + cols.append(cols.pop(cols.index("Area"))) + cols.append(cols.pop(cols.index("MajorAxisLength"))) + cols.append(cols.pop(cols.index("MinorAxisLength"))) + cols.append(cols.pop(cols.index("Eccentricity"))) + cols.append(cols.pop(cols.index("Solidity"))) + cols.append(cols.pop(cols.index("Extent"))) + cols.append(cols.pop(cols.index("Orientation"))) + #Reindex the dataframe with new order + dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols) + #Otherwise, add no spatial information + else: + #Create channel names for this mask + new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] + #Use the above information to mask z stack + dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm]))) + + #Concatenate all data from all masks to return + dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1) + + #Return the dataframe + return dat + + +def ExtractSingleCells(masks,image,channel_names,output): + """Function for extracting single cell information from input + path containing single-cell masks, z_stack path, and channel_names path.""" + + #Create pathlib object for output + output = Path(output) + + #Check if header available + #sniffer = csv.Sniffer() + #sniffer.has_header(open(channel_names).readline()) + #If header not available + #if not sniffer: + #If header available + #channel_names_loaded = pd.read_csv(channel_names) + #channel_names_loaded_list = list(channel_names_loaded.marker_name) + #else: + #print("negative") + #old one column version + #channel_names_loaded = pd.read_csv(channel_names,header=None) + #Add a column index for ease + #channel_names_loaded.columns = ["marker"] + #channel_names_loaded = list(channel_names_loaded.marker.values) + + #Read csv channel names + channel_names_loaded = pd.read_csv(channel_names) + #Check for size of columns + if channel_names_loaded.shape[1] > 1: + #Get the marker_name column if more than one column (CyCIF structure) + channel_names_loaded_list = list(channel_names_loaded.marker_name) + else: + #old one column version -- re-read the csv file and add column name + channel_names_loaded = pd.read_csv(channel_names, header = None) + #Add a column index for ease and for standardization + channel_names_loaded.columns = ["marker"] + channel_names_loaded_list = list(channel_names_loaded.marker) + + #Check for unique marker names -- create new list to store new names + channel_names_loaded_checked = [] + for idx,val in enumerate(channel_names_loaded_list): + #Check for unique value + if channel_names_loaded_list.count(val) > 1: + #If unique count greater than one, add suffix + channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1)) + else: + #Otherwise, leave channel name + channel_names_loaded_checked.append(val) + + #Clear small memory amount by clearing old channel names + channel_names_loaded, channel_names_loaded_list = None, None + + #Read the masks + masks_loaded = {} + #iterate through mask paths and read images to add to dictionary object + for m in masks: + m_full_name = os.path.basename(m) + m_name = m_full_name.split('.')[0] + masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')}) + + scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked) + #Write the singe cell data to a csv file using the image name + + im_full_name = os.path.basename(image) + im_name = im_full_name.split('.')[0] + scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False) + + +def MultiExtractSingleCells(masks,image,channel_names,output): + """Function for iterating over a list of z_stacks and output locations to + export single-cell data from image masks""" + + print("Extracting single-cell data for "+str(image)+'...') + + #Run the ExtractSingleCells function for this image + ExtractSingleCells(masks,image,channel_names,output) + + #Print update + im_full_name = os.path.basename(image) + im_name = im_full_name.split('.')[0] + print("Finished "+str(im_name))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Fri Mar 12 00:19:24 2021 +0000 @@ -0,0 +1,24 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="3.6.10">python</requirement> + <requirement type="package" version="0.17.2">scikit-image</requirement> + <requirement type="package" version="2.10.0">h5py</requirement> + <requirement type="package" version="1.0.4">pandas</requirement> + <requirement type="package" version="1.18.5">numpy</requirement> + <requirement type="package" version="1.0.1">pathlib</requirement> + </requirements> + </xml> + + <xml name="version_cmd"> + <version_command>echo @VERSION@</version_command> + </xml> + <xml name="citations"> + <citations> + </citations> + </xml> + + <token name="@VERSION@">1.3.1</token> + <token name="@CMD_BEGIN@">python ${__tool_directory__}/CommandSingleCellExtraction.py</token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantification.xml Fri Mar 12 00:19:24 2021 +0000 @@ -0,0 +1,71 @@ +<tool id="quantification" name="Quantification" version="@VERSION@.5" profile="17.09"> + <description>Single cell quantification, a module for single-cell data extraction given a segmentation mask and multi-channel image.</description> + <macros> + <import>macros.xml</import> + </macros> + + <expand macro="requirements"/> + @VERSION_CMD@ + + <command detect_errors="exit_code"><![CDATA[ + ln -s '$image' '${image.name}'.ome.tiff; + ln -s '$primary_mask' '${primary_mask.name}'.ome.tiff; + #for $mask in $supp_masks: + ln -s '$mask' '${mask.name}'.ome.tiff; + #end for + + mkdir ./tool_out; + + @CMD_BEGIN@ + + --masks + '${primary_mask.name}'.ome.tiff + #if $supp_masks + #for $mask in $supp_masks: + '${mask.name}'.ome.tiff + #end for + #end if + + --image '${image.name}'.ome.tiff + --output ./tool_out + --channel_names '$channel_names'; + + mv ./tool_out/*.csv ./tool_out/quantified.csv; + ]]></command> + + <inputs> + <param name="image" type="data" format="tiff" label="Registered TIFF"/> + <param name="primary_mask" type="data" format="tiff" label="Primary Cell Mask"/> + <param name="supp_masks" type="data" multiple="true" optional="true" format="tiff" label="Additional Cell Masks"/> + <param name="channel_names" type="data" format="csv" label="Marker Channels"/> + </inputs> + + <outputs> + <data format="csv" name="quant_out" from_work_dir="./tool_out/quantified.csv" label="${tool.name} on ${on_string}"/> + </outputs> + <help><![CDATA[ +# Single cell quantification +Module for single-cell data extraction given a segmentation mask and multi-channel image. The CSV structure is aligned with histoCAT output. + +**CommandSingleCellExtraction.py**: + +* `--masks` Paths to where masks are stored (Ex: ./segmentation/cellMask.tif) -> If multiple masks are selected the first mask will be used for spatial feature extraction but all will be quantified + +* `--image` Path to image(s) for quantification. (Ex: ./registration/*.h5) -> works with .h(df)5 or .tif(f) + +* `--output` Path to output directory. (Ex: ./feature_extraction) + +* `--channel_names` csv file containing the channel names for the z-stack (Ex: ./my_channels.csv) + +# Run script +`python CommandSingleCellExtraction.py --masks ./segmentation/cellMask.tif ./segmentation/membraneMask.tif --image ./registration/Exemplar_001.h5 --output ./feature_extraction --channel_names ./my_channels.csv` + +# Main developer +Denis Schapiro (https://github.com/DenisSch) + +Joshua Hess (https://github.com/JoshuaHess12) + +Jeremy Muhlich (https://github.com/jmuhlich) + ]]></help> + <expand macro="citations" /> +</tool>