Mercurial > repos > bgruening > music_deconvolution
changeset 4:56371b5a2da9 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
author | bgruening |
---|---|
date | Thu, 10 Feb 2022 12:52:31 +0000 |
parents | fd7a16d073c5 |
children | 2ba99a52bd44 |
files | macros.xml music-deconvolution.xml music_deconvolution.xml scripts/compare.R scripts/estimateprops.R test-data/default_output_no_disease_nnls.pdf test-data/out_filt1.pdf test-data/out_heat2.pdf |
diffstat | 8 files changed, 837 insertions(+), 357 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Sat Jan 29 12:51:49 2022 +0000 +++ b/macros.xml Thu Feb 10 12:52:31 2022 +0000 @@ -1,5 +1,5 @@ <macros> - <token name="@VERSION_SUFFIX@">2</token> + <token name="@VERSION_SUFFIX@">3</token> <!-- The ESet inspector/constructor and MuSiC tool can have independent Galaxy versions but should reference the same package version always. -->
--- a/music-deconvolution.xml Sat Jan 29 12:51:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,298 +0,0 @@ -<tool id="music_deconvolution" name="MuSiC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" - profile="21.09" license="GPL-3.0-or-later" > - <description>estimate cell type proportions in bulk RNA-seq data</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements" /> - <command detect_errors="exit_code" ><![CDATA[ -mkdir report_data && -Rscript --vanilla '$__tool_directory__/scripts/${do.method}.R' '$conf' -]]></command> - <configfiles> - <configfile name="conf" > - -null_str_vec = function(gstr){ - tokens = unlist(as.vector(strsplit(gstr, split=","))) - if (length(tokens) == 0){ - return(NULL) - } - if (length(tokens) == 1){ - return(tokens[[1]]) - } - return(tokens) -} - -bulk_eset = readRDS('$bulk_eset') -scrna_eset = readRDS('$scrna_eset') -use_disease_factor = FALSE -maxyscale = NA - -#if str($do.method) == "estimateprops": - -maxyscale = as.numeric('$do.maxyscale') ## yields "NA" if blank -phenotype_factors = null_str_vec('$do.phenotype_factors') -phenotype_factors_always_exclude = null_str_vec('$do.phenotype_factors_always_exclude') -celltypes_label = null_str_vec('$do.celltypes_label') -samples_label = null_str_vec('$do.samples_label') -celltypes = null_str_vec('$do.celltypes') -methods = c("MuSiC", "NNLS") - - #if str($do.disease_factor.use) == "yes": -use_disease_factor = TRUE -phenotype_scrna_target = null_str_vec('$do.disease_factor.phenotype_scrna_target') -phenotype_target = null_str_vec('$do.disease_factor.phenotype_target') -phenotype_target_threshold = as.numeric('$do.disease_factor.phenotype_target_threshold') -sample_disease_group = null_str_vec('$do.disease_factor.sample_disease_group') -sample_disease_group_scale = as.integer('$do.disease_factor.sample_disease_group_scale') - #end if - -outfile_pdf='$out_pdf' - -#elif str($do.method) == "dendrogram": - -celltypes_label = null_str_vec('$do.celltypes_label') -clustertype_label = null_str_vec('$do.clustertype_label') -samples_label = null_str_vec('$do.samples_label') -celltypes = null_str_vec('$do.celltypes') - -data.to.use = list( - #for $i, $repeat in enumerate( $do.cluster_groups ) - #if $i == 0: - $repeat.cluster_id = list(cell.types = null_str_vec('$repeat.celltypes'), - marker.names = null_str_vec('$repeat.marker_name'), - marker.list = read_list('$repeat.marker_list')) - #else - , $repeat.cluster_id = list(cell.types = null_str_vec('$repeat.celltypes'), - marker.names = null_str_vec('$repeat.marker_name'), - marker.list = read_list('$repeat.marker_list')) - #end if - #end for -) - -outfile_pdf='$out_pdf' -outfile_tab='$out_tab' - -#else - stop("No such option") -#end if - - </configfile> - </configfiles> - <inputs> - <param name="scrna_eset" label="scRNA Dataset" type="data" format="@RDATATYPE@" /> - <param name="bulk_eset" label="Bulk RNA Dataset" type="data" format="@RDATATYPE@" /> - <conditional name="do" > - <param name="method" type="select" label="Purpose" > - <!-- The values here correspond to script names in the scripts folder - and must remain so --> - <option value="estimateprops">Estimate Proportions</option> - <option value="dendrogram">Compute Dendrogram</option> - </param> - <when value="estimateprops" > - <param name="celltypes_label" type="text" value="cellType" - label="Cell Types Label from scRNA dataset" > - <expand macro="validator_text" /> - </param> - <param name="samples_label" type="text" value="sampleID" - label="Samples Identifier from scRNA dataset" > - <expand macro="validator_text" /> - </param> - <expand macro="celltypes_macro" /> - <param name="phenotype_factors" type="text" - label="Phenotype factors" - help="List of phenotypes factors to be used in the linear regression. Please make sure that each factor has more than one unique value. Names correspond to column names in the bulk RNA dataset phenotype table. If blank, then treat all bulk phenotype columns as factors." > - <expand macro="validator_index_identifiers" /> - </param> - <param name="phenotype_factors_always_exclude" type="text" - label="Excluded phenotype factors" - help="List of phenotype factors to always exclude in the analysis" - value="sampleID,SubjectName" > - <expand macro="validator_index_identifiers" /> - </param> - <conditional name="disease_factor" > - <param name="use" type="select" label="Show proportions of a disease factor?" > - <option value="no" selected="true" >No</option> - <option value="yes" >Yes</option> - </param> - <when value="no" ></when> - <when value="yes" > - <param name="phenotype_scrna_target" type="text" label="scRNA Phenotype Cell Target" - help="The name of a target scRNA cell type to select in the phenotype comparison." > - <expand macro="validator_text" /> - </param> - <param name="phenotype_target" type="text" label="Bulk Phenotype Target" - help="MUST exist in the bulk RNA datasets phenotype factors as above." > - <expand macro="validator_text" /> - </param> - <param name="phenotype_target_threshold" type="float" label="Bulk Phenotype Target Threshold" - value="-99" - help="The (%) threshold at which the phenotype target manifests. Leave at -99 to select all." > - </param> - <param name="sample_disease_group" type="text" label="scRNA Sample Disease Group" - help="Name for target disease group, ideally a value from the scRNA phenotype factor data" > - <expand macro="validator_text" /> - </param> - <param name="sample_disease_group_scale" type="integer" - label="scRNA Sample Disease Group (Scale)" value="5" - help="Used to accentutate certain features in the plots. Increase this number to reduce the effect." /> - </when> - </conditional> - <param name="maxyscale" type="float" min="0" value="" optional="true" - label="Scale all Y-axes to max limit" help="Leave blank to autoscale each plot."/> - </when> - <when value="dendrogram" > - <param name="celltypes_label" type="text" value="cellType" - label="Cell Types Label from scRNA dataset" > - <expand macro="validator_text" /> - </param> - <param name="clustertype_label" type="text" value="clusterType" - label="Cluster Types Label from scRNA dataset" > - <expand macro="validator_text" /> - </param> - <param name="samples_label" type="text" value="sampleID" - label="Samples Identifier from scRNA dataset" > - <expand macro="validator_text" /> - </param> - <expand macro="celltypes_macro" /> - <repeat name="cluster_groups" title="Cluster Groups" min="0" - help="Insert cell cluster groups based on a previous clustering." > - <param name="cluster_id" label="Cluster ID" type="text" value="" - help="e.g. C1 or Cluster1, etc." /> - <expand macro="celltypes_macro" /> - <param name="marker_name" label="Marker Gene Group Name" type="text" - optional="true" value="" - help="Name of the list of gene markers used to describe the marker list supplied below." > - <expand macro="validator_text" /> - </param> - <param name="marker_list" label="List of Gene Markers" type="data" format="txt,tabular" - optional="true" - help="A single column of marker genes" /> - </repeat> - </when> - </conditional> - </inputs> - <outputs> - <data name="out_pdf" format="pdf" label="${tool.name} on ${on_string}: PDF Plots" /> - <data name="out_tab" format="tabular" label="${tool.name} on ${on_string}: Cell Proportions by Sample" > - <filter>do["method"] == "dendrogram" and len(do["cluster_groups"]) >0</filter> - </data> - <collection name="props" type="list" label="${tool.name} on ${on_string}: Proportion Matrices" > - <filter>do["method"] == "estimateprops"</filter> - <discover_datasets pattern="prop_(?P<designation>.+)\.tabular" format="tabular" directory="report_data" /> - </collection> - <collection name="summaries" type="list" label="${tool.name} on ${on_string}: Summaries and Logs"> - <filter>do["method"] == "estimateprops" and do["disease_factor"]["use"] == "yes"</filter> - <discover_datasets pattern="summ_(?P<designation>.+)\.txt" format="txt" directory="report_data" /> - <discover_datasets pattern="varprop_(?P<designation>.+)\.tabular" format="tabular" directory="report_data" /> - <discover_datasets pattern="rsquared_(?P<designation>.+)\.tabular" format="tabular" directory="report_data" /> - <discover_datasets pattern="weightgene_(?P<designation>.+)\.tabular" format="tabular" directory="report_data" /> - </collection> - </outputs> - <tests> - <test expect_num_outputs="1" > - <!-- Dendrogram test 1 --> - <param name="bulk_eset" value="Mousebulkeset.rds" /> - <param name="scrna_eset" value="Mousesubeset.degenesonly2.half.rds" /> - <conditional name="do" > - <param name="method" value="dendrogram" /> - <param name="celltypes_label" value="cellType" /> - <param name="samples_label" value="sampleID" /> - <param name="celltypes" value="Endo,Podo,PT,LOH,DCT,CD-PC,CD-IC,Fib,Macro,Neutro,B lymph,T lymph,NK" /> - </conditional> - <output name="out_pdf" value="dendro_1.pdf" compare="sim_size" /> - </test> - <test expect_num_outputs="2" > - <!-- Dendrogram test 2 --> - <param name="bulk_eset" value="Mousebulkeset.rds" /> - <param name="scrna_eset" value="Mousesubeset.degenesonly2.half.rds" /> - <conditional name="do" > - <param name="method" value="dendrogram" /> - <param name="celltypes_label" value="cellType" /> - <param name="samples_label" value="sampleID" /> - <param name="celltypes" value="Endo,Podo,PT,LOH,DCT,CD-PC,CD-IC,Fib,Macro,Neutro,B lymph,T lymph,NK" /> - <repeat name="cluster_groups" > - <param name="cluster_id" value="C1" /> - <param name="celltypes" value="Neutro" /> - </repeat> - <repeat name="cluster_groups" > - <param name="cluster_id" value="C2" /> - <param name="celltypes" value="Podo" /> - </repeat> - <repeat name="cluster_groups" > - <param name="cluster_id" value="C3" /> - <param name="celltypes" value="Endo,CD-PC,LOH,CD-IC,DCT,PT" /> - <param name="marker_name" value="Epithelial" /> - <param name="marker_list" value="epith.markers" /> - </repeat> - <repeat name="cluster_groups" > - <param name="cluster_id" value="C4" /> - <param name="celltypes" value="Macro,Fib,B lymph,NK,T lymph" /> - <param name="marker_name" value="Immune" /> - <param name="marker_list" value="immune.markers" /> - </repeat> - </conditional> - <output name="out_pdf" value="dendro.pdf" compare="sim_size" /> - <output name="out_tab"> - <assert_contents> - <has_text_matching expression="^\s+Neutro\s+Podo\s+Endo" /> - <has_text text="APOL1.GNA78M"/> - </assert_contents> - </output> - </test> - <test expect_num_outputs="2" > - <!-- Estimate Proportions: no disease factor, no fitting reports --> - <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" /> - <param name="scrna_eset" value="EMTABesethealthy.subset.rds" /> - <conditional name="do" > - <param name="method" value="estimateprops" /> - <param name="celltypes_label" value="cellType" /> - <param name="samples_label" value="sampleID" /> - <param name="disease_factor" value="no" /> - </conditional> - <output name="out_pdf" value="default_output_no_disease.pdf" compare="sim_size" /> - </test> - <test expect_num_outputs="3" > - <!-- Estimate Proportions test --> - <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" /> - <param name="scrna_eset" value="EMTABesethealthy.subset.rds" /> - <conditional name="do" > - <param name="method" value="estimateprops" /> - <param name="celltypes_label" value="cellType" /> - <param name="samples_label" value="sampleID" /> - <param name="celltypes" value="alpha,beta,delta,gamma,acinar,ductal" /> - <conditional name="disease_factor" > - <param name="use" value="yes" /> - <param name="phenotype_scrna_target" value="beta" /> - <param name="phenotype_factors" value="age,bmi,hba1c,gender" /> - <param name="phenotype_target" value="hba1c" /> - <param name="phenotype_target_threshold" value="6.5" /> - <param name="sample_disease_group" value="T2D" /> - <param name="sample_disease_group_scale" value="5" /> - </conditional> - </conditional> - <output name="out_pdf" value="default_output.pdf" compare="sim_size" /> - <output_collection name="summaries" count="5"> - <element name="Log of MuSiC fitting" ftype="txt"> - <assert_contents> - <has_text text="Residual standard error: 0.1704 on 72 degrees of freedom"/> - </assert_contents> - </element> - <element name="Log of NNLS fitting" ftype="txt"> - <assert_contents> - <has_text text="Residual standard error: 0.0645 on 72 degrees of freedom"/> - </assert_contents> - </element> - </output_collection> - </test> - </tests> - <help><![CDATA[ -MuSiC utilizes cell-type specific gene expression from single-cell RNA sequencing (RNA-seq) data to characterize cell type compositions from bulk RNA-seq data in complex tissues. By appropriate weighting of genes showing cross-subject and cross-cell consistency, MuSiC enables the transfer of cell type-specific gene expression information from one dataset to another. - -Solid tissues often contain closely related cell types which leads to collinearity. To deal with collinearity, MuSiC employs a tree-guided procedure that recursively zooms in on closely related cell types. Briefly, we first group similar cell types into the same cluster and estimate cluster proportions, then recursively repeat this procedure within each cluster. - - ]]></help> - <citations> - <citation type="doi">https://doi.org/10.1038/s41467-018-08023-x</citation> - </citations> -</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/music_deconvolution.xml Thu Feb 10 12:52:31 2022 +0000 @@ -0,0 +1,315 @@ +<tool id="music_deconvolution" name="MuSiC Deconvolution" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" + profile="21.09" license="GPL-3.0-or-later" > + <description>estimate cell type proportions in bulk RNA-seq data</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code" ><![CDATA[ +mkdir report_data && +Rscript --vanilla '$__tool_directory__/scripts/${do.method}.R' '$conf' +]]></command> + <configfiles> + <configfile name="conf" > + +null_str_vec = function(gstr){ + tokens = unlist(as.vector(strsplit(gstr, split=","))) + if (length(tokens) == 0){ + return(NULL) + } + if (length(tokens) == 1){ + return(tokens[[1]]) + } + return(tokens) +} + +bulk_eset = readRDS('$bulk_eset') +scrna_eset = readRDS('$scrna_eset') +use_disease_factor = FALSE +maxyscale = NA + +#if str($do.method) == "estimateprops": + +maxyscale = as.numeric('$do.maxyscale') ## yields "NA" if blank +phenotype_factors = null_str_vec('$do.phenotype_factors') +phenotype_factors_always_exclude = null_str_vec('$do.phenotype_factors_always_exclude') +celltypes_label = null_str_vec('$do.celltypes_label') +samples_label = null_str_vec('$do.samples_label') +celltypes = null_str_vec('$do.celltypes') +methods = c(null_str_vec('$do.est_methods')) + + #if str($do.disease_factor.use) == "yes": +use_disease_factor = TRUE +phenotype_scrna_target = null_str_vec('$do.disease_factor.phenotype_scrna_target') +phenotype_target = null_str_vec('$do.disease_factor.phenotype_target') +phenotype_target_threshold = as.numeric('$do.disease_factor.phenotype_target_threshold') +sample_disease_group = null_str_vec('$do.disease_factor.sample_disease_group') +sample_disease_group_scale = as.integer('$do.disease_factor.sample_disease_group_scale') + #end if + +outfile_pdf='$out_pdf' + +#elif str($do.method) == "dendrogram": + +celltypes_label = null_str_vec('$do.celltypes_label') +clustertype_label = null_str_vec('$do.clustertype_label') +samples_label = null_str_vec('$do.samples_label') +celltypes = null_str_vec('$do.celltypes') + +data.to.use = list( + #for $i, $repeat in enumerate( $do.cluster_groups ) + #if $i == 0: + $repeat.cluster_id = list(cell.types = null_str_vec('$repeat.celltypes'), + marker.names = null_str_vec('$repeat.marker_name'), + marker.list = read_list('$repeat.marker_list')) + #else + , $repeat.cluster_id = list(cell.types = null_str_vec('$repeat.celltypes'), + marker.names = null_str_vec('$repeat.marker_name'), + marker.list = read_list('$repeat.marker_list')) + #end if + #end for +) + +outfile_pdf='$out_pdf' +outfile_tab='$out_tab' + +#else + stop("No such option") +#end if + + </configfile> + </configfiles> + <inputs> + <param name="scrna_eset" label="scRNA Dataset" type="data" format="@RDATATYPE@" /> + <param name="bulk_eset" label="Bulk RNA Dataset" type="data" format="@RDATATYPE@" /> + <conditional name="do" > + <param name="method" type="select" label="Purpose" > + <!-- The values here correspond to script names in the scripts folder + and must remain so --> + <option value="estimateprops">Estimate Proportions</option> + <option value="dendrogram">Compute Dendrogram</option> + </param> + <when value="estimateprops" > + <param name="est_methods" type="select" multiple="true" label="Methods to use" min="1" > + <option value="MuSiC" selected="true" >MuSiC</option> + <option value="NNLS" selected="true" >NNLS</option> + </param> + <param name="celltypes_label" type="text" value="cellType" + label="Cell Types Label from scRNA dataset" > + <expand macro="validator_text" /> + </param> + <param name="samples_label" type="text" value="sampleID" + label="Samples Identifier from scRNA dataset" > + <expand macro="validator_text" /> + </param> + <expand macro="celltypes_macro" /> + <param name="phenotype_factors" type="text" + label="Phenotype factors" + help="List of phenotypes factors to be used in the linear regression. Please make sure that each factor has more than one unique value. Names correspond to column names in the bulk RNA dataset phenotype table. If blank, then treat all bulk phenotype columns as factors." > + <expand macro="validator_index_identifiers" /> + </param> + <param name="phenotype_factors_always_exclude" type="text" + label="Excluded phenotype factors" + help="List of phenotype factors to always exclude in the analysis" + value="sampleID,SubjectName" > + <expand macro="validator_index_identifiers" /> + </param> + <conditional name="disease_factor" > + <param name="use" type="select" label="Show proportions of a disease factor?" > + <option value="no" selected="true" >No</option> + <option value="yes" >Yes</option> + </param> + <when value="no" ></when> + <when value="yes" > + <param name="phenotype_scrna_target" type="text" label="scRNA Phenotype Cell Target" + help="The name of a target scRNA cell type to select in the phenotype comparison." > + <expand macro="validator_text" /> + </param> + <param name="phenotype_target" type="text" label="Bulk Phenotype Target" + help="MUST exist in the bulk RNA datasets phenotype factors as above." > + <expand macro="validator_text" /> + </param> + <param name="phenotype_target_threshold" type="float" label="Bulk Phenotype Target Threshold" + value="-99" + help="The (%) threshold at which the phenotype target manifests. Leave at -99 to select all." > + </param> + <param name="sample_disease_group" type="text" label="scRNA Sample Disease Group" + help="Name for target disease group, ideally a value from the scRNA phenotype factor data" > + <expand macro="validator_text" /> + </param> + <param name="sample_disease_group_scale" type="integer" + label="scRNA Sample Disease Group (Scale)" value="5" + help="Used to accentutate certain features in the plots. Increase this number to reduce the effect." /> + </when> + </conditional> + <param name="maxyscale" type="float" min="0" value="" optional="true" + label="Scale all Y-axes to max limit" help="Leave blank to autoscale each plot."/> + </when> + <when value="dendrogram" > + <param name="celltypes_label" type="text" value="cellType" + label="Cell Types Label from scRNA dataset" > + <expand macro="validator_text" /> + </param> + <param name="clustertype_label" type="text" value="clusterType" + label="Cluster Types Label from scRNA dataset" > + <expand macro="validator_text" /> + </param> + <param name="samples_label" type="text" value="sampleID" + label="Samples Identifier from scRNA dataset" > + <expand macro="validator_text" /> + </param> + <expand macro="celltypes_macro" /> + <repeat name="cluster_groups" title="Cluster Groups" min="0" + help="Insert cell cluster groups based on a previous clustering." > + <param name="cluster_id" label="Cluster ID" type="text" value="" + help="e.g. C1 or Cluster1, etc." /> + <expand macro="celltypes_macro" /> + <param name="marker_name" label="Marker Gene Group Name" type="text" + optional="true" value="" + help="Name of the list of gene markers used to describe the marker list supplied below." > + <expand macro="validator_text" /> + </param> + <param name="marker_list" label="List of Gene Markers" type="data" format="txt,tabular" + optional="true" + help="A single column of marker genes" /> + </repeat> + </when> + </conditional> + </inputs> + <outputs> + <data name="out_pdf" format="pdf" label="${tool.name} on ${on_string}: PDF Plots" /> + <data name="out_tab" format="tabular" label="${tool.name} on ${on_string}: Cell Proportions by Sample" > + <filter>do["method"] == "dendrogram" and len(do["cluster_groups"]) >0</filter> + </data> + <collection name="props" type="list" label="${tool.name} on ${on_string}: Proportion Matrices" > + <filter>do["method"] == "estimateprops"</filter> + <discover_datasets pattern="prop_(?P<designation>.+)\.tabular" format="tabular" directory="report_data" /> + </collection> + <collection name="summaries" type="list" label="${tool.name} on ${on_string}: Summaries and Logs"> + <filter>do["method"] == "estimateprops" and do["disease_factor"]["use"] == "yes"</filter> + <discover_datasets pattern="summ_(?P<designation>.+)\.txt" format="txt" directory="report_data" /> + <discover_datasets pattern="varprop_(?P<designation>.+)\.tabular" format="tabular" directory="report_data" /> + <discover_datasets pattern="rsquared_(?P<designation>.+)\.tabular" format="tabular" directory="report_data" /> + <discover_datasets pattern="weightgene_(?P<designation>.+)\.tabular" format="tabular" directory="report_data" /> + </collection> + </outputs> + <tests> + <test expect_num_outputs="1" > + <!-- Dendrogram test 1 --> + <param name="bulk_eset" value="Mousebulkeset.rds" /> + <param name="scrna_eset" value="Mousesubeset.degenesonly2.half.rds" /> + <conditional name="do" > + <param name="method" value="dendrogram" /> + <param name="celltypes_label" value="cellType" /> + <param name="samples_label" value="sampleID" /> + <param name="celltypes" value="Endo,Podo,PT,LOH,DCT,CD-PC,CD-IC,Fib,Macro,Neutro,B lymph,T lymph,NK" /> + </conditional> + <output name="out_pdf" value="dendro_1.pdf" compare="sim_size" /> + </test> + <test expect_num_outputs="2" > + <!-- Dendrogram test 2 --> + <param name="bulk_eset" value="Mousebulkeset.rds" /> + <param name="scrna_eset" value="Mousesubeset.degenesonly2.half.rds" /> + <conditional name="do" > + <param name="method" value="dendrogram" /> + <param name="celltypes_label" value="cellType" /> + <param name="samples_label" value="sampleID" /> + <param name="celltypes" value="Endo,Podo,PT,LOH,DCT,CD-PC,CD-IC,Fib,Macro,Neutro,B lymph,T lymph,NK" /> + <repeat name="cluster_groups" > + <param name="cluster_id" value="C1" /> + <param name="celltypes" value="Neutro" /> + </repeat> + <repeat name="cluster_groups" > + <param name="cluster_id" value="C2" /> + <param name="celltypes" value="Podo" /> + </repeat> + <repeat name="cluster_groups" > + <param name="cluster_id" value="C3" /> + <param name="celltypes" value="Endo,CD-PC,LOH,CD-IC,DCT,PT" /> + <param name="marker_name" value="Epithelial" /> + <param name="marker_list" value="epith.markers" /> + </repeat> + <repeat name="cluster_groups" > + <param name="cluster_id" value="C4" /> + <param name="celltypes" value="Macro,Fib,B lymph,NK,T lymph" /> + <param name="marker_name" value="Immune" /> + <param name="marker_list" value="immune.markers" /> + </repeat> + </conditional> + <output name="out_pdf" value="dendro.pdf" compare="sim_size" /> + <output name="out_tab"> + <assert_contents> + <has_text_matching expression="^\s+Neutro\s+Podo\s+Endo" /> + <has_text text="APOL1.GNA78M"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2" > + <!-- Estimate Proportions: no disease factor, no fitting reports, only NNLS --> + <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" /> + <param name="scrna_eset" value="EMTABesethealthy.subset.rds" /> + <conditional name="do" > + <param name="method" value="estimateprops" /> + <param name="est_methods" value="NNLS" /> + <param name="celltypes_label" value="cellType" /> + <param name="samples_label" value="sampleID" /> + <param name="disease_factor" value="no" /> + </conditional> + <output name="out_pdf" value="default_output_no_disease_nnls.pdf" compare="sim_size" /> + </test> + <test expect_num_outputs="2" > + <!-- Estimate Proportions: no disease factor, no fitting reports --> + <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" /> + <param name="scrna_eset" value="EMTABesethealthy.subset.rds" /> + <conditional name="do" > + <param name="method" value="estimateprops" /> + <param name="celltypes_label" value="cellType" /> + <param name="samples_label" value="sampleID" /> + <param name="disease_factor" value="no" /> + </conditional> + <output name="out_pdf" value="default_output_no_disease.pdf" compare="sim_size" /> + </test> + <test expect_num_outputs="3" > + <!-- Estimate Proportions test --> + <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" /> + <param name="scrna_eset" value="EMTABesethealthy.subset.rds" /> + <conditional name="do" > + <param name="method" value="estimateprops" /> + <param name="celltypes_label" value="cellType" /> + <param name="samples_label" value="sampleID" /> + <param name="celltypes" value="alpha,beta,delta,gamma,acinar,ductal" /> + <conditional name="disease_factor" > + <param name="use" value="yes" /> + <param name="phenotype_scrna_target" value="beta" /> + <param name="phenotype_factors" value="age,bmi,hba1c,gender" /> + <param name="phenotype_target" value="hba1c" /> + <param name="phenotype_target_threshold" value="6.5" /> + <param name="sample_disease_group" value="T2D" /> + <param name="sample_disease_group_scale" value="5" /> + </conditional> + </conditional> + <output name="out_pdf" value="default_output.pdf" compare="sim_size" /> + <output_collection name="summaries" count="5"> + <element name="Log of MuSiC fitting" ftype="txt"> + <assert_contents> + <has_text text="Residual standard error: 0.1704 on 72 degrees of freedom"/> + </assert_contents> + </element> + <element name="Log of NNLS fitting" ftype="txt"> + <assert_contents> + <has_text text="Residual standard error: 0.0645 on 72 degrees of freedom"/> + </assert_contents> + </element> + </output_collection> + </test> + </tests> + <help><![CDATA[ +MuSiC utilizes cell-type specific gene expression from single-cell RNA sequencing (RNA-seq) data to characterize cell type compositions from bulk RNA-seq data in complex tissues. By appropriate weighting of genes showing cross-subject and cross-cell consistency, MuSiC enables the transfer of cell type-specific gene expression information from one dataset to another. + +Solid tissues often contain closely related cell types which leads to collinearity. To deal with collinearity, MuSiC employs a tree-guided procedure that recursively zooms in on closely related cell types. Briefly, we first group similar cell types into the same cluster and estimate cluster proportions, then recursively repeat this procedure within each cluster. + + ]]></help> + <citations> + <citation type="doi">https://doi.org/10.1038/s41467-018-08023-x</citation> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/compare.R Thu Feb 10 12:52:31 2022 +0000 @@ -0,0 +1,440 @@ +suppressWarnings(suppressPackageStartupMessages(library(xbioc))) +suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) +suppressWarnings(suppressPackageStartupMessages(library(reshape2))) +suppressWarnings(suppressPackageStartupMessages(library(cowplot))) +## We use this script to estimate the effectiveness of proportion methods + +## Load Conf +args <- commandArgs(trailingOnly = TRUE) +source(args[1]) + +method_key <- list("MuSiC" = "est_music", + "NNLS" = "est_nnls")[[est_method]] + + +scale_yaxes <- function(gplot, value) { + if (is.na(value)) { + gplot + } else { + gplot + scale_y_continuous(lim = c(0, value)) + } +} + + +set_factor_data <- function(bulk_data, factor_name = NULL) { + if (is.null(factor_name)) { + factor_name <- "None" ## change to something plottable + } + pdat <- pData(bulk_data) + sam_fact <- NULL + if (factor_name %in% colnames(pdat)) { + sam_fact <- cbind(rownames(pdat), + as.character(pdat[[factor_name]])) + cat(paste0(" - factor: ", factor_name, + " found in phenotypes\n")) + } else { + ## We assign this as the factor for the entire dataset + sam_fact <- cbind(rownames(pdat), + factor_name) + cat(paste0(" - factor: assigning \"", factor_name, + "\" to whole dataset\n")) + } + colnames(sam_fact) <- c("Samples", "Factors") + return(as.data.frame(sam_fact)) +} + +## Due to limiting sizes, we need to load and unload +## possibly very large datasets. +process_pair <- function(sc_data, bulk_data, + ctypes_label, samples_label, ctypes, + factor_group) { + ## - Generate + est_prop <- music_prop( + bulk.eset = bulk_data, sc.eset = sc_data, + clusters = ctypes_label, + samples = samples_label, select.ct = ctypes, verbose = T) + ## - + estimated_music_props <- est_prop$Est.prop.weighted + estimated_nnls_props <- est_prop$Est.prop.allgene + ## - + fact_data <- set_factor_data(bulk_data, factor_group) + ## - + return(list(est_music = estimated_music_props, + est_nnls = estimated_nnls_props, + bulk_sample_totals = colSums(exprs(bulk_data)), + plot_groups = fact_data)) +} + +music_on_all <- function(files) { + results <- list() + for (sc_name in names(files)) { + cat(paste0("sc-group:", sc_name, "\n")) + scgroup <- files[[sc_name]] + ## - sc Data + sc_est <- readRDS(scgroup$dataset) + ## - params + celltypes_label <- scgroup$label_cell + samples_label <- scgroup$label_sample + celltypes <- scgroup$celltype + + results[[sc_name]] <- list() + for (bulk_name in names(scgroup$bulk)) { + cat(paste0(" - bulk-group:", bulk_name, "\n")) + bulkgroup <- scgroup$bulk[[bulk_name]] + ## - bulk Data + bulk_est <- readRDS(bulkgroup$dataset) + ## - bulk params + pheno_facts <- bulkgroup$pheno_facts + pheno_excl <- bulkgroup$pheno_excl + ## + results[[sc_name]][[bulk_name]] <- process_pair( + sc_est, bulk_est, + celltypes_label, samples_label, + celltypes, bulkgroup$factor_group) + ## + rm(bulk_est) ## unload + } + rm(sc_est) ## unload + } + return(results) +} + +plot_all_individual_heatmaps <- function(results) { + pdf(out_heatmulti_pdf, width = 8, height = 8) + for (sc_name in names(results)) { + for (bk_name in names(results[[sc_name]])) { + res <- results[[sc_name]][[bk_name]] + plot_hmap <- Prop_heat_Est( + data.matrix(res[[method_key]]), method.name = est_method) + + ggtitle(paste0("[", est_method, "Cell type ", + "proportions in ", + bk_name, " (Bulk) based on ", + sc_name, " (scRNA)")) + + xlab("Cell Types (scRNA)") + + ylab("Samples (Bulk)") + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 6)) + print(plot_hmap) + } + } + dev.off() +} + +merge_factors_spread <- function(grudat_spread, factor_groups) { + ## Generated + merge_it <- function(matr, plot_groups, valname) { + ren <- melt(lapply(matr, function(mat) { + mat["ct"] <- rownames(mat); return(mat)})) + ## - Grab factors and merge into list + ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples") + colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors") + return(ren_new) + } + tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"), + merge_it(grudat$spread$scale, factor_groups, "value.scale"), + by = c("Sample", "Cell", "Bulk", "Factors")) + return(tab) +} + + +plot_grouped_heatmaps <- function(results) { + pdf(out_heatmulti_pdf, width = 8, height = 8) + for (sc_name in names(results)) { + named_list <- sapply( + names(results[[sc_name]]), + function(n) { + ## We transpose the data here, because + ## the plotting function omits by default + ## the Y-axis which are the samples. + ## Since the celltypes are the common factor + ## these should be the Y-axis instead. + t(data.matrix(results[[sc_name]][[n]][[method_key]])) + }, simplify = F, USE.NAMES = T) + named_methods <- names(results[[sc_name]]) + ## + plot_hmap <- Prop_heat_Est( + named_list, + method.name = named_methods) + + ggtitle(paste0("[", est_method, "] Cell type ", + "proportions of ", + "Bulk Datasets based on ", + sc_name, " (scRNA)")) + + xlab("Samples (Bulk)") + + ylab("Cell Types (scRNA)") + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 6)) + print(plot_hmap) + } + dev.off() +} + +## Desired plots +## 1. Pie chart: +## - Per Bulk dataset (using just normalised proportions) +## - Per Bulk dataset (multiplying proportions by nreads) + +unlist_names <- function(results, method, prepend_bkname=FALSE) { + unique(sort( + unlist(lapply(names(results), function(scname) { + lapply(names(results[[scname]]), function(bkname) { + res <- get(method)(results[[scname]][[bkname]][[method_key]]) + if (prepend_bkname) { + ## We *do not* assume unique bulk sample names + ## across different bulk datasets. + res <- paste0(bkname, "::", res) + } + return(res) + }) + })) + )) +} + +summarized_matrix <- function(results) { # nolint + ## We assume that cell types MUST be unique, but that sample + ## names do not need to be. For this reason, we must prepend + ## the bulk dataset name to the individual sample names. + all_celltypes <- unlist_names(results, "colnames") + all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE) + + ## Iterate through all possible samples and populate a table. + ddff <- data.frame() + ddff_scale <- data.frame() + for (cell in all_celltypes) { + for (sample in all_samples) { + group_sname <- unlist(strsplit(sample, split = "::")) + bulk <- group_sname[1] + id_sample <- group_sname[2] + for (scgroup in names(results)) { + if (bulk %in% names(results[[scgroup]])) { + mat_prop <- results[[scgroup]][[bulk]][[method_key]] + vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals + ## - We use sample instead of id_sample because we need to + ## extract bulk sets from the complete matrix later. It's + ## messy, yes. + if (cell %in% colnames(mat_prop)) { + ddff[cell, sample] <- mat_prop[id_sample, cell] + ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint + } else { + ddff[cell, sample] <- 0 + ddff_scale[cell, sample] <- 0 + } + } + } + } + } + return(list(prop = ddff, scaled = ddff_scale)) +} + +flatten_factor_list <- function(results) { + ## Get a 2d DF of all factors across all bulk samples. + res <- c() + for (scgroup in names(results)) { + for (bulkgroup in names(results[[scgroup]])) { + dat <- results[[scgroup]][[bulkgroup]]$plot_groups + dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint + res <- rbind(res, dat) + } + } + return(res) +} + +group_by_dataset <- function(summat) { + bulk_names <- unlist( + lapply(names(files), function(x) names(files[[x]]$bulk))) + mat_names <- colnames(summat$prop) + bd <- list() + bd_scale <- list() + bd_spread_scale <- list() + bd_spread_prop <- list() + for (bname in bulk_names) { + subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))] + ## - + bd[[bname]] <- rowSums(summat$prop[, subs]) + bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) + bd_spread_scale[[bname]] <- summat$scaled[, subs] + bd_spread_prop[[bname]] <- summat$prop[, subs] + } + return(list(prop = as.data.frame(bd), + scaled = as.data.frame(bd_scale), + spread = list(scale = bd_spread_scale, + prop = bd_spread_prop))) +} + +summarize_heatmaps <- function(grudat_spread_melt, do_factors) { + ## - + do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, + ylabs = element_blank(), xlabs = element_blank(), + use_log = TRUE, size = 11) { + ## Convert from matrix to long format + melted <- grudat_melted ## copy? + if (use_log) { + melted[[fillval]] <- log10(melted[[fillval]] + 1) + } + return(ggplot(melted) + + geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), + colour = "white") + + scale_fill_gradient2(low = "steelblue", high = "red", + mid = "white", name = element_blank()) + + theme(axis.text.x = element_text(angle = -90, hjust = 0, + size = size)) + + ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) + } + + do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { + do_logged <- (plot %in% c("log", "both")) + do_normal <- (plot %in% c("normal", "both")) + plist <- list() + if (do_logged) { + plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar, + "value.scale", "Reads (log10+1)", + size = size) + plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar, + "value.prop", "Sample (log10+1)", + size = size) + } + if (do_normal) { + plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar, + "value.scale", "Reads", use_log = F, + size = size) + plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar, + "value.prop", "Sample", use_log = F, + size = size) + } + return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), + plot_grid(plotlist = plist, ncol = ncol), + ncol = 1, rel_heights = c(0.05, 0.95))) + + } + p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", ) + p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1, + size = 8) + p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1, + size = 8) + p3 <- ggplot + theme_void() + if (do_factors) { + p3 <- do_gridplot("Cell Types against Factors", "Factors", "both") + } + return(list(bulk = p1, + samples = list(log = p2b, normal = p2a), + factors = p3)) +} + +summarize_boxplots <- function(grudat_spread, do_factors) { + common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") + + xlab(element_blank()) + ylab(element_blank()) + common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") + + xlab(element_blank()) + ylab(element_blank()) + + A <- B <- list() #nolint + ## Cell type by sample + A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk)) + A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk)) + ## Sample by Cell type + B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) + + ylab("Bulk Dataset") + B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) + + ylab("Bulk Dataset") + ## -- Factor plots are optional + A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void() + + if (do_factors) { + A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors)) + A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors)) + B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) + + ylab("Bulk Dataset") + B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + + ylab("Bulk Dataset") + } + + title_a <- "Cell Types against Bulk" + title_b <- "Bulk Datasets against Cells" + if (do_factors) { + title_a <- paste0(title_a, " and Factors") + title_b <- paste0(title_b, " and Factors") + } + + a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"), + plot_grid(plotlist = A, ncol = 2), + ncol = 1, rel_heights = c(0.05, 0.95)) + b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"), + plot_grid(plotlist = B, ncol = 2), + ncol = 1, rel_heights = c(0.05, 0.95)) + return(list(cell = a_all, bulk = b_all)) +} + +filter_output <- function(grudat_spread_melt, out_filt) { + print_red <- function(comment, red_list) { + cat(paste(comment, paste(red_list, collapse = ", "), "\n")) + } + grudat_filt <- grudat_spread_melt + print_red("Total Cell types:", unique(grudat_filt$Cell)) + if (!is.null(out_filt$cells)) { + grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ] + print_red(" - selecting:", out_filt$cells) + } + print_red("Total Factors:", unique(grudat_spread_melt$Factors)) + if (!is.null(out_filt$facts)) { + grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ] + print_red(" - selecting:", out_filt$facts) + } + return(grudat_filt) +} + + +results <- music_on_all(files) + +if (heat_grouped_p) { + plot_grouped_heatmaps(results) +} else { + plot_all_individual_heatmaps(results) +} + +save.image("/tmp/sesh.rds") + +summat <- summarized_matrix(results) +grudat <- group_by_dataset(summat) +grudat_spread_melt <- merge_factors_spread(grudat$spread, + flatten_factor_list(results)) + + + +## The output filters ONLY apply to boxplots, since these take +do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) + +grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt) + +heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors) +box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) + +pdf(out_heatsumm_pdf, width = 14, height = 14) +print(heat_maps) +print(box_plots) +dev.off() + +## Generate output tables +stats_prop <- lapply(grudat$spread$prop, function(x) { + t(apply(x, 1, summary))}) +stats_scale <- lapply(grudat$spread$scale, function(x) { + t(apply(x, 1, summary))}) + +writable2 <- function(obj, prefix, title) { + write.table(obj, + file = paste0("report_data/", prefix, "_", + title, ".tabular"), + quote = F, sep = "\t", col.names = NA) +} +## Make the value table printable +grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint +colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", + "CT Prop in Sample", "Number of Reads") + +writable2(grudat_spread_melt, "values", "Data Table") +writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions") +writable2({ + aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa +}, "values", "Matrix of Cell Type Read Counts") + +for (bname in names(stats_prop)) { + writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props")) + writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props")) +}
--- a/scripts/estimateprops.R Sat Jan 29 12:51:49 2022 +0000 +++ b/scripts/estimateprops.R Thu Feb 10 12:52:31 2022 +0000 @@ -14,8 +14,12 @@ clusters = celltypes_label, samples = samples_label, select.ct = celltypes, verbose = T) + estimated_music_props <- est_prop$Est.prop.weighted estimated_nnls_props <- est_prop$Est.prop.allgene +## +estimated_music_props_flat <- melt(estimated_music_props) +estimated_nnls_props_flat <- melt(estimated_nnls_props) scale_yaxes <- function(gplot, value) { if (is.na(value)) { @@ -25,23 +29,36 @@ } } +sieve_data <- function(func, music_data, nnls_data) { + if (func == "list") { + res <- list(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + res[lengths(res) > 0] ## filter out NULL elements + } else if (func == "rbind") { + rbind(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + } else if (func == "c") { + c(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + } +} + + ## Show different in estimation methods ## Jitter plot of estimated cell type proportions jitter_fig <- scale_yaxes(Jitter_Est( - list(data.matrix(estimated_music_props), - data.matrix(estimated_nnls_props)), + sieve_data("list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), method.name = methods, title = "Jitter plot of Est Proportions", size = 2, alpha = 0.7) + theme_minimal(), maxyscale) - ## Make a Plot ## A more sophisticated jitter plot is provided as below. We separated ## the T2D subjects and normal subjects by their disease factor levels. -estimated_music_props_flat <- melt(estimated_music_props) -estimated_nnls_props_flat <- melt(estimated_nnls_props) - -m_prop <- rbind(estimated_music_props_flat, - estimated_nnls_props_flat) +m_prop <- sieve_data("rbind", + estimated_music_props_flat, + estimated_nnls_props_flat) colnames(m_prop) <- c("Sub", "CellType", "Prop") if (is.null(celltypes)) { @@ -69,7 +86,7 @@ phenotype_target_threshold <- -Inf message("phenotype target threshold set to -Inf") } - + ## the "2" here is to do with the sample groups, not number of methods m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 @@ -84,8 +101,10 @@ m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), subset(m_prop, Disease == sample_disease_group)) - jitter_new <- scale_yaxes(ggplot(m_prop, aes(Method, Prop)) + - geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), + jitter_new <- scale_yaxes( + ggplot(m_prop, aes(Method, Prop)) + + geom_point(aes(fill = Method, color = Disease, + stroke = D, shape = Disease), size = 2, alpha = 0.7, position = position_jitter(width = 0.25, height = 0)) + facet_wrap(~ CellType, scales = "free") + @@ -100,21 +119,23 @@ ## Create dataframe for beta cell proportions and Disease_factor levels ## - Ugly code. Essentially, doubles the cell type proportions for each ## set of MuSiC and NNLS methods - m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint - phenotype_factors], - ## get proportions of target cell type - ct.prop = c(estimated_music_props[, phenotype_scrna_target], - estimated_nnls_props[, phenotype_scrna_target]), - ## - Method = factor(rep(methods, - each = nrow(estimated_music_props)), - levels = methods)) + m_prop_ana <- data.frame( + pData(bulk_eset)[rep(1:nrow(estimated_music_props), length(methods)), #nolint + phenotype_factors], + ## get proportions of target cell type + ct.prop = sieve_data("c", + estimated_music_props[, phenotype_scrna_target], + estimated_nnls_props[, phenotype_scrna_target]), + ## + Method = factor(rep(methods, + each = nrow(estimated_music_props)), + levels = methods)) ## - fix headers colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint ## - drop NA for target phenotype (e.g. hba1c) m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) m_prop_ana$Disease <- factor( # nolint - ## - Here we set Normal/Disease assignments across the two MuSiC and NNLS methods + ## - Here we set Normal/Disease assignments across the methods sample_groups[( m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1 ], @@ -123,12 +144,15 @@ m_prop_ana$D <- (m_prop_ana$Disease == # nolint sample_disease_group) / sample_disease_group_scale - jitt_compare <- scale_yaxes(ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + + jitt_compare <- scale_yaxes( + ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + - geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), + geom_point(aes(fill = Method, color = Disease, + stroke = D, shape = Disease), size = 2, alpha = 0.7) + facet_wrap(~ Method) + ggtitle(paste0(toupper(phenotype_target), " vs. ", - toupper(phenotype_scrna_target), " Cell Type Proportion")) + + toupper(phenotype_scrna_target), + " Cell Type Proportion")) + theme_minimal() + ylab(paste0("Proportion of ", phenotype_scrna_target, " cells")) + @@ -138,19 +162,22 @@ } ## BoxPlot -plot_box <- scale_yaxes(Boxplot_Est(list( - data.matrix(estimated_music_props), - data.matrix(estimated_nnls_props)), - method.name = c("MuSiC", "NNLS")) + +plot_box <- scale_yaxes(Boxplot_Est( + sieve_data("list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods) + theme(axis.text.x = element_text(angle = -90), axis.text.y = element_text(size = 8)) + ggtitle(element_blank()) + theme_minimal(), maxyscale) ## Heatmap -plot_hmap <- Prop_heat_Est(list( - data.matrix(estimated_music_props), - data.matrix(estimated_nnls_props)), - method.name = c("MuSiC", "NNLS")) + +plot_hmap <- Prop_heat_Est( + sieve_data( + "list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods) + theme(axis.text.x = element_text(angle = -90), axis.text.y = element_text(size = 6)) @@ -167,33 +194,29 @@ plot_hmap message(dev.off()) -## Output Proportions +writable <- function(obj, prefix, title) { + write.table(obj, + file = paste0("report_data/", prefix, "_", + title, ".tabular"), + quote = F, sep = "\t", col.names = NA) +} -write.table(est_prop$Est.prop.weighted, - file = paste0("report_data/prop_", - "Music Estimated Proportions of Cell Types", - ".tabular"), - quote = F, sep = "\t", col.names = NA) -write.table(est_prop$Est.prop.allgene, - file = paste0("report_data/prop_", - "NNLS Estimated Proportions of Cell Types", - ".tabular"), - quote = F, sep = "\t", col.names = NA) -write.table(est_prop$Weight.gene, - file = paste0("report_data/weightgene_", - "Music Estimated Proportions of Cell Types (by Gene)", - ".tabular"), - quote = F, sep = "\t", col.names = NA) -write.table(est_prop$r.squared.full, - file = paste0("report_data/rsquared_", - "Music R-sqr Estimated Proportions of Each Subject", - ".tabular"), - quote = F, sep = "\t", col.names = NA) -write.table(est_prop$Var.prop, - file = paste0("report_data/varprop_", - "Matrix of Variance of MuSiC Estimates", - ".tabular"), - quote = F, sep = "\t", col.names = NA) +## Output Proportions +if ("NNLS" %in% methods) { + writable(est_prop$Est.prop.allgene, "prop", + "NNLS Estimated Proportions of Cell Types") +} + +if ("MuSiC" %in% methods) { + writable(est_prop$Est.prop.weighted, "prop", + "Music Estimated Proportions of Cell Types") + writable(est_prop$Weight.gene, "weightgene", + "Music Estimated Proportions of Cell Types (by Gene)") + writable(est_prop$r.squared.full, "rsquared", + "Music R-sqr Estimated Proportions of Each Subject") + writable(est_prop$Var.prop, "varprop", + "Matrix of Variance of MuSiC Estimates") +} if (use_disease_factor) {