view oncoenrichr_wrapper.xml @ 8:dc08c98bb28f draft

Uploaded
author sigven
date Fri, 02 Aug 2024 18:36:59 +0000
parents 023155e2e66c
children e69867fb65fe
line wrap: on
line source

<tool id="oncoenrichr_wrapper" name="oncoEnrichR" version="1.5.0">
  <description>Cancer-dedicated gene set interpretation</description>
    <requirements>
        <container type="docker">sigven/oncoenrichr:1.5.0</container>
  </requirements>
  <command detect_errors="aggressive"><![CDATA[
      #if $query_set.query_choice.query_input == "text"
        echo $query_set.query_choice.query_text | sed 's/__cn__/\n/g' > query_text.csv &&
        #set input_file = './query_text.csv'
      #else if $query_set.query_choice.query_input == "file"
        ln -s $query_set.query_choice.query_file "$query_set.query_choice.query_file.element_identifier" &&
        #set input_file = './' + str($query_set.query_choice.query_file.element_identifier)
      #end if

      #set background_file = ''
      #if $fun_enrich.custom_bgset.def_background
	      #if $fun_enrich.custom_bgset.bg_choice.bg_source == "text"
	        echo $fun_enrich.custom_bgset.bg_choice.bg_enrich_text | sed 's/__cn__/\n/g' > custom_bgset.csv &&
	        #set background_file = './custom_bgset.csv'
	      #else if $fun_enrich.custom_bgset.bg_choice.bg_source == "file" and $fun_enrich.custom_bgset.bg_choice.bg_enrich_file
	        ln -s $fun_enrich.custom_bgset.bg_choice.bg_enrich_file background_text.csv &&
	        #set background_file = './custom_bgset.csv'
	      #else
	        #set background_file = ''
	      #end if
	 #end if

      R -e 'suppressPackageStartupMessages(library(oncoEnrichR));
      suppressWarnings(load(system.file("internal_db", "oedb.rda", package = "oncoEnrichR")));
      gene_data <- read.csv("$input_file", strip.white = TRUE);
      oe_report <- oncoEnrichR::onco_enrich(
        query = gene_data[[1]],
        oeDB = oedb,
      #if $query_set.query_id_type
        query_id_type = "$query_set.query_id_type",
      #end if
      ignore_id_err = $query_set.ignore_id_err,

      #if $report_metadata.project_title
          project_title = "$report_metadata.project_title",
      #end if
      #if $report_metadata.project_owner
          project_owner = "$report_metadata.project_owner",
      #end if
      #if $report_metadata.project_description
          project_description = "$report_metadata.project_description",
      #end if

      show_enrichment = $modules.show_enrichment,
      show_ppi = $modules.show_ppi,
      show_disease = $modules.show_disease,
      show_cancer_hallmarks = $modules.show_cancer_hallmarks,
      show_drug = $modules.show_drug,
      show_aberration = $modules.show_aberration,
      show_coexpression = $modules.show_coexpression,
      show_subcell_comp = $modules.show_subcell_comp,
      show_complex = $modules.show_complex,
	  show_domain = $modules.show_domain,
      show_fitness = $modules.show_fitness,
      show_ligand_receptor = $modules.show_ligand_receptor,
      show_regulatory = $modules.show_regulatory,
	  show_prognostic = $modules.show_prognostic,
	  show_unknown_function = $modules.show_unknown_function,
      show_synleth = $modules.show_synleth,

      #if $background_file
          bgset = read.csv("$background_file", strip.white = TRUE)[[1]],
		#if $fun_enrich.custom_bgset.bg_enrich_id_type
            bgset_id_type = "$fun_enrich.custom_bgset.bg_enrich_id_type",
          #end if
          #if $fun_enrich.custom_bgset.bg_enrich_description
            bgset_description = "$fun_enrich.custom_bgset.bg_enrich_description",
          #end if
      #else
          bgset = NULL,
      #end if

      #if $fun_enrich.enrichment_p_value_cutoff
          enrichment_p_value_cutoff = $fun_enrich.enrichment_p_value_cutoff,
      #end if
      #if $fun_enrich.enrichment_p_value_adj
          enrichment_p_value_adj = "$fun_enrich.enrichment_p_value_adj",
      #end if
      #if $fun_enrich.enrichment_q_value_cutoff
          enrichment_q_value_cutoff = $fun_enrich.enrichment_q_value_cutoff,
      #end if
      #if $fun_enrich.enrichment_min_geneset_size
          enrichment_min_geneset_size = $fun_enrich.enrichment_min_geneset_size,
      #end if
      #if $fun_enrich.enrichment_max_geneset_size
          enrichment_max_geneset_size = $fun_enrich.enrichment_max_geneset_size,
      #end if
      enrichment_plot_num_terms = $fun_enrich.enrichment_plot_num_terms,
      enrichment_simplify_go = $fun_enrich.enrichment_simplify_go,


      #if $protein_interactions.ppi_add_nodes
          ppi_add_nodes = $protein_interactions.ppi_add_nodes,
      #end if
      #if $protein_interactions.ppi_string_min_score
          ppi_string_min_score = $protein_interactions.ppi_string_min_score,
      #end if
      #if $protein_interactions.ppi_biogrid_min_evidence
          ppi_biogrid_min_evidence = $protein_interactions.ppi_biogrid_min_evidence,
      #end if
      ppi_show_drugs = $protein_interactions.ppi_show_drugs,
      ppi_show_isolated_nodes = $protein_interactions.ppi_show_isolated_nodes,
	  ppi_node_shadow = $protein_interactions.ppi_node_shadow,

	  #if $subcellular_compartments.subcellcomp_min_confidence
          subcellcomp_min_confidence = $subcellular_compartments.subcellcomp_min_confidence,
      #end if
      #if $subcellular_compartments.subcellcomp_min_channels
          subcellcomp_min_channels = $subcellular_compartments.subcellcomp_min_channels,
      #end if
      #if $fitness.fitness_max_score
          fitness_max_score = $fitness.fitness_max_score,
      #end if
      subcellcomp_show_cytosol = $subcellular_compartments.subcellcomp_show_cytosol,
      #if $disease.show_top_diseases_only
          show_top_diseases_only = $disease.show_top_diseases_only,
      #end if

      regulatory_min_confidence = "$regulatory.regulatory_min_confidence",
      galaxy = TRUE
      );

      oncoEnrichR::write(report = oe_report, oeDB = oedb, file = "$report1", format = "html", embed_resources = F, extra_files_path = "$report1.extra_files_path", overwrite = T, ignore_file_extension = T);
      oncoEnrichR::write(report = oe_report, oeDB = oedb, file = "$report2", format = "excel", overwrite = T, ignore_file_extension = T)' 2>&1

  ]]></command>
  <inputs>
      <section title="" name=""/>
      <section name="query_set" title="Query gene set" expanded="true">
          <conditional name="query_choice">
                <param name="query_input" type="select" multiple="false" display="radio"
                       label="Query gene set: do you want to upload a file OR paste into a text box?">
                    <option value="text">Text field</option>
				<option value="file">From file</option>
                </param>
                <when value="text">
                    <param type="text" name="query_text" label="Query gene set identifiers (one per line)" area="true"/>
                </when>
			 <when value="file">
                   <param name="query_file" type="data" format="txt" label="Query gene set identifiers" multiple="false"/>
                </when>
          </conditional>
          <param name="query_id_type" type="select" label="Query identifier type" display="radio" multiple="false">
              <option value="symbol">Primary gene symbol (HGNC) - e.g. KRAS</option>
              <option value="uniprot_acc">UniProt accession - e.g. P01116</option>
              <option value="entrezgene">NCBI Entrez gene identifier - e.g. 3845</option>
              <option value="ensembl_gene">Ensembl gene identifier - e.g. ENSG00000133703</option>
              <option value="ensembl_mrna">Ensembl transcript identifier - e.g. ENST00000311936</option>
              <option value="ensembl_protein">Ensembl protein identifier - e.g. ENSP00000308495</option>
              <option value="refseq_transcript_id">RefSeq mRNA identifier - e.g. NM_004985</option>
              <option value="refseq_protein">RefSeq protein identifier - e.g. NP_004976</option>
          </param>
		<param name="ignore_id_err" type="boolean" label="Ignore erroneous idenfiers" truevalue="T" falsevalue="F" checked="true"/>
      </section>

      <section title="" name=""/>
      <section name="report_metadata" title="Project metadata and output settings" expanded="true">
          <param type="text" name="report_name" label="Output filename (prefix)" value="Report"/>
          <param type="text" name="project_title" label="Project title" />
          <param type="text" name="project_owner" label="Project owner" />
          <param type="text" name="project_description" label="Project description" area="true"/>
      </section>

      <section title="" name=""/>
      <section name="modules" title="Analysis modules included in the report" expanded="true">
		  <param name="show_disease" type="boolean" label="Gene-cancer associations" truevalue="T" falsevalue="F" checked="true"/>
          <param name="show_enrichment" type="boolean" label="Gene functional enrichment" truevalue="T" falsevalue="F" checked="true"/>
          <param name="show_ppi" type="boolean" label="Protein-protein interaction network" truevalue="T" falsevalue="F" checked="true"/>
          <param name="show_regulatory" type="boolean" label="Regulatory (TF-target) interactions" truevalue="T" falsevalue="F" checked="false"/>
          <param name="show_ligand_receptor" type="boolean" label="Ligand-receptor interactions" truevalue="T" falsevalue="F" checked="false"/>
          <param name="show_cancer_hallmarks" type="boolean" label="Cancer hallmark associations" truevalue="T" falsevalue="F" checked="true"/>
          <param name="show_drug" type="boolean" label="Drug-target associations" truevalue="T" falsevalue="F" checked="true"/>
          <param name="show_aberration" type="boolean" label="Tumor aberration frequencies" truevalue="T" falsevalue="F" checked="false"/>
          <param name="show_coexpression" type="boolean" label="Tumor co-expression patterns" truevalue="T" falsevalue="F" checked="false"/>
          <param name="show_subcell_comp" type="boolean" label="Subcellular localizations" truevalue="T" falsevalue="F" checked="true"/>
          <param name="show_complex" type="boolean" label="Protein complex memberships" truevalue="T" falsevalue="F" checked="true"/>
		  <param name="show_domain" type="boolean" label="Protein domain frequencies" truevalue="T" falsevalue="F" checked="false"/>
          <param name="show_fitness" type="boolean" label="Gene fitness effects" truevalue="T" falsevalue="F" checked="true"/>
          <param name="show_synleth" type="boolean" label="Predicted synthetic lethality interactions" truevalue="T" falsevalue="F" checked="true"/>
		  <param name="show_unknown_function" type="boolean" label="Genes of poorly defined function" truevalue="T" falsevalue="F" checked="false"/>
		  <param name="show_prognostic" type="boolean" label="Prognostic cancer associations" truevalue="T" falsevalue="F" checked="true"/>
      </section>

      <section title="" name=""/>
      <section name="fun_enrich" title="Options - gene functional enrichment">
		 <conditional name="custom_bgset">
			 <param name="def_background" type="boolean" label="Define custom background set (all annotated protein-coding genes by default)" truevalue="T" falsevalue="F" checked="false"/>
			 <when value="T">
		            <conditional name="bg_choice">
		                <param name="bg_source" type="select" display="radio"
		                       label="Custom background gene set: do you want to upload a file OR paste into a text box?">
						<option value="text">Text field</option>
		                    <option value="file">From file</option>

		                </param>
		                <when value="file">
		                   <param type="data" format="txt" name="bg_enrich_file" label="Custom background gene set" optional="true" multiple="false"/>
		                </when>
		                <when value="text">
		                    <param type="text" name="bg_enrich_text" label="Custom background gene set identifiers (one per line):" area="true"/>
		                </when>
		          </conditional>

		          <param type="select" name="bg_enrich_id_type" label="Custom background identifier type" display="radio" multiple="false">
		              <option value="symbol">Primary gene symbol (HGNC) - e.g. KRAS</option>
                      <option value="uniprot_acc">UniProt accession - e.g. P01116</option>
                      <option value="entrezgene">NCBI Entrez gene identifier - e.g. 3845</option>
                      <option value="ensembl_gene">Ensembl gene identifier - e.g. ENSG00000133703</option>
                      <option value="ensembl_mrna">Ensembl transcript identifier - e.g. ENST00000311936</option>
                      <option value="ensembl_protein">Ensembl protein identifier - e.g. ENSP00000308495</option>
                      <option value="refseq_transcript_id">RefSeq mRNA identifier - e.g. NM_004985</option>
                      <option value="refseq_protein">RefSeq protein identifier - e.g. NP_004976</option>
		          </param>
		          <param type="text" name="bg_enrich_description" label="Custom background gene set description" value="Custom background description"/>
			</when>
		</conditional>

          <param name="enrichment_p_value_cutoff" type="float" label="P-value cutoff for enrichment tests (clusterProfiler)" value="0.05"/>
          <param name="enrichment_p_value_adj" type="select" label="P-value adjustment method (clusterProfiler)">
              <option value="BH">Benjamini-Hochberg</option>
              <option value="holm">Holm</option>
              <option value="hochberg">Hochberg</option>
              <option value="hommel">Hommel</option>
              <option value="bonferroni">Bonferroni</option>
              <option value="BY">Benjamini-Yekutieli</option>
              <option value="fdr">fdr</option>
              <option value="none">none</option>
          </param>
          <param name="enrichment_q_value_cutoff" type="float" label="Q-value cutoff for enrichment tests to report as significant (clusterProfiler)" value="0.2"/>
          <param name="enrichment_min_geneset_size" type="integer" label="Minimum number of genes annotated by ontology term for testing (clusterProfiler)" value="10"/>
          <param name="enrichment_max_geneset_size" type="integer" label="Maximum number of genes annotated by ontology term for testing (clusterProfiler)" value="500"/>
          <param name="enrichment_simplify_go" type="boolean" label="Simplify GO enrichment results by removal of redundant terms (recommended)" truevalue="T" falsevalue="F" checked="true"/>
          <param name="enrichment_plot_num_terms" type="integer" label="Number of top enriched Gene Ontology terms (max) to show in enrichment barplot" min="10" max="30" value="20"/>
      </section>

      <section title="" name=""/>
      <section name="fitness" title="Options - gene fitness scores">
          <param  name="fitness_max_score" type="float" label="Maximum loss-of-fitness score (Bayes Factor from BAGEL) for genes retrieved from Project Score" value="-2" min="-5" max="0"/>
      </section>
      <section title="" name=""/>
      <section name="protein_interactions" title="Options - protein-protein interaction network">
          <param name="ppi_network_type" type = "select" label="STRING: type of retrieved network interactions">
              <option value="functional">functional</option>
              <option value="physical">physical</option>
          </param>
          <param name="ppi_string_min_score" type="float" label="STRING: minimum confidence score for interactions to be included in network" value="0.9" min="0.4" max="1"/>
          <param name="ppi_biogrid_min_evidence" type="integer" label="BioGRID: Minimum number of evidence support for interactions to be included in network" value="3" min="2" max="10"/>
          <param name="ppi_add_nodes" type="integer" label="Addition of interacting non-queryset proteins to the protein-protein interaction network (STRING/BioGRID)" value="30" min="0" max="50"/>
          <param name="ppi_show_drugs" type="boolean" label="Attach anti-cancer drugs in protein-protein interaction network (STRING/BioGRID)" truevalue="T" falsevalue="F" checked="false"/>
          <param name="ppi_show_isolated_nodes" type="boolean" label="Show isolated nodes in protein-protein interaction network (STRING/BioGRID)" truevalue="T" falsevalue="F" checked="false"/>
		  <param name="ppi_node_shadow" type="boolean" label="Add shadow to nodes in protein-protein interaction network" truevalue="T" falsevalue="F" checked="true"/>
      </section>
      <section title="" name=""/>
      <section name="regulatory" title="Options - regulatory interactions">
         <param name="regulatory_min_confidence" type="select" label = "Minimum confidence level of regulatory interactions included (DoRothEA - A:highest, D:lowest)">
            <option value="D">D</option>
            <option value="C">C</option>
            <option value="B">B</option>
            <option value="A">A</option>
        </param>
      </section>
     <section title="" name=""/>

	 <section name="subcellular_compartments" title="Options - Subcellular compartment annotations">
        <param name="subcellcomp_min_confidence" type="integer" label="Minimum confidence level for subcellular compartment annotations" value="3" min="3" max="5"/>
        <param name="subcellcomp_min_channels" type="integer" label="Minimum number of channel (Text Mining, Experimental, Knowledge) support for annotations" value="1" min="1" max="3"/>
        <param name="subcellcomp_show_cytosol" type="boolean" label="Show cytosol annotations (very common localization) in subcellular heatmap " truevalue="T" falsevalue="F" checked="false"/>
      </section>
      <section title="" name=""/>

      <section name="disease" title="Options - Disease associations">
          <param type="boolean" name="show_top_diseases_only" label="Show top disease assocations only" truevalue="T" falsevalue="F" checked="true"/>
      </section>

  </inputs>
    <outputs>
        <data format="xlsx" name="report2" label="$report_metadata.report_name - xlsx"/>
        <data format="html" name="report1" label="$report_metadata.report_name - html"/>
    </outputs>


  <help><![CDATA[
.. class:: infomark

The query gene set is limited to n = 200 identifiers. Running with more identifiers can be done through the stand-alone R package. A very limited query gene set (e.g. n < 5) will in general reduce the relevance and significance of many oncoEnrichR report modules (i.e. protein-protein interaction networks, functional enrichment etc.).

-----

**Dataset formats**

The input dataset is in tabular_ format. The two output datasets are html_ and xlsx.

.. _tabular: ${static_path}/formatHelp.html#tab
.. _html: ${static_path}/formatHelp.html#html

-----

**What it does**

*OncoEnrichR* is intended for exploratory analysis and prioritization of a candidate hits (referred to as *query set* below) from high-throughput cancer biology experiments. The tool queries a number of high-quality data resources in order to interpret the query gene set along various dimensions, examples being cancer aberration frequencies, protein-protein interactions, pathway enrichment, subcellular compartment localization, target druggability, gene fitness scores, and regulatory interactions.

The results from the various analysis modules are provided in an interactive HTML report where the user can interrogate the results further. A multisheet Excel workbook is also provided for convience. The following resources are currently utilized for annotation and analysis:

-  `Open Targets Platform <https://targetvalidation.org/>`_ - disease associations, drug-target associations, cancer hallmarks, and druggability/tractability rankings

-  `The Cancer Genome Atlas <https://portal.gdc.cancer.gov/>`_ - gene aberration frequencies and co-expression patterns in approximately 10,000 primary tumor samples

-  `The Human Protein Atlas <https://www.proteinatlas.org/>`_ - expression data for healthy human tissues (`GTex <https://gtexportal.org/home/>`_)/cell types, and prognostic gene expression associations in cancer (`The Pathology Atlas <https://www.proteinatlas.org/humanproteome/pathology/>`_)

-  `Molecular Signatures Database (MSigDB) <http://software.broadinstitute.org/gsea/msigdb/index.jsp/>`_ - collection of annotated (e.g. towards pathways) gene sets for enrichment/overrepresentation analysis. This includes gene sets from `Gene Ontology <http://geneontology.org/>`_, `Reactome <https://reactome.org/>`_, `KEGG <https://www.genome.jp/kegg/pathway.html/>`_, `WikiPathways <https://www.wikipathways.org/index.php/WikiPathways/>`_, `BIOCARTA <https://maayanlab.cloud/Harmonizome/dataset/Biocarta+Pathways/>`_, as well as curated `immunologic <https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp#C7/>`_ and `cancer-specific <https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp#C6/>`_ signatures.

-  `NetPath <http://www.netpath.org/>`_ - manually curated resource of signal transduction pathways in humans

-  `UniProt <https://uniprot.org>`_ - Comprehensive resource of protein sequence and functional information

-  `STRING <https://string-db.org/>`_ - protein-protein interaction database

-  `InterPro/PFAM <https://www.ebi.ac.uk/interpro/>`_ - Collection of protein families/domains`

-  `BIOGRID <http://thebiogrid.org>`_ - Database of Protein, Genetic and Chemical Interactions

-  `CellChatDB <http://www.cellchat.org/>`_ - database on ligand-receptor interactions

-  `DoRothEA <https://saezlab.github.io/dorothea/>`_ - gene set resource containing signed transcription factor (TF) - target interactions

-  `CORUM <https://mips.helmholtz-muenchen.de/corum/>`_ - protein complex database

-  `Compleat <https://fgr.hms.harvard.edu/compleat>`_ - protein complex resource

-  `ComplexPortal <https://www.ebi.ac.uk/complexportal/home/>`_ - manually curated, encyclopaedic resource of macromolecular complexes

-  `hu.MAP2 <http://humap2.proteincomplexes.org/>`_ - human protein complex map

-  `COMPARTMENTS <https://compartments.jensenlab.org/Search/>`_ - subcellular compartment annotation database

-  `CancerMine <http://bionlp.bcgsc.ca/cancermine/>`_ - literature-mined resource on cancer drivers, oncogenes and tumor suppressor genes

-  `Cancer Gene Census <https://cancer.sanger.ac.uk/census/>`_ - Curated high-confidence list of genes with substantial published evidence in oncology

-  `Network of Cancer Genes <http://ncg.kcl.ac.uk/>`_ - manually curated collection of cancer genes, healthy drivers and their properties

-  `DepMap/Project Score <https://score.depmap.sanger.ac.uk/>`_ - database on the effects on cancer cell line viability elicited by CRISPR-Cas9 mediated gene activation

-  `Genetic determinants of survival in cancer <http://survival.cshl.edu/>`_ - resource on the prognostic impact of genetic aberrations (methylation, CNA, mutation, expression) in human cancers (TCGA)

-  `Predicted synthetic lethality interactions <https://pubmed.ncbi.nlm.nih.gov/34529928/>`_ - comprehensive prediction of synthetic lethality interactions in human cancer cell lines

The contents of the gene set analysis report attempt to answer the following questions related to the query set:

-  Which diseases/tumor types are known to be associated with genes in the query set, and to what extent? Which genes show evidence of oncogenic and/or tumor suppressive roles?

-  Which query genes have been linked (through literature) to the various hallmarks of cancer?

-  Which genes in the query set are poorly characterized or have an unknown function?

-  Which proteins in the query set can be targeted by inhibitors for diffferent cancer conditions (early and late clinical development phases)? What is the tractability/druggability status for other targets in the query set?

-  Which cancer-relevant protein complexes are involved for proteins in the query set?

-  Are there known cancer-relevant regulatory interactions (transcription factor (TF) - target) found in the query set?

-  Are there known ligand-receptor interactions in the query set?

-  Which subcellular compartments (nucleus, cytosol, plasma membrane etc.) are dominant localizations for members of the query set?

-  Which protein-protein interactions are known within the query set? Are there interactions between members of the query set and other cancer-relevant proteins (e.g. proto-oncogenes, tumor-suppressors or predicted cancer drivers)? Which proteins constitute hubs in the protein-protein interaction network?

-  Are there specific pathways, biological processes or molecular functions that are enriched within the query set, as compared to a reference/background set?

-  Which members of the query set are frequently mutated in tumor sample cohorts (TCGA - SNVs/InDels / homozygous deletions / copy number amplifications)? What are the most frequent recurrent somatic variants (SNVs/InDels) in the query set genes?

-  Which members of the query set are co-expressed (strong negative or positive correlations) with cancer-relevant genes (i.e. proto-oncogenes or tumor suppressors) in tumor sample cohorts (TCGA)?

-  Which members of the query set are associated with better/worse survival in different cancers, considering mutation, expression, methylation or copy number levels in tumors?

-  Which members of the query set are predicted as partners of synthetic lethality interactions?

-  Which members of the query set are associated with cellular loss-of-fitness in CRISPR/Cas9 whole-genome drop out screens of cancer cell lines (i.e. reduction of cell viability elicited by a gene inactivation)? Which genes should be prioritized considering genomic biomarkers and fitness scores in combination?


]]>
  </help>

 <citations>
     <!-- Example of annotating a citation using a DOI. -->
     <citation type="doi">10.1002/ijc.34666</citation>
     <!-- Example of annotating a citation using a BibTex entry. -->
  </citations>
</tool>