Mercurial > repos > galaxyp > proteomics_rnaseq_reduced_db_workflow
changeset 1:20d9fb1ba210 default tip
Replace several tabular manipulations with regex_replace tool
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Thu, 20 Mar 2014 21:50:05 -0500 |
parents | 9d5e59373c84 |
children | |
files | README.rst proteomics_rnaseq_reduced_db_workflow_v2.ga repository_dependencies.xml |
diffstat | 3 files changed, 471 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/README.rst Mon Mar 17 16:03:12 2014 -0500 +++ b/README.rst Thu Mar 20 21:50:05 2014 -0500 @@ -47,6 +47,7 @@ Version Changes ------- ---------------------------------------------------------------------- v0.0.1 - Initial release to Tool Shed (March, 2014) +v0.0.2 - Use regex_replace tool for tabular file manipulation ======= ======================================================================
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/proteomics_rnaseq_reduced_db_workflow_v2.ga Thu Mar 20 21:50:05 2014 -0500 @@ -0,0 +1,469 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "Filter out proteins that have a transcript expression level, as quantified by RNA-Seq data, below a certain threshold.", + "format-version": "0.1", + "name": "Proteomics Reduced DB v2", + "steps": { + "0": { + "annotation": "ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.73.pep.all.fa.gz", + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.73.pep.all.fa.gz", + "name": "Ensembl Protein FASTA (reference proteome)" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 208, + "top": 200 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"Ensembl Protein FASTA (reference proteome)\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "1": { + "annotation": "Ensembl reference fasta with only chromosome assigned sequences. For example: ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.73.dna.toplevel.fa.gz", + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "Ensembl reference fasta with only chromosome assigned sequences. For example: ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.73.dna.toplevel.fa.gz", + "name": "Ensembl Genome Reference Fasta" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 209, + "top": 292 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"Ensembl Genome Reference Fasta\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "2": { + "annotation": "For example: \nftp://ftp.ensembl.org/pub/release-73/gtf/homo_sapiens/Homo_sapiens.GRCh37.73.gtf.gz", + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "For example: \nftp://ftp.ensembl.org/pub/release-73/gtf/homo_sapiens/Homo_sapiens.GRCh37.73.gtf.gz", + "name": "Ensembl GTF File (gene models)" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 213, + "top": 456 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"Ensembl GTF File (gene models)\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "3": { + "annotation": "RNA-Seq left mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)", + "id": 3, + "input_connections": {}, + "inputs": [ + { + "description": "RNA-Seq left mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)", + "name": "RNA-Seq left paired-end fastq" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 220, + "top": 563 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"RNA-Seq left paired-end fastq\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "4": { + "annotation": "RNA-Seq right mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)", + "id": 4, + "input_connections": {}, + "inputs": [ + { + "description": "RNA-Seq right mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)", + "name": "RNA-Seq right paired-end fastq" + } + ], + "name": "Input dataset", + "outputs": [], + "position": { + "left": 221, + "top": 673 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"RNA-Seq right paired-end fastq\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [] + }, + "5": { + "annotation": "Convert peptide fasta to a 2-column tabular file. Keep all the head info.", + "id": 5, + "input_connections": { + "input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "name": "FASTA-to-Tabular", + "outputs": [ + { + "name": "output", + "type": "tabular" + } + ], + "position": { + "left": 538, + "top": 267 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "fasta2tab", + "tool_state": "{\"__page__\": 0, \"keep_first\": \"\\\"0\\\"\", \"descr_columns\": \"\\\"1\\\"\", \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"__rerun_remap_job_id__\": null}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "6": { + "annotation": "Given a GTF file and the reference genome, this tool constructs a synthetic transcriptome that will be used for isoform quantification during \"-calculate expression\".", + "id": 6, + "input_connections": { + "reference|gtf": { + "id": 2, + "output_name": "output" + }, + "reference|reference_fasta_file": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [], + "name": "RSEM prepare reference", + "outputs": [ + { + "name": "reference_file", + "type": "rsem_ref" + } + ], + "position": { + "left": 419, + "top": 388 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/jjohnson/rsem/rsem_prepare_reference/1.1.17", + "tool_state": "{\"__page__\": 0, \"reference\": \"{\\\"ref_type\\\": \\\"genomic\\\", \\\"gtf\\\": null, \\\"reference_fasta_file\\\": null, \\\"__current_case__\\\": 1}\", \"reference_name\": \"\\\"primaryEnsemblGtfRef\\\"\", \"__rerun_remap_job_id__\": null, \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"polya\": \"{\\\"polya_use\\\": \\\"add\\\", \\\"polya_length\\\": \\\"125\\\", \\\"__current_case__\\\": 0}\", \"transcript_to_gene_map\": \"null\", \"ntog\": \"\\\"False\\\"\"}", + "tool_version": "1.1.17", + "type": "tool", + "user_outputs": [] + }, + "7": { + "annotation": "", + "id": 7, + "input_connections": { + "infile": { + "id": 5, + "output_name": "output" + } + }, + "inputs": [], + "name": "Regex Replace", + "outputs": [ + { + "name": "outfile", + "type": "txt" + } + ], + "position": { + "left": 802, + "top": 281 + }, + "post_job_actions": { + "ChangeDatatypeActionoutfile": { + "action_arguments": { + "newtype": "tabular" + }, + "action_type": "ChangeDatatypeAction", + "output_name": "outfile" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/kellrott/regex_replace/regex_replace/1.0.0", + "tool_state": "{\"__page__\": 0, \"ignore_case\": \"\\\"False\\\"\", \"search_str\": \"\\\"^(.* transcript:)(ENST\\\\\\\\d+)(.*)$\\\"\", \"__rerun_remap_job_id__\": null, \"replace_str\": \"\\\"\\\\\\\\1\\\\\\\\2\\\\\\\\3\\\\\\\\t\\\\\\\\2\\\"\", \"replace_count\": \"\\\"0\\\"\", \"multiline\": \"\\\"False\\\"\", \"infile\": \"null\", \"dot_all\": \"\\\"False\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "8": { + "annotation": "Given then RNA-Seq reads (fastq) and synthetic transcriptome (from \"-prepare reference\"), this tool quantifies the abundances of each mRNA transcript within the GTF file.", + "id": 8, + "input_connections": { + "input|fastq|fastq1": { + "id": 3, + "output_name": "output" + }, + "input|fastq|fastq2": { + "id": 4, + "output_name": "output" + }, + "reference|rsem_ref": { + "id": 6, + "output_name": "reference_file" + } + }, + "inputs": [], + "name": "RSEM calculate expression", + "outputs": [ + { + "name": "gene_abundances", + "type": "tabular" + }, + { + "name": "isoform_abundances", + "type": "tabular" + }, + { + "name": "transcript_bam", + "type": "bam" + }, + { + "name": "transcript_sorted_bam", + "type": "bam" + }, + { + "name": "genome_bam", + "type": "bam" + }, + { + "name": "genome_sorted_bam", + "type": "bam" + }, + { + "name": "log", + "type": "txt" + } + ], + "position": { + "left": 719, + "top": 523 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/jjohnson/rsem/rsem_calculate_expression/1.1.17", + "tool_state": "{\"__page__\": 0, \"reference\": \"{\\\"rsem_ref\\\": null, \\\"refSrc\\\": \\\"history\\\", \\\"__current_case__\\\": 1}\", \"rsem_options\": \"{\\\"fullparams\\\": \\\"default\\\", \\\"__current_case__\\\": 0}\", \"rsem_outputs\": \"{\\\"result_bams\\\": \\\"none\\\", \\\"__current_case__\\\": 0}\", \"__rerun_remap_job_id__\": null, \"seedlength\": \"\\\"25\\\"\", \"sample\": \"\\\"rsem_sample\\\"\", \"forward_prob\": \"\\\"0.5\\\"\", \"input\": \"{\\\"fastq\\\": {\\\"fastq2\\\": null, \\\"fastq1\\\": null, \\\"matepair\\\": \\\"paired\\\", \\\"__current_case__\\\": 1}, \\\"bowtie_options\\\": {\\\"fullparams\\\": \\\"default\\\", \\\"__current_case__\\\": 0}, \\\"fastq_select\\\": \\\"--phred33-quals\\\", \\\"__current_case__\\\": 0, \\\"format\\\": \\\"fastq\\\"}\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}", + "tool_version": "1.1.17", + "type": "tool", + "user_outputs": [] + }, + "9": { + "annotation": "Selection of lower threshold of transcriptional abundance in TPM required for inclusion of the corresponding protein in the reduced database.", + "id": 9, + "input_connections": { + "input": { + "id": 8, + "output_name": "isoform_abundances" + } + }, + "inputs": [], + "name": "Filter", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 991, + "top": 591 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "Filter1", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"cond\": \"\\\"c3>0.000001\\\"\", \"input\": \"null\", \"header_lines\": \"\\\"0\\\"\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "10": { + "annotation": "Add a column with the RSEM TPM times a million.", + "id": 10, + "input_connections": { + "input": { + "id": 9, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Compute", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 1199, + "top": 574 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "Add_a_column1", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"cond\": \"\\\"c3*1000000\\\"\", \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"round\": \"\\\"no\\\"\"}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "11": { + "annotation": "", + "id": 11, + "input_connections": { + "input1": { + "id": 7, + "output_name": "outfile" + }, + "input2": { + "id": 10, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Join two Datasets", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 1350, + "top": 419 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "join1", + "tool_state": "{\"input2\": \"null\", \"__page__\": 0, \"field1\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"3\\\"}\", \"partial\": \"\\\"\\\"\", \"field2\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"1\\\"}\", \"__rerun_remap_job_id__\": null, \"fill_empty_columns\": \"{\\\"fill_empty_columns_switch\\\": \\\"no_fill\\\", \\\"__current_case__\\\": 0}\", \"unmatched\": \"\\\"\\\"\", \"input1\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}", + "tool_version": "2.0.2", + "type": "tool", + "user_outputs": [] + }, + "12": { + "annotation": "", + "id": 12, + "input_connections": { + "infile": { + "id": 11, + "output_name": "out_file1" + } + }, + "inputs": [], + "name": "Regex Replace", + "outputs": [ + { + "name": "outfile", + "type": "txt" + } + ], + "position": { + "left": 1545, + "top": 546 + }, + "post_job_actions": { + "ChangeDatatypeActionoutfile": { + "action_arguments": { + "newtype": "tabular" + }, + "action_type": "ChangeDatatypeAction", + "output_name": "outfile" + } + }, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/kellrott/regex_replace/regex_replace/1.0.0", + "tool_state": "{\"__page__\": 0, \"ignore_case\": \"\\\"False\\\"\", \"search_str\": \"\\\"^(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)$\\\"\", \"__rerun_remap_job_id__\": null, \"replace_str\": \"\\\"\\\\\\\\1 tmp:\\\\\\\\8\\\\\\\\t\\\\\\\\2\\\"\", \"replace_count\": \"\\\"0\\\"\", \"multiline\": \"\\\"False\\\"\", \"infile\": \"null\", \"dot_all\": \"\\\"False\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "13": { + "annotation": "Final reduced database after application of a TPM cut-off.", + "id": 13, + "input_connections": { + "input": { + "id": 12, + "output_name": "outfile" + } + }, + "inputs": [], + "name": "Tabular-to-FASTA", + "outputs": [ + { + "name": "output", + "type": "fasta" + } + ], + "position": { + "left": 1743, + "top": 484 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "tab2fasta", + "tool_state": "{\"title_col\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": [\\\"1\\\"]}\", \"__page__\": 0, \"seq_col\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"2\\\"}\", \"__rerun_remap_job_id__\": null, \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}", + "tool_version": "1.1.0", + "type": "tool", + "user_outputs": [] + }, + "14": { + "annotation": "Format FASTA to desired width.", + "id": 14, + "input_connections": { + "input": { + "id": 13, + "output_name": "output" + } + }, + "inputs": [], + "name": "FASTA Width", + "outputs": [ + { + "name": "output", + "type": "input" + } + ], + "position": { + "left": 1939, + "top": 569 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/fasta_formatter/cshl_fasta_formatter/1.0.0", + "tool_state": "{\"__page__\": 0, \"input\": \"null\", \"__rerun_remap_job_id__\": null, \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"width\": \"\\\"80\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + } + } +} \ No newline at end of file
--- a/repository_dependencies.xml Mon Mar 17 16:03:12 2014 -0500 +++ b/repository_dependencies.xml Thu Mar 20 21:50:05 2014 -0500 @@ -1,5 +1,6 @@ <?xml version="1.0"?> <repositories description="Required tools for proteomics_rnaseq_splice_db_workflow"> <repository name="fasta_formatter" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="8f0ae92440b8" /> + <repository name="regex_replace" owner="kellrott" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="9a77d5fca67c" /> <repository name="rsem" owner="jjohnson" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="59459de65740" /> </repositories>