Mercurial > repos > theo.collard > ballgown_wrapper
comparison ballgown/ballgown.xml @ 3:896cdffe06ff draft
first upload
author | theo.collard |
---|---|
date | Wed, 26 Apr 2017 08:42:01 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:eb1206832359 | 3:896cdffe06ff |
---|---|
1 <tool id="ballgown" name="Ballgown" version="0.5.0" workflow_compatible="true"> | |
2 <description>Flexible, isoform-level differential expression analysis</description> | |
3 <requirements> | |
4 <requirement type="package" version="2.2.0">bioconductor-ballgown</requirement> | |
5 <requirement type="package" version="0.5.0">r-dplyr</requirement> | |
6 <requirement type="package" version="1.3.2">r-optparse</requirement> | |
7 | |
8 </requirements> | |
9 <command interpreter="Rscript" detect_errors="aggressive"> | |
10 ##------------------------------------------------------------------------------------ | |
11 ## This function reads the input file with the mapping between samples and files | |
12 ## E.g. of result: | |
13 ## mapping = { | |
14 ## "e2t.ctab" : "sample1", | |
15 ## "other.ctab" : "sample2", | |
16 ## "i2t.ctab" : "sample1", | |
17 ## "t_data.ctab": "sample1" | |
18 ## ... | |
19 ## } | |
20 ##------------------------------------------------------------------------------------ | |
21 #def read_sample_mapping_file(sample_mapping_file): | |
22 #try | |
23 #set mapping = {} | |
24 #set file = open($sample_mapping_file.dataset.dataset.get_file_name(),'r') | |
25 #for $line in $file: | |
26 #set content= $line.strip().split('\t') | |
27 #for $map in $content: | |
28 #set mapping[$map]= $content[0] | |
29 #end for | |
30 #end for | |
31 #return $mapping | |
32 #except | |
33 #return None | |
34 #end try | |
35 #end def | |
36 | |
37 ##------------------------------------------------------------------------------------ | |
38 ## This function returns the name of the sample associated to a given file | |
39 ##------------------------------------------------------------------------------------ | |
40 #def get_sample_name($dataset, $sample_mapping): | |
41 ##If the file with samples mapping was provided | |
42 #if $sample_mapping != None: | |
43 #return $sample_mapping.get($dataset.name, None) | |
44 ##Otherwise with extract the sample name from the filename | |
45 #else: | |
46 #return str($dataset.element_identifier) | |
47 #end if | |
48 #end def | |
49 | |
50 ##------------------------------------------------------------------------------------ | |
51 ## This function reads a dataset or list of datasets and sets the corresponding value | |
52 ## in the $result variable | |
53 ## e.g. of result | |
54 ##'sample1' : { | |
55 ## 'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat' | |
56 ## 'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat', | |
57 ## 't_data': '/export/galaxy-central/database/files/000/dataset_12.dat', | |
58 ## 'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat', | |
59 ## 'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat' | |
60 ## }, | |
61 ##------------------------------------------------------------------------------------ | |
62 #def read_input_files($param_name, $param_value, $result, $sample_mapping, $create_if_empty): | |
63 ## If input is a data collection | |
64 #if isinstance($param_value, list): | |
65 ## For each dataset | |
66 #for $dataset in $param_value: | |
67 ## Get the sample name | |
68 #set sample_name = $get_sample_name($dataset, $sample_mapping) | |
69 ## Check if sample is already registered | |
70 #if not($result.has_key($sample_name)): | |
71 #if ($create_if_empty == True): | |
72 #set result[$sample_name] = {} | |
73 #else: | |
74 #raise ValueError("Error in input. Please check that input contains all the required files for sample " + $sample_name) | |
75 #end if | |
76 #end if | |
77 ## Register the file to the sample | |
78 #set result[$sample_name][$param_name] = str($dataset.dataset.dataset.get_file_name()) | |
79 #end for | |
80 #else: | |
81 #if not($result.has_key("sample_1")): | |
82 #set result["sample_1"] = {} | |
83 #end if | |
84 #set result["sample_1"][$param_name] = str($param_name.dataset.dataset.get_file_name()) | |
85 #end if | |
86 #return $result | |
87 #end def | |
88 | |
89 ##------------------------------------------------------------------------------------ | |
90 ## Main body of the tool | |
91 ##------------------------------------------------------------------------------------ | |
92 ## Set the params for the next R script | |
93 #set result={} | |
94 #set sample_mapping=None | |
95 | |
96 ## If the samples mapping file was provided, parse the content | |
97 #if $samples_names != None and not(isinstance($samples_names, list) and (None in $samples_names)): | |
98 #set sample_mapping = $read_sample_mapping_file($samples_names) | |
99 #end if | |
100 | |
101 ## READ THE CONTENT FOR e_data AND STORE THE FILES | |
102 ## INDEXED BY THEIR SAMPLE NAME | |
103 ## e.g. 'HBR_Rep1' : { | |
104 ## 'e_data': '/export/galaxy-central/database/files/000/dataset_13.dat' | |
105 ## 'i_data': '/export/galaxy-central/database/files/000/dataset_10.dat', | |
106 ## 't_data': '/export/galaxy-central/database/files/000/dataset_12.dat', | |
107 ## 'e2t': '/export/galaxy-central/database/files/000/dataset_9.dat', | |
108 ## 'i2t': '/export/galaxy-central/database/files/000/dataset_11.dat' | |
109 ## }, | |
110 ## 'HBR_Rep2' : {...} | |
111 #set $result = $read_input_files("e_data.ctab", $e_data, $result, $sample_mapping, True) | |
112 #set $result = $read_input_files("i_data.ctab", $i_data, $result, $sample_mapping, False) | |
113 #set $result = $read_input_files("t_data.ctab", $t_data, $result, $sample_mapping, False) | |
114 #set $result = $read_input_files("e2t.ctab", $e2t, $result, $sample_mapping, False) | |
115 #set $result = $read_input_files("i2t.ctab", $i2t, $result, $sample_mapping, False) | |
116 | |
117 ## For each input sample, create a directory and link the input files for ballgown | |
118 #import os | |
119 #set n_sample = 1 | |
120 #for $key, $value in $result.iteritems(): | |
121 #set dir_name = str($output.files_path) + "/" + $key + "/" | |
122 $os.makedirs($dir_name) | |
123 #for $file_name, $file_path in $value.iteritems(): | |
124 $os.symlink($file_path, $dir_name + $file_name) | |
125 #end for | |
126 #set n_sample = $n_sample + 1 | |
127 #end for | |
128 | |
129 ## Run the R script with the location of the linked files and the name for outpot file | |
130 ballgown.R --directory $output.files_path --outputtranscript $output --outputgenes $outputgn --texpression $trexpression --phendat $phendata --bgout $bgo | |
131 </command> | |
132 <inputs> | |
133 <param name="e_data" type="data" multiple="true" format="tabular" label="Exon-level expression measurements" help="One row per exon. See below for more details."/> | |
134 <param name="i_data" type="data" multiple="true" format="tabular" label="Intron- (i.e., junction-) level expression measurements" help="One row per intron. See below for more details."/> | |
135 <param name="t_data" type="data" multiple="true" format="tabular" label="Transcript-level expression measurements" help="One row per transcript. See below for more details."/> | |
136 <param name="e2t" type="data" multiple="true" format="tabular" label="Exons-transcripts mapping" help="Table with two columns, e_id and t_id, denoting which exons belong to which transcripts. See below for more details."/> | |
137 <param name="i2t" type="data" multiple="true" format="tabular" label="Introns-transcripts mapping" help="Table with two columns, i_id and t_id, denoting which introns belong to which transcripts. See below for more details."/> | |
138 <param name="samples_names" type="data" optional="true" multiple="false" format="tabular" label="File names for samples" help="Optional. Use in case that the names for the analysed samples cannot be extracted from the filenames."/> | |
139 <param argument="--phendat" name="phendata" type="data" format="csv" label="phenotype data" /> | |
140 <param argument="--texpression" name="trexpression" type="float" value="0.5" label="minimal transcript expression to appear in the results"/> | |
141 </inputs> | |
142 <outputs> | |
143 <data name="bgo" format="rda" file="ballgown_object.rda" label="${tool.name} on ${on_string}: ballgown object (R data file)"/> | |
144 <data name="output" format="csv" file="output_transcript.csv" label="${tool.name} on ${on_string}: transcripts expression (tabular)"/> | |
145 <data name="outputgn" format="csv" file="output_genes.csv" label="${tool.name} on ${on_string}: genes expression (tabular)"/> | |
146 </outputs> | |
147 <tests> | |
148 </tests> | |
149 <help> | |
150 | |
151 ======================= | |
152 Ballgown | |
153 ======================= | |
154 ----------------------- | |
155 **What it does** | |
156 ----------------------- | |
157 | |
158 Ballgown is a software package designed to facilitate flexible differential expression analysis of RNA-seq data. | |
159 The Ballgown package provides functions to organize, visualize, and analyze the expression measurements for your transcriptome assembly. | |
160 | |
161 ---- | |
162 | |
163 ----------------------- | |
164 **How to use** | |
165 ----------------------- | |
166 The input for this tools consists on 5 files for each sample in your experiment: | |
167 | |
168 - **e_data**: exon-level expression measurements. Tab file or collection of tab files. One row per exon. Columns are e_id (numeric exon id), chr, strand, start, end (genomic location of the exon), and the following expression measurements for each sample: | |
169 * rcount: reads overlapping the exon | |
170 * ucount: uniquely mapped reads overlapping the exon | |
171 * mrcount: multi-map-corrected number of reads overlapping the exon | |
172 * cov average per-base read coverage | |
173 * cov_sd: standard deviation of per-base read coverage | |
174 * mcov: multi-map-corrected average per-base read coverage | |
175 * mcov_sd: standard deviation of multi-map-corrected per-base coverage | |
176 - **i_data**: intron- (i.e., junction-) level expression measurements. Tab file or collection of tab files. One row per intron. Columns are i_id (numeric intron id), chr, strand, start, end (genomic location of the intron), and the following expression measurements for each sample: | |
177 * rcount: number of reads supporting the intron | |
178 * ucount: number of uniquely mapped reads supporting the intron | |
179 * mrcount: multi-map-corrected number of reads supporting the intron | |
180 - **t_data**: transcript-level expression measurements. Tab file or collection of tab files. One row per transcript. Columns are: | |
181 * t_id: numeric transcript id | |
182 * chr, strand, start, end: genomic location of the transcript | |
183 * t_name: Cufflinks-generated transcript id | |
184 * num_exons: number of exons comprising the transcript | |
185 * length: transcript length, including both exons and introns | |
186 * gene_id: gene the transcript belongs to | |
187 * gene_name: HUGO gene name for the transcript, if known | |
188 * cov: per-base coverage for the transcript (available for each sample) | |
189 * FPKM: Cufflinks-estimated FPKM for the transcript (available for each sample) | |
190 - **e2t**: Tab file or collection of tab files. Table with two columns, e_id and t_id, denoting which exons belong to which transcripts. These ids match the ids in the e_data and t_data tables. | |
191 - **i2t**: Tab file or collection of tab files. Table with two columns, i_id and t_id, denoting which introns belong to which transcripts. These ids match the ids in the i_data and t_data tables. | |
192 - samples_names: (optional) Tab file. Table with five columns, one row per sample. Defines which files from the input belong to each sample in the experiment. | |
193 | |
194 .. class:: infomark | |
195 | |
196 '''TIP''' *Note* Here's an example of a good phenotype data file for your expirement. | |
197 | |
198 +--------------+-------------------------+-------------------------+---+ | |
199 |ids |experimental variable 1 |experimental variable 2 |...| | |
200 +==============+=========================+=========================+===+ | |
201 |sample 1 |value 1 |value 2 |...| | |
202 +--------------+-------------------------+-------------------------+---+ | |
203 |sample 2 |value 2 |value 1 |...| | |
204 +--------------+-------------------------+-------------------------+---+ | |
205 |sample 3 |value 1 |value 2 |...| | |
206 +--------------+-------------------------+-------------------------+---+ | |
207 |sample 4 |value 2 |value 1 |...| | |
208 +--------------+-------------------------+-------------------------+---+ | |
209 |... |value 1 |value 2 |...| | |
210 +--------------+-------------------------+-------------------------+---+ | |
211 | |
212 | |
213 .. class:: infomark | |
214 | |
215 *Note* The minimal transcript expression is a number used to filter the transcripts that | |
216 are less or not expressed in our samples when compared to the genome | |
217 | |
218 ----------------------- | |
219 **Outputs** | |
220 ----------------------- | |
221 | |
222 This tool has 3 outputs: | |
223 | |
224 - **transcripts expression** : this is a csv file containing all the transcripts that are expressed above the transcripts expression value | |
225 - **genes expression** : this is a csv file containing all the genes that are expressed above the transcripts expression value | |
226 - **Ballgown object** : this is the ballgown object created during the process. This file can be re-used later for further analysis in a R console. | |
227 | |
228 ---- | |
229 | |
230 **Authors**: Théo Collard [SLU Global Bioinformatics Centre], Rafael Hernández de Diego [SLU Global Bioinformatics Centre], and Tomas Klingström [SLU Global Bioinformatics Centre] | |
231 | |
232 Sources are available at https://github.com/CollardT/Ballgown-Wrapper | |
233 | |
234 </help> | |
235 </tool> |