0
|
1 <tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.2">
|
|
2 <description>from SAM/BAM file</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="1.64">biopython</requirement>
|
|
5 <requirement type="python-module">Bio</requirement>
|
|
6 <requirement type="binary">samtools</requirement>
|
|
7 <requirement type="package" version="0.1.19">samtools</requirement>
|
|
8 </requirements>
|
|
9 <version_command interpreter="python">seq_filter_by_mapping.py --version</version_command>
|
|
10 <command interpreter="python">
|
|
11 seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode
|
|
12 #if $output_choice_cond.output_choice=="both"
|
|
13 -p $output_pos -n $output_neg
|
|
14 #elif $output_choice_cond.output_choice=="pos"
|
|
15 -p $output_pos
|
|
16 #elif $output_choice_cond.output_choice=="neg"
|
|
17 -n $output_neg
|
|
18 #end if
|
|
19 ## Now loop over all the mapping files
|
|
20 #for i in $mapping_file#${i} #end for#
|
|
21 </command>
|
|
22 <stdio>
|
|
23 <!-- Anything other than zero is an error -->
|
|
24 <exit_code range="1:" />
|
|
25 <exit_code range=":-1" />
|
|
26 </stdio>
|
|
27 <inputs>
|
|
28 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." />
|
|
29 <param name="mapping_file" type="data" format="sam,bam" multiple="true" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." />
|
|
30 <conditional name="output_choice_cond">
|
|
31 <param name="output_choice" type="select" label="Output mapped reads, unmapped reads, or both?">
|
|
32 <option value="both">Both mapped and unmapped reads, as two files</option>
|
|
33 <option value="pos">Just mapped reads, as a single file</option>
|
|
34 <option value="neg">Just unmapped reads, as a single file</option>
|
|
35 </param>
|
|
36 <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml -->
|
|
37 <when value="both" />
|
|
38 <when value="pos" />
|
|
39 <when value="neg" />
|
|
40 </conditional>
|
|
41 <param name="pair_mode" type="select" label="Paired read treatment">
|
|
42 <option value="lax" selected="true">Treat as a pair, allow either read to be mapped</option>
|
|
43 <option value="strict">Treat as a pair, require both reads to be mapped</option>
|
|
44 <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter...
|
|
45 <option value="solo">Treat independently (will split partners when only one maps)</option>
|
|
46 -->
|
|
47 </param>
|
|
48 </inputs>
|
|
49 <outputs>
|
|
50 <data name="output_pos" format="input" metadata_source="input_file" label="$input_file.name (mapped)">
|
|
51 <filter>output_choice_cond["output_choice"] != "neg"</filter>
|
|
52 </data>
|
|
53 <data name="output_neg" format="input" metadata_source="input_file" label="$input_file.name (unmapped)">
|
|
54 <filter>output_choice_cond["output_choice"] != "pos"</filter>
|
|
55 </data>
|
|
56 </outputs>
|
|
57 <tests>
|
|
58 <test>
|
|
59 <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" />
|
|
60 <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" />
|
|
61 <param name="pair_mode" value="lax" />
|
|
62 <param name="output_choice" value="pos" />
|
|
63 <output name="output_pos" file="SRR639755_sample_lax.fastq" ftype="fastqsanger" />
|
|
64 </test>
|
|
65 <test>
|
|
66 <param name="input_file" value="SRR639755_mito_pairs.fastq.gz" ftype="fastqsanger" />
|
|
67 <param name="mapping_file" value="SRR639755_sample_by_coord.sam" ftype="sam" />
|
|
68 <param name="pair_mode" value="strict" />
|
|
69 <param name="output_choice" value="pos" />
|
|
70 <output name="output_pos" file="SRR639755_sample_strict.fastq" ftype="fastqsanger" />
|
|
71 </test>
|
|
72 </tests>
|
|
73 <help>
|
|
74 **What it does**
|
|
75
|
|
76 By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in
|
|
77 two, those sequences (or read pairs) which do or don't map in the provided
|
|
78 SAM/BAM file. You can opt to have a single output file of just the mapping reads,
|
|
79 or just the non-mapping ones.
|
|
80
|
|
81 **Example Usage**
|
|
82
|
|
83 You might wish to perform a contamination screan by mapping your reads against
|
|
84 known contaminant reference sequences, then use this tool to select only the
|
|
85 unmapped reads for further analysis (e.g. *de novo* assembly).
|
|
86
|
|
87 Similarly you might wish to map your reads against a known bacterial reference,
|
|
88 then take the non-mapping sequences forward for analysis if looking for novel
|
|
89 plasmids.
|
|
90
|
|
91
|
|
92 **References**
|
|
93
|
|
94 If you use this Galaxy tool in work leading to a scientific publication please
|
|
95 cite:
|
|
96
|
|
97 Peter J.A. Cock (2014), Galaxy tool for filtering reads by mapping
|
|
98 http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
|
|
99
|
|
100 This tool uses Biopython to read and write SFF files, so you may also wish to
|
|
101 cite the Biopython application note (and Galaxy too of course):
|
|
102
|
|
103 Cock et al (2009). Biopython: freely available Python tools for computational
|
|
104 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
|
|
105 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
|
|
106
|
|
107 This tool is available to install into other Galaxy Instances via the Galaxy
|
|
108 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
|
|
109 </help>
|
|
110 <citations>
|
|
111 <citation type="doi">10.1093/bioinformatics/btp163</citation>
|
|
112 </citations>
|
|
113 </tool>
|