Mercurial > repos > peterjc > fastq_paired_unpaired
changeset 0:72e9fcaec61f
Migrated tool version 0.0.4 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 17:21:17 -0400 |
parents | |
children | 7ed81e36fc1c |
files | tools/fastq/fastq_paired_unpaired.py tools/fastq/fastq_paired_unpaired.txt tools/fastq/fastq_paired_unpaired.xml |
diffstat | 3 files changed, 372 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_paired_unpaired.py Tue Jun 07 17:21:17 2011 -0400 @@ -0,0 +1,212 @@ +#!/usr/bin/env python +"""Divides a FASTQ into paired and single (orphan reads) as separate files. + +The input file should be a valid FASTQ file which has been sorted so that +any partner forward+reverse reads are consecutive. The output files all +preserve this sort order. Pairing are recognised based on standard name +suffices. See below or run the tool with no arguments for more details. + +Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even +Color Space should all work equally well). + +This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. +See accompanying text file for licence details (MIT/BSD style). + +This is version 0.0.4 of the script. +""" +import os +import sys +import re +from galaxy_utils.sequence.fastq import fastqReader, fastqWriter + +def stop_err(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +msg = """Expect either 3 or 4 arguments, all FASTQ filenames. + +If you want two output files, use four arguments: + - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) + - Sorted input FASTQ filename, + - Output paired FASTQ filename (forward then reverse interleaved), + - Output singles FASTQ filename (orphan reads) + +If you want three output files, use five arguments: + - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) + - Sorted input FASTQ filename, + - Output forward paired FASTQ filename, + - Output reverse paired FASTQ filename, + - Output singles FASTQ filename (orphan reads) + +The input file should be a valid FASTQ file which has been sorted so that +any partner forward+reverse reads are consecutive. The output files all +preserve this sort order. + +Any reads where the forward/reverse naming suffix used is not recognised +are treated as orphan reads. The tool supports the /1 and /2 convention +used by Illumina, the .f and .r convention, and the Sanger convention +(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). + +Note that this does support multiple forward and reverse reads per template +(which is quite common with Sanger sequencing), e.g. this which is sorted +alphabetically: + +WTSI_1055_4p17.p1kapIBF +WTSI_1055_4p17.p1kpIBF +WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.q1kpIBR + +or this where the reads already come in pairs: + +WTSI_1055_4p17.p1kapIBF +WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.p1kpIBF +WTSI_1055_4p17.q1kpIBR + +both become: + +WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR +""" + +if len(sys.argv) == 5: + format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] +elif len(sys.argv) == 6: + pairs_fastq = None + format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] +else: + stop_err(msg) + +format = format.replace("fastq", "").lower() +if not format: + format="sanger" #safe default +elif format not in ["sanger","solexa","illumina","cssanger"]: + stop_err("Unrecognised format %s" % format) + +def f_match(name): + if name.endswith("/1") or name.endswith(".f"): + return True + +#Cope with three widely used suffix naming convensions, +#Illumina: /1 or /2 +#Forward/revered: .f or .r +#Sanger, e.g. .p1k and .q1k +#See http://staden.sourceforge.net/manual/pregap4_unix_50.html +re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") +re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") + +#assert re_f.match("demo/1") +assert re_f.search("demo.f") +assert re_f.search("demo.s1") +assert re_f.search("demo.f1k") +assert re_f.search("demo.p1") +assert re_f.search("demo.p1k") +assert re_f.search("demo.p1lk") +assert re_r.search("demo/2") +assert re_r.search("demo.r") +assert re_r.search("demo.q1") +assert re_r.search("demo.q1lk") +assert not re_r.search("demo/1") +assert not re_r.search("demo.f") +assert not re_r.search("demo.p") +assert not re_f.search("demo/2") +assert not re_f.search("demo.r") +assert not re_f.search("demo.q") + +count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 +in_handle = open(input_fastq) +if pairs_fastq: + pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) + pairs_r_writer = pairs_f_writer +else: + pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format) + pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format) +singles_writer = fastqWriter(open(singles_fastq, "w"), format) +last_template, buffered_reads = None, [] + +for record in fastqReader(in_handle, format): + count += 1 + name = record.identifier.split(None,1)[0] + assert name[0]=="@", record.identifier #Quirk of the Galaxy parser + suffix = re_f.search(name) + if suffix: + #============ + #Forward read + #============ + template = name[:suffix.start()] + #print name, "forward", template + forward += 1 + if last_template == template: + buffered_reads.append(record) + else: + #Any old buffered reads are orphans + for old in buffered_reads: + singles_writer.write(old) + singles += 1 + #Save this read in buffer + buffered_reads = [record] + last_template = template + else: + suffix = re_r.search(name) + if suffix: + #============ + #Reverse read + #============ + template = name[:suffix.start()] + #print name, "reverse", template + reverse += 1 + if last_template == template and buffered_reads: + #We have a pair! + #If there are multiple buffered forward reads, want to pick + #the first one (although we could try and do something more + #clever looking at the suffix to match them up...) + old = buffered_reads.pop(0) + pairs_f_writer.write(old) + pairs_r_writer.write(record) + pairs += 2 + else: + #As this is a reverse read, this and any buffered read(s) are + #all orphans + for old in buffered_reads: + singles_writer.write(old) + singles += 1 + buffered_reads = [] + singles_writer.write(record) + singles += 1 + last_template = None + else: + #=========================== + #Neither forward nor reverse + #=========================== + singles_writer.write(record) + singles += 1 + neither += 1 + for old in buffered_reads: + singles_writer.write(old) + singles += 1 + buffered_reads = [] + last_template = None +if last_template: + #Left over singles... + for old in buffered_reads: + singles_writer.write(old) + singles += 1 +in_handle.close +singles_writer.close() +if pairs_fastq: + pairs_f_writer.close() + assert pairs_r_writer.file.closed +else: + pairs_f_writer.close() + pairs_r_writer.close() + +if neither: + print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \ + % (count, forward, reverse, neither, pairs, singles) +else: + print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ + % (count, forward, reverse, pairs, singles) + +assert count == pairs + singles == forward + reverse + neither, \ + "%i vs %i+%i=%i vs %i+%i=%i" \ + % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_paired_unpaired.txt Tue Jun 07 17:21:17 2011 -0400 @@ -0,0 +1,80 @@ +Galaxy tool to divide FASTQ files into paired and unpaired reads +================================================================ + +This tool is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. +See the licence text below. + +This tool is a short Python script (using the Biopython library functions) which +divides a FASTQ file into paired reads, and single or orphan reads. You can have +separate files for the forward/reverse reads, or have them interleaved in a +single file. + +Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even +Color Space should all work equally well). + +There are just two files to install: + +* fastq_paired_unpaired.py (the Python script) +* fastq_paired_unpaired.xml (the Galaxy tool definition) + +The suggested location is in the Galaxy folder tools/fastq next to other FASTQ +tools provided with Galaxy. + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer +the tool. One suggested location is next to the fastq_filter.xml entry. Simply +add the line: + +<tool file="fastq/fastq_paired_unpaired.xml" /> + +That's it. + + +History +======= + +v0.0.1 - Initial version, using Biopython +v0.0.2 - Help text; cope with multiple pairs per template +v0.0.3 - Galaxy XML wrappers added +v0.0.4 - Use Galaxy library to handle FASTQ files (avoid Biopython dependency) + + +Developers +========== + +This script and other tools for filtering FASTA, FASTQ and SFF files are +currently being developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/fasta_filter + +For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder: + +tar -czf fastq_paired_unpaired.tar.gz tools/fastq/fastq_paired_unpaired.* + +Check this worked: + +$ tar -tzf fastq_paired_unpaired.tar.gz +fastq/fastq_paired_unpaired.py +fastq/fastq_paired_unpaired.txt +fastq/fastq_paired_unpaired.xml + + +Licence (MIT/BSD style) +======================= + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_paired_unpaired.xml Tue Jun 07 17:21:17 2011 -0400 @@ -0,0 +1,80 @@ +<tool id="fastq_paired_unpaired" name="Divide FASTQ file into paired and unpaired reads" version="0.0.4"> + <description>using the read name suffices</description> + <command interpreter="python"> +fastq_paired_unpaired.py $input_fastq.extension $input_fastq +#if $output_choice_cond.output_choice=="separate" + $output_forward $output_reverse +#elif $output_choice_cond.output_choice=="interleaved" + $output_paired +#end if +$output_singles + </command> + <inputs> + <param name="input_fastq" type="data" format="fastq" label="FASTQ file to divide into paired and unpaired reads"/> + <conditional name="output_choice_cond"> + <param name="output_choice" type="select" label="How to output paired reads?"> + <option value="separate">Separate (two FASTQ files, for the forward and reverse reads, in matching order).</option> + <option value="interleaved">Interleaved (one FASTQ file, alternating forward read then partner reverse read).</option> + </param> + <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml --> + <when value="separate" /> + <when value="interleaved" /> + </conditional> + </inputs> + <outputs> + <data name="output_singles" format="input" label="Orphan or single reads"/> + <data name="output_forward" format="input" label="Forward paired reads"> + <filter>output_choice_cond["output_choice"] == "separate"</filter> + </data> + <data name="output_reverse" format="input" label="Reverse paired reads"> + <filter>output_choice_cond["output_choice"] == "separate"</filter> + </data> + <data name="output_paired" format="input" label="Interleaved paired reads"> + <filter>output_choice_cond["output_choice"] == "interleaved"</filter> + </data> + </outputs> + <tests> + </tests> + <requirements> + <requirement type="python-module">Bio</requirement> + </requirements> + <help> + +**What it does** + +Using the common read name suffix conventions, it divides a FASTQ file into +paired reads, and orphan or single reads. + +The input file should be a valid FASTQ file which has been sorted so that +any partner forward+reverse reads are consecutive. The output files all +preserve this sort order. Pairing are recognised based on standard name +suffices. See below or run the tool with no arguments for more details. + +Any reads where the forward/reverse naming suffix used is not recognised +are treated as orphan reads. The tool supports the /1 and /2 convention +used by Illumina, the .f and .r convention, and the Sanger convention +(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). + +Note that this does support multiple forward and reverse reads per template +(which is quite common with Sanger sequencing), e.g. this which is sorted +alphabetically: + +WTSI_1055_4p17.p1kapIBF +WTSI_1055_4p17.p1kpIBF +WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.q1kpIBR + +or this where the reads already come in pairs: + +WTSI_1055_4p17.p1kapIBF +WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.p1kpIBF +WTSI_1055_4p17.q1kpIBR + +both become: + +WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR +WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR + + </help> +</tool>