# HG changeset patch
# User peterjc
# Date 1307481677 14400
# Node ID 72e9fcaec61f3e90e964acb5a4438a195d8249f4
Migrated tool version 0.0.4 from old tool shed archive to new tool shed repository
diff -r 000000000000 -r 72e9fcaec61f tools/fastq/fastq_paired_unpaired.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_unpaired.py Tue Jun 07 17:21:17 2011 -0400
@@ -0,0 +1,212 @@
+#!/usr/bin/env python
+"""Divides a FASTQ into paired and single (orphan reads) as separate files.
+
+The input file should be a valid FASTQ file which has been sorted so that
+any partner forward+reverse reads are consecutive. The output files all
+preserve this sort order. Pairing are recognised based on standard name
+suffices. See below or run the tool with no arguments for more details.
+
+Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even
+Color Space should all work equally well).
+
+This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved.
+See accompanying text file for licence details (MIT/BSD style).
+
+This is version 0.0.4 of the script.
+"""
+import os
+import sys
+import re
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+def stop_err(msg, err=1):
+ sys.stderr.write(msg.rstrip() + "\n")
+ sys.exit(err)
+
+msg = """Expect either 3 or 4 arguments, all FASTQ filenames.
+
+If you want two output files, use four arguments:
+ - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
+ - Sorted input FASTQ filename,
+ - Output paired FASTQ filename (forward then reverse interleaved),
+ - Output singles FASTQ filename (orphan reads)
+
+If you want three output files, use five arguments:
+ - FASTQ variant (e.g. sanger, solexa, illumina or cssanger)
+ - Sorted input FASTQ filename,
+ - Output forward paired FASTQ filename,
+ - Output reverse paired FASTQ filename,
+ - Output singles FASTQ filename (orphan reads)
+
+The input file should be a valid FASTQ file which has been sorted so that
+any partner forward+reverse reads are consecutive. The output files all
+preserve this sort order.
+
+Any reads where the forward/reverse naming suffix used is not recognised
+are treated as orphan reads. The tool supports the /1 and /2 convention
+used by Illumina, the .f and .r convention, and the Sanger convention
+(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details).
+
+Note that this does support multiple forward and reverse reads per template
+(which is quite common with Sanger sequencing), e.g. this which is sorted
+alphabetically:
+
+WTSI_1055_4p17.p1kapIBF
+WTSI_1055_4p17.p1kpIBF
+WTSI_1055_4p17.q1kapIBR
+WTSI_1055_4p17.q1kpIBR
+
+or this where the reads already come in pairs:
+
+WTSI_1055_4p17.p1kapIBF
+WTSI_1055_4p17.q1kapIBR
+WTSI_1055_4p17.p1kpIBF
+WTSI_1055_4p17.q1kpIBR
+
+both become:
+
+WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR
+WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR
+"""
+
+if len(sys.argv) == 5:
+ format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:]
+elif len(sys.argv) == 6:
+ pairs_fastq = None
+ format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:]
+else:
+ stop_err(msg)
+
+format = format.replace("fastq", "").lower()
+if not format:
+ format="sanger" #safe default
+elif format not in ["sanger","solexa","illumina","cssanger"]:
+ stop_err("Unrecognised format %s" % format)
+
+def f_match(name):
+ if name.endswith("/1") or name.endswith(".f"):
+ return True
+
+#Cope with three widely used suffix naming convensions,
+#Illumina: /1 or /2
+#Forward/revered: .f or .r
+#Sanger, e.g. .p1k and .q1k
+#See http://staden.sourceforge.net/manual/pregap4_unix_50.html
+re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
+re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
+
+#assert re_f.match("demo/1")
+assert re_f.search("demo.f")
+assert re_f.search("demo.s1")
+assert re_f.search("demo.f1k")
+assert re_f.search("demo.p1")
+assert re_f.search("demo.p1k")
+assert re_f.search("demo.p1lk")
+assert re_r.search("demo/2")
+assert re_r.search("demo.r")
+assert re_r.search("demo.q1")
+assert re_r.search("demo.q1lk")
+assert not re_r.search("demo/1")
+assert not re_r.search("demo.f")
+assert not re_r.search("demo.p")
+assert not re_f.search("demo/2")
+assert not re_f.search("demo.r")
+assert not re_f.search("demo.q")
+
+count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0
+in_handle = open(input_fastq)
+if pairs_fastq:
+ pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format)
+ pairs_r_writer = pairs_f_writer
+else:
+ pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format)
+ pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format)
+singles_writer = fastqWriter(open(singles_fastq, "w"), format)
+last_template, buffered_reads = None, []
+
+for record in fastqReader(in_handle, format):
+ count += 1
+ name = record.identifier.split(None,1)[0]
+ assert name[0]=="@", record.identifier #Quirk of the Galaxy parser
+ suffix = re_f.search(name)
+ if suffix:
+ #============
+ #Forward read
+ #============
+ template = name[:suffix.start()]
+ #print name, "forward", template
+ forward += 1
+ if last_template == template:
+ buffered_reads.append(record)
+ else:
+ #Any old buffered reads are orphans
+ for old in buffered_reads:
+ singles_writer.write(old)
+ singles += 1
+ #Save this read in buffer
+ buffered_reads = [record]
+ last_template = template
+ else:
+ suffix = re_r.search(name)
+ if suffix:
+ #============
+ #Reverse read
+ #============
+ template = name[:suffix.start()]
+ #print name, "reverse", template
+ reverse += 1
+ if last_template == template and buffered_reads:
+ #We have a pair!
+ #If there are multiple buffered forward reads, want to pick
+ #the first one (although we could try and do something more
+ #clever looking at the suffix to match them up...)
+ old = buffered_reads.pop(0)
+ pairs_f_writer.write(old)
+ pairs_r_writer.write(record)
+ pairs += 2
+ else:
+ #As this is a reverse read, this and any buffered read(s) are
+ #all orphans
+ for old in buffered_reads:
+ singles_writer.write(old)
+ singles += 1
+ buffered_reads = []
+ singles_writer.write(record)
+ singles += 1
+ last_template = None
+ else:
+ #===========================
+ #Neither forward nor reverse
+ #===========================
+ singles_writer.write(record)
+ singles += 1
+ neither += 1
+ for old in buffered_reads:
+ singles_writer.write(old)
+ singles += 1
+ buffered_reads = []
+ last_template = None
+if last_template:
+ #Left over singles...
+ for old in buffered_reads:
+ singles_writer.write(old)
+ singles += 1
+in_handle.close
+singles_writer.close()
+if pairs_fastq:
+ pairs_f_writer.close()
+ assert pairs_r_writer.file.closed
+else:
+ pairs_f_writer.close()
+ pairs_r_writer.close()
+
+if neither:
+ print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \
+ % (count, forward, reverse, neither, pairs, singles)
+else:
+ print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \
+ % (count, forward, reverse, pairs, singles)
+
+assert count == pairs + singles == forward + reverse + neither, \
+ "%i vs %i+%i=%i vs %i+%i=%i" \
+ % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)
diff -r 000000000000 -r 72e9fcaec61f tools/fastq/fastq_paired_unpaired.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_unpaired.txt Tue Jun 07 17:21:17 2011 -0400
@@ -0,0 +1,80 @@
+Galaxy tool to divide FASTQ files into paired and unpaired reads
+================================================================
+
+This tool is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved.
+See the licence text below.
+
+This tool is a short Python script (using the Biopython library functions) which
+divides a FASTQ file into paired reads, and single or orphan reads. You can have
+separate files for the forward/reverse reads, or have them interleaved in a
+single file.
+
+Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even
+Color Space should all work equally well).
+
+There are just two files to install:
+
+* fastq_paired_unpaired.py (the Python script)
+* fastq_paired_unpaired.xml (the Galaxy tool definition)
+
+The suggested location is in the Galaxy folder tools/fastq next to other FASTQ
+tools provided with Galaxy.
+
+You will also need to modify the tools_conf.xml file to tell Galaxy to offer
+the tool. One suggested location is next to the fastq_filter.xml entry. Simply
+add the line:
+
+
+
+That's it.
+
+
+History
+=======
+
+v0.0.1 - Initial version, using Biopython
+v0.0.2 - Help text; cope with multiple pairs per template
+v0.0.3 - Galaxy XML wrappers added
+v0.0.4 - Use Galaxy library to handle FASTQ files (avoid Biopython dependency)
+
+
+Developers
+==========
+
+This script and other tools for filtering FASTA, FASTQ and SFF files are
+currently being developed on the following hg branch:
+http://bitbucket.org/peterjc/galaxy-central/src/fasta_filter
+
+For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder:
+
+tar -czf fastq_paired_unpaired.tar.gz tools/fastq/fastq_paired_unpaired.*
+
+Check this worked:
+
+$ tar -tzf fastq_paired_unpaired.tar.gz
+fastq/fastq_paired_unpaired.py
+fastq/fastq_paired_unpaired.txt
+fastq/fastq_paired_unpaired.xml
+
+
+Licence (MIT/BSD style)
+=======================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
diff -r 000000000000 -r 72e9fcaec61f tools/fastq/fastq_paired_unpaired.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_paired_unpaired.xml Tue Jun 07 17:21:17 2011 -0400
@@ -0,0 +1,80 @@
+
+ using the read name suffices
+
+fastq_paired_unpaired.py $input_fastq.extension $input_fastq
+#if $output_choice_cond.output_choice=="separate"
+ $output_forward $output_reverse
+#elif $output_choice_cond.output_choice=="interleaved"
+ $output_paired
+#end if
+$output_singles
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ output_choice_cond["output_choice"] == "separate"
+
+
+ output_choice_cond["output_choice"] == "separate"
+
+
+ output_choice_cond["output_choice"] == "interleaved"
+
+
+
+
+
+ Bio
+
+
+
+**What it does**
+
+Using the common read name suffix conventions, it divides a FASTQ file into
+paired reads, and orphan or single reads.
+
+The input file should be a valid FASTQ file which has been sorted so that
+any partner forward+reverse reads are consecutive. The output files all
+preserve this sort order. Pairing are recognised based on standard name
+suffices. See below or run the tool with no arguments for more details.
+
+Any reads where the forward/reverse naming suffix used is not recognised
+are treated as orphan reads. The tool supports the /1 and /2 convention
+used by Illumina, the .f and .r convention, and the Sanger convention
+(see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details).
+
+Note that this does support multiple forward and reverse reads per template
+(which is quite common with Sanger sequencing), e.g. this which is sorted
+alphabetically:
+
+WTSI_1055_4p17.p1kapIBF
+WTSI_1055_4p17.p1kpIBF
+WTSI_1055_4p17.q1kapIBR
+WTSI_1055_4p17.q1kpIBR
+
+or this where the reads already come in pairs:
+
+WTSI_1055_4p17.p1kapIBF
+WTSI_1055_4p17.q1kapIBR
+WTSI_1055_4p17.p1kpIBF
+WTSI_1055_4p17.q1kpIBR
+
+both become:
+
+WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR
+WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR
+
+
+