Mercurial > repos > peterjc > fastq_paired_unpaired
view tools/fastq/fastq_paired_unpaired.py @ 2:95a632a71951 draft
Uploaded v0.0.6, adds unit test
author | peterjc |
---|---|
date | Tue, 30 Apr 2013 14:08:37 -0400 |
parents | 7ed81e36fc1c |
children | 528ba9c896e0 |
line wrap: on
line source
#!/usr/bin/env python """Divides a FASTQ into paired and single (orphan reads) as separate files. The input file should be a valid FASTQ file which has been sorted so that any partner forward+reverse reads are consecutive. The output files all preserve this sort order. Pairing are recognised based on standard name suffices. See below or run the tool with no arguments for more details. Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even Color Space should all work equally well). This script is copyright 2010-2011 by Peter Cock, The James Hutton Institute (formerly SCRI), Scotland, UK. All rights reserved. See accompanying text file for licence details (MIT/BSD style). """ import os import sys import re from galaxy_utils.sequence.fastq import fastqReader, fastqWriter if "-v" in sys.argv or "--version" in sys.argv: print "Version 0.0.6" sys.exit(0) def stop_err(msg, err=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(err) msg = """Expect either 3 or 4 arguments, all FASTQ filenames. If you want two output files, use four arguments: - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) - Sorted input FASTQ filename, - Output paired FASTQ filename (forward then reverse interleaved), - Output singles FASTQ filename (orphan reads) If you want three output files, use five arguments: - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) - Sorted input FASTQ filename, - Output forward paired FASTQ filename, - Output reverse paired FASTQ filename, - Output singles FASTQ filename (orphan reads) The input file should be a valid FASTQ file which has been sorted so that any partner forward+reverse reads are consecutive. The output files all preserve this sort order. Any reads where the forward/reverse naming suffix used is not recognised are treated as orphan reads. The tool supports the /1 and /2 convention originally used by Illumina, the .f and .r convention, and the Sanger convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details), and the new Illumina convention where the reads have the same identifier with the fragment at the start of the description, e.g. @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA @HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA Note that this does support multiple forward and reverse reads per template (which is quite common with Sanger sequencing), e.g. this which is sorted alphabetically: WTSI_1055_4p17.p1kapIBF WTSI_1055_4p17.p1kpIBF WTSI_1055_4p17.q1kapIBR WTSI_1055_4p17.q1kpIBR or this where the reads already come in pairs: WTSI_1055_4p17.p1kapIBF WTSI_1055_4p17.q1kapIBR WTSI_1055_4p17.p1kpIBF WTSI_1055_4p17.q1kpIBR both become: WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR """ if len(sys.argv) == 5: format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] elif len(sys.argv) == 6: pairs_fastq = None format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] else: stop_err(msg) format = format.replace("fastq", "").lower() if not format: format="sanger" #safe default elif format not in ["sanger","solexa","illumina","cssanger"]: stop_err("Unrecognised format %s" % format) def f_match(name): if name.endswith("/1") or name.endswith(".f"): return True #Cope with three widely used suffix naming convensions, #Illumina: /1 or /2 #Forward/revered: .f or .r #Sanger, e.g. .p1k and .q1k #See http://staden.sourceforge.net/manual/pregap4_unix_50.html re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") #assert re_f.match("demo/1") assert re_f.search("demo.f") assert re_f.search("demo.s1") assert re_f.search("demo.f1k") assert re_f.search("demo.p1") assert re_f.search("demo.p1k") assert re_f.search("demo.p1lk") assert re_r.search("demo/2") assert re_r.search("demo.r") assert re_r.search("demo.q1") assert re_r.search("demo.q1lk") assert not re_r.search("demo/1") assert not re_r.search("demo.f") assert not re_r.search("demo.p") assert not re_f.search("demo/2") assert not re_f.search("demo.r") assert not re_f.search("demo.q") re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$") re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$") assert re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") assert re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") assert not re_illumina_f.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") assert not re_illumina_r.match("@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 in_handle = open(input_fastq) if pairs_fastq: pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) pairs_r_writer = pairs_f_writer else: pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format) pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format) singles_writer = fastqWriter(open(singles_fastq, "w"), format) last_template, buffered_reads = None, [] for record in fastqReader(in_handle, format): count += 1 name = record.identifier.split(None,1)[0] assert name[0]=="@", record.identifier #Quirk of the Galaxy parser is_forward = False suffix = re_f.search(name) if suffix: #============ #Forward read #============ template = name[:suffix.start()] is_forward = True elif re_illumina_f.match(record.identifier): template = name #No suffix is_forward = True if is_forward: #print name, "forward", template forward += 1 if last_template == template: buffered_reads.append(record) else: #Any old buffered reads are orphans for old in buffered_reads: singles_writer.write(old) singles += 1 #Save this read in buffer buffered_reads = [record] last_template = template else: is_reverse = False suffix = re_r.search(name) if suffix: #============ #Reverse read #============ template = name[:suffix.start()] is_reverse = True elif re_illumina_r.match(record.identifier): template = name #No suffix is_reverse = True if is_reverse: #print name, "reverse", template reverse += 1 if last_template == template and buffered_reads: #We have a pair! #If there are multiple buffered forward reads, want to pick #the first one (although we could try and do something more #clever looking at the suffix to match them up...) old = buffered_reads.pop(0) pairs_f_writer.write(old) pairs_r_writer.write(record) pairs += 2 else: #As this is a reverse read, this and any buffered read(s) are #all orphans for old in buffered_reads: singles_writer.write(old) singles += 1 buffered_reads = [] singles_writer.write(record) singles += 1 last_template = None else: #=========================== #Neither forward nor reverse #=========================== singles_writer.write(record) singles += 1 neither += 1 for old in buffered_reads: singles_writer.write(old) singles += 1 buffered_reads = [] last_template = None if last_template: #Left over singles... for old in buffered_reads: singles_writer.write(old) singles += 1 in_handle.close singles_writer.close() if pairs_fastq: pairs_f_writer.close() assert pairs_r_writer.file.closed else: pairs_f_writer.close() pairs_r_writer.close() if neither: print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \ % (count, forward, reverse, neither, pairs, singles) else: print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ % (count, forward, reverse, pairs, singles) assert count == pairs + singles == forward + reverse + neither, \ "%i vs %i+%i=%i vs %i+%i+%i=%i" \ % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)