Mercurial > repos > peterjc > fastq_paired_unpaired
view tools/fastq/fastq_paired_unpaired.py @ 0:72e9fcaec61f
Migrated tool version 0.0.4 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 17:21:17 -0400 |
parents | |
children | 7ed81e36fc1c |
line wrap: on
line source
#!/usr/bin/env python """Divides a FASTQ into paired and single (orphan reads) as separate files. The input file should be a valid FASTQ file which has been sorted so that any partner forward+reverse reads are consecutive. The output files all preserve this sort order. Pairing are recognised based on standard name suffices. See below or run the tool with no arguments for more details. Note that the FASTQ variant is unimportant (Sanger, Solexa, Illumina, or even Color Space should all work equally well). This script is copyright 2010 by Peter Cock, SCRI, UK. All rights reserved. See accompanying text file for licence details (MIT/BSD style). This is version 0.0.4 of the script. """ import os import sys import re from galaxy_utils.sequence.fastq import fastqReader, fastqWriter def stop_err(msg, err=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(err) msg = """Expect either 3 or 4 arguments, all FASTQ filenames. If you want two output files, use four arguments: - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) - Sorted input FASTQ filename, - Output paired FASTQ filename (forward then reverse interleaved), - Output singles FASTQ filename (orphan reads) If you want three output files, use five arguments: - FASTQ variant (e.g. sanger, solexa, illumina or cssanger) - Sorted input FASTQ filename, - Output forward paired FASTQ filename, - Output reverse paired FASTQ filename, - Output singles FASTQ filename (orphan reads) The input file should be a valid FASTQ file which has been sorted so that any partner forward+reverse reads are consecutive. The output files all preserve this sort order. Any reads where the forward/reverse naming suffix used is not recognised are treated as orphan reads. The tool supports the /1 and /2 convention used by Illumina, the .f and .r convention, and the Sanger convention (see http://staden.sourceforge.net/manual/pregap4_unix_50.html for details). Note that this does support multiple forward and reverse reads per template (which is quite common with Sanger sequencing), e.g. this which is sorted alphabetically: WTSI_1055_4p17.p1kapIBF WTSI_1055_4p17.p1kpIBF WTSI_1055_4p17.q1kapIBR WTSI_1055_4p17.q1kpIBR or this where the reads already come in pairs: WTSI_1055_4p17.p1kapIBF WTSI_1055_4p17.q1kapIBR WTSI_1055_4p17.p1kpIBF WTSI_1055_4p17.q1kpIBR both become: WTSI_1055_4p17.p1kapIBF paired with WTSI_1055_4p17.q1kapIBR WTSI_1055_4p17.p1kpIBF paired with WTSI_1055_4p17.q1kpIBR """ if len(sys.argv) == 5: format, input_fastq, pairs_fastq, singles_fastq = sys.argv[1:] elif len(sys.argv) == 6: pairs_fastq = None format, input_fastq, pairs_f_fastq, pairs_r_fastq, singles_fastq = sys.argv[1:] else: stop_err(msg) format = format.replace("fastq", "").lower() if not format: format="sanger" #safe default elif format not in ["sanger","solexa","illumina","cssanger"]: stop_err("Unrecognised format %s" % format) def f_match(name): if name.endswith("/1") or name.endswith(".f"): return True #Cope with three widely used suffix naming convensions, #Illumina: /1 or /2 #Forward/revered: .f or .r #Sanger, e.g. .p1k and .q1k #See http://staden.sourceforge.net/manual/pregap4_unix_50.html re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") #assert re_f.match("demo/1") assert re_f.search("demo.f") assert re_f.search("demo.s1") assert re_f.search("demo.f1k") assert re_f.search("demo.p1") assert re_f.search("demo.p1k") assert re_f.search("demo.p1lk") assert re_r.search("demo/2") assert re_r.search("demo.r") assert re_r.search("demo.q1") assert re_r.search("demo.q1lk") assert not re_r.search("demo/1") assert not re_r.search("demo.f") assert not re_r.search("demo.p") assert not re_f.search("demo/2") assert not re_f.search("demo.r") assert not re_f.search("demo.q") count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 in_handle = open(input_fastq) if pairs_fastq: pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) pairs_r_writer = pairs_f_writer else: pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format) pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format) singles_writer = fastqWriter(open(singles_fastq, "w"), format) last_template, buffered_reads = None, [] for record in fastqReader(in_handle, format): count += 1 name = record.identifier.split(None,1)[0] assert name[0]=="@", record.identifier #Quirk of the Galaxy parser suffix = re_f.search(name) if suffix: #============ #Forward read #============ template = name[:suffix.start()] #print name, "forward", template forward += 1 if last_template == template: buffered_reads.append(record) else: #Any old buffered reads are orphans for old in buffered_reads: singles_writer.write(old) singles += 1 #Save this read in buffer buffered_reads = [record] last_template = template else: suffix = re_r.search(name) if suffix: #============ #Reverse read #============ template = name[:suffix.start()] #print name, "reverse", template reverse += 1 if last_template == template and buffered_reads: #We have a pair! #If there are multiple buffered forward reads, want to pick #the first one (although we could try and do something more #clever looking at the suffix to match them up...) old = buffered_reads.pop(0) pairs_f_writer.write(old) pairs_r_writer.write(record) pairs += 2 else: #As this is a reverse read, this and any buffered read(s) are #all orphans for old in buffered_reads: singles_writer.write(old) singles += 1 buffered_reads = [] singles_writer.write(record) singles += 1 last_template = None else: #=========================== #Neither forward nor reverse #=========================== singles_writer.write(record) singles += 1 neither += 1 for old in buffered_reads: singles_writer.write(old) singles += 1 buffered_reads = [] last_template = None if last_template: #Left over singles... for old in buffered_reads: singles_writer.write(old) singles += 1 in_handle.close singles_writer.close() if pairs_fastq: pairs_f_writer.close() assert pairs_r_writer.file.closed else: pairs_f_writer.close() pairs_r_writer.close() if neither: print "%i reads (%i forward, %i reverse, %i neither), %i in pairs, %i as singles" \ % (count, forward, reverse, neither, pairs, singles) else: print "%i reads (%i forward, %i reverse), %i in pairs, %i as singles" \ % (count, forward, reverse, pairs, singles) assert count == pairs + singles == forward + reverse + neither, \ "%i vs %i+%i=%i vs %i+%i=%i" \ % (count,pairs,singles,pairs+singles,forward,reverse,neither,forward+reverse+neither)