diff FilterUncorrectabledPEfastq.py @ 0:9a0b65ad3c84 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/rcorrector commit eba2cb86379f6ee6899a1141226e191c63fadfa6
author iuc
date Thu, 13 Sep 2018 07:00:00 -0400
parents
children 6703b98884a2
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FilterUncorrectabledPEfastq.py	Thu Sep 13 07:00:00 2018 -0400
@@ -0,0 +1,85 @@
+"""
+author: adam h freedman
+afreedman405 at gmail.com
+data: Fri Aug 26 10:55:18 EDT 2016
+This script takes as an input Rcorrector error corrected Illumina paired-reads
+in fastq format and:
+1. Removes any reads that Rcorrector indentifes as containing an error,
+but can't be corrected, typically low complexity sequences. For these,
+the header contains 'unfixable'.
+2. Strips the ' cor' from headers of reads that Rcorrector fixed, to avoid
+issues created by certain header formats for downstream tools.
+3. Write a log with counts of (a) read pairs that were removed because one end
+was unfixable, (b) corrected left and right reads, (c) total number of
+read pairs containing at least one corrected read.
+Currently, this script only handles paired-end data, and handle either unzipped
+or gzipped files on the fly, so long as the gzipped files end with 'gz'.
+"""
+
+# import sys
+import argparse
+import gzip
+from itertools import izip_longest
+# izip
+from os.path import basename
+
+
+def get_input_streams(r1file, r2file):
+    if r1file[-2:] == 'gz':
+        r1handle = gzip.open(r1file, 'rb')
+        r2handle = gzip.open(r2file, 'rb')
+    else:
+        r1handle = open(r1file, 'r')
+        r2handle = open(r2file, 'r')
+    return r1handle, r2handle
+
+
+def grouper(iterable, n, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
+    args = [iter(iterable)] * n
+    return izip_longest(fillvalue=fillvalue, * args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="options for filtering and logging rCorrector fastq outputs")
+    parser.add_argument('-1', '--left_reads', dest='leftreads', type=str, help='R1 fastq file')
+    parser.add_argument('-2', '--right_reads', dest='rightreads', type=str, help='R2 fastq file')
+    parser.add_argument('-o', '--out_prefix', dest='outprefix', type=str, help="prefix for filtered fastq output")
+    opts = parser.parse_args()
+    r1out = open(opts.outprefix + '_' + basename(opts.leftreads).replace('.gz', ''), 'w')
+    r2out = open(opts.outprefix + '_' + basename(opts.rightreads).replace('.gz', ''), 'w')
+    r1_cor_count = 0
+    r2_cor_count = 0
+    pair_cor_count = 0
+    unfix_count = 0
+    r1_stream, r2_stream = get_input_streams(opts.leftreads, opts.rightreads)
+    with r1_stream as f1, r2_stream as f2:
+        R1 = grouper(f1, 4)
+        R2 = grouper(f2, 4)
+        counter = 0
+        for entry in R1:
+            counter += 1
+            if counter % 100000 == 0:
+                print "%s reads processed" % counter
+            head1, seq1, placeholder1, qual1 = [i.strip() for i in entry]
+            head2, seq2, placeholder2, qual2 = [j.strip() for j in R2.next()]
+            if 'unfixable' in head1 or 'unfixable' in head2:
+                unfix_count += 1
+            else:
+                if 'cor' in head1:
+                    r1_cor_count += 1
+                if 'cor' in head2:
+                    r2_cor_count += 1
+                if 'cor' in head1 or 'cor' in head2:
+                    pair_cor_count += 1
+                head1 = head1.split('l:')[0][:-1]  # keeps all before the low kmer count statistic and removes the trailing whitespace character
+                head2 = head2.split('l:')[0][:-1]
+                # head1 = head1.replace(' cor', '')
+                # head2 = head2.replace(' cor', '')
+                r1out.write('%s\n' % '\n'.join([head1, seq1, placeholder1, qual1]))
+                r2out.write('%s\n' % '\n'.join([head2, seq2, placeholder2, qual2]))
+    unfix_log = open('rmunfixable.log', 'w')
+    unfix_log.write('total PE reads:%s\nremoved PE reads:%s\nretained PE reads:%s\nR1 corrected:%s\nR2 corrected:%s\npairs corrected:%s\n' % (counter, unfix_count, counter - unfix_count, r1_cor_count, r2_cor_count, pair_cor_count))
+    r1out.close()
+r2out.close()