comparison FilterUncorrectabledPEfastq.py @ 0:9a0b65ad3c84 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/rcorrector commit eba2cb86379f6ee6899a1141226e191c63fadfa6
author iuc
date Thu, 13 Sep 2018 07:00:00 -0400
parents
children 6703b98884a2
comparison
equal deleted inserted replaced
-1:000000000000 0:9a0b65ad3c84
1 """
2 author: adam h freedman
3 afreedman405 at gmail.com
4 data: Fri Aug 26 10:55:18 EDT 2016
5 This script takes as an input Rcorrector error corrected Illumina paired-reads
6 in fastq format and:
7 1. Removes any reads that Rcorrector indentifes as containing an error,
8 but can't be corrected, typically low complexity sequences. For these,
9 the header contains 'unfixable'.
10 2. Strips the ' cor' from headers of reads that Rcorrector fixed, to avoid
11 issues created by certain header formats for downstream tools.
12 3. Write a log with counts of (a) read pairs that were removed because one end
13 was unfixable, (b) corrected left and right reads, (c) total number of
14 read pairs containing at least one corrected read.
15 Currently, this script only handles paired-end data, and handle either unzipped
16 or gzipped files on the fly, so long as the gzipped files end with 'gz'.
17 """
18
19 # import sys
20 import argparse
21 import gzip
22 from itertools import izip_longest
23 # izip
24 from os.path import basename
25
26
27 def get_input_streams(r1file, r2file):
28 if r1file[-2:] == 'gz':
29 r1handle = gzip.open(r1file, 'rb')
30 r2handle = gzip.open(r2file, 'rb')
31 else:
32 r1handle = open(r1file, 'r')
33 r2handle = open(r2file, 'r')
34 return r1handle, r2handle
35
36
37 def grouper(iterable, n, fillvalue=None):
38 "Collect data into fixed-length chunks or blocks"
39 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
40 args = [iter(iterable)] * n
41 return izip_longest(fillvalue=fillvalue, * args)
42
43
44 if __name__ == "__main__":
45 parser = argparse.ArgumentParser(description="options for filtering and logging rCorrector fastq outputs")
46 parser.add_argument('-1', '--left_reads', dest='leftreads', type=str, help='R1 fastq file')
47 parser.add_argument('-2', '--right_reads', dest='rightreads', type=str, help='R2 fastq file')
48 parser.add_argument('-o', '--out_prefix', dest='outprefix', type=str, help="prefix for filtered fastq output")
49 opts = parser.parse_args()
50 r1out = open(opts.outprefix + '_' + basename(opts.leftreads).replace('.gz', ''), 'w')
51 r2out = open(opts.outprefix + '_' + basename(opts.rightreads).replace('.gz', ''), 'w')
52 r1_cor_count = 0
53 r2_cor_count = 0
54 pair_cor_count = 0
55 unfix_count = 0
56 r1_stream, r2_stream = get_input_streams(opts.leftreads, opts.rightreads)
57 with r1_stream as f1, r2_stream as f2:
58 R1 = grouper(f1, 4)
59 R2 = grouper(f2, 4)
60 counter = 0
61 for entry in R1:
62 counter += 1
63 if counter % 100000 == 0:
64 print "%s reads processed" % counter
65 head1, seq1, placeholder1, qual1 = [i.strip() for i in entry]
66 head2, seq2, placeholder2, qual2 = [j.strip() for j in R2.next()]
67 if 'unfixable' in head1 or 'unfixable' in head2:
68 unfix_count += 1
69 else:
70 if 'cor' in head1:
71 r1_cor_count += 1
72 if 'cor' in head2:
73 r2_cor_count += 1
74 if 'cor' in head1 or 'cor' in head2:
75 pair_cor_count += 1
76 head1 = head1.split('l:')[0][:-1] # keeps all before the low kmer count statistic and removes the trailing whitespace character
77 head2 = head2.split('l:')[0][:-1]
78 # head1 = head1.replace(' cor', '')
79 # head2 = head2.replace(' cor', '')
80 r1out.write('%s\n' % '\n'.join([head1, seq1, placeholder1, qual1]))
81 r2out.write('%s\n' % '\n'.join([head2, seq2, placeholder2, qual2]))
82 unfix_log = open('rmunfixable.log', 'w')
83 unfix_log.write('total PE reads:%s\nremoved PE reads:%s\nretained PE reads:%s\nR1 corrected:%s\nR2 corrected:%s\npairs corrected:%s\n' % (counter, unfix_count, counter - unfix_count, r1_cor_count, r2_cor_count, pair_cor_count))
84 r1out.close()
85 r2out.close()