annotate tools/fastq/fastq_paired_end_deinterlacer.py @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #Florent Angly
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter, fastqNamedReader, fastqJoiner
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 input_filename = sys.argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 input_type = sys.argv[2] or 'sanger'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 mate1_filename = sys.argv[3]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 mate2_filename = sys.argv[4]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 single1_filename = sys.argv[5]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 single2_filename = sys.argv[6]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 type = input_type
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 input = fastqNamedReader( open( input_filename, 'rb' ), format = type )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 mate1_out = fastqWriter( open( mate1_filename, 'wb' ), format = type )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 mate2_out = fastqWriter( open( mate2_filename, 'wb' ), format = type )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 single1_out = fastqWriter( open( single1_filename, 'wb' ), format = type )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 single2_out = fastqWriter( open( single2_filename, 'wb' ), format = type )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 joiner = fastqJoiner( type )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 i = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 skip_count = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 found = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 for i, mate1 in enumerate( fastqReader( open( input_filename, 'rb' ), format = type ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 if mate1.identifier in found:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 del found[mate1.identifier]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 mate2 = input.get( joiner.get_paired_identifier( mate1 ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 if mate2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 # This is a mate pair
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 found[mate2.identifier] = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 if joiner.is_first_mate( mate1 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 mate1_out.write( mate1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 mate2_out.write( mate2 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 mate1_out.write( mate2 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 mate2_out.write( mate1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 # This is a single
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 skip_count += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 if joiner.is_first_mate( mate1 ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 single1_out.write( mate1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 single2_out.write( mate1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 if i is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 print "Your input file contained no valid FASTQ sequences."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 if skip_count:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 print 'There were %i reads with no mate.' % skip_count
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1)/2 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 input.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 mate1_out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 mate2_out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 single1_out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 single2_out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 main()