annotate tools/fastq/fastq_trimmer_by_quality.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #Dan Blankenberg
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 from optparse import OptionParser
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 def mean( score_list ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 return float( sum( score_list ) ) / float( len( score_list ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 ACTION_METHODS = { 'min':min, 'max':max, 'sum':sum, 'mean':mean }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 def compare( aggregated_value, operator, threshold_value ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 if operator == '>':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 return aggregated_value > threshold_value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 elif operator == '>=':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 return aggregated_value >= threshold_value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 elif operator == '==':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 return aggregated_value == threshold_value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 elif operator == '<':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 return aggregated_value < threshold_value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 elif operator == '<=':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 return aggregated_value <= threshold_value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 elif operator == '!=':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 return aggregated_value != threshold_value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 def exclude( value_list, exclude_indexes ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 rval = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 for i, val in enumerate( value_list ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 if i not in exclude_indexes:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 rval.append( val )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 return rval
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 def exclude_and_compare( aggregate_action, aggregate_list, operator, threshold_value, exclude_indexes = None ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 if not aggregate_list or compare( aggregate_action( aggregate_list ), operator, threshold_value ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 if exclude_indexes:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 for exclude_index in exclude_indexes:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 excluded_list = exclude( aggregate_list, exclude_index )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 if not excluded_list or compare( aggregate_action( excluded_list ), operator, threshold_value ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 return True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 return False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 usage = "usage: %prog [options] input_file output_file"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 parser = OptionParser( usage=usage )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 ( options, args ) = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 if len ( args ) != 2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 parser.error( "Need to specify an input file and an output file" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 if options.window_size < 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 parser.error( 'You must specify a strictly positive window size' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 if options.window_step < 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 parser.error( 'You must specify a strictly positive step size' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 #determine an exhaustive list of window indexes that can be excluded from aggregation
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 exclude_window_indexes = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 last_exclude_indexes = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 for exclude_count in range( min( options.exclude_count, options.window_size ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 if last_exclude_indexes:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 new_exclude_indexes = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 for exclude_list in last_exclude_indexes:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 for window_index in range( options.window_size ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 if window_index not in exclude_list:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 new_exclude = sorted( exclude_list + [ window_index ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 if new_exclude not in exclude_window_indexes + new_exclude_indexes:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 new_exclude_indexes.append( new_exclude )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 exclude_window_indexes += new_exclude_indexes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 last_exclude_indexes = new_exclude_indexes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 for window_index in range( options.window_size ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 last_exclude_indexes.append( [ window_index ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 exclude_window_indexes = list( last_exclude_indexes )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 out = fastqWriter( open( args[1], 'wb' ), format = options.format )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 action = ACTION_METHODS[ options.aggregation_action ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 num_reads = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 num_reads_excluded = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 for trim_end in options.trim_ends:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 quality_list = fastq_read.get_decimal_quality_scores()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 if trim_end == '5':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 lwindow_position = 0 #left position of window
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 if lwindow_position >= len( quality_list ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 fastq_read.sequence = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 fastq_read.quality = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 fastq_read = fastq_read.slice( lwindow_position, None )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 lwindow_position += options.window_step
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 rwindow_position = len( quality_list ) #right position of window
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 lwindow_position = rwindow_position - options.window_size #left position of window
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 if rwindow_position <= 0 or lwindow_position < 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 fastq_read.sequence = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 fastq_read.quality = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 fastq_read = fastq_read.slice( None, rwindow_position )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 rwindow_position -= options.window_step
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 if options.keep_zero_length or len( fastq_read ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 out.write( fastq_read )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 num_reads_excluded += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 if num_reads is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 print "No valid FASTQ reads could be processed."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 print "%i FASTQ reads were processed." % ( num_reads + 1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 if num_reads_excluded:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 print "%i reads of zero length were excluded from the output." % num_reads_excluded
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 if __name__ == "__main__": main()