Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_seeker2-align.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 from optparse import OptionParser, OptionGroup | |
4 import re | |
5 import tempfile | |
6 from bs_align import output | |
7 from bs_align.bs_pair_end import * | |
8 from bs_align.bs_single_end import * | |
9 from bs_align.bs_rrbs import * | |
10 from bs_utils.utils import * | |
11 | |
12 | |
13 if __name__ == '__main__': | |
14 | |
15 parser = OptionParser() | |
16 # option group 1 | |
17 opt_group = OptionGroup(parser, "For single end reads") | |
18 opt_group.add_option("-i", "--input", type="string", dest="infilename",help="Input your read file name (FORMAT: sequences, fastq, qseq,fasta)", metavar="INFILE") | |
19 parser.add_option_group(opt_group) | |
20 | |
21 # option group 2 | |
22 opt_group = OptionGroup(parser, "For pair end reads") | |
23 opt_group.add_option("-1", "--input_1", type="string", dest="infilename_1",help="Input your read file end 1 (FORMAT: sequences, qseq, fasta, fastq)", metavar="FILE") | |
24 opt_group.add_option("-2", "--input_2", type="string", dest="infilename_2",help="Input your read file end 2 (FORMAT: sequences, qseq, fasta, fastq)", metavar="FILE") | |
25 opt_group.add_option("--minins",type = "int",dest = "min_insert_size", help="The minimum insert size for valid paired-end alignments [Default: %default]", default = -1) | |
26 opt_group.add_option("--maxins",type = "int",dest = "max_insert_size", help="The maximum insert size for valid paired-end alignments [Default: %default]", default = 400) | |
27 parser.add_option_group(opt_group) | |
28 | |
29 # option group 3 | |
30 opt_group = OptionGroup(parser, "Reduced Representation Bisulfite Sequencing Options") | |
31 opt_group.add_option("-r", "--rrbs", action="store_true", dest="rrbs", default = False, help = 'Process reads from Reduced Representation Bisulfite Sequencing experiments') | |
32 opt_group.add_option("-c", "--cut-site", type="string",dest="cut_format", help="Cutting sites of restriction enzyme. Ex: MspI(C-CGG), Mael:(C-TAG), double-enzyme MspI&Mael:(C-CGG,C-TAG). [Default: %default]", metavar="pattern", default = "C-CGG") | |
33 opt_group.add_option("-L", "--low", type = "int", dest="rrbs_low_bound", help="lower bound of fragment length (excluding C-CGG ends) [Default: %default]", default = 40) | |
34 opt_group.add_option("-U", "--up", type = "int", dest="rrbs_up_bound", help="upper bound of fragment length (excluding C-CGG ends) [Default: %default]", default = 500) | |
35 parser.add_option_group(opt_group) | |
36 | |
37 # option group 4 | |
38 opt_group = OptionGroup(parser, "General options") | |
39 opt_group.add_option("-t", "--tag", type="string", dest="taginfo",help="[Y]es for undirectional lib, [N]o for directional [Default: %default]", metavar="TAG", default = 'N') | |
40 opt_group.add_option("-s","--start_base",type = "int",dest = "cutnumber1", help="The first base of your read to be mapped [Default: %default]", default = 1) | |
41 opt_group.add_option("-e","--end_base",type = "int",dest = "cutnumber2", help="The last cycle number of your read to be mapped [Default: %default]", default = 200) | |
42 opt_group.add_option("-a", "--adapter", type="string", dest="adapter_file",help="Input text file of your adaptor sequences (to be trimed from the 3'end of the reads). Input 1 seq for dir. lib., 2 seqs for undir. lib. One line per sequence", metavar="FILE", default = '') | |
43 opt_group.add_option("--am",type = "int",dest = "adapter_mismatch", help="Number of mismatches allowed in adaptor [Default: %default]", default = 0) | |
44 opt_group.add_option("-g", "--genome", type="string", dest="genome",help="Name of the reference genome (the same as the reference genome file in the preprocessing step) [ex. chr21_hg18.fa]") | |
45 opt_group.add_option("-m", "--mismatches",type = "int", dest="int_no_mismatches",help="Number of mismatches in one read [Default: %default]", default = 4) | |
46 opt_group.add_option("--aligner", dest="aligner",help="Aligner program to perform the analisys: " + ', '.join(supported_aligners) + " [Default: %default]", metavar="ALIGNER", default = BOWTIE2) | |
47 opt_group.add_option("-p", "--path", dest="aligner_path", help="Path to the aligner program. Defaults: " +' '*70+ '\t'.join(('%s: %s '+' '*70) % (al, aligner_path[al]) for al in sorted(supported_aligners)), | |
48 metavar="PATH" | |
49 ) | |
50 opt_group.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path) | |
51 opt_group.add_option("-l", "--split_line",type = "int", dest="no_split",help="Number of lines per split (the read file will be split into small files for mapping. The result will be merged. [Default: %default]", default = 4000000) | |
52 opt_group.add_option("-o", "--output", type="string", dest="outfilename",help="The name of output file [INFILE.bs(se|pe|rrbs)]", metavar="OUTFILE") | |
53 opt_group.add_option("-f", "--output-format", type="string", dest="output_format",help="Output format: "+', '.join(output.formats)+" [Default: %default]", metavar="FORMAT", default = output.BAM) | |
54 opt_group.add_option("--no-header", action="store_true", dest="no_SAM_header",help="Suppress SAM header lines [Default: %default]", default = False) | |
55 opt_group.add_option("--temp_dir", type="string", dest="temp_dir",help="The path to your temporary directory [Default: %default]", metavar="PATH", default = tempfile.gettempdir()) | |
56 opt_group.add_option("--XS",type = "string", dest="XS_filter",help="Filter definition for tag XS, format X,Y. X=0.8 and y=5 indicate that for one read, if #(mCH sites)/#(all CH sites)>0.8 and #(mCH sites)>5, then tag XS=1; or else tag XS=0. [Default: %default]", default = "0.5,5") # added by weilong | |
57 opt_group.add_option("--multiple-hit", action="store_true", dest="Output_multiple_hit", default = False, help = 'Output reads with multiple hits to file\"Multiple_hit.fa\"') | |
58 | |
59 opt_group.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False) | |
60 | |
61 parser.add_option_group(opt_group) | |
62 | |
63 # option group 5 | |
64 opt_group = OptionGroup(parser, "Aligner Options", | |
65 "You may specify any additional options for the aligner. You just have to prefix them with " + | |
66 ', '.join('%s for %s' % (aligner_options_prefixes[aligner], aligner) for aligner in supported_aligners)+ | |
67 ', and BS Seeker will pass them on. For example: --bt-p 4 will increase the number of threads for bowtie to 4, ' | |
68 '--bt--tryhard will instruct bowtie to try as hard as possible to find valid alignments when they exist, and so on. ' | |
69 'Be sure that you know what you are doing when using these options! Also, we don\'t do any validation on the values.') | |
70 parser.add_option_group(opt_group) | |
71 | |
72 | |
73 #---------------------------------------------------------------- | |
74 # separate aligner options from BS Seeker options | |
75 aligner_options = {} | |
76 bs_seeker_options = [] | |
77 i = 1 | |
78 while i < len(sys.argv): | |
79 arg = sys.argv[i] | |
80 m = re.match(r'^%s' % '|'.join('(%s)'% aligner_options_prefixes[al] for al in supported_aligners), arg) | |
81 if m: | |
82 a_opt = arg.replace(m.group(0),'-',1) | |
83 aligner_options[a_opt] = [] | |
84 while i + 1 < len(sys.argv) and sys.argv[i+1][0] != '-': | |
85 aligner_options[a_opt].append(sys.argv[i+1]) | |
86 i += 1 | |
87 if len(aligner_options[a_opt]) == 0: # if it is a key-only option | |
88 aligner_options[a_opt] = True | |
89 else: | |
90 bs_seeker_options.append(arg) | |
91 i += 1 | |
92 | |
93 | |
94 (options, args) = parser.parse_args(args = bs_seeker_options) | |
95 | |
96 | |
97 # if no options were given by the user, print help and exit | |
98 if len(sys.argv) == 1: | |
99 print parser.print_help() | |
100 exit(0) | |
101 | |
102 if options.version : | |
103 show_version() | |
104 exit (-1) | |
105 else : | |
106 show_version() | |
107 | |
108 # check parameters | |
109 # input read files | |
110 if options.infilename and (options.infilename_1 or options.infilename_2): | |
111 error('-i and [-1|-2] options are exclusive. You should use only one of them.') | |
112 | |
113 if not (options.infilename or (options.infilename_1 and options.infilename_2)): | |
114 error('You should set either -i or -1 and -2 options.') | |
115 # -t, directional / un-directional library | |
116 asktag=str(options.taginfo).upper() | |
117 if asktag not in 'YN': | |
118 error('-t option should be either Y or N, not %s' % asktag) | |
119 # -a | |
120 if options.aligner not in supported_aligners: | |
121 error('-a option should be: %s' % ' ,'.join(supported_aligners)+'.') | |
122 # path for aligner | |
123 aligner_exec = os.path.expanduser( os.path.join(options.aligner_path or aligner_path[options.aligner], options.aligner) ) | |
124 # mismatch allowed: bowtie 1,build-in parameter '-m'; bowtie 2, post-filter paramter | |
125 # mismatch should no greater than the read length | |
126 int_no_mismatches=min(options.int_no_mismatches, options.cutnumber2-options.cutnumber1) | |
127 str_no_mismatches=str(int_no_mismatches) | |
128 # -g | |
129 if options.genome is None: | |
130 error('-g is a required option') | |
131 genome = os.path.split(options.genome)[1] | |
132 genome_subdir = genome | |
133 | |
134 # try to guess the location of the reference genome for RRBS | |
135 if options.rrbs: | |
136 if options.rrbs_low_bound and options.rrbs_up_bound: | |
137 if options.cut_format == "C-CGG" : | |
138 genome_subdir += '_rrbs_%d_%d' % (options.rrbs_low_bound, options.rrbs_up_bound) | |
139 else : | |
140 genome_subdir += '_rrbs_%s_%d_%d' % ( re.sub(",","-",re.sub("-", "", options.cut_format)), options.rrbs_low_bound, options.rrbs_up_bound) | |
141 else: | |
142 possible_refs = filter(lambda dir: dir.startswith(genome+'_rrbs_'), os.listdir(options.dbpath)) | |
143 if len(possible_refs) == 1: | |
144 genome_subdir = possible_refs[0] | |
145 else: | |
146 error('Cannot localize unambiguously the reference genome for RRBS. ' | |
147 'Please, specify the options \"--low\" and \"--up\" that you used at the index-building step.\n' | |
148 'Possible choices are:\n' + '\n'.join([pr.split('_rrbs_')[-1].replace('_',', ') for pr in possible_refs])) | |
149 | |
150 db_path = os.path.expanduser(os.path.join(options.dbpath, genome_subdir + '_' + options.aligner)) | |
151 | |
152 if not os.path.isdir(db_path): | |
153 error('Index DIR \"' + genome_subdir + '..\" cannot be found in ' + options.dbpath +'.\n\tPlease run the bs_seeker2-build.py ' | |
154 'to create it with the correct parameters for -g, -r, --low, --up and --aligner.') | |
155 | |
156 # handle aligner options | |
157 # | |
158 | |
159 # default aligner options | |
160 aligner_options_defaults = { | |
161 BOWTIE : { '-e' : 40*int_no_mismatches, | |
162 '--nomaqround' : True, | |
163 '--norc' : True, | |
164 '-k' : 2, | |
165 # -k=2; report two best hits, and filter by error rates | |
166 '--quiet' : True, | |
167 '--best' : True, | |
168 # '--suppress' : '2,5,6', | |
169 '--sam' : True, | |
170 '--sam-nohead' : True, | |
171 '-p' : 2 | |
172 }, | |
173 BOWTIE2 : { | |
174 #'-M' : 5, | |
175 '--norc' : True, | |
176 '--quiet' : True, | |
177 '-p' : 2, | |
178 '--sam-nohead' : True, | |
179 # run bowtie2 in local mode by default | |
180 '--local' : '--end-to-end' not in aligner_options, | |
181 #'--mm' : True, | |
182 '-k' : 2 | |
183 }, | |
184 SOAP : { '-v' : int_no_mismatches, | |
185 '-p' : 2, | |
186 '-r' : 2, | |
187 '-M' : 4 | |
188 }, | |
189 RMAP : { '-M' : 2 | |
190 # to do # control for only mapping on + strand | |
191 } | |
192 | |
193 } | |
194 | |
195 if '--end-to-end' not in aligner_options: | |
196 aligner_options_defaults[BOWTIE2].update({'-D' : 50}) | |
197 #aligner_options_defaults[BOWTIE2].update({'-D' : 50, '-R': 3, '-N': 0, '-L': 15, '-i' : 'S,1,0.50'}) | |
198 else: | |
199 aligner_options_defaults[BOWTIE2].update({'-D' : 50, '-L': 15, '--score-min': 'L,-0.6,-0.6' }) | |
200 | |
201 aligner_options = dict(aligner_options_defaults[options.aligner], **aligner_options) | |
202 | |
203 aligner_options_string = lambda : ' %s ' % (' '.join(opt_key + | |
204 (' ' + ' '.join(map(str,opt_val)) # join all values if the value is an array | |
205 if type(opt_val) is list else | |
206 ('' if type(opt_val) is bool and opt_val # output an empty string if it is a key-only option | |
207 else ' ' +str(opt_val)) # output the value if it is a single value | |
208 ) | |
209 for opt_key, opt_val in aligner_options.iteritems() if opt_val not in [None, False])) | |
210 | |
211 | |
212 # tmp_path = (options.outfilename or options.infilename or options.infilename_1) +'-'+ options.aligner+ '-TMP' | |
213 # clear_dir(tmp_path) | |
214 | |
215 if options.output_format not in output.formats: | |
216 error('Output format should be one of: ' + ', '.join(output.formats)) | |
217 | |
218 if options.outfilename: | |
219 outfilename = options.outfilename | |
220 logfilename = outfilename | |
221 elif options.infilename is not None: | |
222 logfilename = options.infilename+'_'+ ('rr' if options.rrbs else '') + 'bsse' | |
223 outfilename = logfilename + '.' + options.output_format | |
224 else: | |
225 logfilename = options.infilename_1+'_'+ ('rr' if options.rrbs else '') + 'bspe' | |
226 outfilename = logfilename + '.' + options.output_format | |
227 | |
228 outfilename = os.path.expanduser(outfilename) | |
229 logfilename = os.path.expanduser(logfilename) | |
230 outfile = output.outfile(outfilename, options.output_format, deserialize(os.path.join(db_path, 'refname')), ' '.join(sys.argv), options.no_SAM_header) | |
231 | |
232 open_log(logfilename+'.bs_seeker2_log') | |
233 | |
234 aligner_title = options.aligner | |
235 if options.aligner == BOWTIE2 : | |
236 if '--end-to-end' in aligner_options : | |
237 aligner_title = aligner_title + "-e2e" | |
238 else: | |
239 aligner_title = aligner_title + "-local" | |
240 | |
241 tmp_path = tempfile.mkdtemp(prefix='bs_seeker2_%s_-%s-TMP-' % (os.path.split(outfilename)[1], aligner_title ), dir = options.temp_dir) | |
242 | |
243 | |
244 (XS_x, XS_y) = options.XS_filter.split(",") | |
245 XS_pct = float(XS_x) | |
246 XS_count = int(XS_y) | |
247 logm('Filter for tag XS: #(mCH)/#(all CH)>%f and #(mCH)>%d' % (XS_pct, XS_count)) | |
248 | |
249 | |
250 logm('Temporary directory: %s' % tmp_path) | |
251 logm('Reduced Representation Bisulfite Sequencing: %s' % str(options.rrbs)) | |
252 if options.infilename is not None: | |
253 logm('Single end') | |
254 | |
255 aligner_command = aligner_exec + aligner_options_string() + \ | |
256 { BOWTIE : ' %(reference_genome)s -f %(input_file)s %(output_file)s', | |
257 BOWTIE2 : ' -x %(reference_genome)s -f -U %(input_file)s -S %(output_file)s', | |
258 SOAP : ' -D %(reference_genome)s.fa.index -o %(output_file)s -a %(input_file)s', | |
259 RMAP : ' -c %(reference_genome)s.fa -o %(output_file)s %(input_file)s' | |
260 }[options.aligner] | |
261 logm ('Aligner command: %s' % aligner_command) | |
262 # single end reads | |
263 if options.rrbs: # RRBS scan | |
264 bs_rrbs(options.infilename, | |
265 asktag, | |
266 # options.rrbs_taginfo, | |
267 options.adapter_file, | |
268 options.cutnumber1, | |
269 options.cutnumber2, | |
270 options.no_split, | |
271 str_no_mismatches, | |
272 aligner_command, | |
273 db_path, | |
274 tmp_path, | |
275 outfile, | |
276 XS_pct, | |
277 XS_count, | |
278 options.adapter_mismatch, | |
279 options.cut_format, | |
280 options.Output_multiple_hit | |
281 ) | |
282 else: # Normal single end scan | |
283 bs_single_end( options.infilename, | |
284 asktag, | |
285 options.adapter_file, | |
286 options.cutnumber1, | |
287 options.cutnumber2, | |
288 options.no_split, | |
289 str_no_mismatches, | |
290 aligner_command, | |
291 db_path, | |
292 tmp_path, | |
293 outfile, | |
294 XS_pct, | |
295 XS_count, | |
296 options.adapter_mismatch, | |
297 options.Output_multiple_hit | |
298 ) | |
299 else: | |
300 logm('Pair end') | |
301 # pair end specific default options | |
302 aligner_options = dict({BOWTIE: {'--ff' : asktag == 'N', | |
303 '--fr' : asktag == 'Y', | |
304 '-X' : options.max_insert_size, | |
305 '-I' : options.min_insert_size if options.min_insert_size > 0 else None | |
306 }, | |
307 BOWTIE2 : { | |
308 '--ff' : asktag == 'N', | |
309 '--fr' : asktag == 'Y', | |
310 '-X' : options.max_insert_size, | |
311 '-I' : options.min_insert_size if options.min_insert_size > 0 else None, | |
312 '--no-discordant' : True, | |
313 '--no-mixed' : True | |
314 }, | |
315 SOAP: { | |
316 '-x' : options.max_insert_size, | |
317 '-m' : options.min_insert_size if options.min_insert_size > 0 else 100 | |
318 }}[options.aligner], | |
319 # integrating 'rmappe' is different from others | |
320 **aligner_options) | |
321 | |
322 aligner_command = aligner_exec + aligner_options_string() + \ | |
323 { BOWTIE : ' %(reference_genome)s -f -1 %(input_file_1)s -2 %(input_file_2)s %(output_file)s', | |
324 BOWTIE2 : ' -x %(reference_genome)s -f -1 %(input_file_1)s -2 %(input_file_2)s -S %(output_file)s', | |
325 SOAP : ' -D %(reference_genome)s.fa.index -o %(output_file)s -a %(input_file_1)s -b %(input_file_2)s -2 %(output_file)s.unpaired' #, | |
326 # RMAP : # rmappe, also paste two inputs into one file. | |
327 }[options.aligner] | |
328 | |
329 logm('Aligner command: %s' % aligner_command) | |
330 | |
331 bs_pair_end(options.infilename_1, | |
332 options.infilename_2, | |
333 asktag, | |
334 options.adapter_file, | |
335 options.cutnumber1, | |
336 options.cutnumber2, | |
337 options.no_split, | |
338 str_no_mismatches, | |
339 aligner_command, | |
340 db_path, | |
341 tmp_path, | |
342 outfile, | |
343 XS_pct, | |
344 XS_count, | |
345 options.Output_multiple_hit | |
346 ) | |
347 | |
348 outfile.close() | |
349 |