comparison BSseeker2/bs_utils/utils.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
comparison
equal deleted inserted replaced
-1:000000000000 0:e6df770c0e58
1 import gzip
2 import os
3
4 import datetime
5 import re
6 import shlex
7 import shutil
8 import string
9 import subprocess
10 import types
11 #from itertools import izip
12
13 import marshal
14 import sys
15
16
17
18 _rc_trans = string.maketrans('ACGT', 'TGCA')
19 def reverse_compl_seq(strseq):
20 return strseq.translate(_rc_trans)[::-1]
21
22
23
24 def show_version() :
25 print ""
26 print " BS-Seeker2 v2.0.3 - May 19, 2013 "
27 print ""
28
29
30
31 """
32 IUPAC nucleotide code Base
33 A Adenine
34 C Cytosine
35 G Guanine
36 T (or U) Thymine (or Uracil)
37 R A or G
38 Y C or T
39 S G or C
40 W A or T
41 K G or T
42 M A or C
43 B C or G or T
44 D A or G or T
45 H A or C or T
46 V A or C or G
47 N any base
48 . or - gap
49 """
50
51
52 def IUPAC ( nuc ) :
53 if nuc == 'R' :
54 return ('A','G')
55 elif nuc == 'Y' :
56 return ('C', 'T')
57 elif nuc == 'S' :
58 return ('G', 'C')
59 elif nuc == 'W' :
60 return ('A', 'T')
61 elif nuc == 'K' :
62 return ('G','T')
63 elif nuc == 'M' :
64 return ('A','C')
65 elif nuc == 'B' :
66 return ('C', 'G', 'T')
67 elif nuc == 'D' :
68 return ('A', 'G', 'T')
69 elif nuc == 'H' :
70 return ('A', 'C', 'T')
71 elif nuc == 'V' :
72 return ('A', 'C', 'G')
73 elif nuc == 'N' :
74 return ('A', 'C', 'G', 'T')
75 else :
76 return (nuc)
77
78
79 def uniq(inlist):
80 # order preserving
81 uniques = []
82 for item in inlist:
83 if item not in uniques:
84 uniques.append(item)
85 return uniques
86
87 from itertools import product
88
89 def EnumerateIUPAC ( context_lst ) :
90 tag_list = []
91 # context_lst = [context]
92 for one_context in context_lst :
93 for m in product(*[ IUPAC(i) for i in list(one_context)]) :
94 tag_list.append(''.join(m))
95 return uniq(tag_list)
96
97 from itertools import product
98
99 # example: cut3_context="CGG"
100 # return generator for : ["CGG","TGG"]
101 # wild-card C to both C and T
102 def Enumerate_C_to_CT ( cut3_context_lst ) :
103 tag_list = []
104 for context in cut3_context_lst :
105 for m in product(*[i if (i is not 'C') else ('C','T') for i in context]) :
106 tag_list.append(''.join(m))
107 return uniq(tag_list)
108
109 #-------------------------------------------------------------------------------------
110
111 # set a reasonable defaults
112 def find_location(program):
113 def is_exe(fpath):
114 return os.path.exists(fpath) and os.access(fpath, os.X_OK)
115 for path in os.environ["PATH"].split(os.pathsep):
116 if is_exe(os.path.join(path, program)):
117 return path
118
119 return None
120
121 BOWTIE = 'bowtie'
122 BOWTIE2 = 'bowtie2'
123 SOAP = 'soap'
124 RMAP = 'rmap'
125
126 supported_aligners = [
127 BOWTIE,
128 BOWTIE2,
129 SOAP,
130 RMAP
131 ]
132
133 aligner_options_prefixes = { BOWTIE : '--bt-',
134 BOWTIE2 : '--bt2-',
135 SOAP : '--soap-',
136 RMAP : '--rmap-' }
137
138 aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or default_path))
139 for aligner, default_path in
140 [(BOWTIE,'~/bowtie-0.12.7/'),
141 (BOWTIE2, '~/bowtie-0.12.7/'),
142 (SOAP, '~/soap2.21release/'),
143 (RMAP, '~/rmap_v2.05/bin')
144 ])
145
146
147 reference_genome_path = os.path.join(os.path.split(globals()['__file__'])[0],'reference_genomes')
148
149
150
151 def error(msg):
152 print >> sys.stderr, 'ERROR: %s' % msg
153 exit(1)
154
155
156 global_stime = datetime.datetime.now()
157 def elapsed(msg = None):
158 print "[%s]" % msg if msg is not None else "+", "Last:" , datetime.datetime.now() - elapsed.stime, '\tTotal:', datetime.datetime.now() - global_stime
159
160 elapsed.stime = datetime.datetime.now()
161
162 elapsed.stime = datetime.datetime.now()
163
164
165 def open_log(fname):
166 open_log.logfile = open(fname, 'w', 1)
167
168 def logm(message):
169 log_message = "[%s] %s\n" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), message)
170 print log_message,
171 open_log.logfile.write(log_message)
172
173 def close_log():
174 open_log.logfile.close()
175
176
177
178
179 def clear_dir(path):
180 """ If path does not exist, it creates a new directory.
181 If path points to a directory, it deletes all of its content.
182 If path points to a file, it raises an exception."""
183
184 if os.path.exists(path):
185 if not os.path.isdir(path):
186 error("%s is a file. Please, delete it manually!" % path)
187 else:
188 for the_file in os.listdir(path):
189 file_path = os.path.join(path, the_file)
190 try:
191 if os.path.isfile(file_path):
192 os.unlink(file_path)
193 elif os.path.isdir(file_path):
194 shutil.rmtree(file_path)
195 except Exception, e:
196 print e
197 else:
198 os.mkdir(path)
199
200
201 def delete_files(*filenames):
202 # return
203 """ Deletes a number of files. filenames can contain generator expressions and/or lists, too"""
204
205 for fname in filenames:
206 if type(fname) in [list, types.GeneratorType]:
207 delete_files(*list(fname))
208 elif os.path.isdir(fname):
209 shutil.rmtree(fname)
210 else:
211 os.remove(fname)
212
213 def split_file(filename, output_prefix, nlines):
214 """ Splits a file (equivalend to UNIX split -l ) """
215 fno = 0
216 lno = 0
217 INPUT = open(filename, 'r')
218 output = None
219 for l in INPUT:
220 if lno == 0:
221 fno += 1
222 if output is not None: output.close()
223 output = open('%s%d' % (output_prefix, fno), 'w')
224 lno = nlines
225 output.write(l)
226 lno -= 1
227 output.close()
228 INPUT.close()
229
230 def isplit_file(filename, output_prefix, nlines):
231 """ Splits a file (equivalend to UNIX split -l ) """
232 fno = 0
233 lno = 0
234 try :
235 input = (gzip.open if filename.endswith('.gz') else open)(filename, 'r')
236 except IOError :
237 print "[Error] Cannot find file : %s !" % filename
238 exit(-1)
239 output = None
240 output_fname = None
241 for l in input:
242 if lno == 0:
243
244 if output is not None:
245 output.close()
246 yield output_fname
247
248 fno += 1
249 output_fname = '%s%d' % (output_prefix, fno)
250 # output_fname = '%s_0' % output_prefix
251 output = open(output_fname, 'w')
252 lno = nlines
253 output.write(l)
254 lno -= 1
255 if output is not None:
256 output.close()
257 yield output_fname
258
259 input.close()
260
261 def read_fasta(fasta_file):
262 """ Iterates over all sequences in a fasta file. One at a time, without reading the whole file into the main memory.
263 """
264
265 try :
266 input = (gzip.open if fasta_file.endswith('.gz') else open)(fasta_file)
267 except IOError:
268 print "[Error] Cannot find fasta file : %s !" % fasta_file
269 exit(-1)
270 sanitize = re.compile(r'[^ACTGN]')
271 sanitize_seq_id = re.compile(r'[^A-Za-z0-9]')
272
273 chrom_seq = []
274 chrom_id = None
275 seen_ids = set()
276
277 for line in input:
278 if line[0] == '>':
279 if chrom_id is not None:
280 yield chrom_id, ''.join(chrom_seq)
281
282 chrom_id = sanitize_seq_id.sub('_', line.split()[0][1:])
283
284 if chrom_id in seen_ids:
285 error('BS Seeker found identical sequence ids (id: %s) in the fasta file: %s. Please, make sure that all sequence ids are unique and contain only alphanumeric characters: A-Za-z0-9_' % (chrom_id, fasta_file))
286 seen_ids.add(chrom_id)
287
288 chrom_seq = []
289
290 else:
291 chrom_seq.append(sanitize.sub('N', line.strip().upper()))
292
293 yield chrom_id, ''.join(chrom_seq)
294
295 input.close()
296
297
298 def serialize(obj, filename):
299 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
300 """
301 output = open(filename+'.data', 'w')
302 marshal.dump(obj, output)
303 output.close()
304
305 def deserialize(filename):
306 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
307 """
308 try:
309 input = open(filename+'.data')
310 except IOError:
311 print "\n[Error]:\n\t Cannot find file: %s.data" % filename
312 exit(-1)
313
314 obj = marshal.load(input)
315 input.close()
316 return obj
317
318
319
320 def run_in_parallel(commands):
321
322 commands = [(cmd[0], open(cmd[1], 'w')) if type(cmd) is tuple else (cmd, None) for cmd in commands]
323
324 logm('Starting commands:\n' + '\n'.join([cmd for cmd, stdout in commands]))
325 for i, proc in enumerate([subprocess.Popen(args = shlex.split(cmd), stdout = stdout) for cmd, stdout in commands]):
326 return_code = proc.wait()
327 logm('Finished: ' + commands[i][0])
328 if return_code != 0:
329 error('%s \nexit with an error code: %d. Please, check the log files.' % (commands[i], return_code))
330 for _, stdout in commands:
331 if stdout is not None:
332 stdout.close()