Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_utils/utils.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 import gzip | |
2 import os | |
3 | |
4 import datetime | |
5 import re | |
6 import shlex | |
7 import shutil | |
8 import string | |
9 import subprocess | |
10 import types | |
11 #from itertools import izip | |
12 | |
13 import marshal | |
14 import sys | |
15 | |
16 | |
17 | |
18 _rc_trans = string.maketrans('ACGT', 'TGCA') | |
19 def reverse_compl_seq(strseq): | |
20 return strseq.translate(_rc_trans)[::-1] | |
21 | |
22 | |
23 | |
24 def show_version() : | |
25 print "" | |
26 print " BS-Seeker2 v2.0.3 - May 19, 2013 " | |
27 print "" | |
28 | |
29 | |
30 | |
31 """ | |
32 IUPAC nucleotide code Base | |
33 A Adenine | |
34 C Cytosine | |
35 G Guanine | |
36 T (or U) Thymine (or Uracil) | |
37 R A or G | |
38 Y C or T | |
39 S G or C | |
40 W A or T | |
41 K G or T | |
42 M A or C | |
43 B C or G or T | |
44 D A or G or T | |
45 H A or C or T | |
46 V A or C or G | |
47 N any base | |
48 . or - gap | |
49 """ | |
50 | |
51 | |
52 def IUPAC ( nuc ) : | |
53 if nuc == 'R' : | |
54 return ('A','G') | |
55 elif nuc == 'Y' : | |
56 return ('C', 'T') | |
57 elif nuc == 'S' : | |
58 return ('G', 'C') | |
59 elif nuc == 'W' : | |
60 return ('A', 'T') | |
61 elif nuc == 'K' : | |
62 return ('G','T') | |
63 elif nuc == 'M' : | |
64 return ('A','C') | |
65 elif nuc == 'B' : | |
66 return ('C', 'G', 'T') | |
67 elif nuc == 'D' : | |
68 return ('A', 'G', 'T') | |
69 elif nuc == 'H' : | |
70 return ('A', 'C', 'T') | |
71 elif nuc == 'V' : | |
72 return ('A', 'C', 'G') | |
73 elif nuc == 'N' : | |
74 return ('A', 'C', 'G', 'T') | |
75 else : | |
76 return (nuc) | |
77 | |
78 | |
79 def uniq(inlist): | |
80 # order preserving | |
81 uniques = [] | |
82 for item in inlist: | |
83 if item not in uniques: | |
84 uniques.append(item) | |
85 return uniques | |
86 | |
87 from itertools import product | |
88 | |
89 def EnumerateIUPAC ( context_lst ) : | |
90 tag_list = [] | |
91 # context_lst = [context] | |
92 for one_context in context_lst : | |
93 for m in product(*[ IUPAC(i) for i in list(one_context)]) : | |
94 tag_list.append(''.join(m)) | |
95 return uniq(tag_list) | |
96 | |
97 from itertools import product | |
98 | |
99 # example: cut3_context="CGG" | |
100 # return generator for : ["CGG","TGG"] | |
101 # wild-card C to both C and T | |
102 def Enumerate_C_to_CT ( cut3_context_lst ) : | |
103 tag_list = [] | |
104 for context in cut3_context_lst : | |
105 for m in product(*[i if (i is not 'C') else ('C','T') for i in context]) : | |
106 tag_list.append(''.join(m)) | |
107 return uniq(tag_list) | |
108 | |
109 #------------------------------------------------------------------------------------- | |
110 | |
111 # set a reasonable defaults | |
112 def find_location(program): | |
113 def is_exe(fpath): | |
114 return os.path.exists(fpath) and os.access(fpath, os.X_OK) | |
115 for path in os.environ["PATH"].split(os.pathsep): | |
116 if is_exe(os.path.join(path, program)): | |
117 return path | |
118 | |
119 return None | |
120 | |
121 BOWTIE = 'bowtie' | |
122 BOWTIE2 = 'bowtie2' | |
123 SOAP = 'soap' | |
124 RMAP = 'rmap' | |
125 | |
126 supported_aligners = [ | |
127 BOWTIE, | |
128 BOWTIE2, | |
129 SOAP, | |
130 RMAP | |
131 ] | |
132 | |
133 aligner_options_prefixes = { BOWTIE : '--bt-', | |
134 BOWTIE2 : '--bt2-', | |
135 SOAP : '--soap-', | |
136 RMAP : '--rmap-' } | |
137 | |
138 aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or default_path)) | |
139 for aligner, default_path in | |
140 [(BOWTIE,'~/bowtie-0.12.7/'), | |
141 (BOWTIE2, '~/bowtie-0.12.7/'), | |
142 (SOAP, '~/soap2.21release/'), | |
143 (RMAP, '~/rmap_v2.05/bin') | |
144 ]) | |
145 | |
146 | |
147 reference_genome_path = os.path.join(os.path.split(globals()['__file__'])[0],'reference_genomes') | |
148 | |
149 | |
150 | |
151 def error(msg): | |
152 print >> sys.stderr, 'ERROR: %s' % msg | |
153 exit(1) | |
154 | |
155 | |
156 global_stime = datetime.datetime.now() | |
157 def elapsed(msg = None): | |
158 print "[%s]" % msg if msg is not None else "+", "Last:" , datetime.datetime.now() - elapsed.stime, '\tTotal:', datetime.datetime.now() - global_stime | |
159 | |
160 elapsed.stime = datetime.datetime.now() | |
161 | |
162 elapsed.stime = datetime.datetime.now() | |
163 | |
164 | |
165 def open_log(fname): | |
166 open_log.logfile = open(fname, 'w', 1) | |
167 | |
168 def logm(message): | |
169 log_message = "[%s] %s\n" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), message) | |
170 print log_message, | |
171 open_log.logfile.write(log_message) | |
172 | |
173 def close_log(): | |
174 open_log.logfile.close() | |
175 | |
176 | |
177 | |
178 | |
179 def clear_dir(path): | |
180 """ If path does not exist, it creates a new directory. | |
181 If path points to a directory, it deletes all of its content. | |
182 If path points to a file, it raises an exception.""" | |
183 | |
184 if os.path.exists(path): | |
185 if not os.path.isdir(path): | |
186 error("%s is a file. Please, delete it manually!" % path) | |
187 else: | |
188 for the_file in os.listdir(path): | |
189 file_path = os.path.join(path, the_file) | |
190 try: | |
191 if os.path.isfile(file_path): | |
192 os.unlink(file_path) | |
193 elif os.path.isdir(file_path): | |
194 shutil.rmtree(file_path) | |
195 except Exception, e: | |
196 print e | |
197 else: | |
198 os.mkdir(path) | |
199 | |
200 | |
201 def delete_files(*filenames): | |
202 # return | |
203 """ Deletes a number of files. filenames can contain generator expressions and/or lists, too""" | |
204 | |
205 for fname in filenames: | |
206 if type(fname) in [list, types.GeneratorType]: | |
207 delete_files(*list(fname)) | |
208 elif os.path.isdir(fname): | |
209 shutil.rmtree(fname) | |
210 else: | |
211 os.remove(fname) | |
212 | |
213 def split_file(filename, output_prefix, nlines): | |
214 """ Splits a file (equivalend to UNIX split -l ) """ | |
215 fno = 0 | |
216 lno = 0 | |
217 INPUT = open(filename, 'r') | |
218 output = None | |
219 for l in INPUT: | |
220 if lno == 0: | |
221 fno += 1 | |
222 if output is not None: output.close() | |
223 output = open('%s%d' % (output_prefix, fno), 'w') | |
224 lno = nlines | |
225 output.write(l) | |
226 lno -= 1 | |
227 output.close() | |
228 INPUT.close() | |
229 | |
230 def isplit_file(filename, output_prefix, nlines): | |
231 """ Splits a file (equivalend to UNIX split -l ) """ | |
232 fno = 0 | |
233 lno = 0 | |
234 try : | |
235 input = (gzip.open if filename.endswith('.gz') else open)(filename, 'r') | |
236 except IOError : | |
237 print "[Error] Cannot find file : %s !" % filename | |
238 exit(-1) | |
239 output = None | |
240 output_fname = None | |
241 for l in input: | |
242 if lno == 0: | |
243 | |
244 if output is not None: | |
245 output.close() | |
246 yield output_fname | |
247 | |
248 fno += 1 | |
249 output_fname = '%s%d' % (output_prefix, fno) | |
250 # output_fname = '%s_0' % output_prefix | |
251 output = open(output_fname, 'w') | |
252 lno = nlines | |
253 output.write(l) | |
254 lno -= 1 | |
255 if output is not None: | |
256 output.close() | |
257 yield output_fname | |
258 | |
259 input.close() | |
260 | |
261 def read_fasta(fasta_file): | |
262 """ Iterates over all sequences in a fasta file. One at a time, without reading the whole file into the main memory. | |
263 """ | |
264 | |
265 try : | |
266 input = (gzip.open if fasta_file.endswith('.gz') else open)(fasta_file) | |
267 except IOError: | |
268 print "[Error] Cannot find fasta file : %s !" % fasta_file | |
269 exit(-1) | |
270 sanitize = re.compile(r'[^ACTGN]') | |
271 sanitize_seq_id = re.compile(r'[^A-Za-z0-9]') | |
272 | |
273 chrom_seq = [] | |
274 chrom_id = None | |
275 seen_ids = set() | |
276 | |
277 for line in input: | |
278 if line[0] == '>': | |
279 if chrom_id is not None: | |
280 yield chrom_id, ''.join(chrom_seq) | |
281 | |
282 chrom_id = sanitize_seq_id.sub('_', line.split()[0][1:]) | |
283 | |
284 if chrom_id in seen_ids: | |
285 error('BS Seeker found identical sequence ids (id: %s) in the fasta file: %s. Please, make sure that all sequence ids are unique and contain only alphanumeric characters: A-Za-z0-9_' % (chrom_id, fasta_file)) | |
286 seen_ids.add(chrom_id) | |
287 | |
288 chrom_seq = [] | |
289 | |
290 else: | |
291 chrom_seq.append(sanitize.sub('N', line.strip().upper())) | |
292 | |
293 yield chrom_id, ''.join(chrom_seq) | |
294 | |
295 input.close() | |
296 | |
297 | |
298 def serialize(obj, filename): | |
299 """ Be careful what you serialize and deseriazlize! marshal.load is not secure! | |
300 """ | |
301 output = open(filename+'.data', 'w') | |
302 marshal.dump(obj, output) | |
303 output.close() | |
304 | |
305 def deserialize(filename): | |
306 """ Be careful what you serialize and deseriazlize! marshal.load is not secure! | |
307 """ | |
308 try: | |
309 input = open(filename+'.data') | |
310 except IOError: | |
311 print "\n[Error]:\n\t Cannot find file: %s.data" % filename | |
312 exit(-1) | |
313 | |
314 obj = marshal.load(input) | |
315 input.close() | |
316 return obj | |
317 | |
318 | |
319 | |
320 def run_in_parallel(commands): | |
321 | |
322 commands = [(cmd[0], open(cmd[1], 'w')) if type(cmd) is tuple else (cmd, None) for cmd in commands] | |
323 | |
324 logm('Starting commands:\n' + '\n'.join([cmd for cmd, stdout in commands])) | |
325 for i, proc in enumerate([subprocess.Popen(args = shlex.split(cmd), stdout = stdout) for cmd, stdout in commands]): | |
326 return_code = proc.wait() | |
327 logm('Finished: ' + commands[i][0]) | |
328 if return_code != 0: | |
329 error('%s \nexit with an error code: %d. Please, check the log files.' % (commands[i], return_code)) | |
330 for _, stdout in commands: | |
331 if stdout is not None: | |
332 stdout.close() |