Mercurial > repos > arkarachai-fungtammasan > microsatellite_ngs
comparison microsatellite.py @ 0:20ab85af9505
Uploaded
author | arkarachai-fungtammasan |
---|---|
date | Fri, 03 Oct 2014 20:54:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:20ab85af9505 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Snoop thru a fasta file looking for microsatellite repeats of given periods | |
4 Output format: length_of_repeat left_flank_length right_flank_length repeat_motif hamming_distance read_name read_sequence read_quality (additional columns) | |
5 | |
6 If --r option turned on, output format will have additional columns behind: | |
7 read_name read_chr pre_s pre_e tr_s tr_e suf_s suf_e tr_len tr_ref_seq | |
8 | |
9 pre_s where the read start | |
10 pre_e the last position before microsatellite | |
11 tr_s where microsatellite start | |
12 tr_e where microsatellite end | |
13 suf_s first base after microsatellite | |
14 tr_ref_seq reference sequence corresponding to microsatellite | |
15 | |
16 * output positions are 0 based | |
17 | |
18 :Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu) | |
19 | |
20 modifing log: | |
21 | |
22 09/27/2013 | |
23 replace function dense_intervals with function non_negative_intervals, which do not need to import such file. | |
24 | |
25 10/18/2013 | |
26 modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel | |
27 | |
28 02/25/2014 | |
29 add function that can deal with mapped reads | |
30 with additional output | |
31 | |
32 02/28/2014 | |
33 modify the 0-based end point, as in 0-base area, it is half-open [ ) | |
34 so the 0-based site, should always be added by 1 | |
35 | |
36 03/05/2014 | |
37 deal with multi-fasta | |
38 """ | |
39 from sys import argv,stdin,stderr,exit | |
40 from string import maketrans | |
41 from md5 import new as md5_new | |
42 import re | |
43 #from pyfracluster import dense_intervals | |
44 | |
45 def usage(s=None): | |
46 message = """ | |
47 usage: microsat_snoop [fasta_file] [options] | |
48 <fasta_file> Name of file to read sequences from; if absent, | |
49 sequences are read from stdin | |
50 --fasta Input file is in fasta format | |
51 (this is the default) | |
52 --fastq Input file is in fastq format | |
53 (default is fasta unless filename is .fastq) | |
54 --fastq:noquals Input file is in fastq format, but discard quals | |
55 --sam Input file is SAM file | |
56 --r Indicate additional output information, if indicated, | |
57 --ref option is mendatory | |
58 --ref=<filepath> Reference file (absolute) path | |
59 --period=<length> (mandatory,cumulative) repeat length(s) to be | |
60 searched for | |
61 <length> is expected to be small, less than 10 | |
62 <length> can also be a comma-separated list, or | |
63 a range <low>..<high> | |
64 --rate=<fraction> control the candidate repeat interval detector; | |
65 it will consider intervals with at least | |
66 <fraction> of matches when shifted by the period; | |
67 <fraction> is between 0 and 1 and can be either a | |
68 real number or <n>/<d> | |
69 (default is 6/7) | |
70 --minlength=<length> minimum length of intervals reported, in bp | |
71 (default is 20) | |
72 --progress=<count> how often to report the sequence we're searching | |
73 (default is no progress report) | |
74 --allowduplicates process all input sequences | |
75 (this is the default) | |
76 --noduplicates ignore any input sequence that's the same as an | |
77 earlier sequence | |
78 --nonearduplicates ignore any input sequence that has the same first | |
79 100 bp as an earlier sequence | |
80 --nonearduplicate=<length> ignore any input sequence that has the same first | |
81 <length> bp as an earlier sequence | |
82 --hamming=<count> Don't report candidate repeat intervals that have | |
83 more than <count> mismatches | |
84 (default is to do no such filtering) | |
85 --prefix=<length> Don't report candidate repeat intervals that | |
86 start within <length> of the sequence start | |
87 (default is to do no such filtering) | |
88 --suffix=<length> Don't report candidate repeat intervals that | |
89 end within <length> of the sequence end | |
90 (default is to do no such filtering) | |
91 --subsample=<k>/<n> Process only the <k>th sequence of every group of | |
92 <n> sequences; <k> ranges from 1 to <n> | |
93 --multipleruns Consider all candidate intervals in a sequence | |
94 (default is to consider only the longest) | |
95 --partialmotifs Consider microatelites with a partial motif | |
96 (default is to consider only whole motifs) | |
97 --splitbyvalidity Preprocess sequences, splitting at Ns; this | |
98 prevents candidates from including Ns | |
99 (default is not to split) | |
100 --noflankdisplay Show entire sequence as flanking regions | |
101 (this is the default) | |
102 --flankdisplay=<length> Limit length of flanking regions shown | |
103 --readnamesuffix=<string> Root of suffix to append to read names; e.g. 1 | |
104 for forward, 2 for reverse; this triggers other | |
105 info to be included in the suffix | |
106 (default is "1" for fastq; no suffix for fasta) | |
107 --head=<number> limit the number of sequences processed | |
108 --markend Write a marker line upon completion | |
109 (default is not to write a marker) | |
110 --help=details Describe the process, and quit""" | |
111 | |
112 if (s == None): exit (message) | |
113 else: exit ("%s\n%s" % (s,message)) | |
114 | |
115 | |
116 detailedDescription = """In broad terms, the process works as follows: | |
117 | |
118 (1) Identify intervals that are highly correlated with the interval shifted by | |
119 P (the repeat period). These intervals are called "runs" or "candidates". | |
120 The level of correlation required is controlled by rateThreshold. | |
121 Depending on whether we want to look for more than one microsat, we either | |
122 find the longest such run (simple algorithm) or many runs (more complicated | |
123 algorithm). The following steps are then performed on each run. | |
124 | |
125 (2) Find the most likely repeat motif in the run. This is done by counting | |
126 all kmers (of length P) and choosing the most frequent. If that kmer is | |
127 itself covered by a sub-repeat we discard this run. The idea is that we | |
128 can ignore a 6-mer like ACGACG because we will find it when we are looking | |
129 for 3-mers. | |
130 | |
131 (3) Once we identify the most likely repeat motif, we then modify the | |
132 interval, adjusting start and end to find the interval that has the fewest | |
133 mismatches vs. a sequence of the motif repeated (hamming distance). Only | |
134 whole copies of the motif are considered. | |
135 | |
136 (4) At this point we have a valid microsat interval (in the eyes of the | |
137 program). It is subjected to some filtering stages (hamming distance or too | |
138 close to an end), and if it satisfies those conditions, it's reported to | |
139 the user.""" | |
140 | |
141 def main(): | |
142 global debug | |
143 | |
144 #=== parse the command line === | |
145 | |
146 inputFilename = None | |
147 referenceFileName = None #add by Chen Sun on 02/25 | |
148 inputFormat = None | |
149 repeatPeriods = [] | |
150 rateThreshold = 6 / 7.0 | |
151 lengthThreshold = 20 | |
152 reportProgress = None | |
153 discardDuplicates = False | |
154 discardNearDuplicates = False | |
155 nearDuplicatePrefix = 100 | |
156 hammingThreshold = 0 | |
157 prefixThreshold = None | |
158 suffixThreshold = None | |
159 subsampleK = None | |
160 subsampleN = None | |
161 reportMultipleRuns = False | |
162 allowPartialMotifs = False | |
163 splitByValidity = False | |
164 flankDisplayLimit = None | |
165 readNameSuffix = None | |
166 headLimit = None | |
167 markEndOfFile = False | |
168 additionalInfo = False | |
169 debug = [] | |
170 | |
171 for arg in argv[1:]: | |
172 if (arg == "--fasta"): | |
173 inputFormat = "fasta" | |
174 elif (arg == "--fastq"): | |
175 inputFormat = "fastq" | |
176 elif (arg == "--fastq:noquals"): | |
177 inputFormat = "fastq:noquals" | |
178 elif (arg == "--sam"): | |
179 inputFormat = "sam" | |
180 elif (arg == "--r"): | |
181 additionalInfo = True | |
182 elif (arg.startswith("--ref=")): | |
183 referenceFileName = arg.split("=",1)[1] | |
184 elif (arg.startswith("--period=")): | |
185 val = arg.split("=",1)[1] | |
186 for period in val.split(","): | |
187 if (".." in period): | |
188 (lowPeriod,highPeriod) = period.split("..",1) | |
189 lowPeriod = int(lowPeriod) | |
190 highPeriod = int(highPeriod) | |
191 for period in xrange(lowPeriod,highPeriod+1): | |
192 repeatPeriods += [period] | |
193 else: | |
194 repeatPeriods += [int(period)] | |
195 elif (arg.startswith("--rate=")): | |
196 val = arg.split("=",1)[1] | |
197 rateThreshold = float_or_fraction(val) | |
198 assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val | |
199 elif (arg.startswith("--minlength=")): | |
200 val = arg.split("=",1)[1] | |
201 lengthThreshold = int(val) | |
202 assert (lengthThreshold >= 0) | |
203 elif (arg.startswith("--progress=")): | |
204 val = arg.split("=",1)[1] | |
205 reportProgress = int(val) | |
206 elif (arg == "--allowduplicates"): | |
207 discardDuplicates = False | |
208 discardNearDuplicates = False | |
209 elif (arg == "--noduplicates"): | |
210 discardDuplicates = True | |
211 discardNearDuplicates = False | |
212 elif (arg == "--nonearduplicates"): | |
213 discardDuplicates = False | |
214 discardNearDuplicates = True | |
215 elif (arg.startswith("--nonearduplicate=")): | |
216 val = arg.split("=",1)[1] | |
217 discardDuplicates = False | |
218 discardNearDuplicates = True | |
219 nearDuplicatePrefix = int(val) | |
220 assert (nearDuplicatePrefix > 0) | |
221 elif (arg.startswith("--hamming=")): | |
222 val = arg.split("=",1)[1] | |
223 hammingThreshold = int(val) | |
224 assert (hammingThreshold >= 0) | |
225 elif (arg.startswith("--prefix=")): | |
226 val = arg.split("=",1)[1] | |
227 prefixThreshold = int(val) | |
228 assert (prefixThreshold >= 0) | |
229 elif (arg.startswith("--suffix=")): | |
230 val = arg.split("=",1)[1] | |
231 suffixThreshold = int(val) | |
232 assert (suffixThreshold >= 0) | |
233 elif (arg.startswith("--subsample=")): | |
234 val = arg.split("=",1)[1] | |
235 (k,n) = val.split("/",2) | |
236 subsampleK = int(k) | |
237 subsampleN = int(n) | |
238 assert (0 < subsampleK <= subsampleN) | |
239 elif (arg == "--multipleruns"): | |
240 reportMultipleRuns = True | |
241 elif (arg == "--partialmotifs"): | |
242 allowPartialMotifs = True | |
243 elif (arg == "--splitbyvalidity"): | |
244 splitByValidity = True | |
245 elif (arg == "--noflankdisplay"): | |
246 flankDisplayLimit = None | |
247 elif (arg.startswith("--flankdisplay=")): | |
248 val = arg.split("=",1)[1] | |
249 flankDisplayLimit = int(val) | |
250 assert (flankDisplayLimit >= 0) | |
251 elif (arg.startswith("--readnamesuffix")): | |
252 readNameSuffix = arg.split("=",1)[1] | |
253 elif (arg.startswith("--head=")): | |
254 headLimit = int_with_unit(arg.split("=",1)[1]) | |
255 elif (arg == "--markend"): | |
256 markEndOfFile = True | |
257 elif (arg == "--help=details"): | |
258 exit (detailedDescription) | |
259 elif (arg.startswith("--debug=")): | |
260 debug += (arg.split("=",1)[1]).split(",") | |
261 elif (arg.startswith("--")): | |
262 usage("unrecognized option: %s" % arg) | |
263 elif (inputFilename == None): | |
264 inputFilename = arg | |
265 else: | |
266 usage("unrecognized option: %s" % arg) | |
267 | |
268 #=== determine periods of interest === | |
269 | |
270 if (repeatPeriods == []): | |
271 usage("you gotta give me a repeat period") | |
272 | |
273 if (additionalInfo == True): | |
274 if (referenceFileName == None): | |
275 usage("reference file path needed. use --ref=<reference> to indicate") | |
276 | |
277 periodSeed = {} | |
278 for period in repeatPeriods: | |
279 if (period < 1): usage("period %d is not valid" % period) | |
280 periodSeed[period] = True | |
281 | |
282 repeatPeriods = [period for period in periodSeed] | |
283 repeatPeriods.sort() | |
284 | |
285 #=== determine input format === | |
286 | |
287 if (inputFormat == "fasta"): sequence_reader = fasta_sequences | |
288 elif (inputFormat == "fastq"): sequence_reader = fastq_sequences | |
289 elif (inputFormat == "fastq:noquals"): sequence_reader = fastq_sequences | |
290 elif (inputFormat == "sam"): sequence_reader = sam_sequences | |
291 elif (inputFilename == None): sequence_reader = fasta_sequences | |
292 elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences | |
293 elif (inputFilename.endswith(".fq")): sequence_reader = fastq_sequences | |
294 elif (inputFilename.endswith(".sam")): sequence_reader = sam_sequences | |
295 else: sequence_reader = fasta_sequences | |
296 | |
297 if (inputFilename != None): inputF = file(inputFilename,"rt") | |
298 else: inputF = stdin | |
299 | |
300 if (readNameSuffix == None) \ | |
301 and (sequence_reader == fastq_sequences) \ | |
302 and (inputFormat != "fastq:noquals"): | |
303 readNameSuffix = "1" | |
304 | |
305 #=== process the sequences === | |
306 | |
307 refSequence = {} | |
308 rightName = "" | |
309 sequence = "" | |
310 if additionalInfo: | |
311 firstFasta = True | |
312 originalRefF = open(referenceFileName) | |
313 for line in originalRefF.readlines(): | |
314 line = line.replace('\r','') | |
315 line = line.replace('\n','') | |
316 if line.startswith(">"): | |
317 if firstFasta: | |
318 firstFasta = False | |
319 else: | |
320 refSequence[rightName] = sequence | |
321 rightName = line[1:] | |
322 sequence = "" | |
323 continue | |
324 sequence += line | |
325 originalRefF.close() | |
326 refSequence[rightName] = sequence | |
327 | |
328 sequenceSeen = {} | |
329 | |
330 numSequences = 0 | |
331 for seqInfo in sequence_reader(inputF): | |
332 numSequences += 1 | |
333 if (headLimit != None) and (numSequences > headLimit): | |
334 print >>stderr, "limit of %d sequences reached" % headLimit | |
335 break | |
336 | |
337 if (sequence_reader == sam_sequences): | |
338 #seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar | |
339 (name, sequence, refName, pre_s, cigar) = seqInfo | |
340 quals = None | |
341 elif (sequence_reader == fastq_sequences): | |
342 (name,sequence,quals) = seqInfo | |
343 if (inputFormat == "fastq:noquals"): quals = None | |
344 else: | |
345 (name,sequence) = seqInfo | |
346 quals = None | |
347 | |
348 if (reportProgress != None) and (numSequences % reportProgress == 0): | |
349 print >>stderr, "%s %d" % (name,numSequences) | |
350 | |
351 # if we're subsampling and not interested in this sequence, skip it | |
352 | |
353 if (subsampleN != None): | |
354 if ((numSequences-1) % subsampleN != (subsampleK-1)): | |
355 continue | |
356 | |
357 # if this sequence is shorter than the length of interest, skip it | |
358 | |
359 seqLen = len(sequence) | |
360 if (seqLen < period) or (seqLen < lengthThreshold): continue | |
361 | |
362 # if we're not interested in duplicates and this is one, skip it; | |
363 # note that we assume no hash collisions occur, i.e. that all hash | |
364 # matches are truly sequence matches | |
365 | |
366 if (discardDuplicates): | |
367 h = hash108(sequence) | |
368 if (h in sequenceSeen): continue | |
369 sequenceSeen[h] = True | |
370 elif (discardNearDuplicates): | |
371 h = hash108(sequence[:nearDuplicatePrefix]) | |
372 if (h in sequenceSeen): continue | |
373 sequenceSeen[h] = True | |
374 | |
375 # split the sequence into chunks of valid nucleotides | |
376 | |
377 if (splitByValidity): | |
378 chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)] | |
379 else: | |
380 chunks = [(0,len(sequence))] | |
381 | |
382 # evaluate for each period of interest | |
383 | |
384 for period in repeatPeriods: | |
385 | |
386 # operate on each chunk | |
387 | |
388 for (chunkStart,chunkEnd) in chunks: | |
389 chunkLen = chunkEnd - chunkStart | |
390 if (chunkLen < period) or (chunkLen < lengthThreshold): continue | |
391 | |
392 if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug): | |
393 print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd) | |
394 | |
395 # compute correlation sequence | |
396 | |
397 corr = correlation_sequence(sequence,period,chunkStart,chunkEnd) | |
398 | |
399 if ("correlation" in debug) or ("runs" in debug): | |
400 print >>stderr, sequence[chunkStart:chunkEnd] | |
401 print >>stderr, corr | |
402 | |
403 # find runs (candidates for being a microsat) | |
404 | |
405 if (reportMultipleRuns): | |
406 runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold) | |
407 else: | |
408 runs = longest_suitable_run(corr,lengthThreshold,rateThreshold) | |
409 if (runs == []): continue | |
410 | |
411 | |
412 if ("runs" in debug): | |
413 for (start,end) in runs: | |
414 run = [" "] * seqLen | |
415 for ix in xrange(start-period,end): | |
416 run[ix] = "*" | |
417 print >>stderr, "".join(run) | |
418 | |
419 if ("candidates" in debug): | |
420 for (start,end) in runs: | |
421 print >>stderr, "%s %d %d" % (name,start,end) | |
422 | |
423 # process runs and report those that pass muster | |
424 | |
425 runCount = 0 | |
426 for (start,end) in runs: | |
427 runCount += 1 | |
428 | |
429 start = chunkStart + start - period | |
430 end = chunkStart + end | |
431 | |
432 (kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs) | |
433 if (kmer == None): continue # (no useful repeat kmer was found) | |
434 | |
435 rptExtent = end - start | |
436 prefixLen = start | |
437 suffixLen = seqLen - end | |
438 if (rptExtent <= period): continue | |
439 if (hammingThreshold != None) and (d > hammingThreshold): continue | |
440 if (prefixThreshold != None) and (prefixLen < prefixThreshold): continue | |
441 if (suffixThreshold != None) and (suffixLen < suffixThreshold): continue | |
442 | |
443 if (flankDisplayLimit == None): | |
444 seq = sequence[:start] \ | |
445 + sequence[start:end].lower() \ | |
446 + sequence[end:] | |
447 else: | |
448 seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \ | |
449 + sequence[start:end].lower() \ | |
450 + sequence[end:min(chunkEnd,end+flankDisplayLimit)] | |
451 reportName = name | |
452 if (readNameSuffix != None): | |
453 reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount) | |
454 if (quals == None or quals == "." or quals == "\t."): quals = "\t." | |
455 else: quals = "\t" + quals | |
456 if not additionalInfo: | |
457 print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \ | |
458 % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals) | |
459 else: | |
460 #pre_e = pre_s + prefixLen - 1 | |
461 refPoint = pre_s | |
462 donorPoint = 0 | |
463 | |
464 donorBeforeStart = prefixLen - 1 #pre_e | |
465 donorMicroStart = prefixLen #tr_s | |
466 donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e | |
467 donorAfterMicro = donorMicroEnd + 1 #suf_s | |
468 donorEnd = len(seq) - 1 #suf_e | |
469 | |
470 set_pre_e = False | |
471 set_tr_s = False | |
472 set_tr_e = False | |
473 set_suf_s = False | |
474 set_suf_e = False | |
475 | |
476 pre_e = 0 | |
477 tr_s = 0 | |
478 tr_e = 0 | |
479 suf_s = 0 | |
480 suf_e = 0 | |
481 | |
482 matchList = re.findall('(\d+)([IDM])', cigar) | |
483 unCognitiveCigar = False | |
484 for matchN, matchType in matchList: | |
485 matchNum = int(matchN) | |
486 if matchType == "M": | |
487 donorPoint = donorPoint + matchNum | |
488 refPoint = refPoint + matchNum | |
489 elif matchType == "D": | |
490 refPoint = refPoint + matchNum | |
491 continue | |
492 elif matchType == "I": | |
493 donorPoint = donorPoint + matchNum | |
494 else: | |
495 unCognitiveCigar = True | |
496 break | |
497 | |
498 if not set_pre_e: | |
499 if donorPoint >= donorBeforeStart: | |
500 pre_e = refPoint - (donorPoint - donorBeforeStart) | |
501 set_pre_e = True | |
502 else: | |
503 continue | |
504 | |
505 if not set_tr_s: | |
506 if donorPoint >= donorMicroStart: | |
507 tr_s = refPoint - (donorPoint - donorMicroStart) | |
508 set_tr_s = True | |
509 else: | |
510 continue | |
511 | |
512 if not set_tr_e: | |
513 if donorPoint >= donorMicroEnd: | |
514 tr_e = refPoint - (donorPoint - donorMicroEnd) | |
515 set_tr_e = True | |
516 else: | |
517 continue | |
518 | |
519 if not set_suf_s: | |
520 if donorPoint >= donorAfterMicro: | |
521 suf_s = refPoint - (donorPoint - donorAfterMicro) | |
522 set_suf_s = True | |
523 else: | |
524 continue | |
525 | |
526 if not set_suf_e: | |
527 if donorPoint >= donorEnd: | |
528 suf_e = refPoint - (donorPoint - donorEnd) | |
529 set_suf_e = True | |
530 else: | |
531 continue | |
532 | |
533 if unCognitiveCigar: | |
534 break | |
535 tr_len = tr_e - tr_s + 1 | |
536 | |
537 if refName not in refSequence: | |
538 tr_ref_seq = "." | |
539 else: | |
540 if refSequence[refName] == "": | |
541 tr_ref_seq = "." | |
542 elif len(refSequence[refName]) <= tr_e: | |
543 tr_ref_seq = "." | |
544 else: | |
545 tr_ref_seq = refSequence[refName][tr_s:tr_e+1] | |
546 | |
547 pre_e += 1 | |
548 tr_e += 1 | |
549 suf_e += 1 | |
550 print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \ | |
551 % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq) | |
552 | |
553 if (markEndOfFile): | |
554 print "# microsat_snoop end-of-file" | |
555 | |
556 if (inputF != stdin): | |
557 inputF.close() | |
558 | |
559 # non_negative_intervals | |
560 # find intervals with exactly + and no - | |
561 # from string like this : +++++++++---+++++++++ | |
562 def non_negative_intervals(seq, minLength=None): | |
563 | |
564 start = -1 | |
565 end = -1 | |
566 firstPlus = 1 | |
567 #print seq | |
568 for ix in range(len(seq)): # for every char in seq | |
569 ch = seq[ix] | |
570 if(ch == "+"): | |
571 if(firstPlus): | |
572 firstPlus = 0 | |
573 start = ix | |
574 else: | |
575 continue | |
576 elif(ch == "-"): | |
577 if(start >= 0): | |
578 end = ix-1 | |
579 if((end - start + 1) >= minLength): | |
580 yield (start,end+1) | |
581 start = -1 | |
582 firstPlus = 1 | |
583 if(start > 0): | |
584 if((ix - start + 1) >= minLength): | |
585 yield (start, ix+1) | |
586 | |
587 | |
588 ################################################################### | |
589 # modified by Chen Sun on 7/11/2014 | |
590 # We do not want other modules, so parse these functions inside | |
591 # | |
592 ################################################################### | |
593 | |
594 # parse a string of the form {positives}/{positives_and_neutrals} | |
595 | |
596 def parse_spec(s): | |
597 if ("/" not in s): raise ValueError | |
598 (n,d) = s.split("/",1) | |
599 if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError | |
600 if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError | |
601 | |
602 positives = n[1:-1] | |
603 d = d[1:-1] | |
604 | |
605 for ch in positives: | |
606 if (ch not in d): raise ValueError | |
607 | |
608 neutrals = [ch for ch in d if (ch not in positives)] | |
609 return (positives,neutrals) | |
610 | |
611 | |
612 # convert a string to a number, allowing fractions | |
613 | |
614 def float_or_fraction(s): | |
615 if ("/" in s): | |
616 (numer,denom) = s.split("/",1) | |
617 return float(numer)/float(denom) | |
618 else: | |
619 return float(s) | |
620 | |
621 | |
622 # dense_intervals-- | |
623 # Find all non-overlapping runs with a good enough rate (of positives), and | |
624 # which meet our length threshold. | |
625 # | |
626 # The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing | |
627 # long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999. | |
628 # | |
629 # $$$ we use the denominator as the threshold, but we really should use the | |
630 # $$$ .. numerator, comparing it to minLength*rate | |
631 | |
632 def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None): | |
633 | |
634 if (blockers == None): | |
635 blockers = "".join([chr(n) for n in range(1,256) | |
636 if (chr(n) not in positives) | |
637 and (chr(n) not in neutrals)]) | |
638 | |
639 stackLeft = [None] # stack with each entry containing five | |
640 stackRight = [None] # .. elements; note that entry zero is not | |
641 stackLeftScore = [None] # .. used | |
642 stackRightScore = [None] | |
643 stackLower = [None] | |
644 top = 0 | |
645 score = 0 | |
646 | |
647 for ix in range(len(seq)): | |
648 ch = seq[ix] | |
649 if (ch in blockers): | |
650 # emit intervals | |
651 | |
652 for sp in range(1,top+1): | |
653 left = stackLeft [sp] + 1 | |
654 right = stackRight[sp] | |
655 | |
656 while (left < right) and (seq[left] not in positives): left += 1 | |
657 while (right > left) and (seq[right] not in positives): right -= 1 | |
658 | |
659 right += 1 | |
660 if (minLength == None) or (right - left >= minLength): | |
661 yield (left,right) | |
662 | |
663 #empty stack | |
664 | |
665 stackLeft = [None] | |
666 stackRight = [None] | |
667 stackLeftScore = [None] | |
668 stackRightScore = [None] | |
669 stackLower = [None] | |
670 top = 0 | |
671 score = 0 | |
672 continue | |
673 | |
674 if (ch in positives): weight = 1-rate | |
675 elif (ch in neutrals): weight = -rate | |
676 else: raise ValueError | |
677 | |
678 score += weight | |
679 #if ("algorithm" in debug): | |
680 # print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score), | |
681 | |
682 if (weight < 0): | |
683 #if ("algorithm" in debug): | |
684 # print >>sys.stderr | |
685 continue | |
686 | |
687 if (top > 0) and (stackRight[top] == ix-1): | |
688 # add this site to the interval on top of the stack | |
689 | |
690 stackRight [top] = ix | |
691 stackRightScore[top] = score | |
692 | |
693 #if ("algorithm" in debug): | |
694 # print >>sys.stderr, \ | |
695 # " extending [%d] %d-%d %4.1f %4.1f" \ | |
696 # % (top, | |
697 # stackLeft [top], stackRight [top], | |
698 # stackLeftScore[top], stackRightScore[top]), | |
699 | |
700 else: | |
701 # create a one site interval | |
702 | |
703 top += 1 | |
704 if (top >= len(stackLeft)): | |
705 stackLeft += [None] | |
706 stackRight += [None] | |
707 stackLeftScore += [None] | |
708 stackRightScore += [None] | |
709 stackLower += [None] | |
710 | |
711 stackLeft [top] = ix - 1 | |
712 stackLeftScore [top] = score - weight | |
713 stackRight [top] = ix | |
714 stackRightScore[top] = score | |
715 stackLower [top] = top - 1 | |
716 | |
717 while (stackLower[top] > 0) \ | |
718 and (stackLeftScore[stackLower[top]] > stackLeftScore[top]): | |
719 stackLower[top] = stackLower[stackLower[top]] | |
720 | |
721 #if ("algorithm" in debug): | |
722 # print >>sys.stderr, \ | |
723 # " creating [%d] %d-%d %4.1f %4.1f -> %d" \ | |
724 # % (top, | |
725 # stackLeft [top], stackRight [top], | |
726 # stackLeftScore[top], stackRightScore[top], | |
727 # stackLower [top]), | |
728 | |
729 # merge intervals; if there is a previous interval with a no-higher | |
730 # left score and no-higher right score, merge this interval (and all | |
731 # intervening ones) into that one | |
732 | |
733 while (top > 1) \ | |
734 and (stackLower[top] > 0) \ | |
735 and (stackRightScore[stackLower[top]] <= stackRightScore[top]): | |
736 stackRight [stackLower[top]] = stackRight [top] | |
737 stackRightScore[stackLower[top]] = stackRightScore[top] | |
738 top = stackLower[top] | |
739 | |
740 #if ("algorithm" in debug): | |
741 # print >>sys.stderr, \ | |
742 # "\n%*s merging [%d] %d-%d %4.1f %4.1f" \ | |
743 # % (13, "", top, | |
744 # stackLeft[top], stackRight [top], | |
745 # stackLeftScore[top], stackRightScore[top]), | |
746 | |
747 #if ("algorithm" in debug): | |
748 # print >>sys.stderr | |
749 | |
750 # emit intervals | |
751 | |
752 for sp in range(1,top+1): | |
753 left = stackLeft [sp] + 1 | |
754 right = stackRight[sp] | |
755 | |
756 while (left < right) and (seq[left] not in positives): left += 1 | |
757 while (right > left) and (seq[right] not in positives): right -= 1 | |
758 | |
759 right += 1 | |
760 if (minLength == None) or (right - left >= minLength): | |
761 yield (left,right) | |
762 | |
763 | |
764 ################################################################### | |
765 # modified by Chen Sun on 7/11/2014 | |
766 # | |
767 ################################################################### | |
768 | |
769 # correlation_sequence-- | |
770 # Compute the correlation sequence for a given period. This is a sequence | |
771 # of + and - indicating whether the base at a given position matches the one | |
772 # P positions earlier (where P is the period). The first P positions are | |
773 # blank. Positions with single character runs longer than the period are | |
774 # considered as non-matches, unless the period is 1. | |
775 | |
776 def correlation_sequence(sequence,period,start=None,end=None): | |
777 if (start == None): start = 0 | |
778 if (end == None): end = len(sequence) | |
779 | |
780 prevCh = sequence[start] | |
781 run = 1 | |
782 for ix in xrange(start+1,start+period): | |
783 ch = sequence[ix] | |
784 if (ch != prevCh): run = 1 | |
785 else: run += 1 | |
786 prevCh = ch | |
787 | |
788 corr = [" "] * period | |
789 for ix in xrange(start+period,end): | |
790 rptCh = sequence[ix-period] | |
791 ch = sequence[ix] | |
792 if (ch != prevCh): run = 1 | |
793 else: run += 1 | |
794 if (ch in "ACGT") \ | |
795 and (ch == rptCh) \ | |
796 and ((period == 1) or (run < period)): | |
797 corr += ["+"] | |
798 else: | |
799 corr += ["-"] | |
800 prevCh = ch | |
801 | |
802 return "".join(corr) | |
803 | |
804 | |
805 # longest_suitable_run-- | |
806 # Find longest run with a good enough rate (of positives). | |
807 # | |
808 # We score a "+" as 1-r and anything else as -r. This is based on the fol- | |
809 # lowing derivation (p is the number of "+"s, n is the number of non-"+"s): | |
810 # p/(p+n) >= r | |
811 # ==> p >= rp + rn | |
812 # ==> (1-r)p - rn >= 0 | |
813 # | |
814 # We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing). | |
815 # | |
816 # $$$ we use the denominator as the threshold, but we really should use the | |
817 # $$$ .. numerator, comparing it to minLength*rate | |
818 # | |
819 # $$$ this needs to account for $$$ this situation: | |
820 # $$$ sequence: ACGACGACGACGTTATTATTATTA | |
821 # $$$ matches: +++++++++---+++++++++ | |
822 # $$$ this is currently considered to be one interval (if rate <= 6/7), but it | |
823 # $$$ ought to be two; we can't just post-process, though, because some other | |
824 # $$$ interval might be longer than the longest half of this; maybe what we | |
825 # $$$ need to do is consider matches at distances -P and -2P, or if we match | |
826 # $$$ -P but that itself was a mismatch, we should carry the mismatch forward | |
827 | |
828 def longest_suitable_run(seq,minLength,rate): | |
829 maxEndingHere = 0 | |
830 maxSoFar = 0 | |
831 start = None | |
832 | |
833 for ix in xrange(len(seq)): | |
834 if (seq[ix] == "+"): s = 1-rate | |
835 else: s = -rate | |
836 | |
837 if (maxEndingHere+s < 0): | |
838 maxEndingHere = 0 | |
839 block = ix | |
840 else: | |
841 maxEndingHere += s | |
842 if (maxEndingHere >= maxSoFar): | |
843 maxSoFar = maxEndingHere | |
844 start = block + 1 | |
845 end = ix + 1 | |
846 | |
847 if (start == None) or (end - start < minLength): | |
848 return [] | |
849 else: | |
850 return [(start,end)] | |
851 | |
852 | |
853 # all_suitable_runs-- | |
854 # Find all non-overlapping runs with a good enough rate (of positives), and | |
855 # which meet our length threshold. | |
856 # $$$ this needs to post-process the intervals, splitting them to account for | |
857 # $$$ this situation: | |
858 # $$$ sequence: ACGACGACGACGTTATTATTATTA | |
859 # $$$ matches: +++++++++---+++++++++ | |
860 # $$$ this is currently reported as one interval (if rate <= 6/7), but it | |
861 # $$$ ought to be two | |
862 | |
863 def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold): | |
864 | |
865 ################################################################ | |
866 # modified by Chen Sun on 07/11/2014 | |
867 # | |
868 ################################################################ | |
869 | |
870 if hammingThreshold > 0: | |
871 return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)] | |
872 elif hammingThreshold == 0: | |
873 return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)] | |
874 | |
875 | |
876 # find_repeat_element-- | |
877 # Find the most plausible repeat element for a run, and nudge the ends of | |
878 # the run if needed. Note that we will not consider kmers that represent | |
879 # shorter repeats. For example, we won't report ACTACT as a 6-mer since we | |
880 # consider this to have a shorter period than 6. | |
881 | |
882 def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False): | |
883 | |
884 if hammingThreshold > 0: | |
885 (kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials) | |
886 return (kmer,bestD,bestStart,bestEnd) | |
887 # count the number of occurences of each k-mer; note that we can't | |
888 # reject kmers containing smaller repeats yet, since for a sequence like | |
889 # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best | |
890 # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end | |
891 # up reporting something like ACACAA as the best motif | |
892 | |
893 if ("element" in debug): | |
894 print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end) | |
895 | |
896 if ("partial" in debug): | |
897 print period, seq, start, end, allowPartials; | |
898 print seq[start:end] | |
899 | |
900 kmerToCount = {} | |
901 kmerToFirst = {} | |
902 for ix in xrange(start,end-(period-1)): | |
903 kmer = seq[ix:ix+period] | |
904 if ("N" in kmer): continue | |
905 if (kmer not in kmerToCount): | |
906 kmerToCount[kmer] = 1 | |
907 kmerToFirst[kmer] = ix | |
908 else: | |
909 kmerToCount[kmer] += 1 | |
910 #if ("element" in debug): | |
911 # print >>stderr, " %d: %s" % (ix,kmer) | |
912 | |
913 # choose the best k-mer; this is simply the most frequently occurring one, | |
914 # with ties broken by whichever one came first | |
915 | |
916 kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount] | |
917 if (kmers == []): return (None,None,start,end) | |
918 kmers.sort() | |
919 | |
920 if ("element" in debug): | |
921 for (count,first,kmer) in kmers: | |
922 print >>stderr, " %s: %d" % (kmer,-count) | |
923 | |
924 (count,first,kmer) = kmers[0] | |
925 if (contains_repeat(kmer)): return (None,None,start,end) | |
926 | |
927 # determine the hamming distance between the run and a simple repeat, for | |
928 # each "plausible" start and end; we compute the distance for each such | |
929 # interval, and choose the one with the lowest hamming distance; ties are | |
930 # broken in a deterministic-but-unspecified manner | |
931 | |
932 bestD = bestStart = bestEnd = None | |
933 ################################################################################### | |
934 # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013 | |
935 # since we do not allow hamming_distance > 0, which means we do not allow mutation, | |
936 # we do not need this section to produce bestStart and End | |
937 ################################################################################### | |
938 | |
939 #for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials): | |
940 # d = hamming_distance(seq,s,e,kmer) | |
941 # if (d == None): continue | |
942 # if (bestD == None) or (d <= bestD): | |
943 # (bestD,bestStart,bestEnd) = (d,s,e) | |
944 | |
945 | |
946 | |
947 bestStart = start | |
948 | |
949 if(allowPartials): | |
950 bestEnd = end | |
951 elif(not allowPartials): | |
952 bestEnd = start | |
953 pattern = seq[start:start+period] | |
954 if ("partial" in debug): | |
955 print "kmer:", kmer | |
956 if(pattern != kmer): | |
957 print "pattern:", pattern | |
958 | |
959 while(bestEnd <= end-period): | |
960 bestEnd += period | |
961 | |
962 # bestD will always be 0, as we do not allow mutation | |
963 bestD = 0 | |
964 | |
965 if ("partial" in debug): | |
966 print bestD, bestStart, bestEnd | |
967 | |
968 ################################################################################### | |
969 # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10 | |
970 # | |
971 ################################################################################### | |
972 return (kmer,bestD,bestStart,bestEnd) | |
973 | |
974 | |
975 def find_hamming_repeat_element(period,seq,start,end,allowPartials=False): | |
976 | |
977 # count the number of occurences of each k-mer; note that we can't | |
978 # reject kmers containing smaller repeats yet, since for a sequence like | |
979 # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best | |
980 # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end | |
981 # up reporting something like ACACAA as the best motif | |
982 | |
983 if ("element" in debug): | |
984 print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end) | |
985 | |
986 kmerToCount = {} | |
987 kmerToFirst = {} | |
988 for ix in xrange(start,end-(period-1)): | |
989 kmer = seq[ix:ix+period] | |
990 if ("N" in kmer): continue | |
991 if (kmer not in kmerToCount): | |
992 kmerToCount[kmer] = 1 | |
993 kmerToFirst[kmer] = ix | |
994 else: | |
995 kmerToCount[kmer] += 1 | |
996 #if ("element" in debug): | |
997 # print >>stderr, " %d: %s" % (ix,kmer) | |
998 | |
999 # choose the best k-mer; this is simply the most frequently occurring one, | |
1000 # with ties broken by whichever one came first | |
1001 | |
1002 kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount] | |
1003 if (kmers == []): return (None,None,start,end) | |
1004 kmers.sort() | |
1005 | |
1006 if ("element" in debug): | |
1007 for (count,first,kmer) in kmers: | |
1008 print >>stderr, " %s: %d" % (kmer,-count) | |
1009 | |
1010 (count,first,kmer) = kmers[0] | |
1011 if (contains_repeat(kmer)): return (None,None,start,end) | |
1012 | |
1013 # determine the hamming distance between the run and a simple repeat, for | |
1014 # each "plausible" start and end; we compute the distance for each such | |
1015 # interval, and choose the one with the lowest hamming distance; ties are | |
1016 # broken in a deterministic-but-unspecified manner | |
1017 | |
1018 bestD = bestStart = bestEnd = None | |
1019 | |
1020 for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials): | |
1021 d = hamming_distance(seq,s,e,kmer) | |
1022 if (d == None): continue | |
1023 if (bestD == None) or (d <= bestD): | |
1024 (bestD,bestStart,bestEnd) = (d,s,e) | |
1025 | |
1026 return (kmer,bestD,bestStart,bestEnd) | |
1027 | |
1028 # plausible_intervals-- | |
1029 # Yield all plausible intervals intersecting with a run. We generate all | |
1030 # starts within P bp of the run's start. For each of these, we either (a) try | |
1031 # all ends within P bp of run's end, or (b) trim the new interval to a whole | |
1032 # multiple of the period, and report this short interval and the longer | |
1033 # interval with one more period appended. Case (a) allows partial motifs, | |
1034 # while case (b) only allows whole motifs. | |
1035 | |
1036 def plausible_intervals(start,end,period,seqLen,allowPartials=False): | |
1037 | |
1038 # generate intervals that allow a partial copy of the motif | |
1039 | |
1040 if (allowPartials): | |
1041 for candStart in xrange(start-(period-1),start+period): | |
1042 if (candStart < 0): continue | |
1043 for candEnd in xrange(end-(period-1),end+period): | |
1044 if (candEnd > seqLen): continue | |
1045 if (candEnd <= candStart+period): continue | |
1046 yield (candStart,candEnd) | |
1047 | |
1048 # -OR- generate intervals that allow only whole copies of the motif | |
1049 | |
1050 else: | |
1051 for candStart in xrange(start-(period-1),start+period): | |
1052 if (candStart < 0): continue | |
1053 candEnd = candStart + ((end-candStart)/period)*period | |
1054 yield (candStart,candEnd) | |
1055 candEnd += period | |
1056 if (candEnd <= seqLen): yield (candStart,candEnd) | |
1057 | |
1058 | |
1059 # hamming_distance-- | |
1060 # Determine the hamming distance between the run and a simple repeat. | |
1061 # $$$ improve this by allowing gaps, and stopping when we reach a threshold | |
1062 | |
1063 kmerToDiffs = {} # (this is used for memo-ization) | |
1064 | |
1065 def hamming_distance(seq,start,end,kmer): | |
1066 period = len(kmer) | |
1067 if (end < start + period): return None | |
1068 | |
1069 wholeEnd = start + ((end-start)/period)*period | |
1070 | |
1071 if (kmer not in kmerToDiffs): | |
1072 kmerToDiffs[kmer] = { kmer:0 } | |
1073 | |
1074 d = 0 | |
1075 for ix in xrange(start,wholeEnd,period): | |
1076 qmer = seq[ix:ix+period] # same size as the kmer motif | |
1077 if (qmer in kmerToDiffs[kmer]): | |
1078 d += kmerToDiffs[kmer][qmer] | |
1079 continue | |
1080 diffs = 0 | |
1081 for iy in xrange(0,period): | |
1082 if (qmer[iy] != kmer[iy]): diffs += 1 | |
1083 kmerToDiffs[kmer][qmer] = diffs | |
1084 d += diffs | |
1085 | |
1086 if (end > wholeEnd): | |
1087 qmer = seq[wholeEnd:end] # shorter than the kmer motif | |
1088 if (qmer in kmerToDiffs[kmer]): | |
1089 d += kmerToDiffs[kmer][qmer] | |
1090 else: | |
1091 diffs = 0 | |
1092 for iy in xrange(0,len(qmer)): | |
1093 if (qmer[iy] != kmer[iy]): diffs += 1 | |
1094 kmerToDiffs[kmer][qmer] = diffs | |
1095 d += diffs | |
1096 | |
1097 return d | |
1098 | |
1099 | |
1100 # fasta_sequences-- | |
1101 # Read the fasta sequences from a file. Note that we convert to upper case, | |
1102 # and convert any letter other than ACGT to N. | |
1103 | |
1104 nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN") | |
1105 | |
1106 def fasta_sequences(f): | |
1107 seqName = None | |
1108 seqNucs = None | |
1109 | |
1110 for line in f: | |
1111 line = line.strip() | |
1112 if (line.startswith(">")): | |
1113 if (seqName != None): | |
1114 yield (seqName,"".join(seqNucs)) | |
1115 seqName = sequence_name(line) | |
1116 seqNucs = [] | |
1117 elif (seqName == None): | |
1118 assert (False), "first sequence has no header" | |
1119 else: | |
1120 seqNucs += [line] | |
1121 | |
1122 if (seqName != None): | |
1123 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap)) | |
1124 | |
1125 | |
1126 # fastq_sequences-- | |
1127 # Read the fastq sequences from a file. Note that we convert to upper case, | |
1128 # and convert any letter other than ACGT to N. | |
1129 | |
1130 def fastq_sequences(f): | |
1131 lineNum = 0 | |
1132 for line in f: | |
1133 lineNum += 1 | |
1134 line = line.strip() | |
1135 | |
1136 if (lineNum % 4 == 1): | |
1137 assert (line.startswith("@")), \ | |
1138 "bad read name at line %d" % lineNum | |
1139 seqName = line[1:] | |
1140 continue | |
1141 | |
1142 if (lineNum % 4 == 2): | |
1143 seqNucs = line | |
1144 continue | |
1145 | |
1146 if (lineNum % 4 == 3): | |
1147 assert (line.startswith("+")), \ | |
1148 "can't understand line %d:\n%s" % (lineNum,line) | |
1149 continue | |
1150 | |
1151 quals = line | |
1152 assert (len(quals) == len(seqNucs)), \ | |
1153 "length mismatch read vs. qualities at line %d" % lineNum | |
1154 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals) | |
1155 | |
1156 assert (lineNum % 4 == 0), \ | |
1157 "incomplete read at end of file" | |
1158 | |
1159 def sam_sequences(f): | |
1160 lineNum = 0 | |
1161 for line in f: | |
1162 lineNum += 1 | |
1163 line = line.strip() | |
1164 | |
1165 if line.startswith("@"): | |
1166 continue | |
1167 | |
1168 columns = line.split("\t") | |
1169 seqName = columns[0] | |
1170 refName = columns[2] | |
1171 pre_s = int(columns[3]) - 1 | |
1172 cigar = columns[5] | |
1173 seqNucs = columns[9] | |
1174 | |
1175 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar) | |
1176 | |
1177 # sequence_name-- | |
1178 # Extract the sequence name from a fasta header. | |
1179 # $$$ this may need to be improved $$$ | |
1180 | |
1181 def sequence_name(s): | |
1182 s = s[1:].strip() | |
1183 if (s == ""): return "" | |
1184 else: return s.split()[0] | |
1185 | |
1186 | |
1187 # nucleotide_runs-- | |
1188 # Yield (start,end) for all runs of valid nucleotides in a sequence. | |
1189 | |
1190 def nucleotide_runs(s): | |
1191 runs = [] | |
1192 start = None | |
1193 for (ix,nuc) in enumerate(s): | |
1194 if (nuc in "ACGT"): | |
1195 if (start == None): | |
1196 start = ix | |
1197 else: | |
1198 if (start != None): | |
1199 yield (start,ix) | |
1200 start = None | |
1201 | |
1202 if (start != None): yield (start,len(s)) | |
1203 | |
1204 | |
1205 # contains_repeat-- | |
1206 # Determine whether a short sequence contains a repeated element, such as a | |
1207 # 6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT). The repeat | |
1208 # must cover the entire sequence, without mismatches. | |
1209 | |
1210 def contains_repeat(kmer): | |
1211 kmerLength = len(kmer) | |
1212 hasRepeat = False | |
1213 rptLen = 1 | |
1214 while (not hasRepeat) and (2 * rptLen <= kmerLength): | |
1215 if (kmerLength % rptLen != 0): | |
1216 rptLen += 1 | |
1217 continue | |
1218 isRepeat = True | |
1219 for i in xrange(rptLen,kmerLength,rptLen): | |
1220 if (kmer[i:i+rptLen] != kmer[:rptLen]): | |
1221 isRepeat = False | |
1222 break | |
1223 if (isRepeat): | |
1224 hasRepeat = True | |
1225 break | |
1226 rptLen += 1 | |
1227 return hasRepeat | |
1228 | |
1229 | |
1230 # hash108-- | |
1231 # Return a 108-bit hash "value" of a string | |
1232 | |
1233 def hash108(s): | |
1234 m = md5_new() | |
1235 m.update(s) | |
1236 return m.hexdigest()[:27] | |
1237 | |
1238 | |
1239 # float_or_fraction-- | |
1240 # Convert a string to a number, allowing fractions | |
1241 | |
1242 def float_or_fraction(s): | |
1243 if ("/" in s): | |
1244 (numer,denom) = s.split("/",1) | |
1245 return float(numer)/float(denom) | |
1246 else: | |
1247 return float(s) | |
1248 | |
1249 | |
1250 # int_with_unit-- | |
1251 # Parse a string as an integer, allowing unit suffixes | |
1252 | |
1253 def int_with_unit(s): | |
1254 if (s.endswith("K")): | |
1255 multiplier = 1000 | |
1256 s = s[:-1] | |
1257 elif (s.endswith("M")): | |
1258 multiplier = 1000 * 1000 | |
1259 s = s[:-1] | |
1260 elif (s.endswith("G")): | |
1261 multiplier = 1000 * 1000 * 1000 | |
1262 s = s[:-1] | |
1263 else: | |
1264 multiplier = 1 | |
1265 | |
1266 try: return int(s) * multiplier | |
1267 except ValueError: return int(math.ceil(float(s) * multiplier)) | |
1268 | |
1269 | |
1270 if __name__ == "__main__": main() | |
1271 |