comparison microsatellite.py @ 0:20ab85af9505

Uploaded
author arkarachai-fungtammasan
date Fri, 03 Oct 2014 20:54:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:20ab85af9505
1 #!/usr/bin/env python
2 """
3 Snoop thru a fasta file looking for microsatellite repeats of given periods
4 Output format: length_of_repeat left_flank_length right_flank_length repeat_motif hamming_distance read_name read_sequence read_quality (additional columns)
5
6 If --r option turned on, output format will have additional columns behind:
7 read_name read_chr pre_s pre_e tr_s tr_e suf_s suf_e tr_len tr_ref_seq
8
9 pre_s where the read start
10 pre_e the last position before microsatellite
11 tr_s where microsatellite start
12 tr_e where microsatellite end
13 suf_s first base after microsatellite
14 tr_ref_seq reference sequence corresponding to microsatellite
15
16 * output positions are 0 based
17
18 :Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu)
19
20 modifing log:
21
22 09/27/2013
23 replace function dense_intervals with function non_negative_intervals, which do not need to import such file.
24
25 10/18/2013
26 modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel
27
28 02/25/2014
29 add function that can deal with mapped reads
30 with additional output
31
32 02/28/2014
33 modify the 0-based end point, as in 0-base area, it is half-open [ )
34 so the 0-based site, should always be added by 1
35
36 03/05/2014
37 deal with multi-fasta
38 """
39 from sys import argv,stdin,stderr,exit
40 from string import maketrans
41 from md5 import new as md5_new
42 import re
43 #from pyfracluster import dense_intervals
44
45 def usage(s=None):
46 message = """
47 usage: microsat_snoop [fasta_file] [options]
48 <fasta_file> Name of file to read sequences from; if absent,
49 sequences are read from stdin
50 --fasta Input file is in fasta format
51 (this is the default)
52 --fastq Input file is in fastq format
53 (default is fasta unless filename is .fastq)
54 --fastq:noquals Input file is in fastq format, but discard quals
55 --sam Input file is SAM file
56 --r Indicate additional output information, if indicated,
57 --ref option is mendatory
58 --ref=<filepath> Reference file (absolute) path
59 --period=<length> (mandatory,cumulative) repeat length(s) to be
60 searched for
61 <length> is expected to be small, less than 10
62 <length> can also be a comma-separated list, or
63 a range <low>..<high>
64 --rate=<fraction> control the candidate repeat interval detector;
65 it will consider intervals with at least
66 <fraction> of matches when shifted by the period;
67 <fraction> is between 0 and 1 and can be either a
68 real number or <n>/<d>
69 (default is 6/7)
70 --minlength=<length> minimum length of intervals reported, in bp
71 (default is 20)
72 --progress=<count> how often to report the sequence we're searching
73 (default is no progress report)
74 --allowduplicates process all input sequences
75 (this is the default)
76 --noduplicates ignore any input sequence that's the same as an
77 earlier sequence
78 --nonearduplicates ignore any input sequence that has the same first
79 100 bp as an earlier sequence
80 --nonearduplicate=<length> ignore any input sequence that has the same first
81 <length> bp as an earlier sequence
82 --hamming=<count> Don't report candidate repeat intervals that have
83 more than <count> mismatches
84 (default is to do no such filtering)
85 --prefix=<length> Don't report candidate repeat intervals that
86 start within <length> of the sequence start
87 (default is to do no such filtering)
88 --suffix=<length> Don't report candidate repeat intervals that
89 end within <length> of the sequence end
90 (default is to do no such filtering)
91 --subsample=<k>/<n> Process only the <k>th sequence of every group of
92 <n> sequences; <k> ranges from 1 to <n>
93 --multipleruns Consider all candidate intervals in a sequence
94 (default is to consider only the longest)
95 --partialmotifs Consider microatelites with a partial motif
96 (default is to consider only whole motifs)
97 --splitbyvalidity Preprocess sequences, splitting at Ns; this
98 prevents candidates from including Ns
99 (default is not to split)
100 --noflankdisplay Show entire sequence as flanking regions
101 (this is the default)
102 --flankdisplay=<length> Limit length of flanking regions shown
103 --readnamesuffix=<string> Root of suffix to append to read names; e.g. 1
104 for forward, 2 for reverse; this triggers other
105 info to be included in the suffix
106 (default is "1" for fastq; no suffix for fasta)
107 --head=<number> limit the number of sequences processed
108 --markend Write a marker line upon completion
109 (default is not to write a marker)
110 --help=details Describe the process, and quit"""
111
112 if (s == None): exit (message)
113 else: exit ("%s\n%s" % (s,message))
114
115
116 detailedDescription = """In broad terms, the process works as follows:
117
118 (1) Identify intervals that are highly correlated with the interval shifted by
119 P (the repeat period). These intervals are called "runs" or "candidates".
120 The level of correlation required is controlled by rateThreshold.
121 Depending on whether we want to look for more than one microsat, we either
122 find the longest such run (simple algorithm) or many runs (more complicated
123 algorithm). The following steps are then performed on each run.
124
125 (2) Find the most likely repeat motif in the run. This is done by counting
126 all kmers (of length P) and choosing the most frequent. If that kmer is
127 itself covered by a sub-repeat we discard this run. The idea is that we
128 can ignore a 6-mer like ACGACG because we will find it when we are looking
129 for 3-mers.
130
131 (3) Once we identify the most likely repeat motif, we then modify the
132 interval, adjusting start and end to find the interval that has the fewest
133 mismatches vs. a sequence of the motif repeated (hamming distance). Only
134 whole copies of the motif are considered.
135
136 (4) At this point we have a valid microsat interval (in the eyes of the
137 program). It is subjected to some filtering stages (hamming distance or too
138 close to an end), and if it satisfies those conditions, it's reported to
139 the user."""
140
141 def main():
142 global debug
143
144 #=== parse the command line ===
145
146 inputFilename = None
147 referenceFileName = None #add by Chen Sun on 02/25
148 inputFormat = None
149 repeatPeriods = []
150 rateThreshold = 6 / 7.0
151 lengthThreshold = 20
152 reportProgress = None
153 discardDuplicates = False
154 discardNearDuplicates = False
155 nearDuplicatePrefix = 100
156 hammingThreshold = 0
157 prefixThreshold = None
158 suffixThreshold = None
159 subsampleK = None
160 subsampleN = None
161 reportMultipleRuns = False
162 allowPartialMotifs = False
163 splitByValidity = False
164 flankDisplayLimit = None
165 readNameSuffix = None
166 headLimit = None
167 markEndOfFile = False
168 additionalInfo = False
169 debug = []
170
171 for arg in argv[1:]:
172 if (arg == "--fasta"):
173 inputFormat = "fasta"
174 elif (arg == "--fastq"):
175 inputFormat = "fastq"
176 elif (arg == "--fastq:noquals"):
177 inputFormat = "fastq:noquals"
178 elif (arg == "--sam"):
179 inputFormat = "sam"
180 elif (arg == "--r"):
181 additionalInfo = True
182 elif (arg.startswith("--ref=")):
183 referenceFileName = arg.split("=",1)[1]
184 elif (arg.startswith("--period=")):
185 val = arg.split("=",1)[1]
186 for period in val.split(","):
187 if (".." in period):
188 (lowPeriod,highPeriod) = period.split("..",1)
189 lowPeriod = int(lowPeriod)
190 highPeriod = int(highPeriod)
191 for period in xrange(lowPeriod,highPeriod+1):
192 repeatPeriods += [period]
193 else:
194 repeatPeriods += [int(period)]
195 elif (arg.startswith("--rate=")):
196 val = arg.split("=",1)[1]
197 rateThreshold = float_or_fraction(val)
198 assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val
199 elif (arg.startswith("--minlength=")):
200 val = arg.split("=",1)[1]
201 lengthThreshold = int(val)
202 assert (lengthThreshold >= 0)
203 elif (arg.startswith("--progress=")):
204 val = arg.split("=",1)[1]
205 reportProgress = int(val)
206 elif (arg == "--allowduplicates"):
207 discardDuplicates = False
208 discardNearDuplicates = False
209 elif (arg == "--noduplicates"):
210 discardDuplicates = True
211 discardNearDuplicates = False
212 elif (arg == "--nonearduplicates"):
213 discardDuplicates = False
214 discardNearDuplicates = True
215 elif (arg.startswith("--nonearduplicate=")):
216 val = arg.split("=",1)[1]
217 discardDuplicates = False
218 discardNearDuplicates = True
219 nearDuplicatePrefix = int(val)
220 assert (nearDuplicatePrefix > 0)
221 elif (arg.startswith("--hamming=")):
222 val = arg.split("=",1)[1]
223 hammingThreshold = int(val)
224 assert (hammingThreshold >= 0)
225 elif (arg.startswith("--prefix=")):
226 val = arg.split("=",1)[1]
227 prefixThreshold = int(val)
228 assert (prefixThreshold >= 0)
229 elif (arg.startswith("--suffix=")):
230 val = arg.split("=",1)[1]
231 suffixThreshold = int(val)
232 assert (suffixThreshold >= 0)
233 elif (arg.startswith("--subsample=")):
234 val = arg.split("=",1)[1]
235 (k,n) = val.split("/",2)
236 subsampleK = int(k)
237 subsampleN = int(n)
238 assert (0 < subsampleK <= subsampleN)
239 elif (arg == "--multipleruns"):
240 reportMultipleRuns = True
241 elif (arg == "--partialmotifs"):
242 allowPartialMotifs = True
243 elif (arg == "--splitbyvalidity"):
244 splitByValidity = True
245 elif (arg == "--noflankdisplay"):
246 flankDisplayLimit = None
247 elif (arg.startswith("--flankdisplay=")):
248 val = arg.split("=",1)[1]
249 flankDisplayLimit = int(val)
250 assert (flankDisplayLimit >= 0)
251 elif (arg.startswith("--readnamesuffix")):
252 readNameSuffix = arg.split("=",1)[1]
253 elif (arg.startswith("--head=")):
254 headLimit = int_with_unit(arg.split("=",1)[1])
255 elif (arg == "--markend"):
256 markEndOfFile = True
257 elif (arg == "--help=details"):
258 exit (detailedDescription)
259 elif (arg.startswith("--debug=")):
260 debug += (arg.split("=",1)[1]).split(",")
261 elif (arg.startswith("--")):
262 usage("unrecognized option: %s" % arg)
263 elif (inputFilename == None):
264 inputFilename = arg
265 else:
266 usage("unrecognized option: %s" % arg)
267
268 #=== determine periods of interest ===
269
270 if (repeatPeriods == []):
271 usage("you gotta give me a repeat period")
272
273 if (additionalInfo == True):
274 if (referenceFileName == None):
275 usage("reference file path needed. use --ref=<reference> to indicate")
276
277 periodSeed = {}
278 for period in repeatPeriods:
279 if (period < 1): usage("period %d is not valid" % period)
280 periodSeed[period] = True
281
282 repeatPeriods = [period for period in periodSeed]
283 repeatPeriods.sort()
284
285 #=== determine input format ===
286
287 if (inputFormat == "fasta"): sequence_reader = fasta_sequences
288 elif (inputFormat == "fastq"): sequence_reader = fastq_sequences
289 elif (inputFormat == "fastq:noquals"): sequence_reader = fastq_sequences
290 elif (inputFormat == "sam"): sequence_reader = sam_sequences
291 elif (inputFilename == None): sequence_reader = fasta_sequences
292 elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences
293 elif (inputFilename.endswith(".fq")): sequence_reader = fastq_sequences
294 elif (inputFilename.endswith(".sam")): sequence_reader = sam_sequences
295 else: sequence_reader = fasta_sequences
296
297 if (inputFilename != None): inputF = file(inputFilename,"rt")
298 else: inputF = stdin
299
300 if (readNameSuffix == None) \
301 and (sequence_reader == fastq_sequences) \
302 and (inputFormat != "fastq:noquals"):
303 readNameSuffix = "1"
304
305 #=== process the sequences ===
306
307 refSequence = {}
308 rightName = ""
309 sequence = ""
310 if additionalInfo:
311 firstFasta = True
312 originalRefF = open(referenceFileName)
313 for line in originalRefF.readlines():
314 line = line.replace('\r','')
315 line = line.replace('\n','')
316 if line.startswith(">"):
317 if firstFasta:
318 firstFasta = False
319 else:
320 refSequence[rightName] = sequence
321 rightName = line[1:]
322 sequence = ""
323 continue
324 sequence += line
325 originalRefF.close()
326 refSequence[rightName] = sequence
327
328 sequenceSeen = {}
329
330 numSequences = 0
331 for seqInfo in sequence_reader(inputF):
332 numSequences += 1
333 if (headLimit != None) and (numSequences > headLimit):
334 print >>stderr, "limit of %d sequences reached" % headLimit
335 break
336
337 if (sequence_reader == sam_sequences):
338 #seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar
339 (name, sequence, refName, pre_s, cigar) = seqInfo
340 quals = None
341 elif (sequence_reader == fastq_sequences):
342 (name,sequence,quals) = seqInfo
343 if (inputFormat == "fastq:noquals"): quals = None
344 else:
345 (name,sequence) = seqInfo
346 quals = None
347
348 if (reportProgress != None) and (numSequences % reportProgress == 0):
349 print >>stderr, "%s %d" % (name,numSequences)
350
351 # if we're subsampling and not interested in this sequence, skip it
352
353 if (subsampleN != None):
354 if ((numSequences-1) % subsampleN != (subsampleK-1)):
355 continue
356
357 # if this sequence is shorter than the length of interest, skip it
358
359 seqLen = len(sequence)
360 if (seqLen < period) or (seqLen < lengthThreshold): continue
361
362 # if we're not interested in duplicates and this is one, skip it;
363 # note that we assume no hash collisions occur, i.e. that all hash
364 # matches are truly sequence matches
365
366 if (discardDuplicates):
367 h = hash108(sequence)
368 if (h in sequenceSeen): continue
369 sequenceSeen[h] = True
370 elif (discardNearDuplicates):
371 h = hash108(sequence[:nearDuplicatePrefix])
372 if (h in sequenceSeen): continue
373 sequenceSeen[h] = True
374
375 # split the sequence into chunks of valid nucleotides
376
377 if (splitByValidity):
378 chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)]
379 else:
380 chunks = [(0,len(sequence))]
381
382 # evaluate for each period of interest
383
384 for period in repeatPeriods:
385
386 # operate on each chunk
387
388 for (chunkStart,chunkEnd) in chunks:
389 chunkLen = chunkEnd - chunkStart
390 if (chunkLen < period) or (chunkLen < lengthThreshold): continue
391
392 if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug):
393 print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd)
394
395 # compute correlation sequence
396
397 corr = correlation_sequence(sequence,period,chunkStart,chunkEnd)
398
399 if ("correlation" in debug) or ("runs" in debug):
400 print >>stderr, sequence[chunkStart:chunkEnd]
401 print >>stderr, corr
402
403 # find runs (candidates for being a microsat)
404
405 if (reportMultipleRuns):
406 runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold)
407 else:
408 runs = longest_suitable_run(corr,lengthThreshold,rateThreshold)
409 if (runs == []): continue
410
411
412 if ("runs" in debug):
413 for (start,end) in runs:
414 run = [" "] * seqLen
415 for ix in xrange(start-period,end):
416 run[ix] = "*"
417 print >>stderr, "".join(run)
418
419 if ("candidates" in debug):
420 for (start,end) in runs:
421 print >>stderr, "%s %d %d" % (name,start,end)
422
423 # process runs and report those that pass muster
424
425 runCount = 0
426 for (start,end) in runs:
427 runCount += 1
428
429 start = chunkStart + start - period
430 end = chunkStart + end
431
432 (kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs)
433 if (kmer == None): continue # (no useful repeat kmer was found)
434
435 rptExtent = end - start
436 prefixLen = start
437 suffixLen = seqLen - end
438 if (rptExtent <= period): continue
439 if (hammingThreshold != None) and (d > hammingThreshold): continue
440 if (prefixThreshold != None) and (prefixLen < prefixThreshold): continue
441 if (suffixThreshold != None) and (suffixLen < suffixThreshold): continue
442
443 if (flankDisplayLimit == None):
444 seq = sequence[:start] \
445 + sequence[start:end].lower() \
446 + sequence[end:]
447 else:
448 seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \
449 + sequence[start:end].lower() \
450 + sequence[end:min(chunkEnd,end+flankDisplayLimit)]
451 reportName = name
452 if (readNameSuffix != None):
453 reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount)
454 if (quals == None or quals == "." or quals == "\t."): quals = "\t."
455 else: quals = "\t" + quals
456 if not additionalInfo:
457 print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \
458 % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals)
459 else:
460 #pre_e = pre_s + prefixLen - 1
461 refPoint = pre_s
462 donorPoint = 0
463
464 donorBeforeStart = prefixLen - 1 #pre_e
465 donorMicroStart = prefixLen #tr_s
466 donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e
467 donorAfterMicro = donorMicroEnd + 1 #suf_s
468 donorEnd = len(seq) - 1 #suf_e
469
470 set_pre_e = False
471 set_tr_s = False
472 set_tr_e = False
473 set_suf_s = False
474 set_suf_e = False
475
476 pre_e = 0
477 tr_s = 0
478 tr_e = 0
479 suf_s = 0
480 suf_e = 0
481
482 matchList = re.findall('(\d+)([IDM])', cigar)
483 unCognitiveCigar = False
484 for matchN, matchType in matchList:
485 matchNum = int(matchN)
486 if matchType == "M":
487 donorPoint = donorPoint + matchNum
488 refPoint = refPoint + matchNum
489 elif matchType == "D":
490 refPoint = refPoint + matchNum
491 continue
492 elif matchType == "I":
493 donorPoint = donorPoint + matchNum
494 else:
495 unCognitiveCigar = True
496 break
497
498 if not set_pre_e:
499 if donorPoint >= donorBeforeStart:
500 pre_e = refPoint - (donorPoint - donorBeforeStart)
501 set_pre_e = True
502 else:
503 continue
504
505 if not set_tr_s:
506 if donorPoint >= donorMicroStart:
507 tr_s = refPoint - (donorPoint - donorMicroStart)
508 set_tr_s = True
509 else:
510 continue
511
512 if not set_tr_e:
513 if donorPoint >= donorMicroEnd:
514 tr_e = refPoint - (donorPoint - donorMicroEnd)
515 set_tr_e = True
516 else:
517 continue
518
519 if not set_suf_s:
520 if donorPoint >= donorAfterMicro:
521 suf_s = refPoint - (donorPoint - donorAfterMicro)
522 set_suf_s = True
523 else:
524 continue
525
526 if not set_suf_e:
527 if donorPoint >= donorEnd:
528 suf_e = refPoint - (donorPoint - donorEnd)
529 set_suf_e = True
530 else:
531 continue
532
533 if unCognitiveCigar:
534 break
535 tr_len = tr_e - tr_s + 1
536
537 if refName not in refSequence:
538 tr_ref_seq = "."
539 else:
540 if refSequence[refName] == "":
541 tr_ref_seq = "."
542 elif len(refSequence[refName]) <= tr_e:
543 tr_ref_seq = "."
544 else:
545 tr_ref_seq = refSequence[refName][tr_s:tr_e+1]
546
547 pre_e += 1
548 tr_e += 1
549 suf_e += 1
550 print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \
551 % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
552
553 if (markEndOfFile):
554 print "# microsat_snoop end-of-file"
555
556 if (inputF != stdin):
557 inputF.close()
558
559 # non_negative_intervals
560 # find intervals with exactly + and no -
561 # from string like this : +++++++++---+++++++++
562 def non_negative_intervals(seq, minLength=None):
563
564 start = -1
565 end = -1
566 firstPlus = 1
567 #print seq
568 for ix in range(len(seq)): # for every char in seq
569 ch = seq[ix]
570 if(ch == "+"):
571 if(firstPlus):
572 firstPlus = 0
573 start = ix
574 else:
575 continue
576 elif(ch == "-"):
577 if(start >= 0):
578 end = ix-1
579 if((end - start + 1) >= minLength):
580 yield (start,end+1)
581 start = -1
582 firstPlus = 1
583 if(start > 0):
584 if((ix - start + 1) >= minLength):
585 yield (start, ix+1)
586
587
588 ###################################################################
589 # modified by Chen Sun on 7/11/2014
590 # We do not want other modules, so parse these functions inside
591 #
592 ###################################################################
593
594 # parse a string of the form {positives}/{positives_and_neutrals}
595
596 def parse_spec(s):
597 if ("/" not in s): raise ValueError
598 (n,d) = s.split("/",1)
599 if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError
600 if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError
601
602 positives = n[1:-1]
603 d = d[1:-1]
604
605 for ch in positives:
606 if (ch not in d): raise ValueError
607
608 neutrals = [ch for ch in d if (ch not in positives)]
609 return (positives,neutrals)
610
611
612 # convert a string to a number, allowing fractions
613
614 def float_or_fraction(s):
615 if ("/" in s):
616 (numer,denom) = s.split("/",1)
617 return float(numer)/float(denom)
618 else:
619 return float(s)
620
621
622 # dense_intervals--
623 # Find all non-overlapping runs with a good enough rate (of positives), and
624 # which meet our length threshold.
625 #
626 # The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing
627 # long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999.
628 #
629 # $$$ we use the denominator as the threshold, but we really should use the
630 # $$$ .. numerator, comparing it to minLength*rate
631
632 def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None):
633
634 if (blockers == None):
635 blockers = "".join([chr(n) for n in range(1,256)
636 if (chr(n) not in positives)
637 and (chr(n) not in neutrals)])
638
639 stackLeft = [None] # stack with each entry containing five
640 stackRight = [None] # .. elements; note that entry zero is not
641 stackLeftScore = [None] # .. used
642 stackRightScore = [None]
643 stackLower = [None]
644 top = 0
645 score = 0
646
647 for ix in range(len(seq)):
648 ch = seq[ix]
649 if (ch in blockers):
650 # emit intervals
651
652 for sp in range(1,top+1):
653 left = stackLeft [sp] + 1
654 right = stackRight[sp]
655
656 while (left < right) and (seq[left] not in positives): left += 1
657 while (right > left) and (seq[right] not in positives): right -= 1
658
659 right += 1
660 if (minLength == None) or (right - left >= minLength):
661 yield (left,right)
662
663 #empty stack
664
665 stackLeft = [None]
666 stackRight = [None]
667 stackLeftScore = [None]
668 stackRightScore = [None]
669 stackLower = [None]
670 top = 0
671 score = 0
672 continue
673
674 if (ch in positives): weight = 1-rate
675 elif (ch in neutrals): weight = -rate
676 else: raise ValueError
677
678 score += weight
679 #if ("algorithm" in debug):
680 # print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score),
681
682 if (weight < 0):
683 #if ("algorithm" in debug):
684 # print >>sys.stderr
685 continue
686
687 if (top > 0) and (stackRight[top] == ix-1):
688 # add this site to the interval on top of the stack
689
690 stackRight [top] = ix
691 stackRightScore[top] = score
692
693 #if ("algorithm" in debug):
694 # print >>sys.stderr, \
695 # " extending [%d] %d-%d %4.1f %4.1f" \
696 # % (top,
697 # stackLeft [top], stackRight [top],
698 # stackLeftScore[top], stackRightScore[top]),
699
700 else:
701 # create a one site interval
702
703 top += 1
704 if (top >= len(stackLeft)):
705 stackLeft += [None]
706 stackRight += [None]
707 stackLeftScore += [None]
708 stackRightScore += [None]
709 stackLower += [None]
710
711 stackLeft [top] = ix - 1
712 stackLeftScore [top] = score - weight
713 stackRight [top] = ix
714 stackRightScore[top] = score
715 stackLower [top] = top - 1
716
717 while (stackLower[top] > 0) \
718 and (stackLeftScore[stackLower[top]] > stackLeftScore[top]):
719 stackLower[top] = stackLower[stackLower[top]]
720
721 #if ("algorithm" in debug):
722 # print >>sys.stderr, \
723 # " creating [%d] %d-%d %4.1f %4.1f -> %d" \
724 # % (top,
725 # stackLeft [top], stackRight [top],
726 # stackLeftScore[top], stackRightScore[top],
727 # stackLower [top]),
728
729 # merge intervals; if there is a previous interval with a no-higher
730 # left score and no-higher right score, merge this interval (and all
731 # intervening ones) into that one
732
733 while (top > 1) \
734 and (stackLower[top] > 0) \
735 and (stackRightScore[stackLower[top]] <= stackRightScore[top]):
736 stackRight [stackLower[top]] = stackRight [top]
737 stackRightScore[stackLower[top]] = stackRightScore[top]
738 top = stackLower[top]
739
740 #if ("algorithm" in debug):
741 # print >>sys.stderr, \
742 # "\n%*s merging [%d] %d-%d %4.1f %4.1f" \
743 # % (13, "", top,
744 # stackLeft[top], stackRight [top],
745 # stackLeftScore[top], stackRightScore[top]),
746
747 #if ("algorithm" in debug):
748 # print >>sys.stderr
749
750 # emit intervals
751
752 for sp in range(1,top+1):
753 left = stackLeft [sp] + 1
754 right = stackRight[sp]
755
756 while (left < right) and (seq[left] not in positives): left += 1
757 while (right > left) and (seq[right] not in positives): right -= 1
758
759 right += 1
760 if (minLength == None) or (right - left >= minLength):
761 yield (left,right)
762
763
764 ###################################################################
765 # modified by Chen Sun on 7/11/2014
766 #
767 ###################################################################
768
769 # correlation_sequence--
770 # Compute the correlation sequence for a given period. This is a sequence
771 # of + and - indicating whether the base at a given position matches the one
772 # P positions earlier (where P is the period). The first P positions are
773 # blank. Positions with single character runs longer than the period are
774 # considered as non-matches, unless the period is 1.
775
776 def correlation_sequence(sequence,period,start=None,end=None):
777 if (start == None): start = 0
778 if (end == None): end = len(sequence)
779
780 prevCh = sequence[start]
781 run = 1
782 for ix in xrange(start+1,start+period):
783 ch = sequence[ix]
784 if (ch != prevCh): run = 1
785 else: run += 1
786 prevCh = ch
787
788 corr = [" "] * period
789 for ix in xrange(start+period,end):
790 rptCh = sequence[ix-period]
791 ch = sequence[ix]
792 if (ch != prevCh): run = 1
793 else: run += 1
794 if (ch in "ACGT") \
795 and (ch == rptCh) \
796 and ((period == 1) or (run < period)):
797 corr += ["+"]
798 else:
799 corr += ["-"]
800 prevCh = ch
801
802 return "".join(corr)
803
804
805 # longest_suitable_run--
806 # Find longest run with a good enough rate (of positives).
807 #
808 # We score a "+" as 1-r and anything else as -r. This is based on the fol-
809 # lowing derivation (p is the number of "+"s, n is the number of non-"+"s):
810 # p/(p+n) >= r
811 # ==> p >= rp + rn
812 # ==> (1-r)p - rn >= 0
813 #
814 # We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing).
815 #
816 # $$$ we use the denominator as the threshold, but we really should use the
817 # $$$ .. numerator, comparing it to minLength*rate
818 #
819 # $$$ this needs to account for $$$ this situation:
820 # $$$ sequence: ACGACGACGACGTTATTATTATTA
821 # $$$ matches: +++++++++---+++++++++
822 # $$$ this is currently considered to be one interval (if rate <= 6/7), but it
823 # $$$ ought to be two; we can't just post-process, though, because some other
824 # $$$ interval might be longer than the longest half of this; maybe what we
825 # $$$ need to do is consider matches at distances -P and -2P, or if we match
826 # $$$ -P but that itself was a mismatch, we should carry the mismatch forward
827
828 def longest_suitable_run(seq,minLength,rate):
829 maxEndingHere = 0
830 maxSoFar = 0
831 start = None
832
833 for ix in xrange(len(seq)):
834 if (seq[ix] == "+"): s = 1-rate
835 else: s = -rate
836
837 if (maxEndingHere+s < 0):
838 maxEndingHere = 0
839 block = ix
840 else:
841 maxEndingHere += s
842 if (maxEndingHere >= maxSoFar):
843 maxSoFar = maxEndingHere
844 start = block + 1
845 end = ix + 1
846
847 if (start == None) or (end - start < minLength):
848 return []
849 else:
850 return [(start,end)]
851
852
853 # all_suitable_runs--
854 # Find all non-overlapping runs with a good enough rate (of positives), and
855 # which meet our length threshold.
856 # $$$ this needs to post-process the intervals, splitting them to account for
857 # $$$ this situation:
858 # $$$ sequence: ACGACGACGACGTTATTATTATTA
859 # $$$ matches: +++++++++---+++++++++
860 # $$$ this is currently reported as one interval (if rate <= 6/7), but it
861 # $$$ ought to be two
862
863 def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold):
864
865 ################################################################
866 # modified by Chen Sun on 07/11/2014
867 #
868 ################################################################
869
870 if hammingThreshold > 0:
871 return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)]
872 elif hammingThreshold == 0:
873 return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)]
874
875
876 # find_repeat_element--
877 # Find the most plausible repeat element for a run, and nudge the ends of
878 # the run if needed. Note that we will not consider kmers that represent
879 # shorter repeats. For example, we won't report ACTACT as a 6-mer since we
880 # consider this to have a shorter period than 6.
881
882 def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False):
883
884 if hammingThreshold > 0:
885 (kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials)
886 return (kmer,bestD,bestStart,bestEnd)
887 # count the number of occurences of each k-mer; note that we can't
888 # reject kmers containing smaller repeats yet, since for a sequence like
889 # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
890 # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end
891 # up reporting something like ACACAA as the best motif
892
893 if ("element" in debug):
894 print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
895
896 if ("partial" in debug):
897 print period, seq, start, end, allowPartials;
898 print seq[start:end]
899
900 kmerToCount = {}
901 kmerToFirst = {}
902 for ix in xrange(start,end-(period-1)):
903 kmer = seq[ix:ix+period]
904 if ("N" in kmer): continue
905 if (kmer not in kmerToCount):
906 kmerToCount[kmer] = 1
907 kmerToFirst[kmer] = ix
908 else:
909 kmerToCount[kmer] += 1
910 #if ("element" in debug):
911 # print >>stderr, " %d: %s" % (ix,kmer)
912
913 # choose the best k-mer; this is simply the most frequently occurring one,
914 # with ties broken by whichever one came first
915
916 kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
917 if (kmers == []): return (None,None,start,end)
918 kmers.sort()
919
920 if ("element" in debug):
921 for (count,first,kmer) in kmers:
922 print >>stderr, " %s: %d" % (kmer,-count)
923
924 (count,first,kmer) = kmers[0]
925 if (contains_repeat(kmer)): return (None,None,start,end)
926
927 # determine the hamming distance between the run and a simple repeat, for
928 # each "plausible" start and end; we compute the distance for each such
929 # interval, and choose the one with the lowest hamming distance; ties are
930 # broken in a deterministic-but-unspecified manner
931
932 bestD = bestStart = bestEnd = None
933 ###################################################################################
934 # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013
935 # since we do not allow hamming_distance > 0, which means we do not allow mutation,
936 # we do not need this section to produce bestStart and End
937 ###################################################################################
938
939 #for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
940 # d = hamming_distance(seq,s,e,kmer)
941 # if (d == None): continue
942 # if (bestD == None) or (d <= bestD):
943 # (bestD,bestStart,bestEnd) = (d,s,e)
944
945
946
947 bestStart = start
948
949 if(allowPartials):
950 bestEnd = end
951 elif(not allowPartials):
952 bestEnd = start
953 pattern = seq[start:start+period]
954 if ("partial" in debug):
955 print "kmer:", kmer
956 if(pattern != kmer):
957 print "pattern:", pattern
958
959 while(bestEnd <= end-period):
960 bestEnd += period
961
962 # bestD will always be 0, as we do not allow mutation
963 bestD = 0
964
965 if ("partial" in debug):
966 print bestD, bestStart, bestEnd
967
968 ###################################################################################
969 # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10
970 #
971 ###################################################################################
972 return (kmer,bestD,bestStart,bestEnd)
973
974
975 def find_hamming_repeat_element(period,seq,start,end,allowPartials=False):
976
977 # count the number of occurences of each k-mer; note that we can't
978 # reject kmers containing smaller repeats yet, since for a sequence like
979 # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
980 # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end
981 # up reporting something like ACACAA as the best motif
982
983 if ("element" in debug):
984 print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
985
986 kmerToCount = {}
987 kmerToFirst = {}
988 for ix in xrange(start,end-(period-1)):
989 kmer = seq[ix:ix+period]
990 if ("N" in kmer): continue
991 if (kmer not in kmerToCount):
992 kmerToCount[kmer] = 1
993 kmerToFirst[kmer] = ix
994 else:
995 kmerToCount[kmer] += 1
996 #if ("element" in debug):
997 # print >>stderr, " %d: %s" % (ix,kmer)
998
999 # choose the best k-mer; this is simply the most frequently occurring one,
1000 # with ties broken by whichever one came first
1001
1002 kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
1003 if (kmers == []): return (None,None,start,end)
1004 kmers.sort()
1005
1006 if ("element" in debug):
1007 for (count,first,kmer) in kmers:
1008 print >>stderr, " %s: %d" % (kmer,-count)
1009
1010 (count,first,kmer) = kmers[0]
1011 if (contains_repeat(kmer)): return (None,None,start,end)
1012
1013 # determine the hamming distance between the run and a simple repeat, for
1014 # each "plausible" start and end; we compute the distance for each such
1015 # interval, and choose the one with the lowest hamming distance; ties are
1016 # broken in a deterministic-but-unspecified manner
1017
1018 bestD = bestStart = bestEnd = None
1019
1020 for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
1021 d = hamming_distance(seq,s,e,kmer)
1022 if (d == None): continue
1023 if (bestD == None) or (d <= bestD):
1024 (bestD,bestStart,bestEnd) = (d,s,e)
1025
1026 return (kmer,bestD,bestStart,bestEnd)
1027
1028 # plausible_intervals--
1029 # Yield all plausible intervals intersecting with a run. We generate all
1030 # starts within P bp of the run's start. For each of these, we either (a) try
1031 # all ends within P bp of run's end, or (b) trim the new interval to a whole
1032 # multiple of the period, and report this short interval and the longer
1033 # interval with one more period appended. Case (a) allows partial motifs,
1034 # while case (b) only allows whole motifs.
1035
1036 def plausible_intervals(start,end,period,seqLen,allowPartials=False):
1037
1038 # generate intervals that allow a partial copy of the motif
1039
1040 if (allowPartials):
1041 for candStart in xrange(start-(period-1),start+period):
1042 if (candStart < 0): continue
1043 for candEnd in xrange(end-(period-1),end+period):
1044 if (candEnd > seqLen): continue
1045 if (candEnd <= candStart+period): continue
1046 yield (candStart,candEnd)
1047
1048 # -OR- generate intervals that allow only whole copies of the motif
1049
1050 else:
1051 for candStart in xrange(start-(period-1),start+period):
1052 if (candStart < 0): continue
1053 candEnd = candStart + ((end-candStart)/period)*period
1054 yield (candStart,candEnd)
1055 candEnd += period
1056 if (candEnd <= seqLen): yield (candStart,candEnd)
1057
1058
1059 # hamming_distance--
1060 # Determine the hamming distance between the run and a simple repeat.
1061 # $$$ improve this by allowing gaps, and stopping when we reach a threshold
1062
1063 kmerToDiffs = {} # (this is used for memo-ization)
1064
1065 def hamming_distance(seq,start,end,kmer):
1066 period = len(kmer)
1067 if (end < start + period): return None
1068
1069 wholeEnd = start + ((end-start)/period)*period
1070
1071 if (kmer not in kmerToDiffs):
1072 kmerToDiffs[kmer] = { kmer:0 }
1073
1074 d = 0
1075 for ix in xrange(start,wholeEnd,period):
1076 qmer = seq[ix:ix+period] # same size as the kmer motif
1077 if (qmer in kmerToDiffs[kmer]):
1078 d += kmerToDiffs[kmer][qmer]
1079 continue
1080 diffs = 0
1081 for iy in xrange(0,period):
1082 if (qmer[iy] != kmer[iy]): diffs += 1
1083 kmerToDiffs[kmer][qmer] = diffs
1084 d += diffs
1085
1086 if (end > wholeEnd):
1087 qmer = seq[wholeEnd:end] # shorter than the kmer motif
1088 if (qmer in kmerToDiffs[kmer]):
1089 d += kmerToDiffs[kmer][qmer]
1090 else:
1091 diffs = 0
1092 for iy in xrange(0,len(qmer)):
1093 if (qmer[iy] != kmer[iy]): diffs += 1
1094 kmerToDiffs[kmer][qmer] = diffs
1095 d += diffs
1096
1097 return d
1098
1099
1100 # fasta_sequences--
1101 # Read the fasta sequences from a file. Note that we convert to upper case,
1102 # and convert any letter other than ACGT to N.
1103
1104 nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN")
1105
1106 def fasta_sequences(f):
1107 seqName = None
1108 seqNucs = None
1109
1110 for line in f:
1111 line = line.strip()
1112 if (line.startswith(">")):
1113 if (seqName != None):
1114 yield (seqName,"".join(seqNucs))
1115 seqName = sequence_name(line)
1116 seqNucs = []
1117 elif (seqName == None):
1118 assert (False), "first sequence has no header"
1119 else:
1120 seqNucs += [line]
1121
1122 if (seqName != None):
1123 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap))
1124
1125
1126 # fastq_sequences--
1127 # Read the fastq sequences from a file. Note that we convert to upper case,
1128 # and convert any letter other than ACGT to N.
1129
1130 def fastq_sequences(f):
1131 lineNum = 0
1132 for line in f:
1133 lineNum += 1
1134 line = line.strip()
1135
1136 if (lineNum % 4 == 1):
1137 assert (line.startswith("@")), \
1138 "bad read name at line %d" % lineNum
1139 seqName = line[1:]
1140 continue
1141
1142 if (lineNum % 4 == 2):
1143 seqNucs = line
1144 continue
1145
1146 if (lineNum % 4 == 3):
1147 assert (line.startswith("+")), \
1148 "can't understand line %d:\n%s" % (lineNum,line)
1149 continue
1150
1151 quals = line
1152 assert (len(quals) == len(seqNucs)), \
1153 "length mismatch read vs. qualities at line %d" % lineNum
1154 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals)
1155
1156 assert (lineNum % 4 == 0), \
1157 "incomplete read at end of file"
1158
1159 def sam_sequences(f):
1160 lineNum = 0
1161 for line in f:
1162 lineNum += 1
1163 line = line.strip()
1164
1165 if line.startswith("@"):
1166 continue
1167
1168 columns = line.split("\t")
1169 seqName = columns[0]
1170 refName = columns[2]
1171 pre_s = int(columns[3]) - 1
1172 cigar = columns[5]
1173 seqNucs = columns[9]
1174
1175 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar)
1176
1177 # sequence_name--
1178 # Extract the sequence name from a fasta header.
1179 # $$$ this may need to be improved $$$
1180
1181 def sequence_name(s):
1182 s = s[1:].strip()
1183 if (s == ""): return ""
1184 else: return s.split()[0]
1185
1186
1187 # nucleotide_runs--
1188 # Yield (start,end) for all runs of valid nucleotides in a sequence.
1189
1190 def nucleotide_runs(s):
1191 runs = []
1192 start = None
1193 for (ix,nuc) in enumerate(s):
1194 if (nuc in "ACGT"):
1195 if (start == None):
1196 start = ix
1197 else:
1198 if (start != None):
1199 yield (start,ix)
1200 start = None
1201
1202 if (start != None): yield (start,len(s))
1203
1204
1205 # contains_repeat--
1206 # Determine whether a short sequence contains a repeated element, such as a
1207 # 6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT). The repeat
1208 # must cover the entire sequence, without mismatches.
1209
1210 def contains_repeat(kmer):
1211 kmerLength = len(kmer)
1212 hasRepeat = False
1213 rptLen = 1
1214 while (not hasRepeat) and (2 * rptLen <= kmerLength):
1215 if (kmerLength % rptLen != 0):
1216 rptLen += 1
1217 continue
1218 isRepeat = True
1219 for i in xrange(rptLen,kmerLength,rptLen):
1220 if (kmer[i:i+rptLen] != kmer[:rptLen]):
1221 isRepeat = False
1222 break
1223 if (isRepeat):
1224 hasRepeat = True
1225 break
1226 rptLen += 1
1227 return hasRepeat
1228
1229
1230 # hash108--
1231 # Return a 108-bit hash "value" of a string
1232
1233 def hash108(s):
1234 m = md5_new()
1235 m.update(s)
1236 return m.hexdigest()[:27]
1237
1238
1239 # float_or_fraction--
1240 # Convert a string to a number, allowing fractions
1241
1242 def float_or_fraction(s):
1243 if ("/" in s):
1244 (numer,denom) = s.split("/",1)
1245 return float(numer)/float(denom)
1246 else:
1247 return float(s)
1248
1249
1250 # int_with_unit--
1251 # Parse a string as an integer, allowing unit suffixes
1252
1253 def int_with_unit(s):
1254 if (s.endswith("K")):
1255 multiplier = 1000
1256 s = s[:-1]
1257 elif (s.endswith("M")):
1258 multiplier = 1000 * 1000
1259 s = s[:-1]
1260 elif (s.endswith("G")):
1261 multiplier = 1000 * 1000 * 1000
1262 s = s[:-1]
1263 else:
1264 multiplier = 1
1265
1266 try: return int(s) * multiplier
1267 except ValueError: return int(math.ceil(float(s) * multiplier))
1268
1269
1270 if __name__ == "__main__": main()
1271