0
+ − 1 #!/usr/bin/env python
+ − 2 """
+ − 3 Snoop thru a fasta file looking for microsatellite repeats of given periods
+ − 4 Output format: length_of_repeat left_flank_length right_flank_length repeat_motif hamming_distance read_name read_sequence read_quality (additional columns)
+ − 5
+ − 6 If --r option turned on, output format will have additional columns behind:
+ − 7 read_name read_chr pre_s pre_e tr_s tr_e suf_s suf_e tr_len tr_ref_seq
+ − 8
+ − 9 pre_s where the read start
+ − 10 pre_e the last position before microsatellite
+ − 11 tr_s where microsatellite start
+ − 12 tr_e where microsatellite end
+ − 13 suf_s first base after microsatellite
+ − 14 tr_ref_seq reference sequence corresponding to microsatellite
+ − 15
+ − 16 * output positions are 0 based
+ − 17
+ − 18 :Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu)
+ − 19
+ − 20 modifing log:
+ − 21
+ − 22 09/27/2013
+ − 23 replace function dense_intervals with function non_negative_intervals, which do not need to import such file.
+ − 24
+ − 25 10/18/2013
+ − 26 modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel
+ − 27
+ − 28 02/25/2014
+ − 29 add function that can deal with mapped reads
+ − 30 with additional output
+ − 31
+ − 32 02/28/2014
+ − 33 modify the 0-based end point, as in 0-base area, it is half-open [ )
+ − 34 so the 0-based site, should always be added by 1
+ − 35
+ − 36 03/05/2014
+ − 37 deal with multi-fasta
+ − 38 """
+ − 39 from sys import argv,stdin,stderr,exit
+ − 40 from string import maketrans
+ − 41 from md5 import new as md5_new
+ − 42 import re
+ − 43 #from pyfracluster import dense_intervals
+ − 44
+ − 45 def usage(s=None):
+ − 46 message = """
+ − 47 usage: microsat_snoop [fasta_file] [options]
+ − 48 <fasta_file> Name of file to read sequences from; if absent,
+ − 49 sequences are read from stdin
+ − 50 --fasta Input file is in fasta format
+ − 51 (this is the default)
+ − 52 --fastq Input file is in fastq format
+ − 53 (default is fasta unless filename is .fastq)
+ − 54 --fastq:noquals Input file is in fastq format, but discard quals
+ − 55 --sam Input file is SAM file
+ − 56 --r Indicate additional output information, if indicated,
+ − 57 --ref option is mendatory
+ − 58 --ref=<filepath> Reference file (absolute) path
+ − 59 --period=<length> (mandatory,cumulative) repeat length(s) to be
+ − 60 searched for
+ − 61 <length> is expected to be small, less than 10
+ − 62 <length> can also be a comma-separated list, or
+ − 63 a range <low>..<high>
+ − 64 --rate=<fraction> control the candidate repeat interval detector;
+ − 65 it will consider intervals with at least
+ − 66 <fraction> of matches when shifted by the period;
+ − 67 <fraction> is between 0 and 1 and can be either a
+ − 68 real number or <n>/<d>
+ − 69 (default is 6/7)
+ − 70 --minlength=<length> minimum length of intervals reported, in bp
+ − 71 (default is 20)
+ − 72 --progress=<count> how often to report the sequence we're searching
+ − 73 (default is no progress report)
+ − 74 --allowduplicates process all input sequences
+ − 75 (this is the default)
+ − 76 --noduplicates ignore any input sequence that's the same as an
+ − 77 earlier sequence
+ − 78 --nonearduplicates ignore any input sequence that has the same first
+ − 79 100 bp as an earlier sequence
+ − 80 --nonearduplicate=<length> ignore any input sequence that has the same first
+ − 81 <length> bp as an earlier sequence
+ − 82 --hamming=<count> Don't report candidate repeat intervals that have
+ − 83 more than <count> mismatches
+ − 84 (default is to do no such filtering)
+ − 85 --prefix=<length> Don't report candidate repeat intervals that
+ − 86 start within <length> of the sequence start
+ − 87 (default is to do no such filtering)
+ − 88 --suffix=<length> Don't report candidate repeat intervals that
+ − 89 end within <length> of the sequence end
+ − 90 (default is to do no such filtering)
+ − 91 --subsample=<k>/<n> Process only the <k>th sequence of every group of
+ − 92 <n> sequences; <k> ranges from 1 to <n>
+ − 93 --multipleruns Consider all candidate intervals in a sequence
+ − 94 (default is to consider only the longest)
+ − 95 --partialmotifs Consider microatelites with a partial motif
+ − 96 (default is to consider only whole motifs)
+ − 97 --splitbyvalidity Preprocess sequences, splitting at Ns; this
+ − 98 prevents candidates from including Ns
+ − 99 (default is not to split)
+ − 100 --noflankdisplay Show entire sequence as flanking regions
+ − 101 (this is the default)
+ − 102 --flankdisplay=<length> Limit length of flanking regions shown
+ − 103 --readnamesuffix=<string> Root of suffix to append to read names; e.g. 1
+ − 104 for forward, 2 for reverse; this triggers other
+ − 105 info to be included in the suffix
+ − 106 (default is "1" for fastq; no suffix for fasta)
+ − 107 --head=<number> limit the number of sequences processed
+ − 108 --markend Write a marker line upon completion
+ − 109 (default is not to write a marker)
+ − 110 --help=details Describe the process, and quit"""
+ − 111
+ − 112 if (s == None): exit (message)
+ − 113 else: exit ("%s\n%s" % (s,message))
+ − 114
+ − 115
+ − 116 detailedDescription = """In broad terms, the process works as follows:
+ − 117
+ − 118 (1) Identify intervals that are highly correlated with the interval shifted by
+ − 119 P (the repeat period). These intervals are called "runs" or "candidates".
+ − 120 The level of correlation required is controlled by rateThreshold.
+ − 121 Depending on whether we want to look for more than one microsat, we either
+ − 122 find the longest such run (simple algorithm) or many runs (more complicated
+ − 123 algorithm). The following steps are then performed on each run.
+ − 124
+ − 125 (2) Find the most likely repeat motif in the run. This is done by counting
+ − 126 all kmers (of length P) and choosing the most frequent. If that kmer is
+ − 127 itself covered by a sub-repeat we discard this run. The idea is that we
+ − 128 can ignore a 6-mer like ACGACG because we will find it when we are looking
+ − 129 for 3-mers.
+ − 130
+ − 131 (3) Once we identify the most likely repeat motif, we then modify the
+ − 132 interval, adjusting start and end to find the interval that has the fewest
+ − 133 mismatches vs. a sequence of the motif repeated (hamming distance). Only
+ − 134 whole copies of the motif are considered.
+ − 135
+ − 136 (4) At this point we have a valid microsat interval (in the eyes of the
+ − 137 program). It is subjected to some filtering stages (hamming distance or too
+ − 138 close to an end), and if it satisfies those conditions, it's reported to
+ − 139 the user."""
+ − 140
+ − 141 def main():
+ − 142 global debug
+ − 143
+ − 144 #=== parse the command line ===
+ − 145
+ − 146 inputFilename = None
+ − 147 referenceFileName = None #add by Chen Sun on 02/25
+ − 148 inputFormat = None
+ − 149 repeatPeriods = []
+ − 150 rateThreshold = 6 / 7.0
+ − 151 lengthThreshold = 20
+ − 152 reportProgress = None
+ − 153 discardDuplicates = False
+ − 154 discardNearDuplicates = False
+ − 155 nearDuplicatePrefix = 100
+ − 156 hammingThreshold = 0
+ − 157 prefixThreshold = None
+ − 158 suffixThreshold = None
+ − 159 subsampleK = None
+ − 160 subsampleN = None
+ − 161 reportMultipleRuns = False
+ − 162 allowPartialMotifs = False
+ − 163 splitByValidity = False
+ − 164 flankDisplayLimit = None
+ − 165 readNameSuffix = None
+ − 166 headLimit = None
+ − 167 markEndOfFile = False
+ − 168 additionalInfo = False
+ − 169 debug = []
+ − 170
+ − 171 for arg in argv[1:]:
+ − 172 if (arg == "--fasta"):
+ − 173 inputFormat = "fasta"
+ − 174 elif (arg == "--fastq"):
+ − 175 inputFormat = "fastq"
+ − 176 elif (arg == "--fastq:noquals"):
+ − 177 inputFormat = "fastq:noquals"
+ − 178 elif (arg == "--sam"):
+ − 179 inputFormat = "sam"
+ − 180 elif (arg == "--r"):
+ − 181 additionalInfo = True
+ − 182 elif (arg.startswith("--ref=")):
+ − 183 referenceFileName = arg.split("=",1)[1]
+ − 184 elif (arg.startswith("--period=")):
+ − 185 val = arg.split("=",1)[1]
+ − 186 for period in val.split(","):
+ − 187 if (".." in period):
+ − 188 (lowPeriod,highPeriod) = period.split("..",1)
+ − 189 lowPeriod = int(lowPeriod)
+ − 190 highPeriod = int(highPeriod)
+ − 191 for period in xrange(lowPeriod,highPeriod+1):
+ − 192 repeatPeriods += [period]
+ − 193 else:
+ − 194 repeatPeriods += [int(period)]
+ − 195 elif (arg.startswith("--rate=")):
+ − 196 val = arg.split("=",1)[1]
+ − 197 rateThreshold = float_or_fraction(val)
+ − 198 assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val
+ − 199 elif (arg.startswith("--minlength=")):
+ − 200 val = arg.split("=",1)[1]
+ − 201 lengthThreshold = int(val)
+ − 202 assert (lengthThreshold >= 0)
+ − 203 elif (arg.startswith("--progress=")):
+ − 204 val = arg.split("=",1)[1]
+ − 205 reportProgress = int(val)
+ − 206 elif (arg == "--allowduplicates"):
+ − 207 discardDuplicates = False
+ − 208 discardNearDuplicates = False
+ − 209 elif (arg == "--noduplicates"):
+ − 210 discardDuplicates = True
+ − 211 discardNearDuplicates = False
+ − 212 elif (arg == "--nonearduplicates"):
+ − 213 discardDuplicates = False
+ − 214 discardNearDuplicates = True
+ − 215 elif (arg.startswith("--nonearduplicate=")):
+ − 216 val = arg.split("=",1)[1]
+ − 217 discardDuplicates = False
+ − 218 discardNearDuplicates = True
+ − 219 nearDuplicatePrefix = int(val)
+ − 220 assert (nearDuplicatePrefix > 0)
+ − 221 elif (arg.startswith("--hamming=")):
+ − 222 val = arg.split("=",1)[1]
+ − 223 hammingThreshold = int(val)
+ − 224 assert (hammingThreshold >= 0)
+ − 225 elif (arg.startswith("--prefix=")):
+ − 226 val = arg.split("=",1)[1]
+ − 227 prefixThreshold = int(val)
+ − 228 assert (prefixThreshold >= 0)
+ − 229 elif (arg.startswith("--suffix=")):
+ − 230 val = arg.split("=",1)[1]
+ − 231 suffixThreshold = int(val)
+ − 232 assert (suffixThreshold >= 0)
+ − 233 elif (arg.startswith("--subsample=")):
+ − 234 val = arg.split("=",1)[1]
+ − 235 (k,n) = val.split("/",2)
+ − 236 subsampleK = int(k)
+ − 237 subsampleN = int(n)
+ − 238 assert (0 < subsampleK <= subsampleN)
+ − 239 elif (arg == "--multipleruns"):
+ − 240 reportMultipleRuns = True
+ − 241 elif (arg == "--partialmotifs"):
+ − 242 allowPartialMotifs = True
+ − 243 elif (arg == "--splitbyvalidity"):
+ − 244 splitByValidity = True
+ − 245 elif (arg == "--noflankdisplay"):
+ − 246 flankDisplayLimit = None
+ − 247 elif (arg.startswith("--flankdisplay=")):
+ − 248 val = arg.split("=",1)[1]
+ − 249 flankDisplayLimit = int(val)
+ − 250 assert (flankDisplayLimit >= 0)
+ − 251 elif (arg.startswith("--readnamesuffix")):
+ − 252 readNameSuffix = arg.split("=",1)[1]
+ − 253 elif (arg.startswith("--head=")):
+ − 254 headLimit = int_with_unit(arg.split("=",1)[1])
+ − 255 elif (arg == "--markend"):
+ − 256 markEndOfFile = True
+ − 257 elif (arg == "--help=details"):
+ − 258 exit (detailedDescription)
+ − 259 elif (arg.startswith("--debug=")):
+ − 260 debug += (arg.split("=",1)[1]).split(",")
+ − 261 elif (arg.startswith("--")):
+ − 262 usage("unrecognized option: %s" % arg)
+ − 263 elif (inputFilename == None):
+ − 264 inputFilename = arg
+ − 265 else:
+ − 266 usage("unrecognized option: %s" % arg)
+ − 267
+ − 268 #=== determine periods of interest ===
+ − 269
+ − 270 if (repeatPeriods == []):
+ − 271 usage("you gotta give me a repeat period")
+ − 272
+ − 273 if (additionalInfo == True):
+ − 274 if (referenceFileName == None):
+ − 275 usage("reference file path needed. use --ref=<reference> to indicate")
+ − 276
+ − 277 periodSeed = {}
+ − 278 for period in repeatPeriods:
+ − 279 if (period < 1): usage("period %d is not valid" % period)
+ − 280 periodSeed[period] = True
+ − 281
+ − 282 repeatPeriods = [period for period in periodSeed]
+ − 283 repeatPeriods.sort()
+ − 284
+ − 285 #=== determine input format ===
+ − 286
+ − 287 if (inputFormat == "fasta"): sequence_reader = fasta_sequences
+ − 288 elif (inputFormat == "fastq"): sequence_reader = fastq_sequences
+ − 289 elif (inputFormat == "fastq:noquals"): sequence_reader = fastq_sequences
+ − 290 elif (inputFormat == "sam"): sequence_reader = sam_sequences
+ − 291 elif (inputFilename == None): sequence_reader = fasta_sequences
+ − 292 elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences
+ − 293 elif (inputFilename.endswith(".fq")): sequence_reader = fastq_sequences
+ − 294 elif (inputFilename.endswith(".sam")): sequence_reader = sam_sequences
+ − 295 else: sequence_reader = fasta_sequences
+ − 296
+ − 297 if (inputFilename != None): inputF = file(inputFilename,"rt")
+ − 298 else: inputF = stdin
+ − 299
+ − 300 if (readNameSuffix == None) \
+ − 301 and (sequence_reader == fastq_sequences) \
+ − 302 and (inputFormat != "fastq:noquals"):
+ − 303 readNameSuffix = "1"
+ − 304
+ − 305 #=== process the sequences ===
+ − 306
+ − 307 refSequence = {}
+ − 308 rightName = ""
+ − 309 sequence = ""
+ − 310 if additionalInfo:
+ − 311 firstFasta = True
+ − 312 originalRefF = open(referenceFileName)
+ − 313 for line in originalRefF.readlines():
+ − 314 line = line.replace('\r','')
+ − 315 line = line.replace('\n','')
+ − 316 if line.startswith(">"):
+ − 317 if firstFasta:
+ − 318 firstFasta = False
+ − 319 else:
+ − 320 refSequence[rightName] = sequence
+ − 321 rightName = line[1:]
+ − 322 sequence = ""
+ − 323 continue
+ − 324 sequence += line
+ − 325 originalRefF.close()
+ − 326 refSequence[rightName] = sequence
+ − 327
+ − 328 sequenceSeen = {}
+ − 329
+ − 330 numSequences = 0
+ − 331 for seqInfo in sequence_reader(inputF):
+ − 332 numSequences += 1
+ − 333 if (headLimit != None) and (numSequences > headLimit):
+ − 334 print >>stderr, "limit of %d sequences reached" % headLimit
+ − 335 break
+ − 336
+ − 337 if (sequence_reader == sam_sequences):
+ − 338 #seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar
+ − 339 (name, sequence, refName, pre_s, cigar) = seqInfo
+ − 340 quals = None
+ − 341 elif (sequence_reader == fastq_sequences):
+ − 342 (name,sequence,quals) = seqInfo
+ − 343 if (inputFormat == "fastq:noquals"): quals = None
+ − 344 else:
+ − 345 (name,sequence) = seqInfo
+ − 346 quals = None
+ − 347
+ − 348 if (reportProgress != None) and (numSequences % reportProgress == 0):
+ − 349 print >>stderr, "%s %d" % (name,numSequences)
+ − 350
+ − 351 # if we're subsampling and not interested in this sequence, skip it
+ − 352
+ − 353 if (subsampleN != None):
+ − 354 if ((numSequences-1) % subsampleN != (subsampleK-1)):
+ − 355 continue
+ − 356
+ − 357 # if this sequence is shorter than the length of interest, skip it
+ − 358
+ − 359 seqLen = len(sequence)
+ − 360 if (seqLen < period) or (seqLen < lengthThreshold): continue
+ − 361
+ − 362 # if we're not interested in duplicates and this is one, skip it;
+ − 363 # note that we assume no hash collisions occur, i.e. that all hash
+ − 364 # matches are truly sequence matches
+ − 365
+ − 366 if (discardDuplicates):
+ − 367 h = hash108(sequence)
+ − 368 if (h in sequenceSeen): continue
+ − 369 sequenceSeen[h] = True
+ − 370 elif (discardNearDuplicates):
+ − 371 h = hash108(sequence[:nearDuplicatePrefix])
+ − 372 if (h in sequenceSeen): continue
+ − 373 sequenceSeen[h] = True
+ − 374
+ − 375 # split the sequence into chunks of valid nucleotides
+ − 376
+ − 377 if (splitByValidity):
+ − 378 chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)]
+ − 379 else:
+ − 380 chunks = [(0,len(sequence))]
+ − 381
+ − 382 # evaluate for each period of interest
+ − 383
+ − 384 for period in repeatPeriods:
+ − 385
+ − 386 # operate on each chunk
+ − 387
+ − 388 for (chunkStart,chunkEnd) in chunks:
+ − 389 chunkLen = chunkEnd - chunkStart
+ − 390 if (chunkLen < period) or (chunkLen < lengthThreshold): continue
+ − 391
+ − 392 if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug):
+ − 393 print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd)
+ − 394
+ − 395 # compute correlation sequence
+ − 396
+ − 397 corr = correlation_sequence(sequence,period,chunkStart,chunkEnd)
+ − 398
+ − 399 if ("correlation" in debug) or ("runs" in debug):
+ − 400 print >>stderr, sequence[chunkStart:chunkEnd]
+ − 401 print >>stderr, corr
+ − 402
+ − 403 # find runs (candidates for being a microsat)
+ − 404
+ − 405 if (reportMultipleRuns):
+ − 406 runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold)
+ − 407 else:
+ − 408 runs = longest_suitable_run(corr,lengthThreshold,rateThreshold)
+ − 409 if (runs == []): continue
+ − 410
+ − 411
+ − 412 if ("runs" in debug):
+ − 413 for (start,end) in runs:
+ − 414 run = [" "] * seqLen
+ − 415 for ix in xrange(start-period,end):
+ − 416 run[ix] = "*"
+ − 417 print >>stderr, "".join(run)
+ − 418
+ − 419 if ("candidates" in debug):
+ − 420 for (start,end) in runs:
+ − 421 print >>stderr, "%s %d %d" % (name,start,end)
+ − 422
+ − 423 # process runs and report those that pass muster
+ − 424
+ − 425 runCount = 0
+ − 426 for (start,end) in runs:
+ − 427 runCount += 1
+ − 428
+ − 429 start = chunkStart + start - period
+ − 430 end = chunkStart + end
+ − 431
+ − 432 (kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs)
+ − 433 if (kmer == None): continue # (no useful repeat kmer was found)
+ − 434
+ − 435 rptExtent = end - start
+ − 436 prefixLen = start
+ − 437 suffixLen = seqLen - end
+ − 438 if (rptExtent <= period): continue
+ − 439 if (hammingThreshold != None) and (d > hammingThreshold): continue
+ − 440 if (prefixThreshold != None) and (prefixLen < prefixThreshold): continue
+ − 441 if (suffixThreshold != None) and (suffixLen < suffixThreshold): continue
+ − 442
+ − 443 if (flankDisplayLimit == None):
+ − 444 seq = sequence[:start] \
+ − 445 + sequence[start:end].lower() \
+ − 446 + sequence[end:]
+ − 447 else:
+ − 448 seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \
+ − 449 + sequence[start:end].lower() \
+ − 450 + sequence[end:min(chunkEnd,end+flankDisplayLimit)]
+ − 451 reportName = name
+ − 452 if (readNameSuffix != None):
+ − 453 reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount)
+ − 454 if (quals == None or quals == "." or quals == "\t."): quals = "\t."
+ − 455 else: quals = "\t" + quals
+ − 456 if not additionalInfo:
+ − 457 print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \
+ − 458 % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals)
+ − 459 else:
+ − 460 #pre_e = pre_s + prefixLen - 1
+ − 461 refPoint = pre_s
+ − 462 donorPoint = 0
+ − 463
+ − 464 donorBeforeStart = prefixLen - 1 #pre_e
+ − 465 donorMicroStart = prefixLen #tr_s
+ − 466 donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e
+ − 467 donorAfterMicro = donorMicroEnd + 1 #suf_s
+ − 468 donorEnd = len(seq) - 1 #suf_e
+ − 469
+ − 470 set_pre_e = False
+ − 471 set_tr_s = False
+ − 472 set_tr_e = False
+ − 473 set_suf_s = False
+ − 474 set_suf_e = False
+ − 475
+ − 476 pre_e = 0
+ − 477 tr_s = 0
+ − 478 tr_e = 0
+ − 479 suf_s = 0
+ − 480 suf_e = 0
+ − 481
+ − 482 matchList = re.findall('(\d+)([IDM])', cigar)
+ − 483 unCognitiveCigar = False
+ − 484 for matchN, matchType in matchList:
+ − 485 matchNum = int(matchN)
+ − 486 if matchType == "M":
+ − 487 donorPoint = donorPoint + matchNum
+ − 488 refPoint = refPoint + matchNum
+ − 489 elif matchType == "D":
+ − 490 refPoint = refPoint + matchNum
+ − 491 continue
+ − 492 elif matchType == "I":
+ − 493 donorPoint = donorPoint + matchNum
+ − 494 else:
+ − 495 unCognitiveCigar = True
+ − 496 break
+ − 497
+ − 498 if not set_pre_e:
+ − 499 if donorPoint >= donorBeforeStart:
+ − 500 pre_e = refPoint - (donorPoint - donorBeforeStart)
+ − 501 set_pre_e = True
+ − 502 else:
+ − 503 continue
+ − 504
+ − 505 if not set_tr_s:
+ − 506 if donorPoint >= donorMicroStart:
+ − 507 tr_s = refPoint - (donorPoint - donorMicroStart)
+ − 508 set_tr_s = True
+ − 509 else:
+ − 510 continue
+ − 511
+ − 512 if not set_tr_e:
+ − 513 if donorPoint >= donorMicroEnd:
+ − 514 tr_e = refPoint - (donorPoint - donorMicroEnd)
+ − 515 set_tr_e = True
+ − 516 else:
+ − 517 continue
+ − 518
+ − 519 if not set_suf_s:
+ − 520 if donorPoint >= donorAfterMicro:
+ − 521 suf_s = refPoint - (donorPoint - donorAfterMicro)
+ − 522 set_suf_s = True
+ − 523 else:
+ − 524 continue
+ − 525
+ − 526 if not set_suf_e:
+ − 527 if donorPoint >= donorEnd:
+ − 528 suf_e = refPoint - (donorPoint - donorEnd)
+ − 529 set_suf_e = True
+ − 530 else:
+ − 531 continue
+ − 532
+ − 533 if unCognitiveCigar:
+ − 534 break
+ − 535 tr_len = tr_e - tr_s + 1
+ − 536
+ − 537 if refName not in refSequence:
+ − 538 tr_ref_seq = "."
+ − 539 else:
+ − 540 if refSequence[refName] == "":
+ − 541 tr_ref_seq = "."
+ − 542 elif len(refSequence[refName]) <= tr_e:
+ − 543 tr_ref_seq = "."
+ − 544 else:
+ − 545 tr_ref_seq = refSequence[refName][tr_s:tr_e+1]
+ − 546
+ − 547 pre_e += 1
+ − 548 tr_e += 1
+ − 549 suf_e += 1
+ − 550 print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \
+ − 551 % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
+ − 552
+ − 553 if (markEndOfFile):
+ − 554 print "# microsat_snoop end-of-file"
+ − 555
+ − 556 if (inputF != stdin):
+ − 557 inputF.close()
+ − 558
+ − 559 # non_negative_intervals
+ − 560 # find intervals with exactly + and no -
+ − 561 # from string like this : +++++++++---+++++++++
+ − 562 def non_negative_intervals(seq, minLength=None):
+ − 563
+ − 564 start = -1
+ − 565 end = -1
+ − 566 firstPlus = 1
+ − 567 #print seq
+ − 568 for ix in range(len(seq)): # for every char in seq
+ − 569 ch = seq[ix]
+ − 570 if(ch == "+"):
+ − 571 if(firstPlus):
+ − 572 firstPlus = 0
+ − 573 start = ix
+ − 574 else:
+ − 575 continue
+ − 576 elif(ch == "-"):
+ − 577 if(start >= 0):
+ − 578 end = ix-1
+ − 579 if((end - start + 1) >= minLength):
+ − 580 yield (start,end+1)
+ − 581 start = -1
+ − 582 firstPlus = 1
+ − 583 if(start > 0):
+ − 584 if((ix - start + 1) >= minLength):
+ − 585 yield (start, ix+1)
+ − 586
+ − 587
+ − 588 ###################################################################
+ − 589 # modified by Chen Sun on 7/11/2014
+ − 590 # We do not want other modules, so parse these functions inside
+ − 591 #
+ − 592 ###################################################################
+ − 593
+ − 594 # parse a string of the form {positives}/{positives_and_neutrals}
+ − 595
+ − 596 def parse_spec(s):
+ − 597 if ("/" not in s): raise ValueError
+ − 598 (n,d) = s.split("/",1)
+ − 599 if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError
+ − 600 if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError
+ − 601
+ − 602 positives = n[1:-1]
+ − 603 d = d[1:-1]
+ − 604
+ − 605 for ch in positives:
+ − 606 if (ch not in d): raise ValueError
+ − 607
+ − 608 neutrals = [ch for ch in d if (ch not in positives)]
+ − 609 return (positives,neutrals)
+ − 610
+ − 611
+ − 612 # convert a string to a number, allowing fractions
+ − 613
+ − 614 def float_or_fraction(s):
+ − 615 if ("/" in s):
+ − 616 (numer,denom) = s.split("/",1)
+ − 617 return float(numer)/float(denom)
+ − 618 else:
+ − 619 return float(s)
+ − 620
+ − 621
+ − 622 # dense_intervals--
+ − 623 # Find all non-overlapping runs with a good enough rate (of positives), and
+ − 624 # which meet our length threshold.
+ − 625 #
+ − 626 # The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing
+ − 627 # long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999.
+ − 628 #
+ − 629 # $$$ we use the denominator as the threshold, but we really should use the
+ − 630 # $$$ .. numerator, comparing it to minLength*rate
+ − 631
+ − 632 def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None):
+ − 633
+ − 634 if (blockers == None):
+ − 635 blockers = "".join([chr(n) for n in range(1,256)
+ − 636 if (chr(n) not in positives)
+ − 637 and (chr(n) not in neutrals)])
+ − 638
+ − 639 stackLeft = [None] # stack with each entry containing five
+ − 640 stackRight = [None] # .. elements; note that entry zero is not
+ − 641 stackLeftScore = [None] # .. used
+ − 642 stackRightScore = [None]
+ − 643 stackLower = [None]
+ − 644 top = 0
+ − 645 score = 0
+ − 646
+ − 647 for ix in range(len(seq)):
+ − 648 ch = seq[ix]
+ − 649 if (ch in blockers):
+ − 650 # emit intervals
+ − 651
+ − 652 for sp in range(1,top+1):
+ − 653 left = stackLeft [sp] + 1
+ − 654 right = stackRight[sp]
+ − 655
+ − 656 while (left < right) and (seq[left] not in positives): left += 1
+ − 657 while (right > left) and (seq[right] not in positives): right -= 1
+ − 658
+ − 659 right += 1
+ − 660 if (minLength == None) or (right - left >= minLength):
+ − 661 yield (left,right)
+ − 662
+ − 663 #empty stack
+ − 664
+ − 665 stackLeft = [None]
+ − 666 stackRight = [None]
+ − 667 stackLeftScore = [None]
+ − 668 stackRightScore = [None]
+ − 669 stackLower = [None]
+ − 670 top = 0
+ − 671 score = 0
+ − 672 continue
+ − 673
+ − 674 if (ch in positives): weight = 1-rate
+ − 675 elif (ch in neutrals): weight = -rate
+ − 676 else: raise ValueError
+ − 677
+ − 678 score += weight
+ − 679 #if ("algorithm" in debug):
+ − 680 # print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score),
+ − 681
+ − 682 if (weight < 0):
+ − 683 #if ("algorithm" in debug):
+ − 684 # print >>sys.stderr
+ − 685 continue
+ − 686
+ − 687 if (top > 0) and (stackRight[top] == ix-1):
+ − 688 # add this site to the interval on top of the stack
+ − 689
+ − 690 stackRight [top] = ix
+ − 691 stackRightScore[top] = score
+ − 692
+ − 693 #if ("algorithm" in debug):
+ − 694 # print >>sys.stderr, \
+ − 695 # " extending [%d] %d-%d %4.1f %4.1f" \
+ − 696 # % (top,
+ − 697 # stackLeft [top], stackRight [top],
+ − 698 # stackLeftScore[top], stackRightScore[top]),
+ − 699
+ − 700 else:
+ − 701 # create a one site interval
+ − 702
+ − 703 top += 1
+ − 704 if (top >= len(stackLeft)):
+ − 705 stackLeft += [None]
+ − 706 stackRight += [None]
+ − 707 stackLeftScore += [None]
+ − 708 stackRightScore += [None]
+ − 709 stackLower += [None]
+ − 710
+ − 711 stackLeft [top] = ix - 1
+ − 712 stackLeftScore [top] = score - weight
+ − 713 stackRight [top] = ix
+ − 714 stackRightScore[top] = score
+ − 715 stackLower [top] = top - 1
+ − 716
+ − 717 while (stackLower[top] > 0) \
+ − 718 and (stackLeftScore[stackLower[top]] > stackLeftScore[top]):
+ − 719 stackLower[top] = stackLower[stackLower[top]]
+ − 720
+ − 721 #if ("algorithm" in debug):
+ − 722 # print >>sys.stderr, \
+ − 723 # " creating [%d] %d-%d %4.1f %4.1f -> %d" \
+ − 724 # % (top,
+ − 725 # stackLeft [top], stackRight [top],
+ − 726 # stackLeftScore[top], stackRightScore[top],
+ − 727 # stackLower [top]),
+ − 728
+ − 729 # merge intervals; if there is a previous interval with a no-higher
+ − 730 # left score and no-higher right score, merge this interval (and all
+ − 731 # intervening ones) into that one
+ − 732
+ − 733 while (top > 1) \
+ − 734 and (stackLower[top] > 0) \
+ − 735 and (stackRightScore[stackLower[top]] <= stackRightScore[top]):
+ − 736 stackRight [stackLower[top]] = stackRight [top]
+ − 737 stackRightScore[stackLower[top]] = stackRightScore[top]
+ − 738 top = stackLower[top]
+ − 739
+ − 740 #if ("algorithm" in debug):
+ − 741 # print >>sys.stderr, \
+ − 742 # "\n%*s merging [%d] %d-%d %4.1f %4.1f" \
+ − 743 # % (13, "", top,
+ − 744 # stackLeft[top], stackRight [top],
+ − 745 # stackLeftScore[top], stackRightScore[top]),
+ − 746
+ − 747 #if ("algorithm" in debug):
+ − 748 # print >>sys.stderr
+ − 749
+ − 750 # emit intervals
+ − 751
+ − 752 for sp in range(1,top+1):
+ − 753 left = stackLeft [sp] + 1
+ − 754 right = stackRight[sp]
+ − 755
+ − 756 while (left < right) and (seq[left] not in positives): left += 1
+ − 757 while (right > left) and (seq[right] not in positives): right -= 1
+ − 758
+ − 759 right += 1
+ − 760 if (minLength == None) or (right - left >= minLength):
+ − 761 yield (left,right)
+ − 762
+ − 763
+ − 764 ###################################################################
+ − 765 # modified by Chen Sun on 7/11/2014
+ − 766 #
+ − 767 ###################################################################
+ − 768
+ − 769 # correlation_sequence--
+ − 770 # Compute the correlation sequence for a given period. This is a sequence
+ − 771 # of + and - indicating whether the base at a given position matches the one
+ − 772 # P positions earlier (where P is the period). The first P positions are
+ − 773 # blank. Positions with single character runs longer than the period are
+ − 774 # considered as non-matches, unless the period is 1.
+ − 775
+ − 776 def correlation_sequence(sequence,period,start=None,end=None):
+ − 777 if (start == None): start = 0
+ − 778 if (end == None): end = len(sequence)
+ − 779
+ − 780 prevCh = sequence[start]
+ − 781 run = 1
+ − 782 for ix in xrange(start+1,start+period):
+ − 783 ch = sequence[ix]
+ − 784 if (ch != prevCh): run = 1
+ − 785 else: run += 1
+ − 786 prevCh = ch
+ − 787
+ − 788 corr = [" "] * period
+ − 789 for ix in xrange(start+period,end):
+ − 790 rptCh = sequence[ix-period]
+ − 791 ch = sequence[ix]
+ − 792 if (ch != prevCh): run = 1
+ − 793 else: run += 1
+ − 794 if (ch in "ACGT") \
+ − 795 and (ch == rptCh) \
+ − 796 and ((period == 1) or (run < period)):
+ − 797 corr += ["+"]
+ − 798 else:
+ − 799 corr += ["-"]
+ − 800 prevCh = ch
+ − 801
+ − 802 return "".join(corr)
+ − 803
+ − 804
+ − 805 # longest_suitable_run--
+ − 806 # Find longest run with a good enough rate (of positives).
+ − 807 #
+ − 808 # We score a "+" as 1-r and anything else as -r. This is based on the fol-
+ − 809 # lowing derivation (p is the number of "+"s, n is the number of non-"+"s):
+ − 810 # p/(p+n) >= r
+ − 811 # ==> p >= rp + rn
+ − 812 # ==> (1-r)p - rn >= 0
+ − 813 #
+ − 814 # We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing).
+ − 815 #
+ − 816 # $$$ we use the denominator as the threshold, but we really should use the
+ − 817 # $$$ .. numerator, comparing it to minLength*rate
+ − 818 #
+ − 819 # $$$ this needs to account for $$$ this situation:
+ − 820 # $$$ sequence: ACGACGACGACGTTATTATTATTA
+ − 821 # $$$ matches: +++++++++---+++++++++
+ − 822 # $$$ this is currently considered to be one interval (if rate <= 6/7), but it
+ − 823 # $$$ ought to be two; we can't just post-process, though, because some other
+ − 824 # $$$ interval might be longer than the longest half of this; maybe what we
+ − 825 # $$$ need to do is consider matches at distances -P and -2P, or if we match
+ − 826 # $$$ -P but that itself was a mismatch, we should carry the mismatch forward
+ − 827
+ − 828 def longest_suitable_run(seq,minLength,rate):
+ − 829 maxEndingHere = 0
+ − 830 maxSoFar = 0
+ − 831 start = None
+ − 832
+ − 833 for ix in xrange(len(seq)):
+ − 834 if (seq[ix] == "+"): s = 1-rate
+ − 835 else: s = -rate
+ − 836
+ − 837 if (maxEndingHere+s < 0):
+ − 838 maxEndingHere = 0
+ − 839 block = ix
+ − 840 else:
+ − 841 maxEndingHere += s
+ − 842 if (maxEndingHere >= maxSoFar):
+ − 843 maxSoFar = maxEndingHere
+ − 844 start = block + 1
+ − 845 end = ix + 1
+ − 846
+ − 847 if (start == None) or (end - start < minLength):
+ − 848 return []
+ − 849 else:
+ − 850 return [(start,end)]
+ − 851
+ − 852
+ − 853 # all_suitable_runs--
+ − 854 # Find all non-overlapping runs with a good enough rate (of positives), and
+ − 855 # which meet our length threshold.
+ − 856 # $$$ this needs to post-process the intervals, splitting them to account for
+ − 857 # $$$ this situation:
+ − 858 # $$$ sequence: ACGACGACGACGTTATTATTATTA
+ − 859 # $$$ matches: +++++++++---+++++++++
+ − 860 # $$$ this is currently reported as one interval (if rate <= 6/7), but it
+ − 861 # $$$ ought to be two
+ − 862
+ − 863 def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold):
+ − 864
+ − 865 ################################################################
+ − 866 # modified by Chen Sun on 07/11/2014
+ − 867 #
+ − 868 ################################################################
+ − 869
+ − 870 if hammingThreshold > 0:
+ − 871 return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)]
+ − 872 elif hammingThreshold == 0:
+ − 873 return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)]
+ − 874
+ − 875
+ − 876 # find_repeat_element--
+ − 877 # Find the most plausible repeat element for a run, and nudge the ends of
+ − 878 # the run if needed. Note that we will not consider kmers that represent
+ − 879 # shorter repeats. For example, we won't report ACTACT as a 6-mer since we
+ − 880 # consider this to have a shorter period than 6.
+ − 881
+ − 882 def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False):
+ − 883
+ − 884 if hammingThreshold > 0:
+ − 885 (kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials)
+ − 886 return (kmer,bestD,bestStart,bestEnd)
+ − 887 # count the number of occurences of each k-mer; note that we can't
+ − 888 # reject kmers containing smaller repeats yet, since for a sequence like
+ − 889 # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
+ − 890 # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end
+ − 891 # up reporting something like ACACAA as the best motif
+ − 892
+ − 893 if ("element" in debug):
+ − 894 print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
+ − 895
+ − 896 if ("partial" in debug):
+ − 897 print period, seq, start, end, allowPartials;
+ − 898 print seq[start:end]
+ − 899
+ − 900 kmerToCount = {}
+ − 901 kmerToFirst = {}
+ − 902 for ix in xrange(start,end-(period-1)):
+ − 903 kmer = seq[ix:ix+period]
+ − 904 if ("N" in kmer): continue
+ − 905 if (kmer not in kmerToCount):
+ − 906 kmerToCount[kmer] = 1
+ − 907 kmerToFirst[kmer] = ix
+ − 908 else:
+ − 909 kmerToCount[kmer] += 1
+ − 910 #if ("element" in debug):
+ − 911 # print >>stderr, " %d: %s" % (ix,kmer)
+ − 912
+ − 913 # choose the best k-mer; this is simply the most frequently occurring one,
+ − 914 # with ties broken by whichever one came first
+ − 915
+ − 916 kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
+ − 917 if (kmers == []): return (None,None,start,end)
+ − 918 kmers.sort()
+ − 919
+ − 920 if ("element" in debug):
+ − 921 for (count,first,kmer) in kmers:
+ − 922 print >>stderr, " %s: %d" % (kmer,-count)
+ − 923
+ − 924 (count,first,kmer) = kmers[0]
+ − 925 if (contains_repeat(kmer)): return (None,None,start,end)
+ − 926
+ − 927 # determine the hamming distance between the run and a simple repeat, for
+ − 928 # each "plausible" start and end; we compute the distance for each such
+ − 929 # interval, and choose the one with the lowest hamming distance; ties are
+ − 930 # broken in a deterministic-but-unspecified manner
+ − 931
+ − 932 bestD = bestStart = bestEnd = None
+ − 933 ###################################################################################
+ − 934 # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013
+ − 935 # since we do not allow hamming_distance > 0, which means we do not allow mutation,
+ − 936 # we do not need this section to produce bestStart and End
+ − 937 ###################################################################################
+ − 938
+ − 939 #for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
+ − 940 # d = hamming_distance(seq,s,e,kmer)
+ − 941 # if (d == None): continue
+ − 942 # if (bestD == None) or (d <= bestD):
+ − 943 # (bestD,bestStart,bestEnd) = (d,s,e)
+ − 944
+ − 945
+ − 946
+ − 947 bestStart = start
+ − 948
+ − 949 if(allowPartials):
+ − 950 bestEnd = end
+ − 951 elif(not allowPartials):
+ − 952 bestEnd = start
+ − 953 pattern = seq[start:start+period]
+ − 954 if ("partial" in debug):
+ − 955 print "kmer:", kmer
+ − 956 if(pattern != kmer):
+ − 957 print "pattern:", pattern
+ − 958
+ − 959 while(bestEnd <= end-period):
+ − 960 bestEnd += period
+ − 961
+ − 962 # bestD will always be 0, as we do not allow mutation
+ − 963 bestD = 0
+ − 964
+ − 965 if ("partial" in debug):
+ − 966 print bestD, bestStart, bestEnd
+ − 967
+ − 968 ###################################################################################
+ − 969 # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10
+ − 970 #
+ − 971 ###################################################################################
+ − 972 return (kmer,bestD,bestStart,bestEnd)
+ − 973
+ − 974
+ − 975 def find_hamming_repeat_element(period,seq,start,end,allowPartials=False):
+ − 976
+ − 977 # count the number of occurences of each k-mer; note that we can't
+ − 978 # reject kmers containing smaller repeats yet, since for a sequence like
+ − 979 # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
+ − 980 # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end
+ − 981 # up reporting something like ACACAA as the best motif
+ − 982
+ − 983 if ("element" in debug):
+ − 984 print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
+ − 985
+ − 986 kmerToCount = {}
+ − 987 kmerToFirst = {}
+ − 988 for ix in xrange(start,end-(period-1)):
+ − 989 kmer = seq[ix:ix+period]
+ − 990 if ("N" in kmer): continue
+ − 991 if (kmer not in kmerToCount):
+ − 992 kmerToCount[kmer] = 1
+ − 993 kmerToFirst[kmer] = ix
+ − 994 else:
+ − 995 kmerToCount[kmer] += 1
+ − 996 #if ("element" in debug):
+ − 997 # print >>stderr, " %d: %s" % (ix,kmer)
+ − 998
+ − 999 # choose the best k-mer; this is simply the most frequently occurring one,
+ − 1000 # with ties broken by whichever one came first
+ − 1001
+ − 1002 kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
+ − 1003 if (kmers == []): return (None,None,start,end)
+ − 1004 kmers.sort()
+ − 1005
+ − 1006 if ("element" in debug):
+ − 1007 for (count,first,kmer) in kmers:
+ − 1008 print >>stderr, " %s: %d" % (kmer,-count)
+ − 1009
+ − 1010 (count,first,kmer) = kmers[0]
+ − 1011 if (contains_repeat(kmer)): return (None,None,start,end)
+ − 1012
+ − 1013 # determine the hamming distance between the run and a simple repeat, for
+ − 1014 # each "plausible" start and end; we compute the distance for each such
+ − 1015 # interval, and choose the one with the lowest hamming distance; ties are
+ − 1016 # broken in a deterministic-but-unspecified manner
+ − 1017
+ − 1018 bestD = bestStart = bestEnd = None
+ − 1019
+ − 1020 for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
+ − 1021 d = hamming_distance(seq,s,e,kmer)
+ − 1022 if (d == None): continue
+ − 1023 if (bestD == None) or (d <= bestD):
+ − 1024 (bestD,bestStart,bestEnd) = (d,s,e)
+ − 1025
+ − 1026 return (kmer,bestD,bestStart,bestEnd)
+ − 1027
+ − 1028 # plausible_intervals--
+ − 1029 # Yield all plausible intervals intersecting with a run. We generate all
+ − 1030 # starts within P bp of the run's start. For each of these, we either (a) try
+ − 1031 # all ends within P bp of run's end, or (b) trim the new interval to a whole
+ − 1032 # multiple of the period, and report this short interval and the longer
+ − 1033 # interval with one more period appended. Case (a) allows partial motifs,
+ − 1034 # while case (b) only allows whole motifs.
+ − 1035
+ − 1036 def plausible_intervals(start,end,period,seqLen,allowPartials=False):
+ − 1037
+ − 1038 # generate intervals that allow a partial copy of the motif
+ − 1039
+ − 1040 if (allowPartials):
+ − 1041 for candStart in xrange(start-(period-1),start+period):
+ − 1042 if (candStart < 0): continue
+ − 1043 for candEnd in xrange(end-(period-1),end+period):
+ − 1044 if (candEnd > seqLen): continue
+ − 1045 if (candEnd <= candStart+period): continue
+ − 1046 yield (candStart,candEnd)
+ − 1047
+ − 1048 # -OR- generate intervals that allow only whole copies of the motif
+ − 1049
+ − 1050 else:
+ − 1051 for candStart in xrange(start-(period-1),start+period):
+ − 1052 if (candStart < 0): continue
+ − 1053 candEnd = candStart + ((end-candStart)/period)*period
+ − 1054 yield (candStart,candEnd)
+ − 1055 candEnd += period
+ − 1056 if (candEnd <= seqLen): yield (candStart,candEnd)
+ − 1057
+ − 1058
+ − 1059 # hamming_distance--
+ − 1060 # Determine the hamming distance between the run and a simple repeat.
+ − 1061 # $$$ improve this by allowing gaps, and stopping when we reach a threshold
+ − 1062
+ − 1063 kmerToDiffs = {} # (this is used for memo-ization)
+ − 1064
+ − 1065 def hamming_distance(seq,start,end,kmer):
+ − 1066 period = len(kmer)
+ − 1067 if (end < start + period): return None
+ − 1068
+ − 1069 wholeEnd = start + ((end-start)/period)*period
+ − 1070
+ − 1071 if (kmer not in kmerToDiffs):
+ − 1072 kmerToDiffs[kmer] = { kmer:0 }
+ − 1073
+ − 1074 d = 0
+ − 1075 for ix in xrange(start,wholeEnd,period):
+ − 1076 qmer = seq[ix:ix+period] # same size as the kmer motif
+ − 1077 if (qmer in kmerToDiffs[kmer]):
+ − 1078 d += kmerToDiffs[kmer][qmer]
+ − 1079 continue
+ − 1080 diffs = 0
+ − 1081 for iy in xrange(0,period):
+ − 1082 if (qmer[iy] != kmer[iy]): diffs += 1
+ − 1083 kmerToDiffs[kmer][qmer] = diffs
+ − 1084 d += diffs
+ − 1085
+ − 1086 if (end > wholeEnd):
+ − 1087 qmer = seq[wholeEnd:end] # shorter than the kmer motif
+ − 1088 if (qmer in kmerToDiffs[kmer]):
+ − 1089 d += kmerToDiffs[kmer][qmer]
+ − 1090 else:
+ − 1091 diffs = 0
+ − 1092 for iy in xrange(0,len(qmer)):
+ − 1093 if (qmer[iy] != kmer[iy]): diffs += 1
+ − 1094 kmerToDiffs[kmer][qmer] = diffs
+ − 1095 d += diffs
+ − 1096
+ − 1097 return d
+ − 1098
+ − 1099
+ − 1100 # fasta_sequences--
+ − 1101 # Read the fasta sequences from a file. Note that we convert to upper case,
+ − 1102 # and convert any letter other than ACGT to N.
+ − 1103
+ − 1104 nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN")
+ − 1105
+ − 1106 def fasta_sequences(f):
+ − 1107 seqName = None
+ − 1108 seqNucs = None
+ − 1109
+ − 1110 for line in f:
+ − 1111 line = line.strip()
+ − 1112 if (line.startswith(">")):
+ − 1113 if (seqName != None):
+ − 1114 yield (seqName,"".join(seqNucs))
+ − 1115 seqName = sequence_name(line)
+ − 1116 seqNucs = []
+ − 1117 elif (seqName == None):
+ − 1118 assert (False), "first sequence has no header"
+ − 1119 else:
+ − 1120 seqNucs += [line]
+ − 1121
+ − 1122 if (seqName != None):
+ − 1123 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap))
+ − 1124
+ − 1125
+ − 1126 # fastq_sequences--
+ − 1127 # Read the fastq sequences from a file. Note that we convert to upper case,
+ − 1128 # and convert any letter other than ACGT to N.
+ − 1129
+ − 1130 def fastq_sequences(f):
+ − 1131 lineNum = 0
+ − 1132 for line in f:
+ − 1133 lineNum += 1
+ − 1134 line = line.strip()
+ − 1135
+ − 1136 if (lineNum % 4 == 1):
+ − 1137 assert (line.startswith("@")), \
+ − 1138 "bad read name at line %d" % lineNum
+ − 1139 seqName = line[1:]
+ − 1140 continue
+ − 1141
+ − 1142 if (lineNum % 4 == 2):
+ − 1143 seqNucs = line
+ − 1144 continue
+ − 1145
+ − 1146 if (lineNum % 4 == 3):
+ − 1147 assert (line.startswith("+")), \
+ − 1148 "can't understand line %d:\n%s" % (lineNum,line)
+ − 1149 continue
+ − 1150
+ − 1151 quals = line
+ − 1152 assert (len(quals) == len(seqNucs)), \
+ − 1153 "length mismatch read vs. qualities at line %d" % lineNum
+ − 1154 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals)
+ − 1155
+ − 1156 assert (lineNum % 4 == 0), \
+ − 1157 "incomplete read at end of file"
+ − 1158
+ − 1159 def sam_sequences(f):
+ − 1160 lineNum = 0
+ − 1161 for line in f:
+ − 1162 lineNum += 1
+ − 1163 line = line.strip()
+ − 1164
+ − 1165 if line.startswith("@"):
+ − 1166 continue
+ − 1167
+ − 1168 columns = line.split("\t")
+ − 1169 seqName = columns[0]
+ − 1170 refName = columns[2]
+ − 1171 pre_s = int(columns[3]) - 1
+ − 1172 cigar = columns[5]
+ − 1173 seqNucs = columns[9]
+ − 1174
+ − 1175 yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar)
+ − 1176
+ − 1177 # sequence_name--
+ − 1178 # Extract the sequence name from a fasta header.
+ − 1179 # $$$ this may need to be improved $$$
+ − 1180
+ − 1181 def sequence_name(s):
+ − 1182 s = s[1:].strip()
+ − 1183 if (s == ""): return ""
+ − 1184 else: return s.split()[0]
+ − 1185
+ − 1186
+ − 1187 # nucleotide_runs--
+ − 1188 # Yield (start,end) for all runs of valid nucleotides in a sequence.
+ − 1189
+ − 1190 def nucleotide_runs(s):
+ − 1191 runs = []
+ − 1192 start = None
+ − 1193 for (ix,nuc) in enumerate(s):
+ − 1194 if (nuc in "ACGT"):
+ − 1195 if (start == None):
+ − 1196 start = ix
+ − 1197 else:
+ − 1198 if (start != None):
+ − 1199 yield (start,ix)
+ − 1200 start = None
+ − 1201
+ − 1202 if (start != None): yield (start,len(s))
+ − 1203
+ − 1204
+ − 1205 # contains_repeat--
+ − 1206 # Determine whether a short sequence contains a repeated element, such as a
+ − 1207 # 6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT). The repeat
+ − 1208 # must cover the entire sequence, without mismatches.
+ − 1209
+ − 1210 def contains_repeat(kmer):
+ − 1211 kmerLength = len(kmer)
+ − 1212 hasRepeat = False
+ − 1213 rptLen = 1
+ − 1214 while (not hasRepeat) and (2 * rptLen <= kmerLength):
+ − 1215 if (kmerLength % rptLen != 0):
+ − 1216 rptLen += 1
+ − 1217 continue
+ − 1218 isRepeat = True
+ − 1219 for i in xrange(rptLen,kmerLength,rptLen):
+ − 1220 if (kmer[i:i+rptLen] != kmer[:rptLen]):
+ − 1221 isRepeat = False
+ − 1222 break
+ − 1223 if (isRepeat):
+ − 1224 hasRepeat = True
+ − 1225 break
+ − 1226 rptLen += 1
+ − 1227 return hasRepeat
+ − 1228
+ − 1229
+ − 1230 # hash108--
+ − 1231 # Return a 108-bit hash "value" of a string
+ − 1232
+ − 1233 def hash108(s):
+ − 1234 m = md5_new()
+ − 1235 m.update(s)
+ − 1236 return m.hexdigest()[:27]
+ − 1237
+ − 1238
+ − 1239 # float_or_fraction--
+ − 1240 # Convert a string to a number, allowing fractions
+ − 1241
+ − 1242 def float_or_fraction(s):
+ − 1243 if ("/" in s):
+ − 1244 (numer,denom) = s.split("/",1)
+ − 1245 return float(numer)/float(denom)
+ − 1246 else:
+ − 1247 return float(s)
+ − 1248
+ − 1249
+ − 1250 # int_with_unit--
+ − 1251 # Parse a string as an integer, allowing unit suffixes
+ − 1252
+ − 1253 def int_with_unit(s):
+ − 1254 if (s.endswith("K")):
+ − 1255 multiplier = 1000
+ − 1256 s = s[:-1]
+ − 1257 elif (s.endswith("M")):
+ − 1258 multiplier = 1000 * 1000
+ − 1259 s = s[:-1]
+ − 1260 elif (s.endswith("G")):
+ − 1261 multiplier = 1000 * 1000 * 1000
+ − 1262 s = s[:-1]
+ − 1263 else:
+ − 1264 multiplier = 1
+ − 1265
+ − 1266 try: return int(s) * multiplier
+ − 1267 except ValueError: return int(math.ceil(float(s) * multiplier))
+ − 1268
+ − 1269
+ − 1270 if __name__ == "__main__": main()
+ − 1271