Mercurial > repos > arkarachai-fungtammasan > microsatellite_ngs
diff microsatellite.py @ 0:20ab85af9505
Uploaded
author | arkarachai-fungtammasan |
---|---|
date | Fri, 03 Oct 2014 20:54:30 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microsatellite.py Fri Oct 03 20:54:30 2014 -0400 @@ -0,0 +1,1271 @@ +#!/usr/bin/env python +""" +Snoop thru a fasta file looking for microsatellite repeats of given periods +Output format: length_of_repeat left_flank_length right_flank_length repeat_motif hamming_distance read_name read_sequence read_quality (additional columns) + +If --r option turned on, output format will have additional columns behind: +read_name read_chr pre_s pre_e tr_s tr_e suf_s suf_e tr_len tr_ref_seq + +pre_s where the read start +pre_e the last position before microsatellite +tr_s where microsatellite start +tr_e where microsatellite end +suf_s first base after microsatellite +tr_ref_seq reference sequence corresponding to microsatellite + +* output positions are 0 based + +:Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu) + +modifing log: + +09/27/2013 +replace function dense_intervals with function non_negative_intervals, which do not need to import such file. + +10/18/2013 +modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel + +02/25/2014 +add function that can deal with mapped reads +with additional output + +02/28/2014 +modify the 0-based end point, as in 0-base area, it is half-open [ ) +so the 0-based site, should always be added by 1 + +03/05/2014 +deal with multi-fasta +""" +from sys import argv,stdin,stderr,exit +from string import maketrans +from md5 import new as md5_new +import re +#from pyfracluster import dense_intervals + +def usage(s=None): + message = """ +usage: microsat_snoop [fasta_file] [options] + <fasta_file> Name of file to read sequences from; if absent, + sequences are read from stdin + --fasta Input file is in fasta format + (this is the default) + --fastq Input file is in fastq format + (default is fasta unless filename is .fastq) + --fastq:noquals Input file is in fastq format, but discard quals + --sam Input file is SAM file + --r Indicate additional output information, if indicated, + --ref option is mendatory + --ref=<filepath> Reference file (absolute) path + --period=<length> (mandatory,cumulative) repeat length(s) to be + searched for + <length> is expected to be small, less than 10 + <length> can also be a comma-separated list, or + a range <low>..<high> + --rate=<fraction> control the candidate repeat interval detector; + it will consider intervals with at least + <fraction> of matches when shifted by the period; + <fraction> is between 0 and 1 and can be either a + real number or <n>/<d> + (default is 6/7) + --minlength=<length> minimum length of intervals reported, in bp + (default is 20) + --progress=<count> how often to report the sequence we're searching + (default is no progress report) + --allowduplicates process all input sequences + (this is the default) + --noduplicates ignore any input sequence that's the same as an + earlier sequence + --nonearduplicates ignore any input sequence that has the same first + 100 bp as an earlier sequence + --nonearduplicate=<length> ignore any input sequence that has the same first + <length> bp as an earlier sequence + --hamming=<count> Don't report candidate repeat intervals that have + more than <count> mismatches + (default is to do no such filtering) + --prefix=<length> Don't report candidate repeat intervals that + start within <length> of the sequence start + (default is to do no such filtering) + --suffix=<length> Don't report candidate repeat intervals that + end within <length> of the sequence end + (default is to do no such filtering) + --subsample=<k>/<n> Process only the <k>th sequence of every group of + <n> sequences; <k> ranges from 1 to <n> + --multipleruns Consider all candidate intervals in a sequence + (default is to consider only the longest) + --partialmotifs Consider microatelites with a partial motif + (default is to consider only whole motifs) + --splitbyvalidity Preprocess sequences, splitting at Ns; this + prevents candidates from including Ns + (default is not to split) + --noflankdisplay Show entire sequence as flanking regions + (this is the default) + --flankdisplay=<length> Limit length of flanking regions shown + --readnamesuffix=<string> Root of suffix to append to read names; e.g. 1 + for forward, 2 for reverse; this triggers other + info to be included in the suffix + (default is "1" for fastq; no suffix for fasta) + --head=<number> limit the number of sequences processed + --markend Write a marker line upon completion + (default is not to write a marker) + --help=details Describe the process, and quit""" + + if (s == None): exit (message) + else: exit ("%s\n%s" % (s,message)) + + +detailedDescription = """In broad terms, the process works as follows: + +(1) Identify intervals that are highly correlated with the interval shifted by + P (the repeat period). These intervals are called "runs" or "candidates". + The level of correlation required is controlled by rateThreshold. + Depending on whether we want to look for more than one microsat, we either + find the longest such run (simple algorithm) or many runs (more complicated + algorithm). The following steps are then performed on each run. + +(2) Find the most likely repeat motif in the run. This is done by counting + all kmers (of length P) and choosing the most frequent. If that kmer is + itself covered by a sub-repeat we discard this run. The idea is that we + can ignore a 6-mer like ACGACG because we will find it when we are looking + for 3-mers. + +(3) Once we identify the most likely repeat motif, we then modify the + interval, adjusting start and end to find the interval that has the fewest + mismatches vs. a sequence of the motif repeated (hamming distance). Only + whole copies of the motif are considered. + +(4) At this point we have a valid microsat interval (in the eyes of the + program). It is subjected to some filtering stages (hamming distance or too + close to an end), and if it satisfies those conditions, it's reported to + the user.""" + +def main(): + global debug + + #=== parse the command line === + + inputFilename = None + referenceFileName = None #add by Chen Sun on 02/25 + inputFormat = None + repeatPeriods = [] + rateThreshold = 6 / 7.0 + lengthThreshold = 20 + reportProgress = None + discardDuplicates = False + discardNearDuplicates = False + nearDuplicatePrefix = 100 + hammingThreshold = 0 + prefixThreshold = None + suffixThreshold = None + subsampleK = None + subsampleN = None + reportMultipleRuns = False + allowPartialMotifs = False + splitByValidity = False + flankDisplayLimit = None + readNameSuffix = None + headLimit = None + markEndOfFile = False + additionalInfo = False + debug = [] + + for arg in argv[1:]: + if (arg == "--fasta"): + inputFormat = "fasta" + elif (arg == "--fastq"): + inputFormat = "fastq" + elif (arg == "--fastq:noquals"): + inputFormat = "fastq:noquals" + elif (arg == "--sam"): + inputFormat = "sam" + elif (arg == "--r"): + additionalInfo = True + elif (arg.startswith("--ref=")): + referenceFileName = arg.split("=",1)[1] + elif (arg.startswith("--period=")): + val = arg.split("=",1)[1] + for period in val.split(","): + if (".." in period): + (lowPeriod,highPeriod) = period.split("..",1) + lowPeriod = int(lowPeriod) + highPeriod = int(highPeriod) + for period in xrange(lowPeriod,highPeriod+1): + repeatPeriods += [period] + else: + repeatPeriods += [int(period)] + elif (arg.startswith("--rate=")): + val = arg.split("=",1)[1] + rateThreshold = float_or_fraction(val) + assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val + elif (arg.startswith("--minlength=")): + val = arg.split("=",1)[1] + lengthThreshold = int(val) + assert (lengthThreshold >= 0) + elif (arg.startswith("--progress=")): + val = arg.split("=",1)[1] + reportProgress = int(val) + elif (arg == "--allowduplicates"): + discardDuplicates = False + discardNearDuplicates = False + elif (arg == "--noduplicates"): + discardDuplicates = True + discardNearDuplicates = False + elif (arg == "--nonearduplicates"): + discardDuplicates = False + discardNearDuplicates = True + elif (arg.startswith("--nonearduplicate=")): + val = arg.split("=",1)[1] + discardDuplicates = False + discardNearDuplicates = True + nearDuplicatePrefix = int(val) + assert (nearDuplicatePrefix > 0) + elif (arg.startswith("--hamming=")): + val = arg.split("=",1)[1] + hammingThreshold = int(val) + assert (hammingThreshold >= 0) + elif (arg.startswith("--prefix=")): + val = arg.split("=",1)[1] + prefixThreshold = int(val) + assert (prefixThreshold >= 0) + elif (arg.startswith("--suffix=")): + val = arg.split("=",1)[1] + suffixThreshold = int(val) + assert (suffixThreshold >= 0) + elif (arg.startswith("--subsample=")): + val = arg.split("=",1)[1] + (k,n) = val.split("/",2) + subsampleK = int(k) + subsampleN = int(n) + assert (0 < subsampleK <= subsampleN) + elif (arg == "--multipleruns"): + reportMultipleRuns = True + elif (arg == "--partialmotifs"): + allowPartialMotifs = True + elif (arg == "--splitbyvalidity"): + splitByValidity = True + elif (arg == "--noflankdisplay"): + flankDisplayLimit = None + elif (arg.startswith("--flankdisplay=")): + val = arg.split("=",1)[1] + flankDisplayLimit = int(val) + assert (flankDisplayLimit >= 0) + elif (arg.startswith("--readnamesuffix")): + readNameSuffix = arg.split("=",1)[1] + elif (arg.startswith("--head=")): + headLimit = int_with_unit(arg.split("=",1)[1]) + elif (arg == "--markend"): + markEndOfFile = True + elif (arg == "--help=details"): + exit (detailedDescription) + elif (arg.startswith("--debug=")): + debug += (arg.split("=",1)[1]).split(",") + elif (arg.startswith("--")): + usage("unrecognized option: %s" % arg) + elif (inputFilename == None): + inputFilename = arg + else: + usage("unrecognized option: %s" % arg) + + #=== determine periods of interest === + + if (repeatPeriods == []): + usage("you gotta give me a repeat period") + + if (additionalInfo == True): + if (referenceFileName == None): + usage("reference file path needed. use --ref=<reference> to indicate") + + periodSeed = {} + for period in repeatPeriods: + if (period < 1): usage("period %d is not valid" % period) + periodSeed[period] = True + + repeatPeriods = [period for period in periodSeed] + repeatPeriods.sort() + + #=== determine input format === + + if (inputFormat == "fasta"): sequence_reader = fasta_sequences + elif (inputFormat == "fastq"): sequence_reader = fastq_sequences + elif (inputFormat == "fastq:noquals"): sequence_reader = fastq_sequences + elif (inputFormat == "sam"): sequence_reader = sam_sequences + elif (inputFilename == None): sequence_reader = fasta_sequences + elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences + elif (inputFilename.endswith(".fq")): sequence_reader = fastq_sequences + elif (inputFilename.endswith(".sam")): sequence_reader = sam_sequences + else: sequence_reader = fasta_sequences + + if (inputFilename != None): inputF = file(inputFilename,"rt") + else: inputF = stdin + + if (readNameSuffix == None) \ + and (sequence_reader == fastq_sequences) \ + and (inputFormat != "fastq:noquals"): + readNameSuffix = "1" + + #=== process the sequences === + + refSequence = {} + rightName = "" + sequence = "" + if additionalInfo: + firstFasta = True + originalRefF = open(referenceFileName) + for line in originalRefF.readlines(): + line = line.replace('\r','') + line = line.replace('\n','') + if line.startswith(">"): + if firstFasta: + firstFasta = False + else: + refSequence[rightName] = sequence + rightName = line[1:] + sequence = "" + continue + sequence += line + originalRefF.close() + refSequence[rightName] = sequence + + sequenceSeen = {} + + numSequences = 0 + for seqInfo in sequence_reader(inputF): + numSequences += 1 + if (headLimit != None) and (numSequences > headLimit): + print >>stderr, "limit of %d sequences reached" % headLimit + break + + if (sequence_reader == sam_sequences): + #seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar + (name, sequence, refName, pre_s, cigar) = seqInfo + quals = None + elif (sequence_reader == fastq_sequences): + (name,sequence,quals) = seqInfo + if (inputFormat == "fastq:noquals"): quals = None + else: + (name,sequence) = seqInfo + quals = None + + if (reportProgress != None) and (numSequences % reportProgress == 0): + print >>stderr, "%s %d" % (name,numSequences) + + # if we're subsampling and not interested in this sequence, skip it + + if (subsampleN != None): + if ((numSequences-1) % subsampleN != (subsampleK-1)): + continue + + # if this sequence is shorter than the length of interest, skip it + + seqLen = len(sequence) + if (seqLen < period) or (seqLen < lengthThreshold): continue + + # if we're not interested in duplicates and this is one, skip it; + # note that we assume no hash collisions occur, i.e. that all hash + # matches are truly sequence matches + + if (discardDuplicates): + h = hash108(sequence) + if (h in sequenceSeen): continue + sequenceSeen[h] = True + elif (discardNearDuplicates): + h = hash108(sequence[:nearDuplicatePrefix]) + if (h in sequenceSeen): continue + sequenceSeen[h] = True + + # split the sequence into chunks of valid nucleotides + + if (splitByValidity): + chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)] + else: + chunks = [(0,len(sequence))] + + # evaluate for each period of interest + + for period in repeatPeriods: + + # operate on each chunk + + for (chunkStart,chunkEnd) in chunks: + chunkLen = chunkEnd - chunkStart + if (chunkLen < period) or (chunkLen < lengthThreshold): continue + + if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug): + print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd) + + # compute correlation sequence + + corr = correlation_sequence(sequence,period,chunkStart,chunkEnd) + + if ("correlation" in debug) or ("runs" in debug): + print >>stderr, sequence[chunkStart:chunkEnd] + print >>stderr, corr + + # find runs (candidates for being a microsat) + + if (reportMultipleRuns): + runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold) + else: + runs = longest_suitable_run(corr,lengthThreshold,rateThreshold) + if (runs == []): continue + + + if ("runs" in debug): + for (start,end) in runs: + run = [" "] * seqLen + for ix in xrange(start-period,end): + run[ix] = "*" + print >>stderr, "".join(run) + + if ("candidates" in debug): + for (start,end) in runs: + print >>stderr, "%s %d %d" % (name,start,end) + + # process runs and report those that pass muster + + runCount = 0 + for (start,end) in runs: + runCount += 1 + + start = chunkStart + start - period + end = chunkStart + end + + (kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs) + if (kmer == None): continue # (no useful repeat kmer was found) + + rptExtent = end - start + prefixLen = start + suffixLen = seqLen - end + if (rptExtent <= period): continue + if (hammingThreshold != None) and (d > hammingThreshold): continue + if (prefixThreshold != None) and (prefixLen < prefixThreshold): continue + if (suffixThreshold != None) and (suffixLen < suffixThreshold): continue + + if (flankDisplayLimit == None): + seq = sequence[:start] \ + + sequence[start:end].lower() \ + + sequence[end:] + else: + seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \ + + sequence[start:end].lower() \ + + sequence[end:min(chunkEnd,end+flankDisplayLimit)] + reportName = name + if (readNameSuffix != None): + reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount) + if (quals == None or quals == "." or quals == "\t."): quals = "\t." + else: quals = "\t" + quals + if not additionalInfo: + print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \ + % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals) + else: + #pre_e = pre_s + prefixLen - 1 + refPoint = pre_s + donorPoint = 0 + + donorBeforeStart = prefixLen - 1 #pre_e + donorMicroStart = prefixLen #tr_s + donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e + donorAfterMicro = donorMicroEnd + 1 #suf_s + donorEnd = len(seq) - 1 #suf_e + + set_pre_e = False + set_tr_s = False + set_tr_e = False + set_suf_s = False + set_suf_e = False + + pre_e = 0 + tr_s = 0 + tr_e = 0 + suf_s = 0 + suf_e = 0 + + matchList = re.findall('(\d+)([IDM])', cigar) + unCognitiveCigar = False + for matchN, matchType in matchList: + matchNum = int(matchN) + if matchType == "M": + donorPoint = donorPoint + matchNum + refPoint = refPoint + matchNum + elif matchType == "D": + refPoint = refPoint + matchNum + continue + elif matchType == "I": + donorPoint = donorPoint + matchNum + else: + unCognitiveCigar = True + break + + if not set_pre_e: + if donorPoint >= donorBeforeStart: + pre_e = refPoint - (donorPoint - donorBeforeStart) + set_pre_e = True + else: + continue + + if not set_tr_s: + if donorPoint >= donorMicroStart: + tr_s = refPoint - (donorPoint - donorMicroStart) + set_tr_s = True + else: + continue + + if not set_tr_e: + if donorPoint >= donorMicroEnd: + tr_e = refPoint - (donorPoint - donorMicroEnd) + set_tr_e = True + else: + continue + + if not set_suf_s: + if donorPoint >= donorAfterMicro: + suf_s = refPoint - (donorPoint - donorAfterMicro) + set_suf_s = True + else: + continue + + if not set_suf_e: + if donorPoint >= donorEnd: + suf_e = refPoint - (donorPoint - donorEnd) + set_suf_e = True + else: + continue + + if unCognitiveCigar: + break + tr_len = tr_e - tr_s + 1 + + if refName not in refSequence: + tr_ref_seq = "." + else: + if refSequence[refName] == "": + tr_ref_seq = "." + elif len(refSequence[refName]) <= tr_e: + tr_ref_seq = "." + else: + tr_ref_seq = refSequence[refName][tr_s:tr_e+1] + + pre_e += 1 + tr_e += 1 + suf_e += 1 + print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \ + % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq) + + if (markEndOfFile): + print "# microsat_snoop end-of-file" + + if (inputF != stdin): + inputF.close() + +# non_negative_intervals +# find intervals with exactly + and no - +# from string like this : +++++++++---+++++++++ +def non_negative_intervals(seq, minLength=None): + + start = -1 + end = -1 + firstPlus = 1 + #print seq + for ix in range(len(seq)): # for every char in seq + ch = seq[ix] + if(ch == "+"): + if(firstPlus): + firstPlus = 0 + start = ix + else: + continue + elif(ch == "-"): + if(start >= 0): + end = ix-1 + if((end - start + 1) >= minLength): + yield (start,end+1) + start = -1 + firstPlus = 1 + if(start > 0): + if((ix - start + 1) >= minLength): + yield (start, ix+1) + + +################################################################### +# modified by Chen Sun on 7/11/2014 +# We do not want other modules, so parse these functions inside +# +################################################################### + +# parse a string of the form {positives}/{positives_and_neutrals} + +def parse_spec(s): + if ("/" not in s): raise ValueError + (n,d) = s.split("/",1) + if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError + if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError + + positives = n[1:-1] + d = d[1:-1] + + for ch in positives: + if (ch not in d): raise ValueError + + neutrals = [ch for ch in d if (ch not in positives)] + return (positives,neutrals) + + +# convert a string to a number, allowing fractions + +def float_or_fraction(s): + if ("/" in s): + (numer,denom) = s.split("/",1) + return float(numer)/float(denom) + else: + return float(s) + + +# dense_intervals-- +# Find all non-overlapping runs with a good enough rate (of positives), and +# which meet our length threshold. +# +# The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing +# long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999. +# +# $$$ we use the denominator as the threshold, but we really should use the +# $$$ .. numerator, comparing it to minLength*rate + +def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None): + + if (blockers == None): + blockers = "".join([chr(n) for n in range(1,256) + if (chr(n) not in positives) + and (chr(n) not in neutrals)]) + + stackLeft = [None] # stack with each entry containing five + stackRight = [None] # .. elements; note that entry zero is not + stackLeftScore = [None] # .. used + stackRightScore = [None] + stackLower = [None] + top = 0 + score = 0 + + for ix in range(len(seq)): + ch = seq[ix] + if (ch in blockers): + # emit intervals + + for sp in range(1,top+1): + left = stackLeft [sp] + 1 + right = stackRight[sp] + + while (left < right) and (seq[left] not in positives): left += 1 + while (right > left) and (seq[right] not in positives): right -= 1 + + right += 1 + if (minLength == None) or (right - left >= minLength): + yield (left,right) + + #empty stack + + stackLeft = [None] + stackRight = [None] + stackLeftScore = [None] + stackRightScore = [None] + stackLower = [None] + top = 0 + score = 0 + continue + + if (ch in positives): weight = 1-rate + elif (ch in neutrals): weight = -rate + else: raise ValueError + + score += weight + #if ("algorithm" in debug): + # print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score), + + if (weight < 0): + #if ("algorithm" in debug): + # print >>sys.stderr + continue + + if (top > 0) and (stackRight[top] == ix-1): + # add this site to the interval on top of the stack + + stackRight [top] = ix + stackRightScore[top] = score + + #if ("algorithm" in debug): + # print >>sys.stderr, \ + # " extending [%d] %d-%d %4.1f %4.1f" \ + # % (top, + # stackLeft [top], stackRight [top], + # stackLeftScore[top], stackRightScore[top]), + + else: + # create a one site interval + + top += 1 + if (top >= len(stackLeft)): + stackLeft += [None] + stackRight += [None] + stackLeftScore += [None] + stackRightScore += [None] + stackLower += [None] + + stackLeft [top] = ix - 1 + stackLeftScore [top] = score - weight + stackRight [top] = ix + stackRightScore[top] = score + stackLower [top] = top - 1 + + while (stackLower[top] > 0) \ + and (stackLeftScore[stackLower[top]] > stackLeftScore[top]): + stackLower[top] = stackLower[stackLower[top]] + + #if ("algorithm" in debug): + # print >>sys.stderr, \ + # " creating [%d] %d-%d %4.1f %4.1f -> %d" \ + # % (top, + # stackLeft [top], stackRight [top], + # stackLeftScore[top], stackRightScore[top], + # stackLower [top]), + + # merge intervals; if there is a previous interval with a no-higher + # left score and no-higher right score, merge this interval (and all + # intervening ones) into that one + + while (top > 1) \ + and (stackLower[top] > 0) \ + and (stackRightScore[stackLower[top]] <= stackRightScore[top]): + stackRight [stackLower[top]] = stackRight [top] + stackRightScore[stackLower[top]] = stackRightScore[top] + top = stackLower[top] + + #if ("algorithm" in debug): + # print >>sys.stderr, \ + # "\n%*s merging [%d] %d-%d %4.1f %4.1f" \ + # % (13, "", top, + # stackLeft[top], stackRight [top], + # stackLeftScore[top], stackRightScore[top]), + + #if ("algorithm" in debug): + # print >>sys.stderr + + # emit intervals + + for sp in range(1,top+1): + left = stackLeft [sp] + 1 + right = stackRight[sp] + + while (left < right) and (seq[left] not in positives): left += 1 + while (right > left) and (seq[right] not in positives): right -= 1 + + right += 1 + if (minLength == None) or (right - left >= minLength): + yield (left,right) + + +################################################################### +# modified by Chen Sun on 7/11/2014 +# +################################################################### + +# correlation_sequence-- +# Compute the correlation sequence for a given period. This is a sequence +# of + and - indicating whether the base at a given position matches the one +# P positions earlier (where P is the period). The first P positions are +# blank. Positions with single character runs longer than the period are +# considered as non-matches, unless the period is 1. + +def correlation_sequence(sequence,period,start=None,end=None): + if (start == None): start = 0 + if (end == None): end = len(sequence) + + prevCh = sequence[start] + run = 1 + for ix in xrange(start+1,start+period): + ch = sequence[ix] + if (ch != prevCh): run = 1 + else: run += 1 + prevCh = ch + + corr = [" "] * period + for ix in xrange(start+period,end): + rptCh = sequence[ix-period] + ch = sequence[ix] + if (ch != prevCh): run = 1 + else: run += 1 + if (ch in "ACGT") \ + and (ch == rptCh) \ + and ((period == 1) or (run < period)): + corr += ["+"] + else: + corr += ["-"] + prevCh = ch + + return "".join(corr) + + +# longest_suitable_run-- +# Find longest run with a good enough rate (of positives). +# +# We score a "+" as 1-r and anything else as -r. This is based on the fol- +# lowing derivation (p is the number of "+"s, n is the number of non-"+"s): +# p/(p+n) >= r +# ==> p >= rp + rn +# ==> (1-r)p - rn >= 0 +# +# We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing). +# +# $$$ we use the denominator as the threshold, but we really should use the +# $$$ .. numerator, comparing it to minLength*rate +# +# $$$ this needs to account for $$$ this situation: +# $$$ sequence: ACGACGACGACGTTATTATTATTA +# $$$ matches: +++++++++---+++++++++ +# $$$ this is currently considered to be one interval (if rate <= 6/7), but it +# $$$ ought to be two; we can't just post-process, though, because some other +# $$$ interval might be longer than the longest half of this; maybe what we +# $$$ need to do is consider matches at distances -P and -2P, or if we match +# $$$ -P but that itself was a mismatch, we should carry the mismatch forward + +def longest_suitable_run(seq,minLength,rate): + maxEndingHere = 0 + maxSoFar = 0 + start = None + + for ix in xrange(len(seq)): + if (seq[ix] == "+"): s = 1-rate + else: s = -rate + + if (maxEndingHere+s < 0): + maxEndingHere = 0 + block = ix + else: + maxEndingHere += s + if (maxEndingHere >= maxSoFar): + maxSoFar = maxEndingHere + start = block + 1 + end = ix + 1 + + if (start == None) or (end - start < minLength): + return [] + else: + return [(start,end)] + + +# all_suitable_runs-- +# Find all non-overlapping runs with a good enough rate (of positives), and +# which meet our length threshold. +# $$$ this needs to post-process the intervals, splitting them to account for +# $$$ this situation: +# $$$ sequence: ACGACGACGACGTTATTATTATTA +# $$$ matches: +++++++++---+++++++++ +# $$$ this is currently reported as one interval (if rate <= 6/7), but it +# $$$ ought to be two + +def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold): + + ################################################################ + # modified by Chen Sun on 07/11/2014 + # + ################################################################ + + if hammingThreshold > 0: + return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)] + elif hammingThreshold == 0: + return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)] + + +# find_repeat_element-- +# Find the most plausible repeat element for a run, and nudge the ends of +# the run if needed. Note that we will not consider kmers that represent +# shorter repeats. For example, we won't report ACTACT as a 6-mer since we +# consider this to have a shorter period than 6. + +def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False): + + if hammingThreshold > 0: + (kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials) + return (kmer,bestD,bestStart,bestEnd) + # count the number of occurences of each k-mer; note that we can't + # reject kmers containing smaller repeats yet, since for a sequence like + # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best + # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end + # up reporting something like ACACAA as the best motif + + if ("element" in debug): + print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end) + + if ("partial" in debug): + print period, seq, start, end, allowPartials; + print seq[start:end] + + kmerToCount = {} + kmerToFirst = {} + for ix in xrange(start,end-(period-1)): + kmer = seq[ix:ix+period] + if ("N" in kmer): continue + if (kmer not in kmerToCount): + kmerToCount[kmer] = 1 + kmerToFirst[kmer] = ix + else: + kmerToCount[kmer] += 1 + #if ("element" in debug): + # print >>stderr, " %d: %s" % (ix,kmer) + + # choose the best k-mer; this is simply the most frequently occurring one, + # with ties broken by whichever one came first + + kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount] + if (kmers == []): return (None,None,start,end) + kmers.sort() + + if ("element" in debug): + for (count,first,kmer) in kmers: + print >>stderr, " %s: %d" % (kmer,-count) + + (count,first,kmer) = kmers[0] + if (contains_repeat(kmer)): return (None,None,start,end) + + # determine the hamming distance between the run and a simple repeat, for + # each "plausible" start and end; we compute the distance for each such + # interval, and choose the one with the lowest hamming distance; ties are + # broken in a deterministic-but-unspecified manner + + bestD = bestStart = bestEnd = None + ################################################################################### + # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013 + # since we do not allow hamming_distance > 0, which means we do not allow mutation, + # we do not need this section to produce bestStart and End + ################################################################################### + + #for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials): + # d = hamming_distance(seq,s,e,kmer) + # if (d == None): continue + # if (bestD == None) or (d <= bestD): + # (bestD,bestStart,bestEnd) = (d,s,e) + + + + bestStart = start + + if(allowPartials): + bestEnd = end + elif(not allowPartials): + bestEnd = start + pattern = seq[start:start+period] + if ("partial" in debug): + print "kmer:", kmer + if(pattern != kmer): + print "pattern:", pattern + + while(bestEnd <= end-period): + bestEnd += period + + # bestD will always be 0, as we do not allow mutation + bestD = 0 + + if ("partial" in debug): + print bestD, bestStart, bestEnd + + ################################################################################### + # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10 + # + ################################################################################### + return (kmer,bestD,bestStart,bestEnd) + + +def find_hamming_repeat_element(period,seq,start,end,allowPartials=False): + + # count the number of occurences of each k-mer; note that we can't + # reject kmers containing smaller repeats yet, since for a sequence like + # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best + # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end + # up reporting something like ACACAA as the best motif + + if ("element" in debug): + print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end) + + kmerToCount = {} + kmerToFirst = {} + for ix in xrange(start,end-(period-1)): + kmer = seq[ix:ix+period] + if ("N" in kmer): continue + if (kmer not in kmerToCount): + kmerToCount[kmer] = 1 + kmerToFirst[kmer] = ix + else: + kmerToCount[kmer] += 1 + #if ("element" in debug): + # print >>stderr, " %d: %s" % (ix,kmer) + + # choose the best k-mer; this is simply the most frequently occurring one, + # with ties broken by whichever one came first + + kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount] + if (kmers == []): return (None,None,start,end) + kmers.sort() + + if ("element" in debug): + for (count,first,kmer) in kmers: + print >>stderr, " %s: %d" % (kmer,-count) + + (count,first,kmer) = kmers[0] + if (contains_repeat(kmer)): return (None,None,start,end) + + # determine the hamming distance between the run and a simple repeat, for + # each "plausible" start and end; we compute the distance for each such + # interval, and choose the one with the lowest hamming distance; ties are + # broken in a deterministic-but-unspecified manner + + bestD = bestStart = bestEnd = None + + for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials): + d = hamming_distance(seq,s,e,kmer) + if (d == None): continue + if (bestD == None) or (d <= bestD): + (bestD,bestStart,bestEnd) = (d,s,e) + + return (kmer,bestD,bestStart,bestEnd) + +# plausible_intervals-- +# Yield all plausible intervals intersecting with a run. We generate all +# starts within P bp of the run's start. For each of these, we either (a) try +# all ends within P bp of run's end, or (b) trim the new interval to a whole +# multiple of the period, and report this short interval and the longer +# interval with one more period appended. Case (a) allows partial motifs, +# while case (b) only allows whole motifs. + +def plausible_intervals(start,end,period,seqLen,allowPartials=False): + + # generate intervals that allow a partial copy of the motif + + if (allowPartials): + for candStart in xrange(start-(period-1),start+period): + if (candStart < 0): continue + for candEnd in xrange(end-(period-1),end+period): + if (candEnd > seqLen): continue + if (candEnd <= candStart+period): continue + yield (candStart,candEnd) + + # -OR- generate intervals that allow only whole copies of the motif + + else: + for candStart in xrange(start-(period-1),start+period): + if (candStart < 0): continue + candEnd = candStart + ((end-candStart)/period)*period + yield (candStart,candEnd) + candEnd += period + if (candEnd <= seqLen): yield (candStart,candEnd) + + +# hamming_distance-- +# Determine the hamming distance between the run and a simple repeat. +# $$$ improve this by allowing gaps, and stopping when we reach a threshold + +kmerToDiffs = {} # (this is used for memo-ization) + +def hamming_distance(seq,start,end,kmer): + period = len(kmer) + if (end < start + period): return None + + wholeEnd = start + ((end-start)/period)*period + + if (kmer not in kmerToDiffs): + kmerToDiffs[kmer] = { kmer:0 } + + d = 0 + for ix in xrange(start,wholeEnd,period): + qmer = seq[ix:ix+period] # same size as the kmer motif + if (qmer in kmerToDiffs[kmer]): + d += kmerToDiffs[kmer][qmer] + continue + diffs = 0 + for iy in xrange(0,period): + if (qmer[iy] != kmer[iy]): diffs += 1 + kmerToDiffs[kmer][qmer] = diffs + d += diffs + + if (end > wholeEnd): + qmer = seq[wholeEnd:end] # shorter than the kmer motif + if (qmer in kmerToDiffs[kmer]): + d += kmerToDiffs[kmer][qmer] + else: + diffs = 0 + for iy in xrange(0,len(qmer)): + if (qmer[iy] != kmer[iy]): diffs += 1 + kmerToDiffs[kmer][qmer] = diffs + d += diffs + + return d + + +# fasta_sequences-- +# Read the fasta sequences from a file. Note that we convert to upper case, +# and convert any letter other than ACGT to N. + +nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN") + +def fasta_sequences(f): + seqName = None + seqNucs = None + + for line in f: + line = line.strip() + if (line.startswith(">")): + if (seqName != None): + yield (seqName,"".join(seqNucs)) + seqName = sequence_name(line) + seqNucs = [] + elif (seqName == None): + assert (False), "first sequence has no header" + else: + seqNucs += [line] + + if (seqName != None): + yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap)) + + +# fastq_sequences-- +# Read the fastq sequences from a file. Note that we convert to upper case, +# and convert any letter other than ACGT to N. + +def fastq_sequences(f): + lineNum = 0 + for line in f: + lineNum += 1 + line = line.strip() + + if (lineNum % 4 == 1): + assert (line.startswith("@")), \ + "bad read name at line %d" % lineNum + seqName = line[1:] + continue + + if (lineNum % 4 == 2): + seqNucs = line + continue + + if (lineNum % 4 == 3): + assert (line.startswith("+")), \ + "can't understand line %d:\n%s" % (lineNum,line) + continue + + quals = line + assert (len(quals) == len(seqNucs)), \ + "length mismatch read vs. qualities at line %d" % lineNum + yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals) + + assert (lineNum % 4 == 0), \ + "incomplete read at end of file" + +def sam_sequences(f): + lineNum = 0 + for line in f: + lineNum += 1 + line = line.strip() + + if line.startswith("@"): + continue + + columns = line.split("\t") + seqName = columns[0] + refName = columns[2] + pre_s = int(columns[3]) - 1 + cigar = columns[5] + seqNucs = columns[9] + + yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar) + +# sequence_name-- +# Extract the sequence name from a fasta header. +# $$$ this may need to be improved $$$ + +def sequence_name(s): + s = s[1:].strip() + if (s == ""): return "" + else: return s.split()[0] + + +# nucleotide_runs-- +# Yield (start,end) for all runs of valid nucleotides in a sequence. + +def nucleotide_runs(s): + runs = [] + start = None + for (ix,nuc) in enumerate(s): + if (nuc in "ACGT"): + if (start == None): + start = ix + else: + if (start != None): + yield (start,ix) + start = None + + if (start != None): yield (start,len(s)) + + +# contains_repeat-- +# Determine whether a short sequence contains a repeated element, such as a +# 6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT). The repeat +# must cover the entire sequence, without mismatches. + +def contains_repeat(kmer): + kmerLength = len(kmer) + hasRepeat = False + rptLen = 1 + while (not hasRepeat) and (2 * rptLen <= kmerLength): + if (kmerLength % rptLen != 0): + rptLen += 1 + continue + isRepeat = True + for i in xrange(rptLen,kmerLength,rptLen): + if (kmer[i:i+rptLen] != kmer[:rptLen]): + isRepeat = False + break + if (isRepeat): + hasRepeat = True + break + rptLen += 1 + return hasRepeat + + +# hash108-- +# Return a 108-bit hash "value" of a string + +def hash108(s): + m = md5_new() + m.update(s) + return m.hexdigest()[:27] + + +# float_or_fraction-- +# Convert a string to a number, allowing fractions + +def float_or_fraction(s): + if ("/" in s): + (numer,denom) = s.split("/",1) + return float(numer)/float(denom) + else: + return float(s) + + +# int_with_unit-- +# Parse a string as an integer, allowing unit suffixes + +def int_with_unit(s): + if (s.endswith("K")): + multiplier = 1000 + s = s[:-1] + elif (s.endswith("M")): + multiplier = 1000 * 1000 + s = s[:-1] + elif (s.endswith("G")): + multiplier = 1000 * 1000 * 1000 + s = s[:-1] + else: + multiplier = 1 + + try: return int(s) * multiplier + except ValueError: return int(math.ceil(float(s) * multiplier)) + + +if __name__ == "__main__": main() +