| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 """
 | 
| 
 | 
     3 Snoop thru a fasta file looking for microsatellite repeats of given periods
 | 
| 
 | 
     4 Output format: length_of_repeat left_flank_length right_flank_length  repeat_motif  hamming_distance  read_name read_sequence read_quality  (additional columns)
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 If --r option turned on, output format will have additional columns behind:
 | 
| 
 | 
     7 read_name read_chr  pre_s pre_e tr_s  tr_e  suf_s suf_e tr_len  tr_ref_seq
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 pre_s           where the read start
 | 
| 
 | 
    10 pre_e           the last position before microsatellite
 | 
| 
 | 
    11 tr_s            where microsatellite start
 | 
| 
 | 
    12 tr_e            where microsatellite end
 | 
| 
 | 
    13 suf_s           first base after microsatellite
 | 
| 
 | 
    14 tr_ref_seq      reference sequence corresponding to microsatellite
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 * output positions are 0 based
 | 
| 
 | 
    17 
 | 
| 
 | 
    18 :Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu)
 | 
| 
 | 
    19           
 | 
| 
 | 
    20 modifing log:
 | 
| 
 | 
    21 
 | 
| 
 | 
    22 09/27/2013
 | 
| 
 | 
    23 replace function dense_intervals with function non_negative_intervals, which do not need to import such file.
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 10/18/2013
 | 
| 
 | 
    26 modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 02/25/2014
 | 
| 
 | 
    29 add function that can deal with mapped reads
 | 
| 
 | 
    30 with additional output
 | 
| 
 | 
    31 
 | 
| 
 | 
    32 02/28/2014
 | 
| 
 | 
    33 modify the 0-based end point, as in 0-base area, it is half-open [ )
 | 
| 
 | 
    34 so the 0-based site, should always be added by 1
 | 
| 
 | 
    35 
 | 
| 
 | 
    36 03/05/2014
 | 
| 
 | 
    37 deal with multi-fasta
 | 
| 
 | 
    38 """
 | 
| 
 | 
    39 from sys          import argv,stdin,stderr,exit
 | 
| 
 | 
    40 from string       import maketrans
 | 
| 
 | 
    41 from md5          import new as md5_new
 | 
| 
 | 
    42 import re
 | 
| 
 | 
    43 #from pyfracluster import dense_intervals
 | 
| 
 | 
    44 
 | 
| 
 | 
    45 def usage(s=None):
 | 
| 
 | 
    46     message = """
 | 
| 
 | 
    47 usage: microsat_snoop [fasta_file] [options]
 | 
| 
 | 
    48   <fasta_file>                Name of file to read sequences from;  if absent,
 | 
| 
 | 
    49                               sequences are read from stdin
 | 
| 
 | 
    50   --fasta                     Input file is in fasta format
 | 
| 
 | 
    51                               (this is the default)
 | 
| 
 | 
    52   --fastq                     Input file is in fastq format
 | 
| 
 | 
    53                               (default is fasta unless filename is .fastq)
 | 
| 
 | 
    54   --fastq:noquals             Input file is in fastq format, but discard quals
 | 
| 
 | 
    55   --sam                       Input file is SAM file 
 | 
| 
 | 
    56   --r                         Indicate additional output information, if indicated,
 | 
| 
 | 
    57                               --ref option is mendatory
 | 
| 
 | 
    58   --ref=<filepath>            Reference file (absolute) path
 | 
| 
 | 
    59   --period=<length>           (mandatory,cumulative) repeat length(s) to be
 | 
| 
 | 
    60                               searched for
 | 
| 
 | 
    61                               <length> is expected to be small, less than 10
 | 
| 
 | 
    62                               <length> can also be a comma-separated list, or
 | 
| 
 | 
    63                               a range <low>..<high>
 | 
| 
 | 
    64   --rate=<fraction>           control the candidate repeat interval detector;
 | 
| 
 | 
    65                               it will consider intervals with at least
 | 
| 
 | 
    66                               <fraction> of matches when shifted by the period;
 | 
| 
 | 
    67                               <fraction> is between 0 and 1 and can be either a
 | 
| 
 | 
    68                               real number or <n>/<d>
 | 
| 
 | 
    69                               (default is 6/7)
 | 
| 
 | 
    70   --minlength=<length>        minimum length of intervals reported, in bp
 | 
| 
 | 
    71                               (default is 20)
 | 
| 
 | 
    72   --progress=<count>          how often to report the sequence we're searching
 | 
| 
 | 
    73                               (default is no progress report)
 | 
| 
 | 
    74   --allowduplicates           process all input sequences
 | 
| 
 | 
    75                               (this is the default)
 | 
| 
 | 
    76   --noduplicates              ignore any input sequence that's the same as an
 | 
| 
 | 
    77                               earlier sequence
 | 
| 
 | 
    78   --nonearduplicates          ignore any input sequence that has the same first
 | 
| 
 | 
    79                               100 bp as an earlier sequence
 | 
| 
 | 
    80   --nonearduplicate=<length>  ignore any input sequence that has the same first
 | 
| 
 | 
    81                               <length> bp as an earlier sequence
 | 
| 
 | 
    82   --hamming=<count>           Don't report candidate repeat intervals that have
 | 
| 
 | 
    83                               more than <count> mismatches
 | 
| 
 | 
    84                               (default is to do no such filtering)
 | 
| 
 | 
    85   --prefix=<length>           Don't report candidate repeat intervals that
 | 
| 
 | 
    86                               start within <length> of the sequence start
 | 
| 
 | 
    87                               (default is to do no such filtering)
 | 
| 
 | 
    88   --suffix=<length>           Don't report candidate repeat intervals that
 | 
| 
 | 
    89                               end within <length> of the sequence end
 | 
| 
 | 
    90                               (default is to do no such filtering)
 | 
| 
 | 
    91   --subsample=<k>/<n>         Process only the <k>th sequence of every group of
 | 
| 
 | 
    92                               <n> sequences;  <k> ranges from 1 to <n>
 | 
| 
 | 
    93   --multipleruns              Consider all candidate intervals in a sequence
 | 
| 
 | 
    94                               (default is to consider only the longest)
 | 
| 
 | 
    95   --partialmotifs             Consider microatelites with a partial motif
 | 
| 
 | 
    96                               (default is to consider only whole motifs)
 | 
| 
 | 
    97   --splitbyvalidity           Preprocess sequences, splitting at Ns;  this
 | 
| 
 | 
    98                               prevents candidates from including Ns
 | 
| 
 | 
    99                               (default is not to split)
 | 
| 
 | 
   100   --noflankdisplay            Show entire sequence as flanking regions
 | 
| 
 | 
   101                               (this is the default)
 | 
| 
 | 
   102   --flankdisplay=<length>     Limit length of flanking regions shown
 | 
| 
 | 
   103   --readnamesuffix=<string>   Root of suffix to append to read names;  e.g. 1
 | 
| 
 | 
   104                               for forward, 2 for reverse;  this triggers other
 | 
| 
 | 
   105                               info to be included in the suffix
 | 
| 
 | 
   106                               (default is "1" for fastq;  no suffix for fasta)
 | 
| 
 | 
   107   --head=<number>             limit the number of sequences processed
 | 
| 
 | 
   108   --markend                   Write a marker line upon completion
 | 
| 
 | 
   109                               (default is not to write a marker)
 | 
| 
 | 
   110   --help=details              Describe the process, and quit"""
 | 
| 
 | 
   111 
 | 
| 
 | 
   112     if (s == None): exit (message)
 | 
| 
 | 
   113     else:           exit ("%s\n%s" % (s,message))
 | 
| 
 | 
   114 
 | 
| 
 | 
   115 
 | 
| 
 | 
   116 detailedDescription = """In broad terms, the process works as follows:
 | 
| 
 | 
   117 
 | 
| 
 | 
   118 (1) Identify intervals that are highly correlated with the interval shifted by
 | 
| 
 | 
   119     P (the repeat period).  These intervals are called "runs" or "candidates".
 | 
| 
 | 
   120     The level of correlation required is controlled by rateThreshold. 
 | 
| 
 | 
   121     Depending on whether we want to look for more than one microsat, we either
 | 
| 
 | 
   122     find the longest such run (simple algorithm) or many runs (more complicated
 | 
| 
 | 
   123     algorithm). The following steps are then performed on each run.
 | 
| 
 | 
   124 
 | 
| 
 | 
   125 (2) Find the most likely repeat motif in the run.  This is done by counting
 | 
| 
 | 
   126     all kmers (of length P) and choosing the most frequent.  If that kmer is
 | 
| 
 | 
   127     itself covered by a sub-repeat we discard this run.  The idea is that we
 | 
| 
 | 
   128     can ignore a 6-mer like ACGACG because we will find it when we are looking
 | 
| 
 | 
   129     for 3-mers.
 | 
| 
 | 
   130 
 | 
| 
 | 
   131 (3) Once we identify the most likely repeat motif, we then modify the
 | 
| 
 | 
   132     interval, adjusting start and end to find the interval that has the fewest
 | 
| 
 | 
   133     mismatches vs. a sequence of the motif repeated (hamming distance).  Only
 | 
| 
 | 
   134     whole copies of the motif are considered.
 | 
| 
 | 
   135 
 | 
| 
 | 
   136 (4) At this point we have a valid microsat interval (in the eyes of the
 | 
| 
 | 
   137     program). It is subjected to some filtering stages (hamming distance or too
 | 
| 
 | 
   138     close to an end), and if it satisfies those conditions, it's reported to
 | 
| 
 | 
   139     the user."""
 | 
| 
 | 
   140 
 | 
| 
 | 
   141 def main():
 | 
| 
 | 
   142     global debug
 | 
| 
 | 
   143 
 | 
| 
 | 
   144     #=== parse the command line ===
 | 
| 
 | 
   145 
 | 
| 
 | 
   146     inputFilename         = None
 | 
| 
 | 
   147     referenceFileName     = None #add by Chen Sun on 02/25
 | 
| 
 | 
   148     inputFormat           = None
 | 
| 
 | 
   149     repeatPeriods         = []
 | 
| 
 | 
   150     rateThreshold         = 6 / 7.0
 | 
| 
 | 
   151     lengthThreshold       = 20
 | 
| 
 | 
   152     reportProgress        = None
 | 
| 
 | 
   153     discardDuplicates     = False
 | 
| 
 | 
   154     discardNearDuplicates = False
 | 
| 
 | 
   155     nearDuplicatePrefix   = 100
 | 
| 
 | 
   156     hammingThreshold      = 0
 | 
| 
 | 
   157     prefixThreshold       = None
 | 
| 
 | 
   158     suffixThreshold       = None
 | 
| 
 | 
   159     subsampleK            = None
 | 
| 
 | 
   160     subsampleN            = None
 | 
| 
 | 
   161     reportMultipleRuns    = False
 | 
| 
 | 
   162     allowPartialMotifs    = False
 | 
| 
 | 
   163     splitByValidity       = False
 | 
| 
 | 
   164     flankDisplayLimit     = None
 | 
| 
 | 
   165     readNameSuffix        = None
 | 
| 
 | 
   166     headLimit             = None
 | 
| 
 | 
   167     markEndOfFile         = False
 | 
| 
 | 
   168     additionalInfo        = False
 | 
| 
 | 
   169     debug                 = []
 | 
| 
 | 
   170 
 | 
| 
 | 
   171     for arg in argv[1:]:
 | 
| 
 | 
   172         if (arg == "--fasta"):
 | 
| 
 | 
   173             inputFormat = "fasta"
 | 
| 
 | 
   174         elif (arg == "--fastq"):
 | 
| 
 | 
   175             inputFormat = "fastq"
 | 
| 
 | 
   176         elif (arg == "--fastq:noquals"):
 | 
| 
 | 
   177             inputFormat = "fastq:noquals"
 | 
| 
 | 
   178         elif (arg == "--sam"):
 | 
| 
 | 
   179             inputFormat = "sam"
 | 
| 
 | 
   180         elif (arg == "--r"):
 | 
| 
 | 
   181             additionalInfo = True
 | 
| 
 | 
   182         elif (arg.startswith("--ref=")):
 | 
| 
 | 
   183             referenceFileName = arg.split("=",1)[1]
 | 
| 
 | 
   184         elif (arg.startswith("--period=")):
 | 
| 
 | 
   185             val = arg.split("=",1)[1]
 | 
| 
 | 
   186             for period in val.split(","):
 | 
| 
 | 
   187                 if (".." in period):
 | 
| 
 | 
   188                     (lowPeriod,highPeriod) = period.split("..",1)
 | 
| 
 | 
   189                     lowPeriod  = int(lowPeriod)
 | 
| 
 | 
   190                     highPeriod = int(highPeriod)
 | 
| 
 | 
   191                     for period in xrange(lowPeriod,highPeriod+1):
 | 
| 
 | 
   192                         repeatPeriods += [period]
 | 
| 
 | 
   193                 else:
 | 
| 
 | 
   194                     repeatPeriods += [int(period)]
 | 
| 
 | 
   195         elif (arg.startswith("--rate=")):
 | 
| 
 | 
   196             val = arg.split("=",1)[1]
 | 
| 
 | 
   197             rateThreshold = float_or_fraction(val)
 | 
| 
 | 
   198             assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val
 | 
| 
 | 
   199         elif (arg.startswith("--minlength=")):
 | 
| 
 | 
   200             val = arg.split("=",1)[1]
 | 
| 
 | 
   201             lengthThreshold = int(val)
 | 
| 
 | 
   202             assert (lengthThreshold >= 0)
 | 
| 
 | 
   203         elif (arg.startswith("--progress=")):
 | 
| 
 | 
   204             val = arg.split("=",1)[1]
 | 
| 
 | 
   205             reportProgress = int(val)
 | 
| 
 | 
   206         elif (arg == "--allowduplicates"):
 | 
| 
 | 
   207             discardDuplicates     = False
 | 
| 
 | 
   208             discardNearDuplicates = False
 | 
| 
 | 
   209         elif (arg == "--noduplicates"):
 | 
| 
 | 
   210             discardDuplicates     = True
 | 
| 
 | 
   211             discardNearDuplicates = False
 | 
| 
 | 
   212         elif (arg == "--nonearduplicates"):
 | 
| 
 | 
   213             discardDuplicates     = False
 | 
| 
 | 
   214             discardNearDuplicates = True
 | 
| 
 | 
   215         elif (arg.startswith("--nonearduplicate=")):
 | 
| 
 | 
   216             val = arg.split("=",1)[1]
 | 
| 
 | 
   217             discardDuplicates     = False
 | 
| 
 | 
   218             discardNearDuplicates = True
 | 
| 
 | 
   219             nearDuplicatePrefix   = int(val)
 | 
| 
 | 
   220             assert (nearDuplicatePrefix > 0)
 | 
| 
 | 
   221         elif (arg.startswith("--hamming=")):
 | 
| 
 | 
   222             val = arg.split("=",1)[1]
 | 
| 
 | 
   223             hammingThreshold = int(val)
 | 
| 
 | 
   224             assert (hammingThreshold >= 0)
 | 
| 
 | 
   225         elif (arg.startswith("--prefix=")):
 | 
| 
 | 
   226             val = arg.split("=",1)[1]
 | 
| 
 | 
   227             prefixThreshold = int(val)
 | 
| 
 | 
   228             assert (prefixThreshold >= 0)
 | 
| 
 | 
   229         elif (arg.startswith("--suffix=")):
 | 
| 
 | 
   230             val = arg.split("=",1)[1]
 | 
| 
 | 
   231             suffixThreshold = int(val)
 | 
| 
 | 
   232             assert (suffixThreshold >= 0)
 | 
| 
 | 
   233         elif (arg.startswith("--subsample=")):
 | 
| 
 | 
   234             val = arg.split("=",1)[1]
 | 
| 
 | 
   235             (k,n) = val.split("/",2)
 | 
| 
 | 
   236             subsampleK = int(k)
 | 
| 
 | 
   237             subsampleN = int(n)
 | 
| 
 | 
   238             assert (0 < subsampleK <= subsampleN)
 | 
| 
 | 
   239         elif (arg == "--multipleruns"):
 | 
| 
 | 
   240             reportMultipleRuns = True
 | 
| 
 | 
   241         elif (arg == "--partialmotifs"):
 | 
| 
 | 
   242             allowPartialMotifs = True
 | 
| 
 | 
   243         elif (arg == "--splitbyvalidity"):
 | 
| 
 | 
   244             splitByValidity = True
 | 
| 
 | 
   245         elif (arg == "--noflankdisplay"):
 | 
| 
 | 
   246             flankDisplayLimit = None
 | 
| 
 | 
   247         elif (arg.startswith("--flankdisplay=")):
 | 
| 
 | 
   248             val = arg.split("=",1)[1]
 | 
| 
 | 
   249             flankDisplayLimit = int(val)
 | 
| 
 | 
   250             assert (flankDisplayLimit >= 0)
 | 
| 
 | 
   251         elif (arg.startswith("--readnamesuffix")):
 | 
| 
 | 
   252             readNameSuffix = arg.split("=",1)[1]
 | 
| 
 | 
   253         elif (arg.startswith("--head=")):
 | 
| 
 | 
   254             headLimit = int_with_unit(arg.split("=",1)[1])
 | 
| 
 | 
   255         elif (arg == "--markend"):
 | 
| 
 | 
   256             markEndOfFile = True
 | 
| 
 | 
   257         elif (arg == "--help=details"):
 | 
| 
 | 
   258             exit (detailedDescription)
 | 
| 
 | 
   259         elif (arg.startswith("--debug=")):
 | 
| 
 | 
   260             debug += (arg.split("=",1)[1]).split(",")
 | 
| 
 | 
   261         elif (arg.startswith("--")):
 | 
| 
 | 
   262             usage("unrecognized option: %s" % arg)
 | 
| 
 | 
   263         elif (inputFilename == None):
 | 
| 
 | 
   264             inputFilename = arg
 | 
| 
 | 
   265         else:
 | 
| 
 | 
   266             usage("unrecognized option: %s" % arg)
 | 
| 
 | 
   267 
 | 
| 
 | 
   268     #=== determine periods of interest ===
 | 
| 
 | 
   269 
 | 
| 
 | 
   270     if (repeatPeriods == []):
 | 
| 
 | 
   271         usage("you gotta give me a repeat period")
 | 
| 
 | 
   272 
 | 
| 
 | 
   273     if (additionalInfo == True):
 | 
| 
 | 
   274         if (referenceFileName == None):
 | 
| 
 | 
   275             usage("reference file path needed. use --ref=<reference> to indicate")
 | 
| 
 | 
   276 
 | 
| 
 | 
   277     periodSeed = {}
 | 
| 
 | 
   278     for period in repeatPeriods:
 | 
| 
 | 
   279         if (period < 1): usage("period %d is not valid" % period)
 | 
| 
 | 
   280         periodSeed[period] = True
 | 
| 
 | 
   281 
 | 
| 
 | 
   282     repeatPeriods = [period for period in periodSeed]
 | 
| 
 | 
   283     repeatPeriods.sort()
 | 
| 
 | 
   284 
 | 
| 
 | 
   285     #=== determine input format ===
 | 
| 
 | 
   286 
 | 
| 
 | 
   287     if   (inputFormat == "fasta"):           sequence_reader = fasta_sequences
 | 
| 
 | 
   288     elif (inputFormat == "fastq"):           sequence_reader = fastq_sequences
 | 
| 
 | 
   289     elif (inputFormat == "fastq:noquals"):   sequence_reader = fastq_sequences
 | 
| 
 | 
   290     elif (inputFormat == "sam"):             sequence_reader = sam_sequences
 | 
| 
 | 
   291     elif (inputFilename == None):            sequence_reader = fasta_sequences
 | 
| 
 | 
   292     elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences
 | 
| 
 | 
   293     elif (inputFilename.endswith(".fq")):    sequence_reader = fastq_sequences
 | 
| 
 | 
   294     elif (inputFilename.endswith(".sam")):   sequence_reader = sam_sequences
 | 
| 
 | 
   295     else:                                    sequence_reader = fasta_sequences
 | 
| 
 | 
   296 
 | 
| 
 | 
   297     if (inputFilename != None): inputF = file(inputFilename,"rt")
 | 
| 
 | 
   298     else:                       inputF = stdin
 | 
| 
 | 
   299 
 | 
| 
 | 
   300     if   (readNameSuffix == None) \
 | 
| 
 | 
   301      and (sequence_reader == fastq_sequences) \
 | 
| 
 | 
   302      and (inputFormat != "fastq:noquals"):
 | 
| 
 | 
   303         readNameSuffix = "1"
 | 
| 
 | 
   304 
 | 
| 
 | 
   305     #=== process the sequences ===
 | 
| 
 | 
   306     
 | 
| 
 | 
   307     refSequence = {}
 | 
| 
 | 
   308     rightName = ""
 | 
| 
 | 
   309     sequence = ""
 | 
| 
 | 
   310     if additionalInfo:
 | 
| 
 | 
   311         firstFasta = True
 | 
| 
 | 
   312         originalRefF = open(referenceFileName)
 | 
| 
 | 
   313         for line in originalRefF.readlines():
 | 
| 
 | 
   314             line = line.replace('\r','')
 | 
| 
 | 
   315             line = line.replace('\n','')
 | 
| 
 | 
   316             if line.startswith(">"):
 | 
| 
 | 
   317                 if firstFasta:
 | 
| 
 | 
   318                     firstFasta = False
 | 
| 
 | 
   319                 else:
 | 
| 
 | 
   320                     refSequence[rightName] = sequence
 | 
| 
 | 
   321                 rightName = line[1:]
 | 
| 
 | 
   322                 sequence = ""
 | 
| 
 | 
   323                 continue
 | 
| 
 | 
   324             sequence += line
 | 
| 
 | 
   325         originalRefF.close()
 | 
| 
 | 
   326         refSequence[rightName] = sequence
 | 
| 
 | 
   327 
 | 
| 
 | 
   328     sequenceSeen = {}
 | 
| 
 | 
   329 
 | 
| 
 | 
   330     numSequences = 0
 | 
| 
 | 
   331     for seqInfo in sequence_reader(inputF):
 | 
| 
 | 
   332         numSequences += 1
 | 
| 
 | 
   333         if (headLimit != None) and (numSequences > headLimit):
 | 
| 
 | 
   334             print >>stderr, "limit of %d sequences reached" % headLimit
 | 
| 
 | 
   335             break
 | 
| 
 | 
   336 
 | 
| 
 | 
   337         if (sequence_reader == sam_sequences):
 | 
| 
 | 
   338             #seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar
 | 
| 
 | 
   339             (name, sequence, refName, pre_s, cigar) = seqInfo
 | 
| 
 | 
   340             quals = None
 | 
| 
 | 
   341         elif (sequence_reader == fastq_sequences):
 | 
| 
 | 
   342             (name,sequence,quals) = seqInfo
 | 
| 
 | 
   343             if (inputFormat == "fastq:noquals"): quals = None
 | 
| 
 | 
   344         else:
 | 
| 
 | 
   345             (name,sequence) = seqInfo
 | 
| 
 | 
   346             quals = None
 | 
| 
 | 
   347 
 | 
| 
 | 
   348         if (reportProgress != None) and (numSequences % reportProgress == 0):
 | 
| 
 | 
   349             print >>stderr, "%s %d" % (name,numSequences)
 | 
| 
 | 
   350 
 | 
| 
 | 
   351         # if we're subsampling and not interested in this sequence, skip it
 | 
| 
 | 
   352 
 | 
| 
 | 
   353         if (subsampleN != None):
 | 
| 
 | 
   354             if ((numSequences-1) % subsampleN != (subsampleK-1)):
 | 
| 
 | 
   355                 continue
 | 
| 
 | 
   356 
 | 
| 
 | 
   357         # if this sequence is shorter than the length of interest, skip it
 | 
| 
 | 
   358 
 | 
| 
 | 
   359         seqLen = len(sequence)
 | 
| 
 | 
   360         if (seqLen < period) or (seqLen < lengthThreshold): continue
 | 
| 
 | 
   361 
 | 
| 
 | 
   362         # if we're not interested in duplicates and this is one, skip it;
 | 
| 
 | 
   363         # note that we assume no hash collisions occur, i.e. that all hash
 | 
| 
 | 
   364         # matches are truly sequence matches
 | 
| 
 | 
   365 
 | 
| 
 | 
   366         if (discardDuplicates):
 | 
| 
 | 
   367             h = hash108(sequence)
 | 
| 
 | 
   368             if (h in sequenceSeen): continue
 | 
| 
 | 
   369             sequenceSeen[h] = True
 | 
| 
 | 
   370         elif (discardNearDuplicates):
 | 
| 
 | 
   371             h = hash108(sequence[:nearDuplicatePrefix])
 | 
| 
 | 
   372             if (h in sequenceSeen): continue
 | 
| 
 | 
   373             sequenceSeen[h] = True
 | 
| 
 | 
   374 
 | 
| 
 | 
   375         # split the sequence into chunks of valid nucleotides
 | 
| 
 | 
   376 
 | 
| 
 | 
   377         if (splitByValidity):
 | 
| 
 | 
   378             chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)]
 | 
| 
 | 
   379         else:
 | 
| 
 | 
   380             chunks = [(0,len(sequence))]
 | 
| 
 | 
   381 
 | 
| 
 | 
   382         # evaluate for each period of interest
 | 
| 
 | 
   383 
 | 
| 
 | 
   384         for period in repeatPeriods:
 | 
| 
 | 
   385 
 | 
| 
 | 
   386             # operate on each chunk
 | 
| 
 | 
   387 
 | 
| 
 | 
   388             for (chunkStart,chunkEnd) in chunks:
 | 
| 
 | 
   389                 chunkLen = chunkEnd - chunkStart
 | 
| 
 | 
   390                 if (chunkLen < period) or (chunkLen < lengthThreshold): continue
 | 
| 
 | 
   391 
 | 
| 
 | 
   392                 if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug):
 | 
| 
 | 
   393                     print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd)
 | 
| 
 | 
   394 
 | 
| 
 | 
   395                 # compute correlation sequence
 | 
| 
 | 
   396 
 | 
| 
 | 
   397                 corr = correlation_sequence(sequence,period,chunkStart,chunkEnd)
 | 
| 
 | 
   398 
 | 
| 
 | 
   399                 if ("correlation" in debug) or ("runs" in debug):
 | 
| 
 | 
   400                     print >>stderr, sequence[chunkStart:chunkEnd]
 | 
| 
 | 
   401                     print >>stderr, corr
 | 
| 
 | 
   402 
 | 
| 
 | 
   403                 # find runs (candidates for being a microsat) 
 | 
| 
 | 
   404 
 | 
| 
 | 
   405                 if (reportMultipleRuns):
 | 
| 
 | 
   406                     runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold)
 | 
| 
 | 
   407                 else:
 | 
| 
 | 
   408                     runs = longest_suitable_run(corr,lengthThreshold,rateThreshold)
 | 
| 
 | 
   409                 if (runs == []): continue
 | 
| 
 | 
   410 
 | 
| 
 | 
   411 
 | 
| 
 | 
   412                 if ("runs" in debug):
 | 
| 
 | 
   413                     for (start,end) in runs:
 | 
| 
 | 
   414                         run = [" "] * seqLen
 | 
| 
 | 
   415                         for ix in xrange(start-period,end):
 | 
| 
 | 
   416                             run[ix] = "*"
 | 
| 
 | 
   417                         print >>stderr, "".join(run)
 | 
| 
 | 
   418 
 | 
| 
 | 
   419                 if ("candidates" in debug):
 | 
| 
 | 
   420                     for (start,end) in runs:
 | 
| 
 | 
   421                         print >>stderr, "%s %d %d" % (name,start,end)
 | 
| 
 | 
   422 
 | 
| 
 | 
   423                 # process runs and report those that pass muster
 | 
| 
 | 
   424 
 | 
| 
 | 
   425                 runCount = 0
 | 
| 
 | 
   426                 for (start,end) in runs:
 | 
| 
 | 
   427                     runCount += 1
 | 
| 
 | 
   428 
 | 
| 
 | 
   429                     start = chunkStart + start - period
 | 
| 
 | 
   430                     end   = chunkStart + end
 | 
| 
 | 
   431 
 | 
| 
 | 
   432                     (kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs)
 | 
| 
 | 
   433                     if (kmer == None): continue    # (no useful repeat kmer was found)
 | 
| 
 | 
   434 
 | 
| 
 | 
   435                     rptExtent = end - start
 | 
| 
 | 
   436                     prefixLen = start
 | 
| 
 | 
   437                     suffixLen = seqLen - end
 | 
| 
 | 
   438                     if (rptExtent <= period): continue
 | 
| 
 | 
   439                     if (hammingThreshold != None) and (d         > hammingThreshold): continue
 | 
| 
 | 
   440                     if (prefixThreshold  != None) and (prefixLen < prefixThreshold):  continue
 | 
| 
 | 
   441                     if (suffixThreshold  != None) and (suffixLen < suffixThreshold):  continue
 | 
| 
 | 
   442 
 | 
| 
 | 
   443                     if (flankDisplayLimit == None):
 | 
| 
 | 
   444                         seq = sequence[:start] \
 | 
| 
 | 
   445                             + sequence[start:end].lower() \
 | 
| 
 | 
   446                             + sequence[end:]
 | 
| 
 | 
   447                     else:
 | 
| 
 | 
   448                         seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \
 | 
| 
 | 
   449                             + sequence[start:end].lower() \
 | 
| 
 | 
   450                             + sequence[end:min(chunkEnd,end+flankDisplayLimit)]
 | 
| 
 | 
   451                     reportName = name
 | 
| 
 | 
   452                     if (readNameSuffix != None):
 | 
| 
 | 
   453                         reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount)
 | 
| 
 | 
   454                     if (quals == None or quals == "." or quals == "\t."): quals = "\t."
 | 
| 
 | 
   455                     else:               quals = "\t" + quals
 | 
| 
 | 
   456                     if not additionalInfo:
 | 
| 
 | 
   457                         print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \
 | 
| 
 | 
   458                             % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals)
 | 
| 
 | 
   459                     else:
 | 
| 
 | 
   460                         #pre_e = pre_s + prefixLen - 1
 | 
| 
 | 
   461                         refPoint = pre_s
 | 
| 
 | 
   462                         donorPoint = 0
 | 
| 
 | 
   463                         
 | 
| 
 | 
   464                         donorBeforeStart = prefixLen - 1 #pre_e
 | 
| 
 | 
   465                         donorMicroStart = prefixLen     #tr_s
 | 
| 
 | 
   466                         donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e
 | 
| 
 | 
   467                         donorAfterMicro = donorMicroEnd + 1 #suf_s
 | 
| 
 | 
   468                         donorEnd = len(seq) - 1    #suf_e
 | 
| 
 | 
   469                         
 | 
| 
 | 
   470                         set_pre_e = False
 | 
| 
 | 
   471                         set_tr_s = False
 | 
| 
 | 
   472                         set_tr_e = False
 | 
| 
 | 
   473                         set_suf_s = False
 | 
| 
 | 
   474                         set_suf_e = False
 | 
| 
 | 
   475                         
 | 
| 
 | 
   476                         pre_e = 0
 | 
| 
 | 
   477                         tr_s = 0
 | 
| 
 | 
   478                         tr_e = 0
 | 
| 
 | 
   479                         suf_s = 0
 | 
| 
 | 
   480                         suf_e = 0
 | 
| 
 | 
   481                         
 | 
| 
 | 
   482                         matchList = re.findall('(\d+)([IDM])', cigar)
 | 
| 
 | 
   483                         unCognitiveCigar = False
 | 
| 
 | 
   484                         for matchN, matchType in matchList:
 | 
| 
 | 
   485                             matchNum = int(matchN)
 | 
| 
 | 
   486                             if matchType == "M":
 | 
| 
 | 
   487                                 donorPoint = donorPoint + matchNum
 | 
| 
 | 
   488                                 refPoint = refPoint + matchNum
 | 
| 
 | 
   489                             elif matchType == "D":
 | 
| 
 | 
   490                                 refPoint = refPoint + matchNum
 | 
| 
 | 
   491                                 continue
 | 
| 
 | 
   492                             elif matchType == "I":
 | 
| 
 | 
   493                                 donorPoint = donorPoint + matchNum
 | 
| 
 | 
   494                             else:
 | 
| 
 | 
   495                                 unCognitiveCigar = True
 | 
| 
 | 
   496                                 break
 | 
| 
 | 
   497 
 | 
| 
 | 
   498                             if not set_pre_e:
 | 
| 
 | 
   499                                 if donorPoint >= donorBeforeStart:
 | 
| 
 | 
   500                                     pre_e = refPoint - (donorPoint - donorBeforeStart)
 | 
| 
 | 
   501                                     set_pre_e = True
 | 
| 
 | 
   502                                 else:
 | 
| 
 | 
   503                                     continue
 | 
| 
 | 
   504                                     
 | 
| 
 | 
   505                             if not set_tr_s:
 | 
| 
 | 
   506                                 if donorPoint >= donorMicroStart:
 | 
| 
 | 
   507                                     tr_s = refPoint - (donorPoint - donorMicroStart)
 | 
| 
 | 
   508                                     set_tr_s = True
 | 
| 
 | 
   509                                 else:
 | 
| 
 | 
   510                                     continue
 | 
| 
 | 
   511                                     
 | 
| 
 | 
   512                             if not set_tr_e:
 | 
| 
 | 
   513                                 if donorPoint >= donorMicroEnd:
 | 
| 
 | 
   514                                     tr_e = refPoint - (donorPoint - donorMicroEnd)
 | 
| 
 | 
   515                                     set_tr_e = True
 | 
| 
 | 
   516                                 else:
 | 
| 
 | 
   517                                     continue
 | 
| 
 | 
   518                                     
 | 
| 
 | 
   519                             if not set_suf_s:
 | 
| 
 | 
   520                                 if donorPoint >= donorAfterMicro:
 | 
| 
 | 
   521                                     suf_s = refPoint - (donorPoint - donorAfterMicro)
 | 
| 
 | 
   522                                     set_suf_s = True
 | 
| 
 | 
   523                                 else:
 | 
| 
 | 
   524                                     continue
 | 
| 
 | 
   525                                     
 | 
| 
 | 
   526                             if not set_suf_e:
 | 
| 
 | 
   527                                 if donorPoint >= donorEnd:
 | 
| 
 | 
   528                                     suf_e = refPoint - (donorPoint - donorEnd)
 | 
| 
 | 
   529                                     set_suf_e = True
 | 
| 
 | 
   530                                 else:
 | 
| 
 | 
   531                                     continue
 | 
| 
 | 
   532                                 
 | 
| 
 | 
   533                         if unCognitiveCigar:
 | 
| 
 | 
   534                             break
 | 
| 
 | 
   535                         tr_len = tr_e - tr_s + 1
 | 
| 
 | 
   536 
 | 
| 
 | 
   537                         if refName not in refSequence:
 | 
| 
 | 
   538                             tr_ref_seq = "."
 | 
| 
 | 
   539                         else:
 | 
| 
 | 
   540                             if refSequence[refName] == "":
 | 
| 
 | 
   541                                 tr_ref_seq = "."
 | 
| 
 | 
   542                             elif len(refSequence[refName]) <= tr_e:
 | 
| 
 | 
   543                                 tr_ref_seq = "."
 | 
| 
 | 
   544                             else:
 | 
| 
 | 
   545                                 tr_ref_seq = refSequence[refName][tr_s:tr_e+1]
 | 
| 
 | 
   546 
 | 
| 
 | 
   547                         pre_e += 1
 | 
| 
 | 
   548                         tr_e += 1
 | 
| 
 | 
   549                         suf_e += 1
 | 
| 
 | 
   550                         print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \
 | 
| 
 | 
   551                             % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
 | 
| 
 | 
   552 
 | 
| 
 | 
   553     if (markEndOfFile):
 | 
| 
 | 
   554         print "# microsat_snoop end-of-file"
 | 
| 
 | 
   555 
 | 
| 
 | 
   556     if (inputF != stdin):
 | 
| 
 | 
   557         inputF.close()
 | 
| 
 | 
   558 
 | 
| 
 | 
   559 # non_negative_intervals
 | 
| 
 | 
   560 #     find intervals with exactly + and no -
 | 
| 
 | 
   561 #     from string like this : +++++++++---+++++++++
 | 
| 
 | 
   562 def non_negative_intervals(seq, minLength=None):
 | 
| 
 | 
   563 
 | 
| 
 | 
   564     start = -1
 | 
| 
 | 
   565     end = -1
 | 
| 
 | 
   566     firstPlus = 1
 | 
| 
 | 
   567     #print seq
 | 
| 
 | 
   568     for ix in range(len(seq)): # for every char in seq    
 | 
| 
 | 
   569         ch = seq[ix]
 | 
| 
 | 
   570         if(ch == "+"):
 | 
| 
 | 
   571             if(firstPlus):
 | 
| 
 | 
   572                 firstPlus = 0
 | 
| 
 | 
   573                 start = ix
 | 
| 
 | 
   574             else:
 | 
| 
 | 
   575                 continue
 | 
| 
 | 
   576         elif(ch == "-"):
 | 
| 
 | 
   577             if(start >= 0):
 | 
| 
 | 
   578                 end = ix-1
 | 
| 
 | 
   579                 if((end - start + 1) >= minLength):
 | 
| 
 | 
   580                     yield (start,end+1)
 | 
| 
 | 
   581                 start = -1
 | 
| 
 | 
   582                 firstPlus = 1
 | 
| 
 | 
   583     if(start > 0):
 | 
| 
 | 
   584         if((ix - start + 1) >= minLength):
 | 
| 
 | 
   585             yield (start, ix+1)
 | 
| 
 | 
   586 
 | 
| 
 | 
   587 
 | 
| 
 | 
   588 ###################################################################
 | 
| 
 | 
   589 # modified by Chen Sun on 7/11/2014
 | 
| 
 | 
   590 # We do not want other modules, so parse these functions inside
 | 
| 
 | 
   591 #
 | 
| 
 | 
   592 ###################################################################
 | 
| 
 | 
   593 
 | 
| 
 | 
   594 # parse a string of the form {positives}/{positives_and_neutrals}
 | 
| 
 | 
   595 
 | 
| 
 | 
   596 def parse_spec(s):
 | 
| 
 | 
   597     if ("/" not in s): raise ValueError
 | 
| 
 | 
   598     (n,d) = s.split("/",1)
 | 
| 
 | 
   599     if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError
 | 
| 
 | 
   600     if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError
 | 
| 
 | 
   601 
 | 
| 
 | 
   602     positives = n[1:-1]
 | 
| 
 | 
   603     d         = d[1:-1]
 | 
| 
 | 
   604 
 | 
| 
 | 
   605     for ch in positives:
 | 
| 
 | 
   606         if (ch not in d): raise ValueError
 | 
| 
 | 
   607 
 | 
| 
 | 
   608     neutrals = [ch for ch in d if (ch not in positives)]
 | 
| 
 | 
   609     return (positives,neutrals)
 | 
| 
 | 
   610 
 | 
| 
 | 
   611 
 | 
| 
 | 
   612 # convert a string to a number, allowing fractions
 | 
| 
 | 
   613 
 | 
| 
 | 
   614 def float_or_fraction(s):
 | 
| 
 | 
   615     if ("/" in s):
 | 
| 
 | 
   616         (numer,denom) = s.split("/",1)
 | 
| 
 | 
   617         return float(numer)/float(denom)
 | 
| 
 | 
   618     else:
 | 
| 
 | 
   619         return float(s)
 | 
| 
 | 
   620 
 | 
| 
 | 
   621 
 | 
| 
 | 
   622 # dense_intervals--
 | 
| 
 | 
   623 #    Find all non-overlapping runs with a good enough rate (of positives), and
 | 
| 
 | 
   624 #    which meet our length threshold.
 | 
| 
 | 
   625 #
 | 
| 
 | 
   626 #    The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing
 | 
| 
 | 
   627 #    long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999.
 | 
| 
 | 
   628 #
 | 
| 
 | 
   629 # $$$ we use the denominator as the threshold, but we really should use the
 | 
| 
 | 
   630 # $$$ .. numerator, comparing it to minLength*rate
 | 
| 
 | 
   631 
 | 
| 
 | 
   632 def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None):
 | 
| 
 | 
   633 
 | 
| 
 | 
   634     if (blockers == None):
 | 
| 
 | 
   635         blockers = "".join([chr(n) for n in range(1,256)
 | 
| 
 | 
   636                                    if  (chr(n) not in positives)
 | 
| 
 | 
   637                                    and (chr(n) not in neutrals)])
 | 
| 
 | 
   638 
 | 
| 
 | 
   639     stackLeft       = [None]    # stack with each entry containing five
 | 
| 
 | 
   640     stackRight      = [None]    # .. elements;  note that entry zero is not
 | 
| 
 | 
   641     stackLeftScore  = [None]    # .. used
 | 
| 
 | 
   642     stackRightScore = [None]
 | 
| 
 | 
   643     stackLower      = [None]
 | 
| 
 | 
   644     top   = 0
 | 
| 
 | 
   645     score = 0
 | 
| 
 | 
   646 
 | 
| 
 | 
   647     for ix in range(len(seq)):
 | 
| 
 | 
   648         ch = seq[ix]
 | 
| 
 | 
   649         if (ch in blockers):
 | 
| 
 | 
   650             # emit intervals
 | 
| 
 | 
   651 
 | 
| 
 | 
   652             for sp in range(1,top+1):
 | 
| 
 | 
   653                 left  = stackLeft [sp] + 1
 | 
| 
 | 
   654                 right = stackRight[sp]
 | 
| 
 | 
   655 
 | 
| 
 | 
   656                 while (left < right) and (seq[left]  not in positives): left  += 1
 | 
| 
 | 
   657                 while (right > left) and (seq[right] not in positives): right -= 1
 | 
| 
 | 
   658 
 | 
| 
 | 
   659                 right += 1
 | 
| 
 | 
   660                 if (minLength == None) or (right - left >= minLength):
 | 
| 
 | 
   661                     yield (left,right)
 | 
| 
 | 
   662 
 | 
| 
 | 
   663             #empty stack
 | 
| 
 | 
   664 
 | 
| 
 | 
   665             stackLeft       = [None]
 | 
| 
 | 
   666             stackRight      = [None]
 | 
| 
 | 
   667             stackLeftScore  = [None]
 | 
| 
 | 
   668             stackRightScore = [None]
 | 
| 
 | 
   669             stackLower      = [None]
 | 
| 
 | 
   670             top   = 0
 | 
| 
 | 
   671             score = 0
 | 
| 
 | 
   672             continue
 | 
| 
 | 
   673 
 | 
| 
 | 
   674         if   (ch in positives): weight = 1-rate
 | 
| 
 | 
   675         elif (ch in neutrals):  weight = -rate
 | 
| 
 | 
   676         else: raise ValueError
 | 
| 
 | 
   677 
 | 
| 
 | 
   678         score += weight
 | 
| 
 | 
   679         #if ("algorithm" in debug):
 | 
| 
 | 
   680         #    print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score),
 | 
| 
 | 
   681 
 | 
| 
 | 
   682         if (weight < 0):
 | 
| 
 | 
   683             #if ("algorithm" in debug):
 | 
| 
 | 
   684             #    print >>sys.stderr
 | 
| 
 | 
   685             continue
 | 
| 
 | 
   686 
 | 
| 
 | 
   687         if (top > 0) and (stackRight[top] == ix-1):
 | 
| 
 | 
   688             # add this site to the interval on top of the stack
 | 
| 
 | 
   689 
 | 
| 
 | 
   690             stackRight     [top] = ix
 | 
| 
 | 
   691             stackRightScore[top] = score
 | 
| 
 | 
   692 
 | 
| 
 | 
   693             #if ("algorithm" in debug):
 | 
| 
 | 
   694             #    print >>sys.stderr, \
 | 
| 
 | 
   695             #          " extending [%d] %d-%d %4.1f %4.1f" \
 | 
| 
 | 
   696             #        % (top,
 | 
| 
 | 
   697             #           stackLeft     [top], stackRight     [top],
 | 
| 
 | 
   698             #           stackLeftScore[top], stackRightScore[top]),
 | 
| 
 | 
   699 
 | 
| 
 | 
   700         else:
 | 
| 
 | 
   701             # create a one site interval
 | 
| 
 | 
   702 
 | 
| 
 | 
   703             top += 1
 | 
| 
 | 
   704             if (top >= len(stackLeft)):
 | 
| 
 | 
   705                 stackLeft       += [None]
 | 
| 
 | 
   706                 stackRight      += [None]
 | 
| 
 | 
   707                 stackLeftScore  += [None]
 | 
| 
 | 
   708                 stackRightScore += [None]
 | 
| 
 | 
   709                 stackLower      += [None]
 | 
| 
 | 
   710 
 | 
| 
 | 
   711             stackLeft      [top] = ix - 1
 | 
| 
 | 
   712             stackLeftScore [top] = score - weight
 | 
| 
 | 
   713             stackRight     [top] = ix
 | 
| 
 | 
   714             stackRightScore[top] = score
 | 
| 
 | 
   715             stackLower     [top] = top - 1
 | 
| 
 | 
   716 
 | 
| 
 | 
   717             while (stackLower[top] > 0) \
 | 
| 
 | 
   718               and (stackLeftScore[stackLower[top]] > stackLeftScore[top]):
 | 
| 
 | 
   719                 stackLower[top] = stackLower[stackLower[top]]
 | 
| 
 | 
   720 
 | 
| 
 | 
   721             #if ("algorithm" in debug):
 | 
| 
 | 
   722             #    print >>sys.stderr, \
 | 
| 
 | 
   723             #          " creating  [%d] %d-%d %4.1f %4.1f -> %d" \
 | 
| 
 | 
   724             #        % (top,
 | 
| 
 | 
   725             #           stackLeft     [top], stackRight     [top],
 | 
| 
 | 
   726             #           stackLeftScore[top], stackRightScore[top],
 | 
| 
 | 
   727             #           stackLower    [top]),
 | 
| 
 | 
   728 
 | 
| 
 | 
   729         # merge intervals;  if there is a previous interval with a no-higher
 | 
| 
 | 
   730         # left score and no-higher right score, merge this interval (and all
 | 
| 
 | 
   731         # intervening ones) into that one
 | 
| 
 | 
   732 
 | 
| 
 | 
   733         while (top > 1) \
 | 
| 
 | 
   734           and (stackLower[top] > 0) \
 | 
| 
 | 
   735           and (stackRightScore[stackLower[top]] <= stackRightScore[top]):
 | 
| 
 | 
   736             stackRight     [stackLower[top]] = stackRight     [top]
 | 
| 
 | 
   737             stackRightScore[stackLower[top]] = stackRightScore[top]
 | 
| 
 | 
   738             top = stackLower[top]
 | 
| 
 | 
   739 
 | 
| 
 | 
   740             #if ("algorithm" in debug):
 | 
| 
 | 
   741             #    print >>sys.stderr, \
 | 
| 
 | 
   742             #          "\n%*s merging   [%d] %d-%d %4.1f %4.1f" \
 | 
| 
 | 
   743             #        % (13, "", top,
 | 
| 
 | 
   744             #           stackLeft[top],      stackRight     [top],
 | 
| 
 | 
   745             #           stackLeftScore[top], stackRightScore[top]),
 | 
| 
 | 
   746 
 | 
| 
 | 
   747         #if ("algorithm" in debug):
 | 
| 
 | 
   748         #    print >>sys.stderr
 | 
| 
 | 
   749 
 | 
| 
 | 
   750     # emit intervals
 | 
| 
 | 
   751 
 | 
| 
 | 
   752     for sp in range(1,top+1):
 | 
| 
 | 
   753         left  = stackLeft [sp] + 1
 | 
| 
 | 
   754         right = stackRight[sp]
 | 
| 
 | 
   755 
 | 
| 
 | 
   756         while (left < right) and (seq[left]  not in positives): left  += 1
 | 
| 
 | 
   757         while (right > left) and (seq[right] not in positives): right -= 1
 | 
| 
 | 
   758 
 | 
| 
 | 
   759         right += 1
 | 
| 
 | 
   760         if (minLength == None) or (right - left >= minLength):
 | 
| 
 | 
   761             yield (left,right)
 | 
| 
 | 
   762             
 | 
| 
 | 
   763             
 | 
| 
 | 
   764 ###################################################################
 | 
| 
 | 
   765 # modified by Chen Sun on 7/11/2014
 | 
| 
 | 
   766 #
 | 
| 
 | 
   767 ###################################################################
 | 
| 
 | 
   768 
 | 
| 
 | 
   769 # correlation_sequence--
 | 
| 
 | 
   770 #    Compute the correlation sequence for a given period.  This is a sequence
 | 
| 
 | 
   771 #    of + and - indicating whether the base at a given position matches the one
 | 
| 
 | 
   772 #    P positions earlier (where P is the period).  The first P positions are
 | 
| 
 | 
   773 #    blank.  Positions with single character runs longer than the period are
 | 
| 
 | 
   774 #    considered as non-matches, unless the period is 1.
 | 
| 
 | 
   775 
 | 
| 
 | 
   776 def correlation_sequence(sequence,period,start=None,end=None):
 | 
| 
 | 
   777     if (start == None): start = 0
 | 
| 
 | 
   778     if (end   == None): end   = len(sequence)
 | 
| 
 | 
   779 
 | 
| 
 | 
   780     prevCh = sequence[start]
 | 
| 
 | 
   781     run    = 1
 | 
| 
 | 
   782     for ix in xrange(start+1,start+period):
 | 
| 
 | 
   783         ch = sequence[ix]
 | 
| 
 | 
   784         if (ch != prevCh): run =  1
 | 
| 
 | 
   785         else:              run += 1
 | 
| 
 | 
   786         prevCh = ch
 | 
| 
 | 
   787 
 | 
| 
 | 
   788     corr = [" "] * period
 | 
| 
 | 
   789     for ix in xrange(start+period,end):
 | 
| 
 | 
   790         rptCh = sequence[ix-period]
 | 
| 
 | 
   791         ch    = sequence[ix]
 | 
| 
 | 
   792         if (ch != prevCh): run =  1
 | 
| 
 | 
   793         else:              run += 1
 | 
| 
 | 
   794         if    (ch    in "ACGT") \
 | 
| 
 | 
   795           and (ch == rptCh) \
 | 
| 
 | 
   796           and ((period == 1) or (run < period)):
 | 
| 
 | 
   797             corr += ["+"]
 | 
| 
 | 
   798         else:
 | 
| 
 | 
   799             corr += ["-"]
 | 
| 
 | 
   800         prevCh = ch
 | 
| 
 | 
   801 
 | 
| 
 | 
   802     return "".join(corr)
 | 
| 
 | 
   803 
 | 
| 
 | 
   804 
 | 
| 
 | 
   805 # longest_suitable_run--
 | 
| 
 | 
   806 #    Find longest run with a good enough rate (of positives).
 | 
| 
 | 
   807 #
 | 
| 
 | 
   808 #    We score a "+" as 1-r and anything else as -r.  This is based on the fol-
 | 
| 
 | 
   809 #    lowing derivation (p is the number of "+"s, n is the number of non-"+"s):
 | 
| 
 | 
   810 #        p/(p+n) >= r
 | 
| 
 | 
   811 #        ==> p >= rp + rn
 | 
| 
 | 
   812 #        ==> (1-r)p - rn >= 0
 | 
| 
 | 
   813 #
 | 
| 
 | 
   814 #    We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing).
 | 
| 
 | 
   815 #
 | 
| 
 | 
   816 # $$$ we use the denominator as the threshold, but we really should use the
 | 
| 
 | 
   817 # $$$ .. numerator, comparing it to minLength*rate
 | 
| 
 | 
   818 #
 | 
| 
 | 
   819 # $$$ this needs to account for $$$ this situation:
 | 
| 
 | 
   820 # $$$   sequence: ACGACGACGACGTTATTATTATTA
 | 
| 
 | 
   821 # $$$   matches:     +++++++++---+++++++++
 | 
| 
 | 
   822 # $$$ this is currently considered to be one interval (if rate <= 6/7), but it
 | 
| 
 | 
   823 # $$$ ought to be two;  we can't just post-process, though, because some other
 | 
| 
 | 
   824 # $$$ interval might be longer than the longest half of this;  maybe what we
 | 
| 
 | 
   825 # $$$ need to do is consider matches at distances -P and -2P, or if we match
 | 
| 
 | 
   826 # $$$ -P but that itself was a mismatch, we should carry the mismatch forward
 | 
| 
 | 
   827 
 | 
| 
 | 
   828 def longest_suitable_run(seq,minLength,rate):
 | 
| 
 | 
   829     maxEndingHere = 0
 | 
| 
 | 
   830     maxSoFar      = 0
 | 
| 
 | 
   831     start         = None
 | 
| 
 | 
   832 
 | 
| 
 | 
   833     for ix in xrange(len(seq)):
 | 
| 
 | 
   834         if (seq[ix] == "+"): s = 1-rate
 | 
| 
 | 
   835         else:                s = -rate
 | 
| 
 | 
   836 
 | 
| 
 | 
   837         if (maxEndingHere+s < 0):
 | 
| 
 | 
   838             maxEndingHere = 0
 | 
| 
 | 
   839             block         = ix
 | 
| 
 | 
   840         else:
 | 
| 
 | 
   841             maxEndingHere += s
 | 
| 
 | 
   842             if (maxEndingHere >= maxSoFar):
 | 
| 
 | 
   843                 maxSoFar = maxEndingHere
 | 
| 
 | 
   844                 start    = block + 1
 | 
| 
 | 
   845                 end      = ix + 1
 | 
| 
 | 
   846 
 | 
| 
 | 
   847     if (start == None) or (end - start < minLength):
 | 
| 
 | 
   848         return []
 | 
| 
 | 
   849     else:
 | 
| 
 | 
   850         return [(start,end)]
 | 
| 
 | 
   851 
 | 
| 
 | 
   852 
 | 
| 
 | 
   853 # all_suitable_runs--
 | 
| 
 | 
   854 #    Find all non-overlapping runs with a good enough rate (of positives), and
 | 
| 
 | 
   855 #    which meet our length threshold.
 | 
| 
 | 
   856 # $$$ this needs to post-process the intervals, splitting them to account for
 | 
| 
 | 
   857 # $$$ this situation:
 | 
| 
 | 
   858 # $$$   sequence: ACGACGACGACGTTATTATTATTA
 | 
| 
 | 
   859 # $$$   matches:     +++++++++---+++++++++
 | 
| 
 | 
   860 # $$$ this is currently reported as one interval (if rate <= 6/7), but it
 | 
| 
 | 
   861 # $$$ ought to be two
 | 
| 
 | 
   862 
 | 
| 
 | 
   863 def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold):
 | 
| 
 | 
   864     
 | 
| 
 | 
   865     ################################################################
 | 
| 
 | 
   866     # modified by Chen Sun on 07/11/2014
 | 
| 
 | 
   867     #
 | 
| 
 | 
   868     ################################################################
 | 
| 
 | 
   869     
 | 
| 
 | 
   870     if hammingThreshold > 0:
 | 
| 
 | 
   871         return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)]
 | 
| 
 | 
   872     elif hammingThreshold == 0:
 | 
| 
 | 
   873         return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)]
 | 
| 
 | 
   874 
 | 
| 
 | 
   875 
 | 
| 
 | 
   876 # find_repeat_element--
 | 
| 
 | 
   877 #    Find the most plausible repeat element for a run, and nudge the ends of
 | 
| 
 | 
   878 #    the run if needed.  Note that we will not consider kmers that represent
 | 
| 
 | 
   879 #    shorter repeats.  For example, we won't report ACTACT as a 6-mer since we
 | 
| 
 | 
   880 #    consider this to have a shorter period than 6.
 | 
| 
 | 
   881 
 | 
| 
 | 
   882 def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False):
 | 
| 
 | 
   883 
 | 
| 
 | 
   884     if hammingThreshold > 0:
 | 
| 
 | 
   885         (kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials)
 | 
| 
 | 
   886         return (kmer,bestD,bestStart,bestEnd)
 | 
| 
 | 
   887     # count the number of occurences of each k-mer;  note that we can't
 | 
| 
 | 
   888     # reject kmers containing smaller repeats yet, since for a sequence like
 | 
| 
 | 
   889     # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
 | 
| 
 | 
   890     # 6-mer, and THEN reject it;  if we reject ACACAC while counting, we'd end
 | 
| 
 | 
   891     # up reporting something like ACACAA as the best motif 
 | 
| 
 | 
   892 
 | 
| 
 | 
   893     if ("element" in debug):
 | 
| 
 | 
   894         print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
 | 
| 
 | 
   895 
 | 
| 
 | 
   896     if ("partial" in debug):
 | 
| 
 | 
   897         print period, seq, start, end, allowPartials;
 | 
| 
 | 
   898         print seq[start:end]
 | 
| 
 | 
   899 
 | 
| 
 | 
   900     kmerToCount = {}
 | 
| 
 | 
   901     kmerToFirst = {}
 | 
| 
 | 
   902     for ix in xrange(start,end-(period-1)):
 | 
| 
 | 
   903         kmer = seq[ix:ix+period]
 | 
| 
 | 
   904         if ("N" in kmer): continue
 | 
| 
 | 
   905         if (kmer not in kmerToCount):
 | 
| 
 | 
   906             kmerToCount[kmer] = 1
 | 
| 
 | 
   907             kmerToFirst[kmer] = ix
 | 
| 
 | 
   908         else:
 | 
| 
 | 
   909             kmerToCount[kmer] += 1
 | 
| 
 | 
   910         #if ("element" in debug):
 | 
| 
 | 
   911         #    print >>stderr, "    %d: %s" % (ix,kmer)
 | 
| 
 | 
   912 
 | 
| 
 | 
   913     # choose the best k-mer;  this is simply the most frequently occurring one,
 | 
| 
 | 
   914     # with ties broken by whichever one came first
 | 
| 
 | 
   915 
 | 
| 
 | 
   916     kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
 | 
| 
 | 
   917     if (kmers == []): return (None,None,start,end)
 | 
| 
 | 
   918     kmers.sort()
 | 
| 
 | 
   919 
 | 
| 
 | 
   920     if ("element" in debug):
 | 
| 
 | 
   921         for (count,first,kmer) in kmers:
 | 
| 
 | 
   922             print >>stderr, "    %s: %d" % (kmer,-count)
 | 
| 
 | 
   923 
 | 
| 
 | 
   924     (count,first,kmer) = kmers[0]
 | 
| 
 | 
   925     if (contains_repeat(kmer)): return (None,None,start,end)
 | 
| 
 | 
   926 
 | 
| 
 | 
   927     # determine the hamming distance between the run and a simple repeat, for
 | 
| 
 | 
   928     # each "plausible" start and end;  we compute the distance for each such
 | 
| 
 | 
   929     # interval, and choose the one with the lowest hamming distance;  ties are
 | 
| 
 | 
   930     # broken in a deterministic-but-unspecified manner
 | 
| 
 | 
   931 
 | 
| 
 | 
   932     bestD = bestStart = bestEnd = None
 | 
| 
 | 
   933     ###################################################################################
 | 
| 
 | 
   934     # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013
 | 
| 
 | 
   935     #     since we do not allow hamming_distance > 0, which means we do not allow mutation,
 | 
| 
 | 
   936     # we do not need this section to produce bestStart and End
 | 
| 
 | 
   937     ###################################################################################
 | 
| 
 | 
   938 
 | 
| 
 | 
   939     #for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
 | 
| 
 | 
   940     #    d = hamming_distance(seq,s,e,kmer)
 | 
| 
 | 
   941     #    if (d == None): continue
 | 
| 
 | 
   942     #    if (bestD == None) or (d <= bestD):
 | 
| 
 | 
   943     #        (bestD,bestStart,bestEnd) = (d,s,e)
 | 
| 
 | 
   944     
 | 
| 
 | 
   945 
 | 
| 
 | 
   946 
 | 
| 
 | 
   947     bestStart = start
 | 
| 
 | 
   948 
 | 
| 
 | 
   949     if(allowPartials):
 | 
| 
 | 
   950         bestEnd = end
 | 
| 
 | 
   951     elif(not allowPartials):
 | 
| 
 | 
   952         bestEnd = start
 | 
| 
 | 
   953         pattern = seq[start:start+period]
 | 
| 
 | 
   954         if ("partial" in debug):
 | 
| 
 | 
   955             print "kmer:", kmer
 | 
| 
 | 
   956             if(pattern != kmer):
 | 
| 
 | 
   957                 print "pattern:", pattern
 | 
| 
 | 
   958 
 | 
| 
 | 
   959         while(bestEnd <= end-period):
 | 
| 
 | 
   960             bestEnd += period
 | 
| 
 | 
   961 
 | 
| 
 | 
   962     # bestD will always be 0, as we do not allow mutation
 | 
| 
 | 
   963     bestD = 0
 | 
| 
 | 
   964     
 | 
| 
 | 
   965     if ("partial" in debug):
 | 
| 
 | 
   966         print bestD, bestStart, bestEnd
 | 
| 
 | 
   967 
 | 
| 
 | 
   968     ###################################################################################
 | 
| 
 | 
   969     # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10
 | 
| 
 | 
   970     # 
 | 
| 
 | 
   971     ###################################################################################
 | 
| 
 | 
   972     return (kmer,bestD,bestStart,bestEnd)
 | 
| 
 | 
   973 
 | 
| 
 | 
   974 
 | 
| 
 | 
   975 def find_hamming_repeat_element(period,seq,start,end,allowPartials=False):
 | 
| 
 | 
   976     
 | 
| 
 | 
   977     # count the number of occurences of each k-mer;  note that we can't
 | 
| 
 | 
   978     # reject kmers containing smaller repeats yet, since for a sequence like
 | 
| 
 | 
   979     # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
 | 
| 
 | 
   980     # 6-mer, and THEN reject it;  if we reject ACACAC while counting, we'd end
 | 
| 
 | 
   981     # up reporting something like ACACAA as the best motif 
 | 
| 
 | 
   982 
 | 
| 
 | 
   983     if ("element" in debug):
 | 
| 
 | 
   984         print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
 | 
| 
 | 
   985 
 | 
| 
 | 
   986     kmerToCount = {}
 | 
| 
 | 
   987     kmerToFirst = {}
 | 
| 
 | 
   988     for ix in xrange(start,end-(period-1)):
 | 
| 
 | 
   989         kmer = seq[ix:ix+period]
 | 
| 
 | 
   990         if ("N" in kmer): continue
 | 
| 
 | 
   991         if (kmer not in kmerToCount):
 | 
| 
 | 
   992             kmerToCount[kmer] = 1
 | 
| 
 | 
   993             kmerToFirst[kmer] = ix
 | 
| 
 | 
   994         else:
 | 
| 
 | 
   995             kmerToCount[kmer] += 1
 | 
| 
 | 
   996         #if ("element" in debug):
 | 
| 
 | 
   997         #    print >>stderr, "    %d: %s" % (ix,kmer)
 | 
| 
 | 
   998 
 | 
| 
 | 
   999     # choose the best k-mer;  this is simply the most frequently occurring one,
 | 
| 
 | 
  1000     # with ties broken by whichever one came first
 | 
| 
 | 
  1001 
 | 
| 
 | 
  1002     kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
 | 
| 
 | 
  1003     if (kmers == []): return (None,None,start,end)
 | 
| 
 | 
  1004     kmers.sort()
 | 
| 
 | 
  1005 
 | 
| 
 | 
  1006     if ("element" in debug):
 | 
| 
 | 
  1007         for (count,first,kmer) in kmers:
 | 
| 
 | 
  1008             print >>stderr, "    %s: %d" % (kmer,-count)
 | 
| 
 | 
  1009 
 | 
| 
 | 
  1010     (count,first,kmer) = kmers[0]
 | 
| 
 | 
  1011     if (contains_repeat(kmer)): return (None,None,start,end)
 | 
| 
 | 
  1012 
 | 
| 
 | 
  1013     # determine the hamming distance between the run and a simple repeat, for
 | 
| 
 | 
  1014     # each "plausible" start and end;  we compute the distance for each such
 | 
| 
 | 
  1015     # interval, and choose the one with the lowest hamming distance;  ties are
 | 
| 
 | 
  1016     # broken in a deterministic-but-unspecified manner
 | 
| 
 | 
  1017 
 | 
| 
 | 
  1018     bestD = bestStart = bestEnd = None
 | 
| 
 | 
  1019 
 | 
| 
 | 
  1020     for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
 | 
| 
 | 
  1021         d = hamming_distance(seq,s,e,kmer)
 | 
| 
 | 
  1022         if (d == None): continue
 | 
| 
 | 
  1023         if (bestD == None) or (d <= bestD):
 | 
| 
 | 
  1024             (bestD,bestStart,bestEnd) = (d,s,e)
 | 
| 
 | 
  1025 
 | 
| 
 | 
  1026     return (kmer,bestD,bestStart,bestEnd)
 | 
| 
 | 
  1027 
 | 
| 
 | 
  1028 # plausible_intervals--
 | 
| 
 | 
  1029 #    Yield all plausible intervals intersecting with a run.  We generate all
 | 
| 
 | 
  1030 #    starts within P bp of the run's start.  For each of these, we either (a) try
 | 
| 
 | 
  1031 #    all ends within P bp of run's end, or (b) trim the new interval to a whole
 | 
| 
 | 
  1032 #    multiple of the period, and report this short interval and the longer
 | 
| 
 | 
  1033 #    interval with one more period appended.  Case (a) allows partial motifs,
 | 
| 
 | 
  1034 #    while case (b) only allows whole motifs.
 | 
| 
 | 
  1035 
 | 
| 
 | 
  1036 def plausible_intervals(start,end,period,seqLen,allowPartials=False):
 | 
| 
 | 
  1037 
 | 
| 
 | 
  1038     # generate intervals that allow a partial copy of the motif
 | 
| 
 | 
  1039 
 | 
| 
 | 
  1040     if (allowPartials):
 | 
| 
 | 
  1041         for candStart in xrange(start-(period-1),start+period):
 | 
| 
 | 
  1042             if (candStart < 0): continue
 | 
| 
 | 
  1043             for candEnd in xrange(end-(period-1),end+period):
 | 
| 
 | 
  1044                 if (candEnd > seqLen): continue
 | 
| 
 | 
  1045                 if (candEnd <= candStart+period): continue
 | 
| 
 | 
  1046                 yield (candStart,candEnd)
 | 
| 
 | 
  1047 
 | 
| 
 | 
  1048     # -OR- generate intervals that allow only whole copies of the motif
 | 
| 
 | 
  1049 
 | 
| 
 | 
  1050     else:
 | 
| 
 | 
  1051         for candStart in xrange(start-(period-1),start+period):
 | 
| 
 | 
  1052             if (candStart < 0): continue
 | 
| 
 | 
  1053             candEnd = candStart + ((end-candStart)/period)*period
 | 
| 
 | 
  1054             yield (candStart,candEnd)
 | 
| 
 | 
  1055             candEnd += period
 | 
| 
 | 
  1056             if (candEnd <= seqLen): yield (candStart,candEnd)
 | 
| 
 | 
  1057 
 | 
| 
 | 
  1058 
 | 
| 
 | 
  1059 # hamming_distance--
 | 
| 
 | 
  1060 #    Determine the hamming distance between the run and a simple repeat.
 | 
| 
 | 
  1061 # $$$ improve this by allowing gaps, and stopping when we reach a threshold
 | 
| 
 | 
  1062 
 | 
| 
 | 
  1063 kmerToDiffs = {}  # (this is used for memo-ization)
 | 
| 
 | 
  1064 
 | 
| 
 | 
  1065 def hamming_distance(seq,start,end,kmer):
 | 
| 
 | 
  1066     period = len(kmer)
 | 
| 
 | 
  1067     if (end < start + period): return None
 | 
| 
 | 
  1068 
 | 
| 
 | 
  1069     wholeEnd = start + ((end-start)/period)*period
 | 
| 
 | 
  1070 
 | 
| 
 | 
  1071     if (kmer not in kmerToDiffs):
 | 
| 
 | 
  1072         kmerToDiffs[kmer] = { kmer:0 }
 | 
| 
 | 
  1073 
 | 
| 
 | 
  1074     d = 0
 | 
| 
 | 
  1075     for ix in xrange(start,wholeEnd,period):
 | 
| 
 | 
  1076         qmer = seq[ix:ix+period]    # same size as the kmer motif
 | 
| 
 | 
  1077         if (qmer in kmerToDiffs[kmer]):
 | 
| 
 | 
  1078             d += kmerToDiffs[kmer][qmer]
 | 
| 
 | 
  1079             continue
 | 
| 
 | 
  1080         diffs = 0
 | 
| 
 | 
  1081         for iy in xrange(0,period):
 | 
| 
 | 
  1082             if (qmer[iy] != kmer[iy]): diffs += 1
 | 
| 
 | 
  1083         kmerToDiffs[kmer][qmer] = diffs
 | 
| 
 | 
  1084         d += diffs
 | 
| 
 | 
  1085 
 | 
| 
 | 
  1086     if (end > wholeEnd):
 | 
| 
 | 
  1087         qmer = seq[wholeEnd:end]    # shorter than the kmer motif
 | 
| 
 | 
  1088         if (qmer in kmerToDiffs[kmer]):
 | 
| 
 | 
  1089             d += kmerToDiffs[kmer][qmer]
 | 
| 
 | 
  1090         else:
 | 
| 
 | 
  1091             diffs = 0
 | 
| 
 | 
  1092             for iy in xrange(0,len(qmer)):
 | 
| 
 | 
  1093                 if (qmer[iy] != kmer[iy]): diffs += 1
 | 
| 
 | 
  1094             kmerToDiffs[kmer][qmer] = diffs
 | 
| 
 | 
  1095             d += diffs
 | 
| 
 | 
  1096 
 | 
| 
 | 
  1097     return d
 | 
| 
 | 
  1098 
 | 
| 
 | 
  1099 
 | 
| 
 | 
  1100 # fasta_sequences--
 | 
| 
 | 
  1101 #    Read the fasta sequences from a file.  Note that we convert to upper case,
 | 
| 
 | 
  1102 #    and convert any letter other than ACGT to N.
 | 
| 
 | 
  1103 
 | 
| 
 | 
  1104 nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN")
 | 
| 
 | 
  1105 
 | 
| 
 | 
  1106 def fasta_sequences(f):
 | 
| 
 | 
  1107     seqName = None
 | 
| 
 | 
  1108     seqNucs = None
 | 
| 
 | 
  1109 
 | 
| 
 | 
  1110     for line in f:
 | 
| 
 | 
  1111         line = line.strip()
 | 
| 
 | 
  1112         if (line.startswith(">")):
 | 
| 
 | 
  1113             if (seqName != None):
 | 
| 
 | 
  1114                 yield (seqName,"".join(seqNucs))
 | 
| 
 | 
  1115             seqName = sequence_name(line)
 | 
| 
 | 
  1116             seqNucs = []
 | 
| 
 | 
  1117         elif (seqName == None):
 | 
| 
 | 
  1118             assert (False), "first sequence has no header"
 | 
| 
 | 
  1119         else:
 | 
| 
 | 
  1120             seqNucs += [line]
 | 
| 
 | 
  1121 
 | 
| 
 | 
  1122     if (seqName != None):
 | 
| 
 | 
  1123         yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap))
 | 
| 
 | 
  1124 
 | 
| 
 | 
  1125 
 | 
| 
 | 
  1126 # fastq_sequences--
 | 
| 
 | 
  1127 #    Read the fastq sequences from a file.  Note that we convert to upper case,
 | 
| 
 | 
  1128 #    and convert any letter other than ACGT to N.
 | 
| 
 | 
  1129 
 | 
| 
 | 
  1130 def fastq_sequences(f):
 | 
| 
 | 
  1131     lineNum = 0
 | 
| 
 | 
  1132     for line in f:
 | 
| 
 | 
  1133         lineNum += 1
 | 
| 
 | 
  1134         line = line.strip()
 | 
| 
 | 
  1135 
 | 
| 
 | 
  1136         if (lineNum % 4 == 1):
 | 
| 
 | 
  1137             assert (line.startswith("@")), \
 | 
| 
 | 
  1138                    "bad read name at line %d" % lineNum
 | 
| 
 | 
  1139             seqName = line[1:]
 | 
| 
 | 
  1140             continue
 | 
| 
 | 
  1141 
 | 
| 
 | 
  1142         if (lineNum % 4 == 2):
 | 
| 
 | 
  1143             seqNucs = line
 | 
| 
 | 
  1144             continue
 | 
| 
 | 
  1145 
 | 
| 
 | 
  1146         if (lineNum % 4 == 3):
 | 
| 
 | 
  1147             assert (line.startswith("+")), \
 | 
| 
 | 
  1148                    "can't understand line %d:\n%s" % (lineNum,line)
 | 
| 
 | 
  1149             continue
 | 
| 
 | 
  1150 
 | 
| 
 | 
  1151         quals = line
 | 
| 
 | 
  1152         assert (len(quals) == len(seqNucs)), \
 | 
| 
 | 
  1153                "length mismatch read vs. qualities at line %d" % lineNum
 | 
| 
 | 
  1154         yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals)
 | 
| 
 | 
  1155 
 | 
| 
 | 
  1156     assert (lineNum % 4 == 0), \
 | 
| 
 | 
  1157            "incomplete read at end of file"
 | 
| 
 | 
  1158 
 | 
| 
 | 
  1159 def sam_sequences(f):
 | 
| 
 | 
  1160     lineNum = 0
 | 
| 
 | 
  1161     for line in f:
 | 
| 
 | 
  1162         lineNum += 1
 | 
| 
 | 
  1163         line = line.strip()
 | 
| 
 | 
  1164 
 | 
| 
 | 
  1165         if line.startswith("@"):
 | 
| 
 | 
  1166             continue
 | 
| 
 | 
  1167 
 | 
| 
 | 
  1168         columns = line.split("\t")
 | 
| 
 | 
  1169         seqName = columns[0]
 | 
| 
 | 
  1170         refName = columns[2]
 | 
| 
 | 
  1171         pre_s = int(columns[3]) - 1
 | 
| 
 | 
  1172         cigar = columns[5]
 | 
| 
 | 
  1173         seqNucs = columns[9]
 | 
| 
 | 
  1174         
 | 
| 
 | 
  1175         yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar)
 | 
| 
 | 
  1176 
 | 
| 
 | 
  1177 # sequence_name--
 | 
| 
 | 
  1178 #    Extract the sequence name from a fasta header.
 | 
| 
 | 
  1179 #    $$$ this may need to be improved $$$
 | 
| 
 | 
  1180 
 | 
| 
 | 
  1181 def sequence_name(s):
 | 
| 
 | 
  1182     s = s[1:].strip()
 | 
| 
 | 
  1183     if (s == ""): return ""
 | 
| 
 | 
  1184     else:         return s.split()[0]
 | 
| 
 | 
  1185 
 | 
| 
 | 
  1186 
 | 
| 
 | 
  1187 # nucleotide_runs--
 | 
| 
 | 
  1188 #    Yield (start,end) for all runs of valid nucleotides in a sequence.
 | 
| 
 | 
  1189 
 | 
| 
 | 
  1190 def nucleotide_runs(s):
 | 
| 
 | 
  1191     runs  = []
 | 
| 
 | 
  1192     start = None
 | 
| 
 | 
  1193     for (ix,nuc) in enumerate(s):
 | 
| 
 | 
  1194         if (nuc in "ACGT"):
 | 
| 
 | 
  1195             if (start == None):
 | 
| 
 | 
  1196                 start = ix
 | 
| 
 | 
  1197         else:
 | 
| 
 | 
  1198             if (start != None):
 | 
| 
 | 
  1199                 yield (start,ix)
 | 
| 
 | 
  1200                 start = None
 | 
| 
 | 
  1201 
 | 
| 
 | 
  1202     if (start != None): yield (start,len(s))
 | 
| 
 | 
  1203 
 | 
| 
 | 
  1204 
 | 
| 
 | 
  1205 # contains_repeat--
 | 
| 
 | 
  1206 #    Determine whether a short sequence contains a repeated element, such as a
 | 
| 
 | 
  1207 #    6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT).  The repeat
 | 
| 
 | 
  1208 #    must cover the entire sequence, without mismatches.
 | 
| 
 | 
  1209 
 | 
| 
 | 
  1210 def contains_repeat(kmer):
 | 
| 
 | 
  1211     kmerLength = len(kmer)
 | 
| 
 | 
  1212     hasRepeat = False
 | 
| 
 | 
  1213     rptLen = 1
 | 
| 
 | 
  1214     while (not hasRepeat) and (2 * rptLen <= kmerLength):
 | 
| 
 | 
  1215         if (kmerLength % rptLen != 0):
 | 
| 
 | 
  1216             rptLen += 1
 | 
| 
 | 
  1217             continue
 | 
| 
 | 
  1218         isRepeat = True
 | 
| 
 | 
  1219         for i in xrange(rptLen,kmerLength,rptLen):
 | 
| 
 | 
  1220             if (kmer[i:i+rptLen] != kmer[:rptLen]):
 | 
| 
 | 
  1221                 isRepeat = False
 | 
| 
 | 
  1222                 break
 | 
| 
 | 
  1223         if (isRepeat):
 | 
| 
 | 
  1224             hasRepeat = True
 | 
| 
 | 
  1225             break
 | 
| 
 | 
  1226         rptLen += 1
 | 
| 
 | 
  1227     return hasRepeat
 | 
| 
 | 
  1228 
 | 
| 
 | 
  1229 
 | 
| 
 | 
  1230 # hash108--
 | 
| 
 | 
  1231 #    Return a 108-bit hash "value" of a string
 | 
| 
 | 
  1232 
 | 
| 
 | 
  1233 def hash108(s):
 | 
| 
 | 
  1234     m = md5_new()
 | 
| 
 | 
  1235     m.update(s)
 | 
| 
 | 
  1236     return m.hexdigest()[:27]
 | 
| 
 | 
  1237 
 | 
| 
 | 
  1238 
 | 
| 
 | 
  1239 # float_or_fraction--
 | 
| 
 | 
  1240 #    Convert a string to a number, allowing fractions
 | 
| 
 | 
  1241 
 | 
| 
 | 
  1242 def float_or_fraction(s):
 | 
| 
 | 
  1243     if ("/" in s):
 | 
| 
 | 
  1244         (numer,denom) = s.split("/",1)
 | 
| 
 | 
  1245         return float(numer)/float(denom)
 | 
| 
 | 
  1246     else:
 | 
| 
 | 
  1247         return float(s)
 | 
| 
 | 
  1248 
 | 
| 
 | 
  1249 
 | 
| 
 | 
  1250 # int_with_unit--
 | 
| 
 | 
  1251 #    Parse a string as an integer, allowing unit suffixes
 | 
| 
 | 
  1252 
 | 
| 
 | 
  1253 def int_with_unit(s):
 | 
| 
 | 
  1254     if (s.endswith("K")):
 | 
| 
 | 
  1255         multiplier = 1000
 | 
| 
 | 
  1256         s = s[:-1]
 | 
| 
 | 
  1257     elif (s.endswith("M")):
 | 
| 
 | 
  1258         multiplier = 1000 * 1000
 | 
| 
 | 
  1259         s = s[:-1]
 | 
| 
 | 
  1260     elif (s.endswith("G")):
 | 
| 
 | 
  1261         multiplier = 1000 * 1000 * 1000
 | 
| 
 | 
  1262         s = s[:-1]
 | 
| 
 | 
  1263     else:
 | 
| 
 | 
  1264         multiplier = 1
 | 
| 
 | 
  1265 
 | 
| 
 | 
  1266     try:               return               int(s)   * multiplier
 | 
| 
 | 
  1267     except ValueError: return int(math.ceil(float(s) * multiplier))
 | 
| 
 | 
  1268 
 | 
| 
 | 
  1269 
 | 
| 
 | 
  1270 if __name__ == "__main__": main()
 | 
| 
 | 
  1271 
 |