view uniqprimer-0.5.0/primertools/nucmerparser.py @ 6:8218425fda1d draft default tip

Uploaded
author dereeper
date Tue, 04 Jan 2022 16:05:55 +0000
parents 3249d78ecfc2
children
line wrap: on
line source

'''
Created on Jan 1, 2011

@author: John L. Herndon
@contact: herndon@cs.colostate.edu
@organization: Colorado State University
@group: Computer Science Department, Asa Ben-Hur's laboratory 
'''

import utils
import os
import re

def parseCoordMatchLine( match ):
      
    match = match.replace( '\t', ' ' )
    sections = match.split( '|', 4 )
        
    #parse the first section, containing the start and end
    #locations of the match
    firstsection = sections[ 0 ].strip( )
    firstsectiontokens = re.split( ' +', firstsection )
    start = int( firstsectiontokens[ 0 ].strip( ) )
    end = int( firstsectiontokens[ 1 ].strip( ) )
    
    #parse the last section, containing the sequenceID
    lastsection = sections[ -1 ].strip( )
    lastsectiontokens = re.split( " +", lastsection )
        
    seqid = lastsectiontokens[ 0 ].strip( )
    
    return utils.Match( start, end, seqid )
        
def parseCoordMatchFile( coordFileName ):
    '''
    A method to parse the coord file.
    returns a list of utils.match objects
    '''
    returnValue = [ ]
    
    #throw if the file doesn't exist
    if os.path.exists( coordFileName ) == False:
        raise utils.NoFileFoundException( coordFileName )
    
    
    #read the nucmer file into memory
    lines = open( coordFileName ).readlines( )
    
    #skip forward to the start of the matches. 
    i = 0
    while lines[ i ] [ 0] != '=':
        i += 1
    matchLines = lines[ i+1 : ]
    
    #parse each line for match start, end and sequenceID
    for matchLine in matchLines:
        returnValue.append( parseCoordMatchLine( matchLine ) )
    
    utils.logMessage( "NucmerParser::parseCoordMatchFile( )", "Parse {0}, finding {1} matches".format( coordFileName, len( returnValue ) ) )

    return returnValue