Mercurial > repos > dereeper > uniqprimer
view uniqprimer-0.5.0/primertools/includefilemanager.py @ 3:3249d78ecfc2 draft
Uploaded
author | dereeper |
---|---|
date | Mon, 03 Jan 2022 09:56:55 +0000 |
parents | |
children |
line wrap: on
line source
''' Created on Jan 1, 2011 @author: John L. Herndon @contact: herndon@cs.colostate.edu @organization: Colorado State University @group: Computer Science Department, Asa Ben-Hur's laboratory ''' import fastaparser import utils import os import programs import nucmerparser import copy class IncludeFileManager( object ): """ A class to manage include files """ #This class needs some work. Need to come up with a way to find unique sequences between all include files.... def __init__( self ): """ Constructor """ self.includeFiles = [ ] self.nucmer = programs.Nucmer( ) self.isExcludeFileInitialized = False self.isReferenceFileInitialized = False self.referenceFile = None self.referenceSequence = None self.uniqueSequences = None def setExcludeFile( self, excludeFileName ): """ A function to set the exclude file that will be used when nucmer is called """ utils.logMessage( "IncludeFileManager::setExcludeFile( )", "fileName {0}".format( excludeFileName ) ) self.excludeFileName = excludeFileName self.isExcludeFileInitialized = True def findUniqueSequencesInFile(self, doWantFile, doNotWantFile ): utils.logMessage( "IncludeFileManager::findUniqueSequence( )", "running nucmer for reference file: {0}".format( doWantFile ) ) coordFile = self.nucmer.execute( [ doWantFile, doNotWantFile ] ) matches = nucmerparser.parseCoordMatchFile( coordFile ) sequences = fastaparser.parseFastaFileAsPrimerSequence( doWantFile ) for match in matches: if sequences.has_key( match.seqID ): primerData = sequences[ match.seqID ] primerData.addMatch( match ) else: print "Warning: id from .coords file not found in sequence data..." utils.logMessage( "IncludeFileManager::processMatches( )", "WARNING - an ID was read in a Match that does not correspond to a sequence read from the fasta file!" ) returnValue = [ ] for key in sequences.keys( ): sequence = sequences[ key ] subSequences = sequence.getNonMatchedSubSequences( ) returnValue.extend( subSequences ) return returnValue def findCommonSequencesInFile(self, want, alsoWant ): utils.logMessage( "IncludeFileManager::findUniqueSequence( )", "running nucmer for reference file: {0}".format( want ) ) print want, alsoWant coordFile = self.nucmer.execute( [ want, alsoWant ] ) matches = nucmerparser.parseCoordMatchFile( coordFile ) sequences = fastaparser.parseFastaFileAsPrimerSequence( want ) for match in matches: if sequences.has_key( match.seqID ): primerData = sequences[ match.seqID ] primerData.addMatch( match ) returnValue = [ ] for key in sequences: sequence = sequences[ key ] subSequences = sequence.getMatchedSubSequences( ) returnValue.extend( subSequences ) return returnValue def processIncludeFile( self, includeFileName ): """ A function that adds and processes and include file. An exclude file must be set for this function to be called. """ utils.logMessage( "IncludeFileManager::processIncludeFile( )", "processing {0}".format( includeFileName ) ) if self.isExcludeFileInitialized == False: utils.logMessage( "IncludeFileManager::processIncludeFile( )", "no exclude file set".format( includeFileName ) ) raise utils.ModuleNotInitializedException( "includefilemanager", "no exclude file set" ) if self.isReferenceFileInitialized == False: utils.logMessage( "IncludeFileManager::processIncludeFile( )", "running nucmer for reference file: {0}".format( includeFileName ) ) self.uniqueSequences = self.findUniqueSequencesInFile( includeFileName, self.excludeFileName ) self.referenceFile = includeFileName self.isReferenceFileInitialized = True else: #write the unique sequences to a temp file tempSequences = utils.getTemporaryDirectory( ) + "/tempSequences.fasta" fastaparser.writeFastaFile( self.uniqueSequences, tempSequences ) self.findCommonSequencesInFile( includeFileName, tempSequences ) self.includeFiles.append( includeFileName ) def getUniqueSequences( self ): """ getUniqueSequences - return a dictionary of all sequences that are found in include fasta files, but not the combined exclude fasta files. The dictionary is indexed by the file ID """ return self.uniqueSequences