view genome_diversity.py @ 32:03c22b722882

remove BeautifulSoup dependency
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:54:23 -0400
parents 2c498d40ecde
children
line wrap: on
line source

#!/usr/bin/env python

import sys
import cdblib

def _openfile( filename=None, mode='r' ):
    try:
        fh = open( filename, mode )
    except IOError, err:
        raise RuntimeError( "can't open file: %s\n" % str( err ) )
    return fh

def get_filename_from_loc( species=None, filename=None ):
    fh = _openfile( filename )
    for line in fh:
        if line and not line.startswith( '#' ):
            line = line.rstrip( '\r\n' )
            if line:
                elems = line.split( '\t' )
                if len( elems ) >= 2 and elems[0] == species:
                    return elems[1]

    raise RuntimeError( "can't find '%s' in location file: %s\n" % ( species, filename ) )


class SnpFile( object ):
    def __init__( self, filename=None, seq_col=1, pos_col=2, ref_seq_col=7, ref_pos_col=8 ):
        self.filename = filename
        self.fh = _openfile( filename )
        self.seq_col = seq_col
        self.pos_col = pos_col
        self.ref_seq_col = ref_seq_col
        self.ref_pos_col = ref_pos_col
        self.elems = None
        self.line = None
        self.comments = []

    def next( self ):
        while self.fh:
            try:
                self.line = self.fh.next()
            except StopIteration:
                self.line = None
                self.elems = None
                return None
            if self.line:
                self.line = self.line.rstrip( '\r\n' )
                if self.line:
                    if self.line.startswith( '#' ):
                        self.comments.append( self.line )
                    else:
                        self.elems = self.line.split( '\t' )
                        return 1

    def get_seq_pos( self ):
        if self.elems:
            return self.elems[ self.seq_col - 1 ], self.elems[ self.pos_col - 1 ]
        else:
            return None, None

    def get_ref_seq_pos( self ):
        if self.elems:
            return self.elems[ self.ref_seq_seq - 1 ], self.elems[ self.ref_pos_col - 1 ]
        else:
            return None, None


class IndexedFile( object ):

    def __init__( self, data_file=None, index_file=None ):
        self.data_file = data_file
        self.index_file = index_file
        self.data_fh = _openfile( data_file )
        self.index_fh = _openfile( index_file )
        self._reader = cdblib.Reader( self.index_fh.read(), hash )

    def get_indexed_line( self, key=None ):
        line = None
        if key in self._reader:
            offset = self._reader.getint( key )
            self.data_fh.seek( offset )
            try:
                line = self.data_fh.next()
            except StopIteration:
                raise RuntimeError( 'index file out of sync for %s' % key )
        return line

class PrimersFile( IndexedFile ):
    def get_primer_header( self, sequence=None, position=None ):
        key = "%s %s" % ( str( sequence ), str( position ) )
        header = self.get_indexed_line( key )
        if header:
            if header.startswith( '>' ):
                elems = header.split()
                if len( elems ) < 3:
                    raise RuntimeError( 'short primers header for %s' % key )
                if sequence != elems[1] or str( position ) != elems[2]:
                    raise RuntimeError( 'primers index for %s finds %s %s' % ( key, elems[1], elems[2] ) )
            else:
                raise RuntimeError( 'primers index out of sync for %s' % key )
        return header

    def get_entry( self, sequence=None, position=None ):
        entry = self.get_primer_header( sequence, position )
        if entry:
            while self.data_fh:
                try:
                    line = self.data_fh.next()
                except StopIteration:
                    break
                if line.startswith( '>' ):
                    break
                entry += line
        return entry

    def get_enzymes( self, sequence=None, position=None ):
        entry = self.get_primer_header( sequence, position )
        enzyme_list = []
        if entry:
            try:
                line = self.data_fh.next()
            except StopIteration:
                raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) )
            if line.startswith( '>' ):
                raise RuntimeError( 'primers entry for %s %s is truncated' % ( str( sequence ), str( position ) ) )
            line.rstrip( '\r\n' )
            if line:
                enzymes = line.split( ',' )
                for enzyme in enzymes:
                    enzyme = enzyme.strip()
                    if enzyme:
                        enzyme_list.append( enzyme )
        return enzyme_list

class SnpcallsFile( IndexedFile ):
    def get_snp_seq( self, sequence=None, position=None ):
        key = "%s %s" % ( str( sequence ), str( position ) )
        line = self.get_indexed_line( key )
        if line:
            elems = line.split( '\t' )
            if len (elems) < 3:
                raise RuntimeError( 'short snpcalls line for %s' % key )
            if sequence != elems[0] or str( position ) != elems[1]:
                raise RuntimeError( 'snpcalls index for %s finds %s %s' % ( key, elems[0], elems[1] ) )
            return elems[2]
        else:
            return None

    def get_flanking_dna( self, sequence=None, position=None, format='fasta' ):
        if format != 'fasta' and format != 'primer3':
            raise RuntimeError( 'invalid format for flanking dna: %s' % str( format ) )
        seq = self.get_snp_seq( sequence, position )
        if seq:
            p = seq.find('[')
            if p == -1:
                raise RuntimeError( 'snpcalls entry for %s %s missing left bracket: %s' % ( str( sequence ), str( position ), seq ) )
            q = seq.find(']', p + 1)
            if q == -1:
                raise RuntimeError( 'snpcalls entry for %s %s missing right bracket: %s' % ( str( sequence ), str( position ), seq ) )
            q += 1

            if format == 'fasta':
                flanking_seq = '> '
            else:
                flanking_seq = 'SEQUENCE_ID='

            flanking_seq += "%s %s %s %s\n" % ( str( sequence ), str( position ), seq[p+1], seq[p+3] )

            if format == 'primer3':
                flanking_seq += 'SEQUENCE_TEMPLATE='

            flanking_seq += "%sn%s\n" % ( seq[0:p], seq[q:] )

            if format == 'primer3':
                flanking_seq += "SEQUENCE_TARGET=%d,11\n=\n" % ( p - 5 )

            return flanking_seq
        else:
            return None



class LocationFile( object ):
    def __init__(self, filename):
        self.build_map(filename)

    def build_map(self, filename):
        self.map = {}
        self.open_file(filename)
        for line in self.read_lines():
            elems = line.split('\t', 1)
            if len(elems) == 2:
                self.map[ elems[0].strip() ] = elems[1].strip()
        self.close_file()

    def read_lines(self):
        for line in self.fh:
            if not line.startswith('#'):
                line = line.rstrip('\r\n')
                yield line

    def open_file(self, filename):
        self.filename = filename
        try:
            self.fh = open(filename, 'r')
        except IOError, err:
            print >> sys.stderr, "Error opening location file '%s': %s" % (filename, str(err))
            sys.exit(1)

    def close_file(self):
        self.fh.close()

    def loc_file( self, key ):
        if key in self.map:
            return self.map[key]
        else:
            print >> sys.stderr, "'%s' does not appear in location file '%s'" % (key, self.filename)
            sys.exit(1)
        
class ChrLens( object ):
    def __init__( self, chrlen_filename ):
        self.chrlen_filename = chrlen_filename
        self.build_map()

    def build_map(self):
        self.map = {}
        self.open_file(self.chrlen_filename)
        for line in self.read_lines():
            elems = line.split('\t', 1)
            if len(elems) == 2:
                chrom = elems[0].strip()
                chrom_len_text = elems[1].strip()
                try:
                    chrom_len = int( chrom_len_text )
                except ValueError:
                    print >> sys.stderr, "Bad length '%s' for chromosome '%s' in '%s'" % (chrom_len_text, chrom, self.chrlen_filename)
                self.map[ chrom ] = chrom_len
        self.close_file()

    def read_lines(self):
        for line in self.fh:
            if not line.startswith('#'):
                line = line.rstrip('\r\n')
                yield line

    def open_file(self, filename):
        self.filename = filename
        try:
            self.fh = open(filename, 'r')
        except IOError, err:
            print >> sys.stderr, "Error opening chromosome length file '%s': %s" % (filename, str(err))
            sys.exit(1)

    def close_file(self):
        self.fh.close()

    def length( self, key ):
        if key in self.map:
            return self.map[key]
        else:
            return None

    def __iter__( self ):
        for chrom in self.map:
            yield chrom