view gmap/lib/galaxy/datatypes/gmap.py @ 0:d58d272914e7

Uploaded
author jjohnson
date Tue, 18 Oct 2011 12:42:42 -0400
parents
children 52da588232b0
line wrap: on
line source

"""
GMAP indexes
"""
import logging
import os,os.path,re
from data import Text
from metadata import MetadataElement

log = logging.getLogger(__name__)

class GmapDB( Text ):
    """
    A GMAP DB for indexes
    """
    MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True )
    MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True )
    MetadataElement( name="kmers", default=[''], desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True )
    MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True )
    MetadataElement( name="maps", default=[''], desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True )
    MetadataElement( name="snps", default=[''], desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True )
    MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True )
    MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True )
    
    file_ext = 'gmapdb'
    is_binary = True
    composite_type = 'auto_primary_file'
    allow_datatype_change = False

    def generate_primary_file( self, dataset = None ):
        """ 
        This is called only at upload to write the html file
        cannot rename the datasets here - they come with the default unfortunately
        """
        return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
    
    def regenerate_primary_file(self,dataset):
        """
        cannot do this until we are setting metadata 
        """
        bn = dataset.metadata.db_name
        log.info( "GmapDB regenerate_primary_file %s" % (bn))
        rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)]
        for i,name in enumerate(dataset.metadata.maps):
            rval.append( '<li>%s' % name)
        rval.append( '</ul></html>' )
        f = file(dataset.file_name,'w')
        f.write("\n".join( rval ))
        f.write('\n')
        f.close()

    def set_peek( self, dataset, is_multi_byte=False ):
        log.info( "GmapDB set_peek %s" % (dataset))
        if not dataset.dataset.purged:
            dataset.peek  = "GMAPDB index %s\n cmet %s\n atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps )
            dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'
    def display_peek( self, dataset ):
        try:
            return dataset.peek
        except:
            return "GMAP index file"
    def sniff( self, filename ):
        return False
    def set_meta( self, dataset, overwrite = True, **kwd ):
        """
        Expecting:
        extra_files_path/<db_name>/db_name>.ref<basesize><kmer>3<index>
        extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp
        extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions
        extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs
        index maps: 
        extra_files_path/db_name/db_name.maps/*.iit
        """
        log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path))
        pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?'
        efp = dataset.extra_files_path
        flist = os.listdir(efp)
        for i,fname in enumerate(flist):
            log.info( "GmapDB set_meta %s %s" % (i,fname))
            fpath = os.path.join(efp,fname)
            if os.path.isdir(fpath):
                ilist = os.listdir(fpath)
                kmers = {'':'default'} # HACK  '' empty key  added so user has default choice when selecting kmer from metadata
                for j,iname in enumerate(ilist):
                    log.info( "GmapDB set_meta file %s %s" % (j,iname))
                    ipath = os.path.join(fpath,iname)
                    if os.path.isdir(ipath):  # find maps
                        dataset.metadata.map_dir = iname
                        for mapfile in os.listdir(ipath):
                            mapname = mapfile.replace('.iit','')
                            log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile))
                            dataset.metadata.maps.append(mapname)
                    else: 
                        m = re.match(pat,iname)
                        if m:
                            log.info( "GmapDB set_meta m %s %s " % (iname, m))
                            assert len(m.groups()) == 10
                            dataset.metadata.db_name = fname
                            if m.groups()[2] == 'ref':
                                if m.groups()[-1] != None:
                                    dataset.metadata.snps.append(m.groups()[-1])
                                else:
                                    if m.groups()[-3] != None:
                                        k = int(m.groups()[-3])
                                        kmers[k] = k
                                    if m.groups()[-4] != None:
                                        dataset.metadata.basesize = int( m.groups()[-4])
                            elif m.groups()[3] == 'met':
                                dataset.metadata.cmet = True
                            elif m.groups()[4] == 'a2i':
                                dataset.metadata.atoi = True
                dataset.metadata.kmers = kmers.keys()

##  class IntervalIndexTree( Text ):
##      """
##      A GMAP Interval Index Tree Map
##      created by iit_store
##      (/path/to/map)/(mapname).iit
##      """
##      MetadataElement( name="map_name", desc="The map name for this index set", default='unknown', set_in_upload=True, readonly=False )
##      file_ext = 'iit'
##      is_binary = True
##      composite_type = 'auto_primary_file'
##      allow_datatype_change = False
##  
##  class IntervalAnnotation(data.Text):
##      """
##      Class describing a GMAP Interval format:
##          >label coords optional_tag
##          optional_annotation (which may be zero, one, or multiple lines)
##      The coords should be of the form:
##          chr:position
##          chr:startposition..endposition
##      """
##      file_ext = 'gmapannotation'
##  
##  class SpliceSiteAnnotation(IntervalAnnotation):
##      file_ext = 'gmapsplicesites'
##      """
##      Example:
##          >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678
##          >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678
##          >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179
##          >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179
##          >NM_004449.ERG.exon1 21:38955452..38955451 donor 783
##          >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783
##          >NM_004449.ERG.exon2 21:38878638..38878637 donor 360
##          >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360
##      """
##  
##  class IntronAnnotation(IntervalAnnotation):
##      file_ext = 'gmapintrons'
##      """
##      Example:
##          >NM_004448.ERBB2.intron1 17:35110090..35116769
##          >NM_004448.ERBB2.intron2 17:35116920..35118100
##          >NM_004449.ERG.intron1 21:38955452..38878739
##          >NM_004449.ERG.intron2 21:38878638..38869541
##      """
##  
##  class SNPAnnotation(IntervalAnnotation):
##      file_ext = 'gmapsnps'
##      """
##      Example:
##          >rs62211261 21:14379270 CG
##          >rs62211262 21:14379281 CG
##      """