# HG changeset patch # User diego # Date 1335058341 14400 # Node ID d50638ebd809cfbed4946a9190c10f5b77a225b9 Uploaded diff -r 000000000000 -r d50638ebd809 rtg.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rtg.py Sat Apr 21 21:32:21 2012 -0400 @@ -0,0 +1,239 @@ +""" +rtg datatypes +""" + +import data +from galaxy.datatypes import sequence +import logging, os, sys, time, tempfile, shutil, string, glob, re, subprocess +import galaxy.model +from galaxy.datatypes import metadata +from galaxy.datatypes.metadata import MetadataElement +from galaxy import util +from galaxy.datatypes.images import Html +from galaxy.datatypes.sequence import Sequence +from galaxy.datatypes.binary import Binary +from sniff import * +from pprint import pprint +from ConfigParser import ConfigParser + +log = logging.getLogger(__name__) +basepath = os.path.dirname(__file__) +rtgcfg = os.path.abspath(os.path.join(basepath, "..", "..", "..", "tools", "rtg", "rtg-galaxy.cfg")) + +class FakeSecHead(object): + def __init__(self, fp): + self.fp = fp + self.sechead = '[asection]\n' + def readline(self): + if self.sechead: + try: return self.sechead + finally: self.sechead = None + else: return self.fp.readline() + +cfg = ConfigParser() +cfg.readfp(FakeSecHead(open(rtgcfg))) + +class Sdf( Html ): + composite_type = 'auto_primary_file' + allow_datatype_change = False + file_ext = 'sdf' + + MetadataElement(name="sdfId", desc="SDF Id", readonly="true", param=metadata.MetadataParameter) + MetadataElement(name="source", desc="Source", readonly="true", values=[('UNKNOWN', 'Unknown'), ('CG', 'Complete Genomics'), ('SOLEXA', 'Solexa')], param=metadata.SelectParameter) + MetadataElement(name="sequences", desc="Number of Sequences", readonly="true", param=metadata.MetadataParameter) + MetadataElement(name="hasQuality", desc="Has Quality", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter) + MetadataElement(name="type", desc="Type", readonly="true", values=[('DNA', 'DNA'), ('PROTEIN', 'Protein')], param=metadata.SelectParameter) + MetadataElement(name="paired", desc="Paired-End", readonly="true", values=[('FALSE', 'False'), ('TRUE', 'True')], param=metadata.SelectParameter) + MetadataElement(name="maxLength", desc="Maximum sequence length", readonly="true", param=metadata.MetadataParameter) + MetadataElement(name="minLength", desc="Minimum sequence length", readonly="true", param=metadata.MetadataParameter) + + def __init__( self, **kwd ): + Html.__init__( self, **kwd ) + log.debug( "Rtg log info %s" % ' __init__') + self.add_composite_file( 'format.log', mimetype = 'text/plain', description = 'Log', substitute_name_with_metadata = None, is_binary = False ) + self.add_composite_file( 'done', mimetype = 'text/plain', description = 'Completion', substitute_name_with_metadata = None, is_binary = False ) + self.add_composite_file( 'progress', mimetype = 'text/plain', description = 'Progress', substitute_name_with_metadata = None, is_binary = False ) + self.add_composite_file( 'mainIndex', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) + self.add_composite_file( 'nameIndex0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) + self.add_composite_file( 'namedata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) + self.add_composite_file( 'namepointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) + self.add_composite_file( 'seqdata0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) + self.add_composite_file( 'seqpointer0', mimetype = 'application/octet-stream', description = 'Index', substitute_name_with_metadata = None, is_binary = True ) + + def generate_primary_file( self, dataset = None ): + log.debug( "Rtg log info %s %s" % ('generate_primary_file',dataset)) + rval = ['RTG SDF Dataset

'] + rval.append('

This SDF dataset is composed of the following files:

' ) + return "\n".join( rval ) + + def regenerate_primary_file(self,dataset): + """ + cannot do this until we are setting metadata + """ + log.debug( "Rtg log info %s %s" % ('regenerate_primary_file',dataset)) + bn = dataset.metadata.base_name + flist = os.listdir(dataset.extra_files_path) + rval = ['Files for RTG SDF Dataset %s

Comprises the following files:

' ) + f = file(dataset.file_name,'w') + f.write("\n".join( rval )) + f.write('\n') + f.close() + + def set_meta( self, dataset, **kwd ): + Html.set_meta( self, dataset, **kwd ) + self.regenerate_primary_file(dataset) + if (os.path.isdir(dataset.extra_files_path + '/left')): + sdfDir = dataset.extra_files_path + '/left' + dataset.metadata.paired = 'TRUE' + else: + sdfDir = dataset.extra_files_path + dataset.metadata.paired = 'FALSE' + p = os.popen(cfg.get('asection', 'rtg') + ' sdfstats ' + sdfDir,"r") + while 1: + line = p.readline() + if not line: + break + if line.startswith('SDF-ID'): + dataset.metadata.sdfId = line.split(':', 1)[1].strip() + elif line.startswith('Number of sequences'): + dataset.metadata.sequences = line.split(':', 1)[1].strip() + elif line.startswith('Type'): + dataset.metadata.type = line.split(':', 1)[1].strip() + elif line.startswith('Source'): + dataset.metadata.source = line.split(':', 1)[1].strip() + elif line.startswith('Quality scores available'): + dataset.metadata.hasQuality = 'TRUE' + elif line.startswith('Maximum length'): + dataset.metadata.maxLength = line.split(':', 1)[1].strip() + elif line.startswith('Minimum length'): + dataset.metadata.minLength = line.split(':', 1)[1].strip() + if dataset.metadata.hasQuality != 'TRUE': + dataset.metadata.hasQuality = 'FALSE' + + if __name__ == '__main__': + import doctest, sys + doctest.testmod(sys.modules[__name__]) + +class Cgtsv ( Sequence ): + """Class representing a generic CG TSV sequence""" + file_ext = "tsvcg" + + def set_meta( self, dataset, **kwd ): + """ + Set the number of sequences and the number of data lines + in dataset. + """ + if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: + dataset.metadata.sequences = None + return + sequences = 0 + for line in file( dataset.file_name ): + line = line.strip() + if line: + if len(line) == 0 or line.startswith( '#' ) or line.startswith( '>' ): + # We don't count comment lines for sequence data types + continue + sequences += 1 + dataset.metadata.sequences = sequences + def sniff ( self, filename ): + """ + Determines whether the file is in CG TSV format + For details, see http://media.completegenomics.com/documents/DataFileFormats.pdf + """ + bases_regexp = re.compile( "^[NGTAC]*" ) + headers = get_headers( filename, '\t' ) + try: + count = 0 + if len(headers) < 2: + return False + for hdr in headers: + if len( hdr ) > 1 and hdr[0]: + if hdr[0].startswith( '#' ): + continue + if len(hdr) != 3: + return False + if hdr[0].startswith( '>' ): + if hdr[0] != ">flags": + return False + if hdr[1] != "reads": + return False + else: + try: + map( int, [hdr[0]] ) + if not bases_regexp.match(hdr[1]): + return False + except: + return False + count += 1 + if count >= 5: + return True + # Do other necessary checking here... + except: + return False + # If we haven't yet returned False, then... + return True + +class Samix( Binary ): + """Class describing a tabix-ed SAM file""" + file_ext = "sam.gz" + MetadataElement( name="sam_index", desc="SAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) + def init_meta( self, dataset, copy_from=None ): + Binary.init_meta( self, dataset, copy_from=copy_from ) + def set_meta( self, dataset, overwrite = True, **kwd ): + """ Creates the index for the SAM file. """ + # These metadata values are not accessible by users, always overwrite + #f = open('/home/alan/galtmp', 'w') + + index_file = dataset.metadata.sam_index + if not index_file: + index_file = dataset.metadata.spec['sam_index'].param.new_file( dataset = dataset ) + # print >>f, 'idx file ', index_file, '\n' + # Create the Sam index + stderr_name = tempfile.NamedTemporaryFile( prefix = "sam_index_stderr" ).name + command = cfg.get('asection', 'rtg') + (' index -f sam %s' % ( dataset.file_name)) + #print >>f, 'idx cmd ', command, '\n' + proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) ) + exit_code = proc.wait() + #Did index succeed? + stderr = open( stderr_name ).read().strip() + if stderr: + if exit_code != 0: + os.unlink( stderr_name ) #clean up + f.close(); + raise Exception, "Error Setting tabix-ed SAM Metadata: %s" % stderr + else: + print stderr + #print >>f, 'move ', dataset.file_name, '.tbi to ', index_file.file_name + shutil.move(dataset.file_name + '.tbi', index_file.file_name) + dataset.metadata.sam_index = index_file + # f.close(); + # Remove temp file + os.unlink( stderr_name ) + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = "Tabix-ed sam alignments file" + dataset.blurb = data.nice_size( dataset.get_size() ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): + try: + return dataset.peek + except: + return "Tabix-ed sam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) ) +