Mercurial > repos > vipints > deseq_hts
diff dexseq-hts_1.0/src/dexseq_prepare_annotation.py @ 11:cec4b4fb30be draft default tip
DEXSeq version 1.6 added
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 08:22:45 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/src/dexseq_prepare_annotation.py Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,131 @@ +import sys, collections, itertools, os.path + +if len( sys.argv ) != 3: + sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" ) + sys.stderr.write( "Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]) ) + sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" ) + sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" ) + sys.stderr.write( "with the count_in_exons.py script.\n" ) + sys.exit(1) + +try: + import HTSeq +except ImportError: + sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) + sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) + sys.exit(1) + +gtf_file = sys.argv[1] +out_file = sys.argv[2] + +## make sure that it can handle GFF files. +parent_child_map = dict() +for feature in HTSeq.GFF_Reader( gtf_file ): + if feature.type in ['mRNA', + 'transcript', + 'ncRNA', + 'miRNA', + 'pseudogenic_transcript', + 'rRNA', + 'snoRNA', + 'snRNA', + 'tRNA', + 'scRNA']: + parent_child_map[feature.attr['ID']] = feature.attr['Parent'] + +# Step 1: Store all exons with their gene and transcript ID +# in a GenomicArrayOfSets + +exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True ) +for f in HTSeq.GFF_Reader( gtf_file ): + if not f.type in ["exon", "pseudogenic_exon"]: + continue + if not f.attr.get('gene_id'): + f.attr['gene_id'] = parent_child_map[f.attr['Parent']] + f.attr['transcript_id'] = f.attr['Parent'] + f.attr['gene_id'] = f.attr['gene_id'].replace( ":", "_" ) + exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] ) + +# Step 2: Form sets of overlapping genes + +# We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set +# contains IDs of genes that overlap, i.e., share bases (on the same strand). +# The keys of 'gene_sets' are the IDs of all genes, and each key refers to +# the set that contains the gene. +# Each gene set forms an 'aggregate gene'. + +gene_sets = collections.defaultdict( lambda: set() ) +for iv, s in exons.steps(): + # For each step, make a set, 'full_set' of all the gene IDs occuring + # in the present step, and also add all those gene IDs, whch have been + # seen earlier to co-occur with each of the currently present gene IDs. + full_set = set() + for gene_id, transcript_id in s: + full_set.add( gene_id ) + full_set |= gene_sets[ gene_id ] + # Make sure that all genes that are now in full_set get associated + # with full_set, i.e., get to know about their new partners + for gene_id in full_set: + assert gene_sets[ gene_id ] <= full_set + gene_sets[ gene_id ] = full_set + + +# Step 3: Go through the steps again to get the exonic sections. Each step +# becomes an 'exonic part'. The exonic part is associated with an +# aggregate gene, i.e., a gene set as determined in the previous step, +# and a transcript set, containing all transcripts that occur in the step. +# The results are stored in the dict 'aggregates', which contains, for each +# aggregate ID, a list of all its exonic_part features. + +aggregates = collections.defaultdict( lambda: list() ) +for iv, s in exons.steps( ): + # Skip empty steps + if len(s) == 0: + continue + # Take one of the gene IDs, find the others via gene sets, and + # form the aggregate ID from all of them + gene_id = list(s)[0][0] + assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ] + aggregate_id = '+'.join( gene_sets[ gene_id ] ) + # Make the feature and store it in 'aggregates' + f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv ) + f.source = os.path.basename( sys.argv[1] ) + f.attr = {} + f.attr[ 'gene_id' ] = aggregate_id + transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) ) + f.attr[ 'transcripts' ] = '+'.join( transcript_set ) + aggregates[ aggregate_id ].append( f ) + + +# Step 4: For each aggregate, number the exonic parts + +aggregate_features = [] +for l in aggregates.values(): + for i in xrange( len(l)-1 ): + assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name" + assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early" + if l[i].iv.chrom != l[i+1].iv.chrom: + raise ValueError, "Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) ) + if l[i].iv.strand != l[i+1].iv.strand: + raise ValueError, "Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) ) + aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene", + HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start, + l[-1].iv.end, l[0].iv.strand ) ) + aggr_feat.source = os.path.basename( sys.argv[1] ) + aggr_feat.attr = { 'gene_id': aggr_feat.name } + for i in xrange( len(l) ): + l[i].attr['exonic_part_number'] = "%03d" % ( i+1 ) + aggregate_features.append( aggr_feat ) + + +# Step 5: Sort the aggregates, then write everything out + +aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) ) + +fout = open( out_file, "w" ) +for aggr_feat in aggregate_features: + fout.write( aggr_feat.get_gff_line() ) + for f in aggregates[ aggr_feat.name ]: + fout.write( f.get_gff_line() ) + +fout.close()