Mercurial > repos > vipints > deseq_hts
comparison dexseq-hts_1.0/src/dexseq_prepare_annotation.py @ 11:cec4b4fb30be draft default tip
DEXSeq version 1.6 added
| author | vipints <vipin@cbio.mskcc.org> |
|---|---|
| date | Tue, 08 Oct 2013 08:22:45 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 10:2fe512c7bfdf | 11:cec4b4fb30be |
|---|---|
| 1 import sys, collections, itertools, os.path | |
| 2 | |
| 3 if len( sys.argv ) != 3: | |
| 4 sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" ) | |
| 5 sys.stderr.write( "Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]) ) | |
| 6 sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" ) | |
| 7 sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" ) | |
| 8 sys.stderr.write( "with the count_in_exons.py script.\n" ) | |
| 9 sys.exit(1) | |
| 10 | |
| 11 try: | |
| 12 import HTSeq | |
| 13 except ImportError: | |
| 14 sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) | |
| 15 sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) | |
| 16 sys.exit(1) | |
| 17 | |
| 18 gtf_file = sys.argv[1] | |
| 19 out_file = sys.argv[2] | |
| 20 | |
| 21 ## make sure that it can handle GFF files. | |
| 22 parent_child_map = dict() | |
| 23 for feature in HTSeq.GFF_Reader( gtf_file ): | |
| 24 if feature.type in ['mRNA', | |
| 25 'transcript', | |
| 26 'ncRNA', | |
| 27 'miRNA', | |
| 28 'pseudogenic_transcript', | |
| 29 'rRNA', | |
| 30 'snoRNA', | |
| 31 'snRNA', | |
| 32 'tRNA', | |
| 33 'scRNA']: | |
| 34 parent_child_map[feature.attr['ID']] = feature.attr['Parent'] | |
| 35 | |
| 36 # Step 1: Store all exons with their gene and transcript ID | |
| 37 # in a GenomicArrayOfSets | |
| 38 | |
| 39 exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True ) | |
| 40 for f in HTSeq.GFF_Reader( gtf_file ): | |
| 41 if not f.type in ["exon", "pseudogenic_exon"]: | |
| 42 continue | |
| 43 if not f.attr.get('gene_id'): | |
| 44 f.attr['gene_id'] = parent_child_map[f.attr['Parent']] | |
| 45 f.attr['transcript_id'] = f.attr['Parent'] | |
| 46 f.attr['gene_id'] = f.attr['gene_id'].replace( ":", "_" ) | |
| 47 exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] ) | |
| 48 | |
| 49 # Step 2: Form sets of overlapping genes | |
| 50 | |
| 51 # We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set | |
| 52 # contains IDs of genes that overlap, i.e., share bases (on the same strand). | |
| 53 # The keys of 'gene_sets' are the IDs of all genes, and each key refers to | |
| 54 # the set that contains the gene. | |
| 55 # Each gene set forms an 'aggregate gene'. | |
| 56 | |
| 57 gene_sets = collections.defaultdict( lambda: set() ) | |
| 58 for iv, s in exons.steps(): | |
| 59 # For each step, make a set, 'full_set' of all the gene IDs occuring | |
| 60 # in the present step, and also add all those gene IDs, whch have been | |
| 61 # seen earlier to co-occur with each of the currently present gene IDs. | |
| 62 full_set = set() | |
| 63 for gene_id, transcript_id in s: | |
| 64 full_set.add( gene_id ) | |
| 65 full_set |= gene_sets[ gene_id ] | |
| 66 # Make sure that all genes that are now in full_set get associated | |
| 67 # with full_set, i.e., get to know about their new partners | |
| 68 for gene_id in full_set: | |
| 69 assert gene_sets[ gene_id ] <= full_set | |
| 70 gene_sets[ gene_id ] = full_set | |
| 71 | |
| 72 | |
| 73 # Step 3: Go through the steps again to get the exonic sections. Each step | |
| 74 # becomes an 'exonic part'. The exonic part is associated with an | |
| 75 # aggregate gene, i.e., a gene set as determined in the previous step, | |
| 76 # and a transcript set, containing all transcripts that occur in the step. | |
| 77 # The results are stored in the dict 'aggregates', which contains, for each | |
| 78 # aggregate ID, a list of all its exonic_part features. | |
| 79 | |
| 80 aggregates = collections.defaultdict( lambda: list() ) | |
| 81 for iv, s in exons.steps( ): | |
| 82 # Skip empty steps | |
| 83 if len(s) == 0: | |
| 84 continue | |
| 85 # Take one of the gene IDs, find the others via gene sets, and | |
| 86 # form the aggregate ID from all of them | |
| 87 gene_id = list(s)[0][0] | |
| 88 assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ] | |
| 89 aggregate_id = '+'.join( gene_sets[ gene_id ] ) | |
| 90 # Make the feature and store it in 'aggregates' | |
| 91 f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv ) | |
| 92 f.source = os.path.basename( sys.argv[1] ) | |
| 93 f.attr = {} | |
| 94 f.attr[ 'gene_id' ] = aggregate_id | |
| 95 transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) ) | |
| 96 f.attr[ 'transcripts' ] = '+'.join( transcript_set ) | |
| 97 aggregates[ aggregate_id ].append( f ) | |
| 98 | |
| 99 | |
| 100 # Step 4: For each aggregate, number the exonic parts | |
| 101 | |
| 102 aggregate_features = [] | |
| 103 for l in aggregates.values(): | |
| 104 for i in xrange( len(l)-1 ): | |
| 105 assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name" | |
| 106 assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early" | |
| 107 if l[i].iv.chrom != l[i+1].iv.chrom: | |
| 108 raise ValueError, "Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) ) | |
| 109 if l[i].iv.strand != l[i+1].iv.strand: | |
| 110 raise ValueError, "Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) ) | |
| 111 aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene", | |
| 112 HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start, | |
| 113 l[-1].iv.end, l[0].iv.strand ) ) | |
| 114 aggr_feat.source = os.path.basename( sys.argv[1] ) | |
| 115 aggr_feat.attr = { 'gene_id': aggr_feat.name } | |
| 116 for i in xrange( len(l) ): | |
| 117 l[i].attr['exonic_part_number'] = "%03d" % ( i+1 ) | |
| 118 aggregate_features.append( aggr_feat ) | |
| 119 | |
| 120 | |
| 121 # Step 5: Sort the aggregates, then write everything out | |
| 122 | |
| 123 aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) ) | |
| 124 | |
| 125 fout = open( out_file, "w" ) | |
| 126 for aggr_feat in aggregate_features: | |
| 127 fout.write( aggr_feat.get_gff_line() ) | |
| 128 for f in aggregates[ aggr_feat.name ]: | |
| 129 fout.write( f.get_gff_line() ) | |
| 130 | |
| 131 fout.close() |
