Mercurial > repos > vipints > fml_mergeloci
comparison fml_gff_groomer/scripts/gff_loci_merge.py @ 0:79726c328621 default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
| author | vipints |
|---|---|
| date | Tue, 07 Jun 2011 17:29:24 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:79726c328621 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # | |
| 3 # This program is free software; you can redistribute it and/or modify | |
| 4 # it under the terms of the GNU General Public License as published by | |
| 5 # the Free Software Foundation; either version 3 of the License, or | |
| 6 # (at your option) any later version. | |
| 7 # | |
| 8 # Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society | |
| 9 # Copyright (C) 2010 Friedrich Miescher Laboratory of the Max Planck Society | |
| 10 # | |
| 11 # Description : to merge same transcripts in single loci and define as an alternative spliced form for the gene. | |
| 12 | |
| 13 def display_content(final_dict): | |
| 14 """displaying the summary from GFF file""" | |
| 15 | |
| 16 print "\tUnique combination of Source(s), Feature type(s) and corresponding count:" | |
| 17 for sftype, cnt in sorted(final_dict['gff_source_type'].items()): | |
| 18 if sftype[1] == 'gene':print '\t' + str(cnt) + '\t' + str(sftype[0]) + ', '+ str(sftype[1]) | |
| 19 | |
| 20 def available_limits(gff_file): | |
| 21 """Figure out the available feature types from the given GFF file""" | |
| 22 | |
| 23 gff_handle = open(gff_file, 'rU') | |
| 24 filter_info = dict(gff_id = [0], gff_source_type = [1, 2], | |
| 25 gff_source = [1], gff_type = [2]) | |
| 26 cur_limits = dict() | |
| 27 for filter_key in filter_info.keys(): | |
| 28 cur_limits[filter_key] = collections.defaultdict(int) | |
| 29 for line in gff_handle: | |
| 30 if line.strip('\n\r')[0] != "#": | |
| 31 parts = [p.strip() for p in line.split('\t')] | |
| 32 if len(parts) == 1 and re.search(r'\w+', parts[0]):continue ## GFF files with FASTA sequence together | |
| 33 assert len(parts) == 9, line | |
| 34 for filter_key, cur_indexes in filter_info.items(): | |
| 35 cur_id = tuple([parts[i] for i in cur_indexes]) | |
| 36 cur_limits[filter_key][cur_id] += 1 | |
| 37 # get rid of the default dicts | |
| 38 gff_handle.close() | |
| 39 final_dict = dict() | |
| 40 for key, value_dict in cur_limits.items(): | |
| 41 if len(key) == 1:key = key[0] | |
| 42 final_dict[key] = dict(value_dict) | |
| 43 return final_dict | |
| 44 | |
| 45 def GFFWriter(merged_info, genes, transcripts, exons, utr5, cds, utr3, out_file): | |
| 46 """Write GFF3 file with merged feature description""" | |
| 47 | |
| 48 out_fh = open(out_file, 'w') | |
| 49 for ginfo, regions in merged_info.items(): | |
| 50 gene_cnt = 1 | |
| 51 for interval, features in sorted(regions.items()):# master gene feature | |
| 52 out_fh.write(ginfo[0] + '\t' + ginfo[1] + '\tgene\t' + str(interval[0]) + '\t' + str(interval[1]) + '\t.\t' + ginfo[2] + '\t.\tID=Gene_' + ginfo[0] + '_' + str(gene_cnt).zfill(5) + ';Name=Gene_' + ginfo[0] + '_' + str(gene_cnt).zfill(5) + '\n') | |
| 53 for geneid in features:# corresponding transcript info | |
| 54 if geneid in transcripts: | |
| 55 for tinfo in transcripts[geneid]:# transcript feature line | |
| 56 out_fh.write(ginfo[0] + '\t' + ginfo[1] + '\t' + tinfo['type'] + '\t' + str(tinfo['start']) + '\t' + str(tinfo['stop']) + '\t.\t' + ginfo[2] + '\t.\tID=' + tinfo['ID']+ ';Parent=Gene_' + ginfo[0] + '_' + str(gene_cnt).zfill(5) + '\n') | |
| 57 if tinfo['ID'] in utr5:# check for 5 prime UTR | |
| 58 for u5info in utr5[tinfo['ID']]:out_fh.write(ginfo[0] + '\t' + ginfo[1] + '\tfive_prime_UTR\t' + str(u5info['start']) + '\t' + str(u5info['stop']) + '\t.\t' + ginfo[2] + '\t.\tParent=' + tinfo['ID'] + '\n') | |
| 59 if tinfo['ID'] in cds:# check for CDS | |
| 60 for cdsinfo in cds[tinfo['ID']]:out_fh.write(ginfo[0] + '\t' + ginfo[1] + '\tCDS\t' + str(cdsinfo['start']) + '\t' + str(cdsinfo['stop']) + '\t.\t' + ginfo[2] + '\t.\tParent=' + tinfo['ID'] + '\n') | |
| 61 if tinfo['ID'] in utr3:# check for 3 prime UTR | |
| 62 for u3info in utr3[tinfo['ID']]:out_fh.write(ginfo[0] + '\t' + ginfo[1] + '\tthree_prime_UTR\t' + str(u3info['start']) + '\t' + str(u3info['stop']) + '\t.\t' + ginfo[2] + '\t.\tParent=' + tinfo['ID'] + '\n') | |
| 63 if tinfo['ID'] in exons:# check for exons | |
| 64 for exinfo in exons[tinfo['ID']]:out_fh.write(ginfo[0] + '\t' + ginfo[1] + '\texon\t' + str(exinfo['start']) + '\t' + str(exinfo['stop']) + '\t.\t' + ginfo[2] + '\t.\tParent=' + tinfo['ID'] + '\n') | |
| 65 gene_cnt += 1 | |
| 66 out_fh.close() | |
| 67 | |
| 68 def UniqLoci(genes, transcripts, exons): | |
| 69 """determine unique location where features annotated multiple times""" | |
| 70 | |
| 71 uniq_loci = dict() | |
| 72 for gid, parts in genes.items(): | |
| 73 gene_info = (parts['chr'], parts['source'], parts['strand']) | |
| 74 if gene_info in uniq_loci:## same contig, orientation, source: look for merging transcripts based on the nearby location | |
| 75 if (int(parts['start']), int(parts['stop'])) in uniq_loci[gene_info].keys(): ## similar transcripts will catch here (start and stop are same may be exon, CDS or intron content may vary) | |
| 76 uniq_loci[gene_info][(int(parts['start']), int(parts['stop']))].append(gid) | |
| 77 else: # heuristic approach to include closely related region on a single master loci. | |
| 78 got_a_range = 0 | |
| 79 for floc in uniq_loci[gene_info].keys():# look whether it lies closely to any intervel which is already defined | |
| 80 if (floc[1]-parts['start']) < 150 or (parts['stop']-floc[0]) < 150:continue ## TODO boundary spanning length in same orientation for genes of each species will be great. | |
| 81 if floc[0] <= parts['start'] and parts['start'] < floc[1]: # the start of the new candidate is inside of any of the already defined interval ? | |
| 82 non_coding = 0 | |
| 83 try: # check for small transcript whether they belong to a existing one or a new non-coding candidate. | |
| 84 if len(transcripts[gid]) == 1: | |
| 85 if len(exons[transcripts[gid][0]['ID']]) == 1:non_coding = 1 | |
| 86 if non_coding == 0: | |
| 87 if parts['stop'] > floc[1]:# making global gene coordinate from individual transcript model | |
| 88 entries = uniq_loci[gene_info][floc] | |
| 89 del uniq_loci[gene_info][floc] # remove the existing interval, here we got a longer downstream position from the candidate | |
| 90 entries.append(gid) | |
| 91 uniq_loci[gene_info][(floc[0], parts['stop'])] = entries | |
| 92 else: | |
| 93 uniq_loci[gene_info][floc].append(gid) | |
| 94 else:# create a new interval for non-coding type entry | |
| 95 uniq_loci[gene_info][(parts['start'], parts['stop'])] = [gid] | |
| 96 got_a_range = 1 | |
| 97 break | |
| 98 except: # dont have any transcripts or exons defined. | |
| 99 break | |
| 100 elif floc[0] < parts['stop'] and parts['stop'] <= floc[1]: # the stop of the new candidate is inside of any of the pre-defined interval ? the candidate seems to be from more upstream | |
| 101 non_coding = 0 | |
| 102 try: | |
| 103 if len(transcripts[gid]) == 1: | |
| 104 if len(exons[transcripts[gid][0]['ID']]) == 1:non_coding = 1 | |
| 105 if non_coding == 0: | |
| 106 entries = uniq_loci[gene_info][floc] | |
| 107 del uniq_loci[gene_info][floc] # remove the existing interval, here we got a upstream position from which the candidate transcribing | |
| 108 entries.append(gid) | |
| 109 uniq_loci[gene_info][(int(parts['start']), floc[1])] = entries | |
| 110 else: # create a new interval for non-coding type entry | |
| 111 uniq_loci[gene_info][(parts['start'], parts['stop'])] = [gid] | |
| 112 got_a_range = 1 | |
| 113 break | |
| 114 except: | |
| 115 break | |
| 116 elif floc[0] > parts['start'] and floc[1] < parts['stop']: # whether the whole feature floc region (--) resides in the candidate location (----------) ? | |
| 117 non_coding = 0 # here the candidate seems to be longer than the pre-defined interval, check all entries from the pre-defined interval whether it is a small region, any chance as non-coding. | |
| 118 try: | |
| 119 for features in uniq_loci[gene_info][floc]: | |
| 120 if len(transcripts[features]) == 1: | |
| 121 if len(exons[transcripts[features][0]['ID']]) == 1:non_coding = 1 | |
| 122 if non_coding == 1: # create a new interval for non coding | |
| 123 uniq_loci[gene_info][(parts['start'], parts['stop'])] = [gid] | |
| 124 else: # append the existing transcript cluster, here change the interval position based on the candidate location | |
| 125 entries = uniq_loci[gene_info][floc] | |
| 126 del uniq_loci[gene_info][floc] # remove the existing interval, here we got a longer upstream and downstream region. | |
| 127 entries.append(gid) | |
| 128 uniq_loci[gene_info][(parts['start'], parts['stop'])] = entries | |
| 129 got_a_range = 1 | |
| 130 break | |
| 131 except: | |
| 132 break | |
| 133 ## or create a new interval ?? | |
| 134 if got_a_range == 0:uniq_loci[gene_info][(parts['start'], parts['stop'])] = [gid] | |
| 135 else: | |
| 136 uniq_loci[gene_info] = {(int(parts['start']), int(parts['stop'])): [gid]} | |
| 137 | |
| 138 return uniq_loci | |
| 139 | |
| 140 def ParseGFF(gff_file): | |
| 141 """feature extraction from provided GFF file""" | |
| 142 | |
| 143 gff_handle = open(gff_file, 'rU') | |
| 144 genes, transcripts, exons, utr5, cds, utr3 = dict(), dict(), dict(), dict(), dict(), dict() | |
| 145 for gff_line in gff_handle: | |
| 146 parts = gff_line.strip('\n\r').split('\t') | |
| 147 if gff_line[0] == '#' or gff_line[0] == '>':continue | |
| 148 if len(parts) == 1:continue ## Some centers in the world create GFF files with FASTA sequence together | |
| 149 if len(parts) != 9:sys.stdout.write('Warning: Found invalid GFF line\n' + gff_line.strip('\n\r') + '\n');continue | |
| 150 if parts[3] == '' or parts[4] == '':sys.stdout.write('Warning: Found missing coordinate in GFF line\n' + gff_line.strip('\n\r') + '\n');continue | |
| 151 if parts[2] == 'gene': | |
| 152 gene_info = dict() | |
| 153 gene_info['start'] = int(parts[3]) | |
| 154 gene_info['stop'] = int(parts[4]) | |
| 155 gene_info['chr'] = parts[0] | |
| 156 gene_info['source'] = parts[1] | |
| 157 gene_info['strand'] = parts[6] | |
| 158 gid = '' | |
| 159 for attr in parts[-1].split(';'): | |
| 160 if attr == '':continue ## GFF line may end with a ';' symbol | |
| 161 attr = attr.split('=') | |
| 162 if attr[0] == 'ID':gid=attr[1];continue | |
| 163 gene_info[attr[0]] = attr[1] | |
| 164 if gid != '': genes[gid] = gene_info | |
| 165 if parts[2] == 'mRNA' or parts[2] == 'transcript' or parts[2] == 'ncRNA' or parts[2] == 'tRNA' or parts[2] == 'snRNA' or parts[2] == 'scRNA' or parts[2] == 'snoRNA' or parts[2] == 'snlRNA' or parts[2] == 'rRNA' or parts[2] == 'miRNA': | |
| 166 mrna_info = dict() | |
| 167 mrna_info['start'] = int(parts[3]) | |
| 168 mrna_info['stop'] = int(parts[4]) | |
| 169 mrna_info['chr'] = parts[0] | |
| 170 mrna_info['strand'] = parts[6] | |
| 171 mrna_info['type'] = parts[2] | |
| 172 gid = '' | |
| 173 for attr in parts[-1].split(';'): | |
| 174 if attr == '':continue ## GFF line may end with a ';' symbol | |
| 175 attr = attr.split('=') | |
| 176 if attr[0] == 'Parent':gid=attr[1];continue | |
| 177 mrna_info[attr[0]] = attr[1] | |
| 178 if gid in transcripts: | |
| 179 transcripts[gid].append(mrna_info) | |
| 180 else: | |
| 181 transcripts[gid] = [mrna_info] | |
| 182 if parts[2] == 'exon': | |
| 183 exon_info = dict() | |
| 184 exon_info['start'] = int(parts[3]) | |
| 185 exon_info['stop'] = int(parts[4]) | |
| 186 exon_info['chr'] = parts[0] | |
| 187 exon_info['strand'] = parts[6] | |
| 188 tid = '' | |
| 189 for attr in parts[-1].split(';'): | |
| 190 if attr == '':continue ## GFF line may end with a ';' symbol | |
| 191 attr = attr.split('=') | |
| 192 if attr[0] == 'Parent':tid=attr[1];continue | |
| 193 exon_info[attr[0]] = attr[1] | |
| 194 if tid in exons: | |
| 195 exons[tid].append(exon_info) | |
| 196 else: | |
| 197 exons[tid] = [exon_info] | |
| 198 if parts[2] == 'five_prime_UTR': | |
| 199 utr5_info = dict() | |
| 200 utr5_info['start'] = int(parts[3]) | |
| 201 utr5_info['stop'] = int(parts[4]) | |
| 202 utr5_info['chr'] = parts[0] | |
| 203 utr5_info['strand'] = parts[6] | |
| 204 tid = '' | |
| 205 for attr in parts[-1].split(';'): | |
| 206 if attr == '':continue ## GFF line may end with a ';' symbol | |
| 207 attr = attr.split('=') | |
| 208 if attr[0] == 'Parent':tid=attr[1];continue | |
| 209 utr5_info[attr[0]] = attr[1] | |
| 210 if tid in utr5: | |
| 211 utr5[tid].append(utr5_info) | |
| 212 else: | |
| 213 utr5[tid] = [utr5_info] | |
| 214 if parts[2] == 'CDS': | |
| 215 cds_info = dict() | |
| 216 cds_info['start'] = int(parts[3]) | |
| 217 cds_info['stop'] = int(parts[4]) | |
| 218 cds_info['chr'] = parts[0] | |
| 219 cds_info['strand'] = parts[6] | |
| 220 tid = '' | |
| 221 for attr in parts[-1].split(';'): | |
| 222 if attr == '':continue | |
| 223 attr = attr.split('=') | |
| 224 if attr[0] == 'Parent':tid=attr[1];continue | |
| 225 cds_info[attr[0]] = attr[1] | |
| 226 if tid in cds: | |
| 227 cds[tid].append(cds_info) | |
| 228 else: | |
| 229 cds[tid] = [cds_info] | |
| 230 if parts[2] == 'three_prime_UTR': | |
| 231 utr3_info = dict() | |
| 232 utr3_info['start'] = int(parts[3]) | |
| 233 utr3_info['stop'] = int(parts[4]) | |
| 234 utr3_info['chr'] = parts[0] | |
| 235 utr3_info['strand'] = parts[6] | |
| 236 tid = '' | |
| 237 for attr in parts[-1].split(';'): | |
| 238 if attr == '':continue | |
| 239 attr = attr.split('=') | |
| 240 if attr[0] == 'Parent':tid=attr[1];continue | |
| 241 utr3_info[attr[0]] = attr[1] | |
| 242 if tid in utr3: | |
| 243 utr3[tid].append(utr3_info) | |
| 244 else: | |
| 245 utr3[tid] = [utr3_info] | |
| 246 gff_handle.close() | |
| 247 return genes, transcripts, exons, utr5, cds, utr3 | |
| 248 | |
| 249 import re, sys | |
| 250 import time | |
| 251 import collections | |
| 252 | |
| 253 if __name__=='__main__': | |
| 254 | |
| 255 stime = time.asctime( time.localtime(time.time()) ) | |
| 256 print '-------------------------------------------------------' | |
| 257 print 'MergeLoci started on ' + stime | |
| 258 print '-------------------------------------------------------' | |
| 259 try: | |
| 260 gff_file = sys.argv[1] | |
| 261 out_file = sys.argv[2] | |
| 262 except: | |
| 263 sys.stderr.write("Missing GFF3 file, result file. terminating...\n") | |
| 264 sys.stderr.write("USAGE: gff_loci_merge.py <gff file> <result file>\n") | |
| 265 sys.exit(-1) | |
| 266 print '--------' | |
| 267 print 'Level: 1- ' + 'Reading GFF file: ' + re.sub(r'/home/galaxy/galaxy-2.1.2009', r'GALAXYDIR', gff_file) | |
| 268 print '--------' | |
| 269 print '--------' | |
| 270 print 'Level: 2- ' + 'BEFORE processing, Merging feature distribution in GFF file' | |
| 271 print '--------' | |
| 272 # initial feature distribution in file | |
| 273 final_dict = available_limits(gff_file) | |
| 274 display_content(final_dict) | |
| 275 # determine the whole content from GFF file | |
| 276 genes, transcripts, exons, utr5, cds, utr3 = ParseGFF(gff_file) | |
| 277 print '--------' | |
| 278 print 'Level: 3- ' + 'Start merging feature(s) from similar locations...' | |
| 279 print '--------' | |
| 280 # determine the same gene loci on specific chromosome based on the same source | |
| 281 merged_regions = UniqLoci(genes, transcripts, exons) | |
| 282 print '\tDone.' | |
| 283 print '--------' | |
| 284 print 'Level: 4- ' + 'Writing merged feature annotation to GFF format...' | |
| 285 print '--------' | |
| 286 # write new GFF file with merged loci information for gene feature | |
| 287 GFFWriter(merged_regions, genes, transcripts, exons, utr5, cds, utr3, out_file) | |
| 288 print '\tDone.' | |
| 289 # after processing display the feature distribution in the result file | |
| 290 print '--------' | |
| 291 print 'Level: 5- ' + 'Merged feature(s) summary from GFF file' | |
| 292 print '--------' | |
| 293 final_dict = available_limits(out_file) | |
| 294 display_content(final_dict) | |
| 295 print | |
| 296 print '\tMerged result file: ' + re.sub(r'/home/galaxy/galaxy-2.1.2009', r'GALAXYDIR', out_file) | |
| 297 stime = time.asctime( time.localtime(time.time()) ) | |
| 298 print '-------------------------------------------------------' | |
| 299 print 'MergeLoci finished at ' + stime | |
| 300 print '-------------------------------------------------------' |
