Mercurial > repos > earlhaminst > gff3_to_json
comparison gff3_to_json.py @ 1:befe6021e476 draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147
| author | earlhaminst |
|---|---|
| date | Tue, 28 Feb 2017 12:06:04 -0500 |
| parents | be6cec883b02 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:be6cec883b02 | 1:befe6021e476 |
|---|---|
| 2 | 2 |
| 3 import json | 3 import json |
| 4 import optparse | 4 import optparse |
| 5 import sys | 5 import sys |
| 6 | 6 |
| 7 cds_parent_dict = dict() | |
| 8 exon_parent_dict = dict() | |
| 9 five_prime_utr_parent_dict = dict() | |
| 10 gene_count = 0 | 7 gene_count = 0 |
| 11 gene_dict = dict() | 8 |
| 12 transcript_dict = dict() | 9 |
| 13 three_prime_utr_parent_dict = dict() | 10 def remove_type_from_list_of_ids(l): |
| 14 | 11 return ','.join(remove_type_from_id(_) for _ in l.split(',')) |
| 15 | 12 |
| 16 def feature_to_json(cols): | 13 |
| 14 def remove_type_from_id(id_): | |
| 15 colon_index = id_.find(':') | |
| 16 if colon_index >= 0: | |
| 17 return id_[colon_index + 1:] | |
| 18 else: | |
| 19 return id_ | |
| 20 | |
| 21 | |
| 22 def feature_to_dict(cols, parent_dict=None): | |
| 17 d = { | 23 d = { |
| 18 'end': int(cols[4]), | 24 'end': int(cols[4]), |
| 19 'start': int(cols[3]), | 25 'start': int(cols[3]), |
| 20 } | 26 } |
| 21 for attr in cols[8].split(';'): | 27 for attr in cols[8].split(';'): |
| 22 if '=' in attr: | 28 if '=' in attr: |
| 23 (tag, value) = attr.split('=') | 29 (tag, value) = attr.split('=') |
| 24 if tag == 'ID': | 30 if tag == 'ID': |
| 25 d['id'] = value | 31 tag = 'id' |
| 26 else: | 32 value = remove_type_from_id(value) |
| 27 d[tag] = value | 33 elif tag == 'Parent': |
| 34 value = remove_type_from_list_of_ids(value) | |
| 35 d[tag] = value | |
| 28 if cols[6] == '+': | 36 if cols[6] == '+': |
| 29 d['strand'] = 1 | 37 d['strand'] = 1 |
| 30 elif cols[6] == '-': | 38 elif cols[6] == '-': |
| 31 d['strand'] = -1 | 39 d['strand'] = -1 |
| 32 else: | 40 else: |
| 33 raise Exception("Unrecognized strand '%s'" % cols[6]) | 41 raise Exception("Unrecognized strand '%s'" % cols[6]) |
| 42 if parent_dict is not None and 'Parent' in d: | |
| 43 # a 3' UTR can be split among multiple exons | |
| 44 # a 5' UTR can be split among multiple exons | |
| 45 # a CDS can be part of multiple transcripts | |
| 46 for parent in d['Parent'].split(','): | |
| 47 if parent not in parent_dict: | |
| 48 parent_dict[parent] = [d] | |
| 49 else: | |
| 50 parent_dict[parent].append(d) | |
| 34 return d | 51 return d |
| 35 | 52 |
| 36 | 53 |
| 37 def gene_to_json(cols, species): | 54 def add_gene_to_dict(cols, species, gene_dict): |
| 38 global gene_count | 55 global gene_count |
| 39 gene = feature_to_json(cols) | 56 gene = feature_to_dict(cols) |
| 40 gene.update({ | 57 gene.update({ |
| 41 'member_id': gene_count, | 58 'member_id': gene_count, |
| 42 'object_type': 'Gene', | 59 'object_type': 'Gene', |
| 43 'seq_region_name': cols[0], | 60 'seq_region_name': cols[0], |
| 44 'species': species, | 61 'species': species, |
| 46 }) | 63 }) |
| 47 gene_dict[gene['id']] = gene | 64 gene_dict[gene['id']] = gene |
| 48 gene_count = gene_count + 1 | 65 gene_count = gene_count + 1 |
| 49 | 66 |
| 50 | 67 |
| 51 def transcript_to_json(cols, species): | 68 def add_transcript_to_dict(cols, species, transcript_dict): |
| 52 transcript = feature_to_json(cols) | 69 transcript = feature_to_dict(cols) |
| 53 transcript.update({ | 70 transcript.update({ |
| 54 'object_type': 'Transcript', | 71 'object_type': 'Transcript', |
| 55 'seq_region_name': cols[0], | 72 'seq_region_name': cols[0], |
| 56 'species': species, | 73 'species': species, |
| 57 }) | 74 }) |
| 58 transcript_dict[transcript['id']] = transcript | 75 transcript_dict[transcript['id']] = transcript |
| 59 | 76 |
| 60 | 77 |
| 61 def exon_to_json(cols, species): | 78 def add_exon_to_dict(cols, species, exon_parent_dict): |
| 62 exon = feature_to_json(cols) | 79 exon = feature_to_dict(cols, exon_parent_dict) |
| 63 exon.update({ | 80 exon.update({ |
| 64 'length': int(cols[4]) - int(cols[3]) + 1, | 81 'length': int(cols[4]) - int(cols[3]) + 1, |
| 65 'object_type': 'Exon', | 82 'object_type': 'Exon', |
| 66 'seq_region_name': cols[0], | 83 'seq_region_name': cols[0], |
| 67 'species': species, | 84 'species': species, |
| 68 }) | 85 }) |
| 69 if 'id' not in exon and 'Name' in exon: | 86 if 'id' not in exon and 'Name' in exon: |
| 70 exon['id'] = exon['Name'] | 87 exon['id'] = exon['Name'] |
| 71 | 88 |
| 72 if 'Parent' in exon: | 89 |
| 73 for parent in exon['Parent'].split(','): | 90 def add_cds_to_dict(cols, cds_parent_dict): |
| 74 if parent not in exon_parent_dict: | 91 cds = feature_to_dict(cols, cds_parent_dict) |
| 75 exon_parent_dict[parent] = [exon] | |
| 76 else: | |
| 77 exon_parent_dict[parent].append(exon) | |
| 78 | |
| 79 | |
| 80 def five_prime_utr_to_json(cols): | |
| 81 five_prime_utr = feature_to_json(cols) | |
| 82 if 'Parent' in five_prime_utr: | |
| 83 for parent in five_prime_utr['Parent'].split(','): | |
| 84 # the 5' UTR can be split among multiple exons | |
| 85 if parent not in five_prime_utr_parent_dict: | |
| 86 five_prime_utr_parent_dict[parent] = [five_prime_utr] | |
| 87 else: | |
| 88 five_prime_utr_parent_dict[parent].append(five_prime_utr) | |
| 89 | |
| 90 | |
| 91 def three_prime_utr_to_json(cols): | |
| 92 three_prime_utr = feature_to_json(cols) | |
| 93 if 'Parent' in three_prime_utr: | |
| 94 for parent in three_prime_utr['Parent'].split(','): | |
| 95 # the 3' UTR can be split among multiple exons | |
| 96 if parent not in three_prime_utr_parent_dict: | |
| 97 three_prime_utr_parent_dict[parent] = [three_prime_utr] | |
| 98 else: | |
| 99 three_prime_utr_parent_dict[parent].append(three_prime_utr) | |
| 100 | |
| 101 | |
| 102 def cds_to_json(cols): | |
| 103 cds = feature_to_json(cols) | |
| 104 if 'id' not in cds: | 92 if 'id' not in cds: |
| 105 if 'Name' in cds: | 93 if 'Name' in cds: |
| 106 cds['id'] = cds['Name'] | 94 cds['id'] = cds['Name'] |
| 107 elif 'Parent' in cds: | 95 elif 'Parent' in cds and ',' not in cds['Parent']: |
| 108 cds['id'] = cds['Parent'] | 96 cds['id'] = cds['Parent'] |
| 109 if 'Parent' in cds: | 97 |
| 110 # At this point we are sure than 'id' is in cds | 98 |
| 111 for parent in cds['Parent'].split(','): | 99 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict): |
| 112 if parent not in cds_parent_dict: | |
| 113 cds_parent_dict[parent] = [cds] | |
| 114 else: | |
| 115 cds_parent_dict[parent].append(cds) | |
| 116 | |
| 117 | |
| 118 def join_dicts(): | |
| 119 for parent, exon_list in exon_parent_dict.items(): | 100 for parent, exon_list in exon_parent_dict.items(): |
| 120 exon_list.sort(key=lambda _: _['start']) | |
| 121 if parent in transcript_dict: | 101 if parent in transcript_dict: |
| 102 exon_list.sort(key=lambda _: _['start']) | |
| 122 transcript_dict[parent]['Exon'] = exon_list | 103 transcript_dict[parent]['Exon'] = exon_list |
| 123 | 104 |
| 124 for transcript_id, transcript in transcript_dict.items(): | 105 for transcript_id, transcript in transcript_dict.items(): |
| 125 translation = { | 106 translation = { |
| 126 'CDS': [], | 107 'CDS': [], |
| 179 for parent in transcript['Parent'].split(','): | 160 for parent in transcript['Parent'].split(','): |
| 180 if parent in gene_dict: | 161 if parent in gene_dict: |
| 181 gene_dict[parent]['Transcript'].append(transcript) | 162 gene_dict[parent]['Transcript'].append(transcript) |
| 182 | 163 |
| 183 | 164 |
| 184 def merge_dicts(json_arg): | 165 def update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict): |
| 185 with open(json_arg) as f: | 166 gene_intersection = set(full_gene_dict.keys()) & set(gene_dict.keys()) |
| 186 dict_from_json = json.load(f) | |
| 187 gene_intersection = set(gene_dict.keys()) & set(dict_from_json.keys()) | |
| 188 if gene_intersection: | 167 if gene_intersection: |
| 189 raise Exception("JSON file '%s' contains information for genes '%s', which are also present in other files" % (json_arg, ', '.join(gene_intersection))) | 168 raise Exception("Information for genes '%s' are present in multiple files" % ', '.join(gene_intersection)) |
| 190 gene_dict.update(dict_from_json) | 169 full_gene_dict.update(gene_dict) |
| 191 | 170 |
| 192 | 171 |
| 193 def write_json(outfile=None, sort_keys=False): | 172 def write_json(full_gene_dict, outfile=None, sort_keys=False): |
| 194 if outfile: | 173 if outfile: |
| 195 with open(outfile, 'w') as f: | 174 with open(outfile, 'w') as f: |
| 196 json.dump(gene_dict, f, sort_keys=sort_keys) | 175 json.dump(full_gene_dict, f, sort_keys=sort_keys) |
| 197 else: | 176 else: |
| 198 print(json.dumps(gene_dict, indent=3, sort_keys=sort_keys)) | 177 json.dump(full_gene_dict, sys.stdout, sort_keys=sort_keys) |
| 199 | 178 |
| 200 | 179 |
| 201 def __main__(): | 180 def __main__(): |
| 202 parser = optparse.OptionParser() | 181 parser = optparse.OptionParser() |
| 203 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') | 182 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') |
| 204 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') | 183 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') |
| 205 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output') | 184 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output') |
| 206 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output') | 185 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output') |
| 207 options, args = parser.parse_args() | 186 options, args = parser.parse_args() |
| 208 | |
| 209 if args: | 187 if args: |
| 210 raise Exception('Use options to provide inputs') | 188 raise Exception('Use options to provide inputs') |
| 189 | |
| 190 full_gene_dict = dict() | |
| 211 for gff3_arg in options.gff3: | 191 for gff3_arg in options.gff3: |
| 212 try: | 192 try: |
| 213 (species, filename) = gff3_arg.split(':') | 193 (species, filename) = gff3_arg.split(':') |
| 214 except ValueError: | 194 except ValueError: |
| 215 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) | 195 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) |
| 196 gene_dict = dict() | |
| 197 transcript_dict = dict() | |
| 198 exon_parent_dict = dict() | |
| 199 cds_parent_dict = dict() | |
| 200 five_prime_utr_parent_dict = dict() | |
| 201 three_prime_utr_parent_dict = dict() | |
| 216 with open(filename) as f: | 202 with open(filename) as f: |
| 217 for i, line in enumerate(f): | 203 for i, line in enumerate(f, start=1): |
| 218 line = line.strip() | 204 line = line.strip() |
| 219 if not line: | 205 if not line: |
| 220 # skip empty lines | 206 # skip empty lines |
| 221 continue | 207 continue |
| 222 if line[0] == '#': | 208 if line[0] == '#': |
| 226 if len(cols) != 9: | 212 if len(cols) != 9: |
| 227 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) | 213 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) |
| 228 feature_type = cols[2] | 214 feature_type = cols[2] |
| 229 try: | 215 try: |
| 230 if feature_type == 'gene': | 216 if feature_type == 'gene': |
| 231 gene_to_json(cols, species) | 217 add_gene_to_dict(cols, species, gene_dict) |
| 232 elif feature_type in ('mRNA', 'transcript'): | 218 elif feature_type in ('mRNA', 'transcript'): |
| 233 transcript_to_json(cols, species) | 219 add_transcript_to_dict(cols, species, transcript_dict) |
| 234 elif feature_type == 'exon': | 220 elif feature_type == 'exon': |
| 235 exon_to_json(cols, species) | 221 add_exon_to_dict(cols, species, exon_parent_dict) |
| 236 elif feature_type == 'five_prime_UTR': | 222 elif feature_type == 'five_prime_UTR': |
| 237 five_prime_utr_to_json(cols) | 223 feature_to_dict(cols, five_prime_utr_parent_dict) |
| 238 elif feature_type == 'three_prime_UTR': | 224 elif feature_type == 'three_prime_UTR': |
| 239 three_prime_utr_to_json(cols) | 225 feature_to_dict(cols, three_prime_utr_parent_dict) |
| 240 elif feature_type == 'CDS': | 226 elif feature_type == 'CDS': |
| 241 cds_to_json(cols) | 227 add_cds_to_dict(cols, cds_parent_dict) |
| 242 else: | 228 else: |
| 243 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) | 229 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr) |
| 244 except Exception as e: | 230 except Exception as e: |
| 245 raise Exception("Line %i in file '%s': %s" % (i, filename, e)) | 231 raise Exception("Line %i in file '%s': %s" % (i, filename, e)) |
| 246 join_dicts() | 232 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) |
| 233 update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict) | |
| 247 | 234 |
| 248 for json_arg in options.json: | 235 for json_arg in options.json: |
| 249 merge_dicts(json_arg) | 236 with open(json_arg) as f: |
| 250 | 237 update_full_gene_dict_no_overwrite(full_gene_dict, json.load(f)) |
| 251 write_json(options.output, options.sort) | 238 |
| 239 write_json(full_gene_dict, options.output, options.sort) | |
| 252 | 240 |
| 253 | 241 |
| 254 if __name__ == '__main__': | 242 if __name__ == '__main__': |
| 255 __main__() | 243 __main__() |
