gff3_to_json: gff3_to_json.py comparison

comparison gff3_to_json.py @ 1:befe6021e476 draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gff3_to_json commit 5e5fbe362ed5a4714debda0f2c0834cbbfd34147

author	earlhaminst
date	Tue, 28 Feb 2017 12:06:04 -0500
parents	be6cec883b02
children

comparison

equal deleted inserted replaced

-:be6cec883b02
+:befe6021e476
 import json
 import optparse
 import sys
-cds_parent_dict = dict()
-exon_parent_dict = dict()
-five_prime_utr_parent_dict = dict()
 gene_count = 0
-gene_dict = dict()
-transcript_dict = dict()
-three_prime_utr_parent_dict = dict()
+def remove_type_from_list_of_ids(l):
+return ','.join(remove_type_from_id(_) for _ in l.split(','))
-def feature_to_json(cols):
+def remove_type_from_id(id_):
+colon_index = id_.find(':')
+if colon_index >= 0:
+return id_[colon_index + 1:]
+else:
+return id_
+def feature_to_dict(cols, parent_dict=None):
 d = {
 'end': int(cols[4]),
 'start': int(cols[3]),
 }
 for attr in cols[8].split(';'):
 if '=' in attr:
 (tag, value) = attr.split('=')
 if tag == 'ID':
-d['id'] = value
+tag = 'id'
-else:
+value = remove_type_from_id(value)
-d[tag] = value
+elif tag == 'Parent':
+value = remove_type_from_list_of_ids(value)
+d[tag] = value
 if cols[6] == '+':
 d['strand'] = 1
 elif cols[6] == '-':
 d['strand'] = -1
 else:
 raise Exception("Unrecognized strand '%s'" % cols[6])
+if parent_dict is not None and 'Parent' in d:
+# a 3' UTR can be split among multiple exons
+# a 5' UTR can be split among multiple exons
+# a CDS can be part of multiple transcripts
+for parent in d['Parent'].split(','):
+if parent not in parent_dict:
+parent_dict[parent] = [d]
+else:
+parent_dict[parent].append(d)
 return d
-def gene_to_json(cols, species):
+def add_gene_to_dict(cols, species, gene_dict):
 global gene_count
-gene = feature_to_json(cols)
+gene = feature_to_dict(cols)
 gene.update({
 'member_id': gene_count,
 'object_type': 'Gene',
 'seq_region_name': cols[0],
 'species': species,
 })
 gene_dict[gene['id']] = gene
 gene_count = gene_count + 1
-def transcript_to_json(cols, species):
+def add_transcript_to_dict(cols, species, transcript_dict):
-transcript = feature_to_json(cols)
+transcript = feature_to_dict(cols)
 transcript.update({
 'object_type': 'Transcript',
 'seq_region_name': cols[0],
 'species': species,
 })
 transcript_dict[transcript['id']] = transcript
-def exon_to_json(cols, species):
+def add_exon_to_dict(cols, species, exon_parent_dict):
-exon = feature_to_json(cols)
+exon = feature_to_dict(cols, exon_parent_dict)
 exon.update({
 'length': int(cols[4]) - int(cols[3]) + 1,
 'object_type': 'Exon',
 'seq_region_name': cols[0],
 'species': species,
 })
 if 'id' not in exon and 'Name' in exon:
 exon['id'] = exon['Name']
-if 'Parent' in exon:
-for parent in exon['Parent'].split(','):
+def add_cds_to_dict(cols, cds_parent_dict):
-if parent not in exon_parent_dict:
+cds = feature_to_dict(cols, cds_parent_dict)
-exon_parent_dict[parent] = [exon]
-else:
-exon_parent_dict[parent].append(exon)
-def five_prime_utr_to_json(cols):
-five_prime_utr = feature_to_json(cols)
-if 'Parent' in five_prime_utr:
-for parent in five_prime_utr['Parent'].split(','):
-# the 5' UTR can be split among multiple exons
-if parent not in five_prime_utr_parent_dict:
-five_prime_utr_parent_dict[parent] = [five_prime_utr]
-else:
-five_prime_utr_parent_dict[parent].append(five_prime_utr)
-def three_prime_utr_to_json(cols):
-three_prime_utr = feature_to_json(cols)
-if 'Parent' in three_prime_utr:
-for parent in three_prime_utr['Parent'].split(','):
-# the 3' UTR can be split among multiple exons
-if parent not in three_prime_utr_parent_dict:
-three_prime_utr_parent_dict[parent] = [three_prime_utr]
-else:
-three_prime_utr_parent_dict[parent].append(three_prime_utr)
-def cds_to_json(cols):
-cds = feature_to_json(cols)
 if 'id' not in cds:
 if 'Name' in cds:
 cds['id'] = cds['Name']
-elif 'Parent' in cds:
+elif 'Parent' in cds and ',' not in cds['Parent']:
 cds['id'] = cds['Parent']
-if 'Parent' in cds:
-# At this point we are sure than 'id' is in cds
-for parent in cds['Parent'].split(','):
+def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict):
-if parent not in cds_parent_dict:
-cds_parent_dict[parent] = [cds]
-else:
-cds_parent_dict[parent].append(cds)
-def join_dicts():
 for parent, exon_list in exon_parent_dict.items():
-exon_list.sort(key=lambda _: _['start'])
 if parent in transcript_dict:
+exon_list.sort(key=lambda _: _['start'])
 transcript_dict[parent]['Exon'] = exon_list
 for transcript_id, transcript in transcript_dict.items():
 translation = {
 'CDS': [],
 for parent in transcript['Parent'].split(','):
 if parent in gene_dict:
 gene_dict[parent]['Transcript'].append(transcript)
-def merge_dicts(json_arg):
+def update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict):
-with open(json_arg) as f:
+gene_intersection = set(full_gene_dict.keys()) & set(gene_dict.keys())
-dict_from_json = json.load(f)
-gene_intersection = set(gene_dict.keys()) & set(dict_from_json.keys())
 if gene_intersection:
-raise Exception("JSON file '%s' contains information for genes '%s', which are also present in other files" % (json_arg, ', '.join(gene_intersection)))
+raise Exception("Information for genes '%s' are present in multiple files" % ', '.join(gene_intersection))
-gene_dict.update(dict_from_json)
+full_gene_dict.update(gene_dict)
-def write_json(outfile=None, sort_keys=False):
+def write_json(full_gene_dict, outfile=None, sort_keys=False):
 if outfile:
 with open(outfile, 'w') as f:
-json.dump(gene_dict, f, sort_keys=sort_keys)
+json.dump(full_gene_dict, f, sort_keys=sort_keys)
 else:
-print(json.dumps(gene_dict, indent=3, sort_keys=sort_keys))
+json.dump(full_gene_dict, sys.stdout, sort_keys=sort_keys)
 def __main__():
 parser = optparse.OptionParser()
 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
 parser.add_option('-s', '--sort', action='store_true', help='Sort the keys in the JSON output')
 parser.add_option('-o', '--output', help='Path of the output file. If not specified, will print on the standard output')
 options, args = parser.parse_args()
 if args:
 raise Exception('Use options to provide inputs')
+full_gene_dict = dict()
 for gff3_arg in options.gff3:
 try:
 (species, filename) = gff3_arg.split(':')
 except ValueError:
 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg)
+gene_dict = dict()
+transcript_dict = dict()
+exon_parent_dict = dict()
+cds_parent_dict = dict()
+five_prime_utr_parent_dict = dict()
+three_prime_utr_parent_dict = dict()
 with open(filename) as f:
-for i, line in enumerate(f):
+for i, line in enumerate(f, start=1):
 line = line.strip()
 if not line:
 # skip empty lines
 continue
 if line[0] == '#':
 if len(cols) != 9:
 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line))
 feature_type = cols[2]
 try:
 if feature_type == 'gene':
-gene_to_json(cols, species)
+add_gene_to_dict(cols, species, gene_dict)
 elif feature_type in ('mRNA', 'transcript'):
-transcript_to_json(cols, species)
+add_transcript_to_dict(cols, species, transcript_dict)
 elif feature_type == 'exon':
-exon_to_json(cols, species)
+add_exon_to_dict(cols, species, exon_parent_dict)
 elif feature_type == 'five_prime_UTR':
-five_prime_utr_to_json(cols)
+feature_to_dict(cols, five_prime_utr_parent_dict)
 elif feature_type == 'three_prime_UTR':
-three_prime_utr_to_json(cols)
+feature_to_dict(cols, three_prime_utr_parent_dict)
 elif feature_type == 'CDS':
-cds_to_json(cols)
+add_cds_to_dict(cols, cds_parent_dict)
 else:
 print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr)
 except Exception as e:
 raise Exception("Line %i in file '%s': %s" % (i, filename, e))
-join_dicts()
+join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
+update_full_gene_dict_no_overwrite(full_gene_dict, gene_dict)
 for json_arg in options.json:
-merge_dicts(json_arg)
+with open(json_arg) as f:
+update_full_gene_dict_no_overwrite(full_gene_dict, json.load(f))
-write_json(options.output, options.sort)
+write_json(full_gene_dict, options.output, options.sort)
 if __name__ == '__main__':
 __main__()

Mercurial > repos > earlhaminst > gff3_to_json

comparison gff3_to_json.py @ 1:befe6021e476 draft default tip