Mercurial > repos > artbio > mircounts
diff mature_mir_gff_translation.py @ 5:9ea96a02c416 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 04980585c257ab5f8eb5d10de007316c47c5d1ce
author | artbio |
---|---|
date | Tue, 05 Sep 2017 06:33:16 -0400 |
parents | 6b8adacd4750 |
children | 3f62272192f9 |
line wrap: on
line diff
--- a/mature_mir_gff_translation.py Mon Sep 04 17:55:01 2017 -0400 +++ b/mature_mir_gff_translation.py Tue Sep 05 06:33:16 2017 -0400 @@ -1,6 +1,6 @@ -#!/usr/bin/env python +import argparse -import argparse +from datetime import datetime def Parser(): @@ -14,19 +14,14 @@ return args -GFF3_header = '''##gff-version 3 -##generated by mature_mir_gff_translation.py -# -# Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** -# microRNAs: miRBase current_version -# genome-build-id: check http://mirbase.org/ -# -# Hairpin precursor sequences have type "miRNA_primary_transcript". -# Note, these sequences do not represent the full primary transcript, -# rather a predicted stem-loop portion that includes the precursor -# miRNA. Mature sequences have type "miRNA". -# -''' +def get_gff_header(gff_input_file): + string_list = [] + for line in open(gff_input_file, "r"): + if line[0] == '#': + string_list.append(line) + string_list.append('# generated by mature_mir_gff_translation.py %s\n#\n' % + str(datetime.now())) + return ''.join(string_list) def load_gff_in_dict(gff_input_file): @@ -51,9 +46,9 @@ gff_dict[ID]["strand"] = gff_fields[6] gff_dict[ID]["phase"] = gff_fields[7] gff_dict[ID]["attributes"] = gff_fields[8] - if "Derives_from" in gff_dict[ID]["attributes"]: + if "erives_from" in gff_dict[ID]["attributes"]: parent_primary_transcript = gff_dict[ID]["attributes"].split( - "Derives_from=")[1] + "erives_from=")[1] parent_primary_transcript = gff_dict[parent_primary_transcript][ "attributes"].split("Name=")[1] gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % ( @@ -61,7 +56,7 @@ return gff_dict -def genome_to_mir_gff(gff_dict, output): +def genome_to_mir_gff(gff_dict, output, header): ''' Converts seqid field from chromosome to item Name Then converts coordinates relative to "miRNA_primary_transcript" @@ -70,9 +65,9 @@ for key in gff_dict: name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] gff_dict[key]["seqid"] = name - if "Derives_from=" in gff_dict[key]["attributes"]: + if "erives_from=" in gff_dict[key]["attributes"]: parent_ID = gff_dict[key]["attributes"].split( - "Derives_from=")[1].split(";")[0] + "erives_from=")[1].split(";")[0] gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int( gff_dict[parent_ID]["start"])+1) gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( @@ -95,7 +90,7 @@ "Name=")[1].split( ";")[0] with open(output, "w") as output: - output.write(GFF3_header) + output.write(header) for ID in sorted(hairpins, key=hairpins.get): output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], gff_dict[ID]["type"], @@ -117,7 +112,7 @@ def main(infile, outfile): gff_dict = load_gff_in_dict(infile) - genome_to_mir_gff(gff_dict, outfile) + genome_to_mir_gff(gff_dict, outfile, get_gff_header(infile)) if __name__ == "__main__":