diff mature_mir_gff_translation.py @ 0:da29af78a960 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit d4d8106d66b65679a1a685ab94bfcf99cdb7b959
author artbio
date Mon, 24 Jul 2017 06:27:50 -0400
parents
children 6b8adacd4750
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mature_mir_gff_translation.py	Mon Jul 24 06:27:50 2017 -0400
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+
+
+def Parser():
+    the_parser = argparse.ArgumentParser()
+    the_parser.add_argument(
+        '--input', action="store", type=str, help="input miRBase GFF3 file")
+    the_parser.add_argument(
+        '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates")
+    args = the_parser.parse_args()
+    return args
+
+GFF3_header= '''##gff-version 3
+##generated by mature_mir_gff_translation.py
+#
+# Chromosomal coordinates of microRNAs ** relative to the hairpin precursors **
+# microRNAs:               miRBase current_version
+# genome-build-id:         check http://mirbase.org/
+#
+# Hairpin precursor sequences have type "miRNA_primary_transcript". 
+# Note, these sequences do not represent the full primary transcript, 
+# rather a predicted stem-loop portion that includes the precursor 
+# miRNA. Mature sequences have type "miRNA".
+#
+'''
+
+def load_gff_in_dict(gff_input_file):
+    '''
+    Reads the gff3 file and return a dictionary of dictionaries
+    with keys equal to standard gff3 fields (9)
+    Note that the key of the primary dictionary is the ID
+    '''
+    gff_dict = {}
+    for line in open(gff_input_file, "r"):
+        if line[0]=="#":
+            continue
+        gff_fields=line[:-1].split("\t")
+        ID=gff_fields[8].split("ID=")[1].split(";")[0]
+        gff_dict[ID] = {}
+        gff_dict[ID]["seqid"]=gff_fields[0]
+        gff_dict[ID]["source"]=gff_fields[1]
+        gff_dict[ID]["type"]=gff_fields[2]
+        gff_dict[ID]["start"]=gff_fields[3]
+        gff_dict[ID]["end"]=gff_fields[4]
+        gff_dict[ID]["score"]=gff_fields[5]
+        gff_dict[ID]["strand"]=gff_fields[6]
+        gff_dict[ID]["phase"]=gff_fields[7]
+        gff_dict[ID]["attributes"]=gff_fields[8]
+        if "Derives_from" in gff_dict[ID]["attributes"]:
+            parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1]
+            parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1]
+            gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript)
+    return gff_dict
+    
+
+def genome_to_mir_gff(gff_dict, output):
+    '''
+    Converts seqid field from chromosome to item Name
+    Then converts coordinates relative to "miRNA_primary_transcript"
+    Note that GFF files are 1-based coordinates
+    '''
+    for key in gff_dict:
+        name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
+        gff_dict[key]["seqid"]=name
+        if "Derives_from=" in gff_dict[key]["attributes"]:
+            parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0]
+            gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1)
+            gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1)
+    hairpins={}
+    matures={}
+    for key in gff_dict:  ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion
+        if gff_dict[key]["type"]=="miRNA_primary_transcript":
+            gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1)
+            gff_dict[key]["start"]="1"
+            # now, do a dict[ID]=Name but only for miRNA_primary_transcript
+            hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
+        else:
+            matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
+    with open(output, "w") as output:
+        output.write(GFF3_header)
+        for ID in sorted(hairpins, key=hairpins.get):
+            output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"],
+                gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"],
+                gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"],
+                gff_dict[ID]["attributes"]]))
+            output.write("\n")
+            for id in sorted(matures, key=matures.get, reverse=True):
+                if ID in gff_dict[id]["attributes"]:
+                    output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"],
+                        gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"],
+                        gff_dict[id]["score"], gff_dict[id]["strand"],
+                        gff_dict[id]["phase"], gff_dict[id]["attributes"]]))
+                    output.write("\n")
+
+
+def main(infile, outfile):
+    gff_dict = load_gff_in_dict(infile)
+    genome_to_mir_gff(gff_dict, outfile)
+
+
+if __name__ == "__main__":
+    args = Parser()
+    main(args.input, args.output)
\ No newline at end of file