diff mature_mir_gff_translation.py @ 5:9ea96a02c416 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 04980585c257ab5f8eb5d10de007316c47c5d1ce
author artbio
date Tue, 05 Sep 2017 06:33:16 -0400
parents 6b8adacd4750
children 3f62272192f9
line wrap: on
line diff
--- a/mature_mir_gff_translation.py	Mon Sep 04 17:55:01 2017 -0400
+++ b/mature_mir_gff_translation.py	Tue Sep 05 06:33:16 2017 -0400
@@ -1,6 +1,6 @@
-#!/usr/bin/env python
+import argparse
 
-import argparse
+from datetime import datetime
 
 
 def Parser():
@@ -14,19 +14,14 @@
     return args
 
 
-GFF3_header = '''##gff-version 3
-##generated by mature_mir_gff_translation.py
-#
-# Chromosomal coordinates of microRNAs ** relative to the hairpin precursors **
-# microRNAs:               miRBase current_version
-# genome-build-id:         check http://mirbase.org/
-#
-# Hairpin precursor sequences have type "miRNA_primary_transcript".
-# Note, these sequences do not represent the full primary transcript,
-# rather a predicted stem-loop portion that includes the precursor
-# miRNA. Mature sequences have type "miRNA".
-#
-'''
+def get_gff_header(gff_input_file):
+    string_list = []
+    for line in open(gff_input_file, "r"):
+        if line[0] == '#':
+            string_list.append(line)
+    string_list.append('# generated by mature_mir_gff_translation.py %s\n#\n' %
+                       str(datetime.now()))
+    return ''.join(string_list)
 
 
 def load_gff_in_dict(gff_input_file):
@@ -51,9 +46,9 @@
         gff_dict[ID]["strand"] = gff_fields[6]
         gff_dict[ID]["phase"] = gff_fields[7]
         gff_dict[ID]["attributes"] = gff_fields[8]
-        if "Derives_from" in gff_dict[ID]["attributes"]:
+        if "erives_from" in gff_dict[ID]["attributes"]:
             parent_primary_transcript = gff_dict[ID]["attributes"].split(
-                "Derives_from=")[1]
+                "erives_from=")[1]
             parent_primary_transcript = gff_dict[parent_primary_transcript][
                                             "attributes"].split("Name=")[1]
             gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % (
@@ -61,7 +56,7 @@
     return gff_dict
 
 
-def genome_to_mir_gff(gff_dict, output):
+def genome_to_mir_gff(gff_dict, output, header):
     '''
     Converts seqid field from chromosome to item Name
     Then converts coordinates relative to "miRNA_primary_transcript"
@@ -70,9 +65,9 @@
     for key in gff_dict:
         name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
         gff_dict[key]["seqid"] = name
-        if "Derives_from=" in gff_dict[key]["attributes"]:
+        if "erives_from=" in gff_dict[key]["attributes"]:
             parent_ID = gff_dict[key]["attributes"].split(
-                                        "Derives_from=")[1].split(";")[0]
+                                        "erives_from=")[1].split(";")[0]
             gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int(
                                        gff_dict[parent_ID]["start"])+1)
             gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
@@ -95,7 +90,7 @@
                                                         "Name=")[1].split(
                                                          ";")[0]
     with open(output, "w") as output:
-        output.write(GFF3_header)
+        output.write(header)
         for ID in sorted(hairpins, key=hairpins.get):
             output.write("\t".join([gff_dict[ID]["seqid"],
                          gff_dict[ID]["source"], gff_dict[ID]["type"],
@@ -117,7 +112,7 @@
 
 def main(infile, outfile):
     gff_dict = load_gff_in_dict(infile)
-    genome_to_mir_gff(gff_dict, outfile)
+    genome_to_mir_gff(gff_dict, outfile, get_gff_header(infile))
 
 
 if __name__ == "__main__":