diff getLongestORF.py @ 0:ec898924d8c7 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 8e118a4d24047e2c62912b962e854f789d6ff559
author mbernt
date Wed, 20 Jun 2018 11:02:06 -0400
parents
children 1c4b24e9bb16
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getLongestORF.py	Wed Jun 20 11:02:06 2018 -0400
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+
+"""
+usage: getLongestORF.py input output.fas output.tab
+
+
+input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf")
+output.fas: fasta file with all longest ORFs per transcript
+output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs
+
+example:
+
+>253936-254394(+)_1 [28 - 63] 
+LTNYCQMVHNIL
+>253936-254394(+)_2 [18 - 77] 
+HKLIDKLLPNGAQYFVKSTQ
+>253936-254394(+)_3 [32 - 148] 
+QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV
+>253936-254394(+)_4 [117 - 152] 
+LGIIVPSLLLCN
+>248351-252461(+)_1 [14 - 85] 
+VLARKYPRCLSPSKKSPCQLRQRS
+>248351-252461(+)_2 [21 - 161] 
+PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP
+>248351-252461(+)_3 [89 - 202] 
+VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG
+>248351-252461(+)_4 [206 - 259] 
+SRTTSGRQSVLSEKLWRR
+>248351-252461(+)_5 [263 - 313] 
+CLSPLWVPCCSRHSCHG
+"""
+
+import sys,re
+
+def findlongestOrf(transcriptDict,old_seqID):
+    #write for previous seqID
+    prevTranscript = transcriptDict[old_seqID]
+    i_max = 0
+    #find longest orf in transcript
+    for i in range(0,len(prevTranscript)):
+        if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
+            i_max = i
+    for i in range(0,len(prevTranscript)):
+        prevStart = prevTranscript[i][0]
+        prevEnd = prevTranscript[i][1]
+        prevLength = prevTranscript[i][2]
+        output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength)
+        if (end - start > 0):
+            output+="\tForward"
+        else:
+            output+="\tReverse"
+        if(i == i_max):
+            output += "\ty\n"
+        else:
+            output += "\tn\n"
+        OUTPUT_ORF_SUMMARY.write(output)
+    transcriptDict.pop(old_seqID, None)
+    return None
+
+INPUT = open(sys.argv[1],"r")
+OUTPUT_FASTA = open(sys.argv[2],"w")
+OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w")
+
+seqID = ""
+old_seqID = ""
+lengthDict = {}
+seqDict = {}
+headerDict = {}
+transcriptDict = {}
+skip = False
+
+OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n")
+
+for line in INPUT:
+    line = line.strip()
+#    print line
+    if(re.match(">",line)): #header
+        seqID = "_".join(line.split(">")[1].split("_")[:-1])
+        #seqID = line.split(">")[1].split("_")[0]
+        start = int (re.search('\ \[(\d+)\ -', line).group(1))
+        end = int (re.search('-\ (\d+)\]',line).group(1))
+        length = abs(end - start)
+        if(seqID not in transcriptDict and old_seqID != ""): #new transcript
+            findlongestOrf(transcriptDict,old_seqID)
+        if seqID not in transcriptDict:
+            transcriptDict[seqID] = []
+        transcriptDict[seqID].append([start,end,length])
+        if(seqID not in lengthDict and old_seqID != ""): #new transcript
+            #write FASTA
+            OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n")
+            #delete old dict entry
+            headerDict.pop(old_seqID, None)
+            seqDict.pop(old_seqID, None)
+            lengthDict.pop(old_seqID, None)
+        #if several longest sequences exist with the same length, the dictionary saves the last occuring.
+        if(seqID not in lengthDict or length >= lengthDict[seqID]):
+            headerDict[seqID] = line
+            lengthDict[seqID] = length
+            seqDict[seqID] = ""
+            skip = False
+        else:
+            skip = True
+            next
+        old_seqID = seqID
+    elif(skip):
+        next
+    else:
+        seqDict[seqID] += line
+
+OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID])
+findlongestOrf(transcriptDict,old_seqID)
+INPUT.close()
+OUTPUT_FASTA.close()
+OUTPUT_ORF_SUMMARY.close()