changeset 0:ec898924d8c7 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 8e118a4d24047e2c62912b962e854f789d6ff559
author mbernt
date Wed, 20 Jun 2018 11:02:06 -0400
parents
children 1c4b24e9bb16
files getLongestORF.py longORF.xml test-data/test_input.fasta test-data/test_output.fasta test-data/test_output.tab
diffstat 5 files changed, 340 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getLongestORF.py	Wed Jun 20 11:02:06 2018 -0400
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+
+"""
+usage: getLongestORF.py input output.fas output.tab
+
+
+input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf")
+output.fas: fasta file with all longest ORFs per transcript
+output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs
+
+example:
+
+>253936-254394(+)_1 [28 - 63] 
+LTNYCQMVHNIL
+>253936-254394(+)_2 [18 - 77] 
+HKLIDKLLPNGAQYFVKSTQ
+>253936-254394(+)_3 [32 - 148] 
+QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV
+>253936-254394(+)_4 [117 - 152] 
+LGIIVPSLLLCN
+>248351-252461(+)_1 [14 - 85] 
+VLARKYPRCLSPSKKSPCQLRQRS
+>248351-252461(+)_2 [21 - 161] 
+PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP
+>248351-252461(+)_3 [89 - 202] 
+VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG
+>248351-252461(+)_4 [206 - 259] 
+SRTTSGRQSVLSEKLWRR
+>248351-252461(+)_5 [263 - 313] 
+CLSPLWVPCCSRHSCHG
+"""
+
+import sys,re
+
+def findlongestOrf(transcriptDict,old_seqID):
+    #write for previous seqID
+    prevTranscript = transcriptDict[old_seqID]
+    i_max = 0
+    #find longest orf in transcript
+    for i in range(0,len(prevTranscript)):
+        if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
+            i_max = i
+    for i in range(0,len(prevTranscript)):
+        prevStart = prevTranscript[i][0]
+        prevEnd = prevTranscript[i][1]
+        prevLength = prevTranscript[i][2]
+        output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength)
+        if (end - start > 0):
+            output+="\tForward"
+        else:
+            output+="\tReverse"
+        if(i == i_max):
+            output += "\ty\n"
+        else:
+            output += "\tn\n"
+        OUTPUT_ORF_SUMMARY.write(output)
+    transcriptDict.pop(old_seqID, None)
+    return None
+
+INPUT = open(sys.argv[1],"r")
+OUTPUT_FASTA = open(sys.argv[2],"w")
+OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w")
+
+seqID = ""
+old_seqID = ""
+lengthDict = {}
+seqDict = {}
+headerDict = {}
+transcriptDict = {}
+skip = False
+
+OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n")
+
+for line in INPUT:
+    line = line.strip()
+#    print line
+    if(re.match(">",line)): #header
+        seqID = "_".join(line.split(">")[1].split("_")[:-1])
+        #seqID = line.split(">")[1].split("_")[0]
+        start = int (re.search('\ \[(\d+)\ -', line).group(1))
+        end = int (re.search('-\ (\d+)\]',line).group(1))
+        length = abs(end - start)
+        if(seqID not in transcriptDict and old_seqID != ""): #new transcript
+            findlongestOrf(transcriptDict,old_seqID)
+        if seqID not in transcriptDict:
+            transcriptDict[seqID] = []
+        transcriptDict[seqID].append([start,end,length])
+        if(seqID not in lengthDict and old_seqID != ""): #new transcript
+            #write FASTA
+            OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n")
+            #delete old dict entry
+            headerDict.pop(old_seqID, None)
+            seqDict.pop(old_seqID, None)
+            lengthDict.pop(old_seqID, None)
+        #if several longest sequences exist with the same length, the dictionary saves the last occuring.
+        if(seqID not in lengthDict or length >= lengthDict[seqID]):
+            headerDict[seqID] = line
+            lengthDict[seqID] = length
+            seqDict[seqID] = ""
+            skip = False
+        else:
+            skip = True
+            next
+        old_seqID = seqID
+    elif(skip):
+        next
+    else:
+        seqDict[seqID] += line
+
+OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID])
+findlongestOrf(transcriptDict,old_seqID)
+INPUT.close()
+OUTPUT_FASTA.close()
+OUTPUT_ORF_SUMMARY.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/longORF.xml	Wed Jun 20 11:02:06 2018 -0400
@@ -0,0 +1,35 @@
+<tool id="longORF" name="Obtain longest ORFs" version="0.1.0">
+    <description> in six-frame translations</description>
+    <command><![CDATA[
+        python $__tool_directory__/getLongestORF.py $input $output_longestORF $output_ORFs
+    ]]>
+    </command>
+    <inputs>
+        <param name="input" format="fasta" type="data" label="sequences"/>
+    </inputs>
+    <outputs>
+        <data name="output_longestORF" format="fasta"/>
+        <data name="output_ORFs" format="tabular"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="input" value="test_input.fasta"/>
+            <output name="output_longestORF" file="test_output.fasta"/>
+            <output name="output_ORFs" file="test_output.tab"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool identifies the longest Open Reading Frames within the six-frame translations of a set of sequences. 
+
+**Input**
+
+It takes an amino acid fasta file with all open reading frames (+ and - strand) listed by the correspondng transcript. The tool is designed to process the output of the Galaxy tool "getorf" from the EMBOSS package.
+
+**Output**
+
+For each transcript, the respected longest ORF is identified and listed in fasta format. Furthermore, table with information about seqID, start, end, length, orientation, longest for all ORFs is given.]]>
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input.fasta	Wed Jun 20 11:02:06 2018 -0400
@@ -0,0 +1,127 @@
+>14520830-14521117(-)_1 [2 - 37] 
+KPLENISASREF
+>14520830-14521117(-)_2 [3 - 47] 
+SPWRIFQPAENFDLQ
+>14520830-14521117(-)_3 [41 - 94] 
+LAVGFGLIFLRSGWMPCL
+>14520830-14521117(-)_4 [63 - 152] 
+FSYDLGGCLACDSCSSYSPNEGQCPARKLE
+>14520830-14521117(-)_5 [146 - 175] 
+VGMMDLCSET
+>14520830-14521117(-)_6 [156 - 200] 
+WTCVQRLNRTNKQNK
+>14520830-14521117(-)_7 [1 - 240] 
+KAPGEYFSQQRILTCSRIWFDFPTIWVDALPVTVAVPIRQMKGSAPHVSWNDGPVFRDLT
+EPTSKTSENRKKEEDTGINS
+>14520830-14521117(-)_8 [179 - 325] 
+QNQQAKQVRTGKRKRTLESILESCTTWFFFHSKFRGTKPLENISASREF
+>14520830-14521117(-)_9 [204 - 335] 
+EQEKGRGHWNQFLRVALLGFSSIPSFVGQSPWRIFQPAENFDLQ
+>14520830-14521117(-)_10 [329 - 382] 
+LAVGFGLIFLRSGWMPCL
+>14520830-14521117(-)_11 [351 - 440] 
+FSYDLGGCLACDSCSSYSPNEGQCPARKLE
+>14520830-14521117(-)_12 [434 - 463] 
+VGMMDLCSET
+>14520830-14521117(-)_13 [444 - 488] 
+WTCVQRLNRTNKQNK
+>14520830-14521117(-)_14 [244 - 528] 
+ELHYLVFLPFQVSWDKAPGEYFSQQRILTCSRIWFDFPTIWVDALPVTVAVPIRQMKGSA
+PHVSWNDGPVFRDLTEPTSKTSENRKKEEDTGINS
+>14520830-14521117(-)_15 [467 - 574] 
+QNQQAKQVRTGKRKRTLESILESCTTWFFFHSKFRG
+>14520830-14521117(-)_16 [492 - 575] 
+EQEKGRGHWNQFLRVALLGFSSIPSFVG
+>14520830-14521117(-)_17 [532 - 576] 
+ELHYLVFLPFQVSWD
+>14520830-14521117(-)_18 [575 - 543] (REVERSE SENSE) 
+SHETWNGRKTK
+>14520830-14521117(-)_19 [574 - 524] (REVERSE SENSE) 
+PTKLGMEEKPSSATLKN
+>14520830-14521117(-)_20 [576 - 466] (REVERSE SENSE) 
+VPRNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC
+>14520830-14521117(-)_21 [520 - 458] (REVERSE SENSE) 
+FQCPLPFSCSHLFCLLVLLSL
+>14520830-14521117(-)_22 [454 - 401] (REVERSE SENSE) 
+TQVHHSNLRAGHCPSFGE
+>14520830-14521117(-)_23 [397 - 359] (REVERSE SENSE) 
+ELQLSQARHPPRS
+>14520830-14521117(-)_24 [355 - 311] (REVERSE SENSE) 
+ENQTKSYCKSKFSAG
+>14520830-14521117(-)_25 [539 - 255] (REVERSE SENSE) 
+CNSQELIPVSSSFFLFSLVLLVGSVKSLNTGPSFQLTCGALPFIWRIGTATVTGKASTQI
+VGKSNQILLQVKILCWLKYSPGALSHETWNGRKTK
+>14520830-14521117(-)_26 [307 - 236] (REVERSE SENSE) 
+NILQGLCPTKLGMEEKPSSATLKN
+>14520830-14521117(-)_27 [462 - 178] (REVERSE SENSE) 
+VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGFVP
+RNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC
+>14520830-14521117(-)_28 [232 - 170] (REVERSE SENSE) 
+FQCPLPFSCSHLFCLLVLLSL
+>14520830-14521117(-)_29 [166 - 113] (REVERSE SENSE) 
+TQVHHSNLRAGHCPSFGE
+>14520830-14521117(-)_30 [109 - 71] (REVERSE SENSE) 
+ELQLSQARHPPRS
+>14520830-14521117(-)_31 [67 - 23] (REVERSE SENSE) 
+ENQTKSYCKSKFSAG
+>14520830-14521117(-)_32 [251 - 3] (REVERSE SENSE) 
+CNSQELIPVSSSFFLFSLVLLVGSVKSLNTGPSFQLTCGALPFIWRIGTATVTGKASTQI
+VGKSNQILLQVKILCWLKYSPGA
+>14520830-14521117(-)_33 [174 - 1] (REVERSE SENSE) 
+VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGF
+>103089310-103089560(-)_1 [2 - 37] 
+GTSEKFLKILLS
+>103089310-103089560(-)_2 [24 - 92] 
+RFYYHRYLFWFCVSVLSADGPKL
+>103089310-103089560(-)_3 [13 - 117] 
+KVSEDFIIIDTCFGFVYLYSLQMVQNCNGVCIRRK
+>103089310-103089560(-)_4 [138 - 167] 
+TACIWLHCGL
+>103089310-103089560(-)_5 [180 - 260] 
+NHPYVSVSGYTRKRKESQSGTKSGRYV
+>103089310-103089560(-)_6 [41 - 271] 
+ILVLVLCICTLCRWSKIVMESVLEENKGKIRLNCMYMAPLWLVTLLKSSVCQCIWIHEEK
+ERVSEWNKEWEVRLKSF
+>103089310-103089560(-)_7 [127 - 288] 
+NQAELHVYGSTVACDTFKIIRMSVYLDTRGKGKSLRVEQRVGGTSEKFLKILLS
+>103089310-103089560(-)_8 [275 - 343] 
+RFYYHRYLFWFCVSVLSADGPKL
+>103089310-103089560(-)_9 [264 - 368] 
+KVSEDFIIIDTCFGFVYLYSLQMVQNCNGVCIRRK
+>103089310-103089560(-)_10 [389 - 418] 
+TACIWLHCGL
+>103089310-103089560(-)_11 [378 - 500] 
+NQAELHVYGSTVACDTFKIIRMSVYLDTRGKGKSLRVEQRV
+>103089310-103089560(-)_12 [292 - 501] 
+ILVLVLCICTLCRWSKIVMESVLEENKGKIRLNCMYMAPLWLVTLLKSSVCQCIWIHEEK
+ERVSEWNKEW
+>103089310-103089560(-)_13 [431 - 502] 
+NHPYVSVSGYTRKRKESQSGTKSG
+>103089310-103089560(-)_14 [500 - 447] (REVERSE SENSE) 
+HSLFHSETLSFSSCIQIH
+>103089310-103089560(-)_15 [480 - 436] (REVERSE SENSE) 
+DSFLFLVYPDTLTYG
+>103089310-103089560(-)_16 [502 - 383] (REVERSE SENSE) 
+PTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA
+>103089310-103089560(-)_17 [426 - 361] (REVERSE SENSE) 
+KCHKPQWSHIHAVQPDFTLIFF
+>103089310-103089560(-)_18 [357 - 289] (REVERSE SENSE) 
+YRLHYNFGPSAESTDTQNQNKYL
+>103089310-103089560(-)_19 [379 - 233] (REVERSE SENSE) 
+FYPYFLLIQTPLQFWTICREYRYTKPKQVSMIIKSSETFQTYLPLFVPL
+>103089310-103089560(-)_20 [279 - 196] (REVERSE SENSE) 
+NLQKLFRRTSHSLFHSETLSFSSCIQIH
+>103089310-103089560(-)_21 [229 - 185] (REVERSE SENSE) 
+DSFLFLVYPDTLTYG
+>103089310-103089560(-)_22 [443 - 132] (REVERSE SENSE) 
+HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNF
+SDVPPTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA
+>103089310-103089560(-)_23 [175 - 110] (REVERSE SENSE) 
+KCHKPQWSHIHAVQPDFTLIFF
+>103089310-103089560(-)_24 [106 - 38] (REVERSE SENSE) 
+YRLHYNFGPSAESTDTQNQNKYL
+>103089310-103089560(-)_25 [128 - 3] (REVERSE SENSE) 
+FYPYFLLIQTPLQFWTICREYRYTKPKQVSMIIKSSETFQTY
+>103089310-103089560(-)_26 [192 - 1] (REVERSE SENSE) 
+HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNF
+SDVP
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output.fasta	Wed Jun 20 11:02:06 2018 -0400
@@ -0,0 +1,4 @@
+>14520830-14521117(-)_27 [462 - 178] (REVERSE SENSE)
+VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGFVPRNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC
+>103089310-103089560(-)_22 [443 - 132] (REVERSE SENSE)
+HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNFSDVPPTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_output.tab	Wed Jun 20 11:02:06 2018 -0400
@@ -0,0 +1,60 @@
+seqID	start	end	length	orientation	longest
+14520830-14521117(-)	2	37	35	Forward	n
+14520830-14521117(-)	3	47	44	Forward	n
+14520830-14521117(-)	41	94	53	Forward	n
+14520830-14521117(-)	63	152	89	Forward	n
+14520830-14521117(-)	146	175	29	Forward	n
+14520830-14521117(-)	156	200	44	Forward	n
+14520830-14521117(-)	1	240	239	Forward	n
+14520830-14521117(-)	179	325	146	Forward	n
+14520830-14521117(-)	204	335	131	Forward	n
+14520830-14521117(-)	329	382	53	Forward	n
+14520830-14521117(-)	351	440	89	Forward	n
+14520830-14521117(-)	434	463	29	Forward	n
+14520830-14521117(-)	444	488	44	Forward	n
+14520830-14521117(-)	244	528	284	Forward	n
+14520830-14521117(-)	467	574	107	Forward	n
+14520830-14521117(-)	492	575	83	Forward	n
+14520830-14521117(-)	532	576	44	Forward	n
+14520830-14521117(-)	575	543	32	Forward	n
+14520830-14521117(-)	574	524	50	Forward	n
+14520830-14521117(-)	576	466	110	Forward	n
+14520830-14521117(-)	520	458	62	Forward	n
+14520830-14521117(-)	454	401	53	Forward	n
+14520830-14521117(-)	397	359	38	Forward	n
+14520830-14521117(-)	355	311	44	Forward	n
+14520830-14521117(-)	539	255	284	Forward	n
+14520830-14521117(-)	307	236	71	Forward	n
+14520830-14521117(-)	462	178	284	Forward	y
+14520830-14521117(-)	232	170	62	Forward	n
+14520830-14521117(-)	166	113	53	Forward	n
+14520830-14521117(-)	109	71	38	Forward	n
+14520830-14521117(-)	67	23	44	Forward	n
+14520830-14521117(-)	251	3	248	Forward	n
+14520830-14521117(-)	174	1	173	Forward	n
+103089310-103089560(-)	2	37	35	Reverse	n
+103089310-103089560(-)	24	92	68	Reverse	n
+103089310-103089560(-)	13	117	104	Reverse	n
+103089310-103089560(-)	138	167	29	Reverse	n
+103089310-103089560(-)	180	260	80	Reverse	n
+103089310-103089560(-)	41	271	230	Reverse	n
+103089310-103089560(-)	127	288	161	Reverse	n
+103089310-103089560(-)	275	343	68	Reverse	n
+103089310-103089560(-)	264	368	104	Reverse	n
+103089310-103089560(-)	389	418	29	Reverse	n
+103089310-103089560(-)	378	500	122	Reverse	n
+103089310-103089560(-)	292	501	209	Reverse	n
+103089310-103089560(-)	431	502	71	Reverse	n
+103089310-103089560(-)	500	447	53	Reverse	n
+103089310-103089560(-)	480	436	44	Reverse	n
+103089310-103089560(-)	502	383	119	Reverse	n
+103089310-103089560(-)	426	361	65	Reverse	n
+103089310-103089560(-)	357	289	68	Reverse	n
+103089310-103089560(-)	379	233	146	Reverse	n
+103089310-103089560(-)	279	196	83	Reverse	n
+103089310-103089560(-)	229	185	44	Reverse	n
+103089310-103089560(-)	443	132	311	Reverse	y
+103089310-103089560(-)	175	110	65	Reverse	n
+103089310-103089560(-)	106	38	68	Reverse	n
+103089310-103089560(-)	128	3	125	Reverse	n
+103089310-103089560(-)	192	1	191	Reverse	n