Mercurial > repos > mbernt > longorf
view getLongestORF.py @ 0:ec898924d8c7 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 8e118a4d24047e2c62912b962e854f789d6ff559
author | mbernt |
---|---|
date | Wed, 20 Jun 2018 11:02:06 -0400 |
parents | |
children | 1c4b24e9bb16 |
line wrap: on
line source
#!/usr/bin/env python """ usage: getLongestORF.py input output.fas output.tab input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf") output.fas: fasta file with all longest ORFs per transcript output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs example: >253936-254394(+)_1 [28 - 63] LTNYCQMVHNIL >253936-254394(+)_2 [18 - 77] HKLIDKLLPNGAQYFVKSTQ >253936-254394(+)_3 [32 - 148] QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV >253936-254394(+)_4 [117 - 152] LGIIVPSLLLCN >248351-252461(+)_1 [14 - 85] VLARKYPRCLSPSKKSPCQLRQRS >248351-252461(+)_2 [21 - 161] PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP >248351-252461(+)_3 [89 - 202] VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG >248351-252461(+)_4 [206 - 259] SRTTSGRQSVLSEKLWRR >248351-252461(+)_5 [263 - 313] CLSPLWVPCCSRHSCHG """ import sys,re def findlongestOrf(transcriptDict,old_seqID): #write for previous seqID prevTranscript = transcriptDict[old_seqID] i_max = 0 #find longest orf in transcript for i in range(0,len(prevTranscript)): if(prevTranscript[i][2] >= prevTranscript[i_max][2]): i_max = i for i in range(0,len(prevTranscript)): prevStart = prevTranscript[i][0] prevEnd = prevTranscript[i][1] prevLength = prevTranscript[i][2] output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength) if (end - start > 0): output+="\tForward" else: output+="\tReverse" if(i == i_max): output += "\ty\n" else: output += "\tn\n" OUTPUT_ORF_SUMMARY.write(output) transcriptDict.pop(old_seqID, None) return None INPUT = open(sys.argv[1],"r") OUTPUT_FASTA = open(sys.argv[2],"w") OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w") seqID = "" old_seqID = "" lengthDict = {} seqDict = {} headerDict = {} transcriptDict = {} skip = False OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n") for line in INPUT: line = line.strip() # print line if(re.match(">",line)): #header seqID = "_".join(line.split(">")[1].split("_")[:-1]) #seqID = line.split(">")[1].split("_")[0] start = int (re.search('\ \[(\d+)\ -', line).group(1)) end = int (re.search('-\ (\d+)\]',line).group(1)) length = abs(end - start) if(seqID not in transcriptDict and old_seqID != ""): #new transcript findlongestOrf(transcriptDict,old_seqID) if seqID not in transcriptDict: transcriptDict[seqID] = [] transcriptDict[seqID].append([start,end,length]) if(seqID not in lengthDict and old_seqID != ""): #new transcript #write FASTA OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n") #delete old dict entry headerDict.pop(old_seqID, None) seqDict.pop(old_seqID, None) lengthDict.pop(old_seqID, None) #if several longest sequences exist with the same length, the dictionary saves the last occuring. if(seqID not in lengthDict or length >= lengthDict[seqID]): headerDict[seqID] = line lengthDict[seqID] = length seqDict[seqID] = "" skip = False else: skip = True next old_seqID = seqID elif(skip): next else: seqDict[seqID] += line OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]) findlongestOrf(transcriptDict,old_seqID) INPUT.close() OUTPUT_FASTA.close() OUTPUT_ORF_SUMMARY.close()