annotate blastn_to_scaffold.py @ 0:ad9a1c117ac6 draft default tip

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Sun, 21 Jun 2015 14:41:10 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
1 #!/usr/bin/python
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
2 import sys
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
3 import argparse
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
4
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
5 def insert_newlines(string, every=60):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
6 lines = []
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
7 for i in xrange(0, len(string), every):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
8 lines.append(string[i:i+every])
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
9 return '\n'.join(lines)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
10
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
11 def getseq (fastadict, transcript, up, down, orientation="direct"):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
12 def reverse (seq):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
13 revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
14 revseq = [revdict[i] for i in seq[::-1]]
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
15 return "".join(revseq)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
16 pickseq = fastadict[transcript][up-1:down]
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
17 if orientation == "direct":
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
18 return pickseq
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
19 else:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
20 return reverse(pickseq)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
21
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
22 def Parser():
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
23 the_parser = argparse.ArgumentParser(
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
24 description="Generate DNA scaffold from blastn alignment of Contigs")
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
25 the_parser.add_argument(
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
26 '--sequences', action="store", type=str, help="input sequence file in fasta format")
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
27 the_parser.add_argument(
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
28 '--guideSequence', action="store", type=str, help="the reference sequence to guide the scaffold assembly in fasta format")
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
29 the_parser.add_argument(
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
30 '--blastn-tab', dest="blastn_tab", action="store", type=str, help="13-columns tabular blastn output")
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
31 the_parser.add_argument(
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
32 '--output', action="store", type=str, help="output file path, fasta format")
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
33 args = the_parser.parse_args()
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
34 return args
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
35
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
36 def blatnInfo (file):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
37 blastlist = []
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
38 with open(file, "r") as f:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
39 for line in f:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
40 minilist = []
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
41 fields = line.rstrip().split()
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
42 minilist.append(fields[0])
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
43 minilist.extend(fields[6:10])
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
44 blastlist.append(minilist)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
45 blastlist.sort(key=lambda x: x[3], reverse=True)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
46 return blastlist
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
47
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
48 def myContigs (file):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
49 Contigs = {}
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
50 with open(file, "r") as f:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
51 for line in f:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
52 if line[0] == ">":
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
53 header = line[1:-1]
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
54 Contigs[header] = ""
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
55 else:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
56 Contigs[header] += line[:-1]
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
57 return Contigs
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
58
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
59 def myGuide (file):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
60 Guide = {}
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
61 coordinate = 0
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
62 with open(file, "r") as f:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
63 for line in f:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
64 if line[0] == ">":
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
65 continue
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
66 else:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
67 for nucleotide in line[:-1]:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
68 coordinate += 1
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
69 Guide[coordinate] = nucleotide.lower()
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
70 return Guide
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
71
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
72 def updateGuide (blastlist, GuideDict, ContigsDict):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
73 '''
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
74 the blastlist object is a list of list with
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
75 element [0] : name of the blasted Contig
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
76 element [1] : queryStart of the alignment to the reference
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
77 element [2] = queryStop of the alignment to the reference
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
78 element [3] : subjectStart of the alignment to the reference
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
79 element [4] = subjectStop of the alignment to the reference
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
80 '''
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
81 for fields in blastlist:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
82 seqHeader = fields[0]
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
83 queryStart = int(fields[1])
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
84 queryStop = int(fields[2])
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
85 subjectStart = int(fields[3])
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
86 subjectStop = int(fields[4])
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
87 if subjectStart > subjectStop:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
88 subjectStart, subjectStop = subjectStop, subjectStart
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
89 orientation = "reverse"
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
90 else:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
91 orientation = "direct"
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
92 sequence = getseq (ContigsDict, seqHeader, queryStart, queryStop, orientation)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
93 for i in range(subjectStart, subjectStop+1):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
94 try:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
95 del GuideDict[i]
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
96 except KeyError:
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
97 continue
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
98 for i, nucleotide in enumerate(sequence):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
99 GuideDict[i+subjectStart] = nucleotide
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
100
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
101 def finalAssembly (GuideDict, outputfile):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
102 finalSeqList = []
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
103 for keys in sorted(GuideDict):
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
104 finalSeqList.append(GuideDict[keys])
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
105 finalSequence = insert_newlines("".join(finalSeqList) )
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
106 Out = open (outputfile, "w")
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
107 print >> Out, ">Scaffold"
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
108 print >> Out, finalSequence
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
109 Out.close()
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
110
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
111 def __main__():
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
112 args = Parser()
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
113 ContigsDict = myContigs (args.sequences)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
114 GuideDict = myGuide (args.guideSequence)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
115 blastlist = blatnInfo(args.blastn_tab)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
116 updateGuide(blastlist, GuideDict, ContigsDict)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
117 finalAssembly(GuideDict, args.output)
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
118
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
119 if __name__ == "__main__":
ad9a1c117ac6 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
120 __main__()