Mercurial > repos > drosofff > blastn_to_scaffold
comparison blastn_to_scaffold.py @ 0:ad9a1c117ac6 draft default tip
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Sun, 21 Jun 2015 14:41:10 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ad9a1c117ac6 |
---|---|
1 #!/usr/bin/python | |
2 import sys | |
3 import argparse | |
4 | |
5 def insert_newlines(string, every=60): | |
6 lines = [] | |
7 for i in xrange(0, len(string), every): | |
8 lines.append(string[i:i+every]) | |
9 return '\n'.join(lines) | |
10 | |
11 def getseq (fastadict, transcript, up, down, orientation="direct"): | |
12 def reverse (seq): | |
13 revdict = {"A":"T","T":"A","G":"C","C":"G","N":"N"} | |
14 revseq = [revdict[i] for i in seq[::-1]] | |
15 return "".join(revseq) | |
16 pickseq = fastadict[transcript][up-1:down] | |
17 if orientation == "direct": | |
18 return pickseq | |
19 else: | |
20 return reverse(pickseq) | |
21 | |
22 def Parser(): | |
23 the_parser = argparse.ArgumentParser( | |
24 description="Generate DNA scaffold from blastn alignment of Contigs") | |
25 the_parser.add_argument( | |
26 '--sequences', action="store", type=str, help="input sequence file in fasta format") | |
27 the_parser.add_argument( | |
28 '--guideSequence', action="store", type=str, help="the reference sequence to guide the scaffold assembly in fasta format") | |
29 the_parser.add_argument( | |
30 '--blastn-tab', dest="blastn_tab", action="store", type=str, help="13-columns tabular blastn output") | |
31 the_parser.add_argument( | |
32 '--output', action="store", type=str, help="output file path, fasta format") | |
33 args = the_parser.parse_args() | |
34 return args | |
35 | |
36 def blatnInfo (file): | |
37 blastlist = [] | |
38 with open(file, "r") as f: | |
39 for line in f: | |
40 minilist = [] | |
41 fields = line.rstrip().split() | |
42 minilist.append(fields[0]) | |
43 minilist.extend(fields[6:10]) | |
44 blastlist.append(minilist) | |
45 blastlist.sort(key=lambda x: x[3], reverse=True) | |
46 return blastlist | |
47 | |
48 def myContigs (file): | |
49 Contigs = {} | |
50 with open(file, "r") as f: | |
51 for line in f: | |
52 if line[0] == ">": | |
53 header = line[1:-1] | |
54 Contigs[header] = "" | |
55 else: | |
56 Contigs[header] += line[:-1] | |
57 return Contigs | |
58 | |
59 def myGuide (file): | |
60 Guide = {} | |
61 coordinate = 0 | |
62 with open(file, "r") as f: | |
63 for line in f: | |
64 if line[0] == ">": | |
65 continue | |
66 else: | |
67 for nucleotide in line[:-1]: | |
68 coordinate += 1 | |
69 Guide[coordinate] = nucleotide.lower() | |
70 return Guide | |
71 | |
72 def updateGuide (blastlist, GuideDict, ContigsDict): | |
73 ''' | |
74 the blastlist object is a list of list with | |
75 element [0] : name of the blasted Contig | |
76 element [1] : queryStart of the alignment to the reference | |
77 element [2] = queryStop of the alignment to the reference | |
78 element [3] : subjectStart of the alignment to the reference | |
79 element [4] = subjectStop of the alignment to the reference | |
80 ''' | |
81 for fields in blastlist: | |
82 seqHeader = fields[0] | |
83 queryStart = int(fields[1]) | |
84 queryStop = int(fields[2]) | |
85 subjectStart = int(fields[3]) | |
86 subjectStop = int(fields[4]) | |
87 if subjectStart > subjectStop: | |
88 subjectStart, subjectStop = subjectStop, subjectStart | |
89 orientation = "reverse" | |
90 else: | |
91 orientation = "direct" | |
92 sequence = getseq (ContigsDict, seqHeader, queryStart, queryStop, orientation) | |
93 for i in range(subjectStart, subjectStop+1): | |
94 try: | |
95 del GuideDict[i] | |
96 except KeyError: | |
97 continue | |
98 for i, nucleotide in enumerate(sequence): | |
99 GuideDict[i+subjectStart] = nucleotide | |
100 | |
101 def finalAssembly (GuideDict, outputfile): | |
102 finalSeqList = [] | |
103 for keys in sorted(GuideDict): | |
104 finalSeqList.append(GuideDict[keys]) | |
105 finalSequence = insert_newlines("".join(finalSeqList) ) | |
106 Out = open (outputfile, "w") | |
107 print >> Out, ">Scaffold" | |
108 print >> Out, finalSequence | |
109 Out.close() | |
110 | |
111 def __main__(): | |
112 args = Parser() | |
113 ContigsDict = myContigs (args.sequences) | |
114 GuideDict = myGuide (args.guideSequence) | |
115 blastlist = blatnInfo(args.blastn_tab) | |
116 updateGuide(blastlist, GuideDict, ContigsDict) | |
117 finalAssembly(GuideDict, args.output) | |
118 | |
119 if __name__ == "__main__": | |
120 __main__() |