Mercurial > repos > artbio > blast_to_scaffold
comparison blast_to_scaffold.py @ 0:7d96b28eec49 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blast_to_scaffold commit 48a4098045106f363e92357949b32617a2e868c1
author | artbio |
---|---|
date | Sun, 15 Oct 2017 12:52:40 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7d96b28eec49 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 | |
4 | |
5 def insert_newlines(string, every=60): | |
6 lines = [] | |
7 for i in range(0, len(string), every): | |
8 lines.append(string[i:i+every]) | |
9 return '\n'.join(lines) | |
10 | |
11 | |
12 def getseq(fastadict, transcript, up, down, orientation="direct"): | |
13 def reverse(seq): | |
14 revdict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} | |
15 revseq = [revdict[i] for i in seq[::-1]] | |
16 return "".join(revseq) | |
17 pickseq = fastadict[transcript][up-1:down] | |
18 if orientation == "direct": | |
19 return pickseq | |
20 else: | |
21 return reverse(pickseq) | |
22 | |
23 | |
24 def Parser(): | |
25 the_parser = argparse.ArgumentParser( | |
26 description="Generate DNA scaffold from blastn or tblastx alignment\ | |
27 of Contigs") | |
28 the_parser.add_argument('--sequences', action="store", type=str, | |
29 help="input sequence file in fasta format") | |
30 the_parser.add_argument('--guideSequence', action="store", type=str, | |
31 help="the reference sequence to guide the scaffold\ | |
32 assembly in fasta format") | |
33 the_parser.add_argument('--blast-tab', dest="blast_tab", action="store", | |
34 type=str, | |
35 help="13-columns tabular blastn or tblastx output") | |
36 the_parser.add_argument('--output', action="store", type=str, | |
37 help="output file path, fasta format") | |
38 the_parser.add_argument('--scaffold_prefix', action="store", type=str, | |
39 help="the prefix that will be used for the header\ | |
40 of the fasta scaffold") | |
41 the_parser.add_argument('--scaffold_suffix', action="store", type=str, | |
42 help="the sufix that will be used for the header\ | |
43 of the fasta scaffold") | |
44 args = the_parser.parse_args() | |
45 return args | |
46 | |
47 | |
48 def blatnInfo(file): | |
49 blastlist = [] | |
50 with open(file, "r") as f: | |
51 for line in f: | |
52 minilist = [] | |
53 fields = line.rstrip().split() | |
54 minilist.append(fields[0]) | |
55 minilist.extend(fields[6:10]) | |
56 blastlist.append(minilist) | |
57 blastlist.sort(key=lambda x: x[3], reverse=True) | |
58 return blastlist | |
59 | |
60 | |
61 def myContigs(file): | |
62 Contigs = {} | |
63 with open(file, "r") as f: | |
64 for line in f: | |
65 if line[0] == ">": | |
66 header = line[1:-1] | |
67 Contigs[header] = "" | |
68 else: | |
69 Contigs[header] += line[:-1] | |
70 return Contigs | |
71 | |
72 | |
73 def myGuide(file): | |
74 Guide = {} | |
75 coordinate = 0 | |
76 with open(file, "r") as f: | |
77 for line in f: | |
78 if line[0] == ">": | |
79 continue | |
80 else: | |
81 for nucleotide in line[:-1]: | |
82 coordinate += 1 | |
83 Guide[coordinate] = nucleotide.lower() | |
84 return Guide | |
85 | |
86 | |
87 def updateGuide(blastlist, GuideDict, ContigsDict): | |
88 ''' | |
89 the blastlist object is a list of list with | |
90 element [0] : name of the blasted Contig | |
91 element [1] : queryStart of the alignment to the reference | |
92 element [2] = queryStop of the alignment to the reference | |
93 element [3] : subjectStart of the alignment to the reference | |
94 element [4] = subjectStop of the alignment to the reference | |
95 ''' | |
96 for fields in blastlist: | |
97 seqHeader = fields[0] | |
98 queryStart = int(fields[1]) | |
99 queryStop = int(fields[2]) | |
100 subjectStart = int(fields[3]) | |
101 subjectStop = int(fields[4]) | |
102 if subjectStart > subjectStop: | |
103 subjectStart, subjectStop = subjectStop, subjectStart | |
104 orientation = "reverse" | |
105 else: | |
106 orientation = "direct" | |
107 sequence = getseq(ContigsDict, seqHeader, queryStart, queryStop, | |
108 orientation) | |
109 for i in range(subjectStart, subjectStop+1): | |
110 try: | |
111 del GuideDict[i] | |
112 except KeyError: | |
113 continue | |
114 for i, nucleotide in enumerate(sequence): | |
115 GuideDict[i+subjectStart] = nucleotide | |
116 | |
117 | |
118 def finalAssembly(GuideDict, outputfile, prefix, suffix): | |
119 finalSeqList = [] | |
120 for keys in sorted(GuideDict): | |
121 finalSeqList.append(GuideDict[keys]) | |
122 finalSequence = insert_newlines("".join(finalSeqList)) | |
123 Out = open(outputfile, "w") | |
124 Out.write(">Scaffold_from_%s_guided_by_%s\n" % (prefix, suffix)) | |
125 Out.write("%s\n" % finalSequence) | |
126 Out.close() | |
127 | |
128 | |
129 def __main__(): | |
130 args = Parser() | |
131 ContigsDict = myContigs(args.sequences) | |
132 GuideDict = myGuide(args.guideSequence) | |
133 blastlist = blatnInfo(args.blast_tab) | |
134 updateGuide(blastlist, GuideDict, ContigsDict) | |
135 finalAssembly(GuideDict, args.output, args.scaffold_prefix, | |
136 args.scaffold_suffix) | |
137 | |
138 | |
139 if __name__ == "__main__": | |
140 __main__() |