Mercurial > repos > artbio > blastparser_and_hits
comparison BlastParser_and_hits.py @ 0:9dfb65ebb02e draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blastparser_and_hits commit 48132e5edac97d54804ccbaf620068a5fb800bdc
author | artbio |
---|---|
date | Sun, 15 Oct 2017 18:43:37 -0400 |
parents | |
children | 36103afa0934 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9dfb65ebb02e |
---|---|
1 #!/usr/bin/python | |
2 import argparse | |
3 from collections import defaultdict | |
4 | |
5 | |
6 def Parser(): | |
7 the_parser = argparse.ArgumentParser() | |
8 the_parser.add_argument('--blast', action="store", type=str, | |
9 help="Path to the blast output\ | |
10 (tabular format, 12 column)") | |
11 the_parser.add_argument('--sequences', action="store", type=str, | |
12 help="Path to the fasta file with blasted\ | |
13 sequences") | |
14 the_parser.add_argument('--fastaOutput', action="store", type=str, | |
15 help="fasta output file of blast hits") | |
16 the_parser.add_argument('--tabularOutput', action="store", type=str, | |
17 help="tabular output file of blast analysis") | |
18 the_parser.add_argument('--flanking', action="store", type=int, | |
19 help="number of flanking nucleotides\ | |
20 added to the hit sequences") | |
21 the_parser.add_argument('--mode', action="store", | |
22 choices=["verbose", "short"], type=str, | |
23 help="reporting (verbose) or not reporting (short)\ | |
24 oases contigs") | |
25 the_parser.add_argument('--filter_relativeCov', action="store", type=float, | |
26 default=0, | |
27 help="filter out relative coverages\ | |
28 below the specified ratio (float number)") | |
29 the_parser.add_argument('--filter_maxScore', action="store", type=float, | |
30 default=0, help="filter out best BitScores below\ | |
31 the specified float number") | |
32 the_parser.add_argument('--filter_meanScore', action="store", type=float, | |
33 default=0, | |
34 help="filter out mean BitScores below the\ | |
35 specified float number") | |
36 the_parser.add_argument('--filter_term_in', action="store", type=str, | |
37 default="", | |
38 help="select the specified term in the\ | |
39 subject list") | |
40 the_parser.add_argument('--filter_term_out', action="store", type=str, | |
41 default="", | |
42 help="exclude the specified term from\ | |
43 the subject list") | |
44 the_parser.add_argument('--al_sequences', action="store", type=str, | |
45 help="sequences that have been blast aligned") | |
46 the_parser.add_argument('--un_sequences', action="store", type=str, | |
47 help="sequences that have not been blast aligned") | |
48 the_parser.add_argument('--dataset_name', action="store", type=str, | |
49 default="", | |
50 help="the name of the dataset that has been parsed,\ | |
51 to be reported in the output") | |
52 args = the_parser.parse_args() | |
53 if not all((args.sequences, args.blast, args.fastaOutput, | |
54 args.tabularOutput)): | |
55 the_parser.error('argument(s) missing, call the\ | |
56 -h option of the script') | |
57 if not args.flanking: | |
58 args.flanking = 0 | |
59 return args | |
60 | |
61 | |
62 def median(lst): | |
63 lst = sorted(lst) | |
64 if len(lst) < 1: | |
65 return None | |
66 if len(lst) % 2 == 1: | |
67 return lst[((len(lst)+1)/2)-1] | |
68 if len(lst) % 2 == 0: | |
69 return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 | |
70 | |
71 | |
72 def mean(lst): | |
73 if len(lst) < 1: | |
74 return 0 | |
75 return sum(lst) / float(len(lst)) | |
76 | |
77 | |
78 def getfasta(fastafile): | |
79 fastadic = {} | |
80 for line in open(fastafile): | |
81 if line[0] == ">": | |
82 header = line[1:-1] | |
83 fastadic[header] = "" | |
84 else: | |
85 fastadic[header] += line | |
86 for header in fastadic: | |
87 fastadic[header] = "".join(fastadic[header].split("\n")) | |
88 return fastadic | |
89 | |
90 | |
91 def insert_newlines(string, every=60): | |
92 lines = [] | |
93 for i in range(0, len(string), every): | |
94 lines.append(string[i:i+every]) | |
95 return '\n'.join(lines) | |
96 | |
97 | |
98 def getblast(blastfile): | |
99 '''blastinfo [0] Percentage of identical matches | |
100 blastinfo [1] Alignment length | |
101 blastinfo [2] Number of mismatches | |
102 blastinfo [3] Number of gap openings | |
103 blastinfo [4] Start of alignment in query | |
104 blastinfo [5] End of alignment in query | |
105 blastinfo [6] Start of alignment in subject (database hit) | |
106 blastinfo [7] End of alignment in subject (database hit) | |
107 blastinfo [8] Expectation value (E-value) | |
108 blastinfo [9] Bit score | |
109 blastinfo [10] Subject length | |
110 (NEED TO BE SPECIFIED WHEN RUNNING BLAST) ''' | |
111 blastdic = defaultdict(dict) | |
112 for line in open(blastfile): | |
113 fields = line[:-1].split("\t") | |
114 transcript = fields[0] | |
115 subject = fields[1] | |
116 # blastinfo[0] | |
117 blastinfo = [float(fields[2])] | |
118 # blastinfo[1:8] insets 1 to 7 | |
119 blastinfo = blastinfo + [int(i) for i in fields[3:10]] | |
120 # blastinfo[8] E-value remains as a string type | |
121 blastinfo.append(fields[10]) | |
122 # blastinfo[9] Bit score | |
123 blastinfo.append(float(fields[11])) | |
124 # blastinfo[10] Subject length MUST BE RETRIEVED | |
125 # THROUGH A 13 COLUMN BLAST OUTPUT | |
126 blastinfo.append(int(fields[12])) | |
127 try: | |
128 blastdic[subject][transcript].append(blastinfo) | |
129 except Exception: | |
130 blastdic[subject][transcript] = [blastinfo] | |
131 return blastdic | |
132 | |
133 | |
134 def getseq(fastadict, transcript, up, down, orientation="direct"): | |
135 def reverse(seq): | |
136 revdict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} | |
137 revseq = [revdict[i] for i in seq[::-1]] | |
138 return "".join(revseq) | |
139 pickseq = fastadict[transcript][up-1:down] | |
140 if orientation == "direct": | |
141 return pickseq | |
142 else: | |
143 return reverse(pickseq) | |
144 | |
145 | |
146 def subjectCoverage(fastadict, blastdict, subject, | |
147 QueriesFlankingNucleotides=0): | |
148 SubjectCoverageList = [] | |
149 HitDic = {} | |
150 bitScores = [] | |
151 for transcript in blastdict[subject]: | |
152 prefix = "%s--%s_" % (subject, transcript) | |
153 hitNumber = 0 | |
154 for hit in blastdict[subject][transcript]: | |
155 hitNumber += 1 | |
156 suffix = "hit%s_IdMatch=%s,AligLength=%s,E-val=%s" % (hitNumber, | |
157 hit[0], | |
158 hit[1], | |
159 hit[8]) | |
160 # query coverage by a hit is in hit[4:6] | |
161 HitDic[prefix+suffix] = GetHitSequence(fastadict, transcript, | |
162 hit[4], hit[5], | |
163 QueriesFlankingNucleotides) | |
164 # subject coverage by a hit is in hit[6:8] | |
165 SubjectCoverageList += range(min([hit[6], hit[7]]), | |
166 max([hit[6], hit[7]]) + 1) | |
167 bitScores.append(hit[9]) | |
168 # always the same value for a given subject. Stupid but simple | |
169 subjectLength = hit[10] | |
170 TotalSubjectCoverage = len(set(SubjectCoverageList)) | |
171 RelativeSubjectCoverage = TotalSubjectCoverage/float(subjectLength) | |
172 return (HitDic, subjectLength, TotalSubjectCoverage, | |
173 RelativeSubjectCoverage, max(bitScores), mean(bitScores)) | |
174 | |
175 | |
176 def GetHitSequence(fastadict, FastaHeader, leftCoordinate, rightCoordinate, | |
177 FlankingValue): | |
178 if rightCoordinate > leftCoordinate: | |
179 polarity = "direct" | |
180 else: | |
181 polarity = "reverse" | |
182 leftCoordinate, rightCoordinate = rightCoordinate, leftCoordinate | |
183 if leftCoordinate - FlankingValue > 0: | |
184 leftCoordinate -= FlankingValue | |
185 else: | |
186 leftCoordinate = 1 | |
187 return getseq(fastadict, FastaHeader, leftCoordinate, rightCoordinate, | |
188 polarity) | |
189 | |
190 | |
191 def outputParsing(dataset_name, F, Fasta, results, Xblastdict, fastadict, | |
192 filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, | |
193 filter_term_in="", filter_term_out="", mode="verbose"): | |
194 def filter_results(results, filter_relativeCov=0, filter_maxScore=0, | |
195 filter_meanScore=0, filter_term_in="", | |
196 filter_term_out=""): | |
197 for subject in results.keys(): | |
198 if results[subject][ | |
199 "RelativeSubjectCoverage"] < filter_relativeCov: | |
200 del results[subject] | |
201 continue | |
202 if results[subject]["maxBitScores"] < filter_maxScore: | |
203 del results[subject] | |
204 continue | |
205 if results[subject]["meanBitScores"] < filter_meanScore: | |
206 del results[subject] | |
207 continue | |
208 if filter_term_in in subject: | |
209 pass | |
210 else: | |
211 del results[subject] | |
212 continue | |
213 if filter_term_out and filter_term_out in subject: | |
214 del results[subject] | |
215 continue | |
216 return results | |
217 | |
218 F = open(F, "w") | |
219 Fasta = open(Fasta, "w") | |
220 blasted_transcripts = [] | |
221 filter_results(results, filter_relativeCov, filter_maxScore, | |
222 filter_meanScore, filter_term_in, filter_term_out) | |
223 for subject in results: | |
224 for transcript in Xblastdict[subject]: | |
225 blasted_transcripts.append(transcript) | |
226 blasted_transcripts = list(set(blasted_transcripts)) | |
227 if mode == "verbose": | |
228 F.write("--- %s ---\n" % dataset_name) | |
229 F.write("# %s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ("SeqId", "%Identity", | |
230 "AlignLength", | |
231 "StartSubject", | |
232 "EndSubject", | |
233 "%QueryHitCov", | |
234 "E-value", | |
235 "BitScore")) | |
236 for subject in sorted(results, | |
237 key=lambda x: results[x]["meanBitScores"], | |
238 reverse=True): | |
239 F.write(" \n# %s\n" % subject) | |
240 F.write("# Suject Length: %s\n" % | |
241 results[subject]["subjectLength"]) | |
242 F.write("# Total Subject Coverage: %s\n" % | |
243 results[subject]["TotalCoverage"]) | |
244 F.write("# Relative Subject Coverage: %s\n" % | |
245 results[subject]["RelativeSubjectCoverage"]) | |
246 F.write("# Best Bit Score: %s\n" % results[subject][ | |
247 "maxBitScores"]) | |
248 F.write("# Mean Bit Score: %s\n" % results[subject][ | |
249 "meanBitScores"]) | |
250 for header in results[subject]["HitDic"]: | |
251 Fasta.write(">%s\n%s\n" % (header, | |
252 insert_newlines(results[subject][ | |
253 "HitDic"][ | |
254 header]))) | |
255 Fasta.write("\n") # final carriage return for the sequence | |
256 for transcript in Xblastdict[subject]: | |
257 transcriptSize = float(len(fastadict[transcript])) | |
258 for hit in Xblastdict[subject][transcript]: | |
259 percentIdentity = hit[0] | |
260 alignLenght = hit[1] | |
261 subjectStart = hit[6] | |
262 subjectEnd = hit[7] | |
263 queryCov = "%.1f" % (abs(hit[5]-hit[4])/transcriptSize*100) | |
264 Eval, BitScore = hit[8], hit[9] | |
265 info = [transcript] + [percentIdentity, alignLenght, | |
266 subjectStart, subjectEnd, queryCov, | |
267 Eval, BitScore] | |
268 info = [str(i) for i in info] | |
269 info = "\t".join(info) | |
270 F.write("%s\n" % info) | |
271 else: | |
272 F.write("--- %s ---\n" % dataset_name) | |
273 F.write("# subject\tsubject length\tTotal Subject Coverage\tRelative\ | |
274 Subject Coverage\tBest Bit Score\tMean Bit Score\n") | |
275 for subject in sorted(results, | |
276 key=lambda x: results[x]["meanBitScores"], | |
277 reverse=True): | |
278 line = [] | |
279 line.append(subject) | |
280 line.append(results[subject]["subjectLength"]) | |
281 line.append(results[subject]["TotalCoverage"]) | |
282 line.append(results[subject]["RelativeSubjectCoverage"]) | |
283 line.append(results[subject]["maxBitScores"]) | |
284 line.append(results[subject]["meanBitScores"]) | |
285 line = [str(i) for i in line] | |
286 F.write("%s\n" % "\t".join(line)) | |
287 for header in results[subject]["HitDic"]: | |
288 Fasta.write(">%s\n%s\n" % (header, | |
289 insert_newlines( | |
290 results[subject][ | |
291 "HitDic"][header]))) | |
292 Fasta.write("\n") # final carriage return for the sequence | |
293 F.close() | |
294 Fasta.close() | |
295 return blasted_transcripts | |
296 | |
297 | |
298 def dispatch_sequences(fastadict, blasted_transcripts, matched_sequences, | |
299 unmatched_sequences): | |
300 '''to output the sequences that matched and did not matched in the blast''' | |
301 F_matched = open(matched_sequences, "w") | |
302 F_unmatched = open(unmatched_sequences, "w") | |
303 for transcript in fastadict: | |
304 if transcript in blasted_transcripts: | |
305 ''''list of blasted_transcripts is generated | |
306 by the outputParsing function''' | |
307 F_matched.write(">%s\n%s\n" % (transcript, insert_newlines( | |
308 fastadict[transcript]))) | |
309 else: | |
310 F_unmatched.write(">%s\n%s\n" % (transcript, insert_newlines( | |
311 fastadict[transcript]))) | |
312 F_matched.close() | |
313 F_unmatched.close() | |
314 return | |
315 | |
316 | |
317 def __main__(): | |
318 args = Parser() | |
319 fastadict = getfasta(args.sequences) | |
320 Xblastdict = getblast(args.blast) | |
321 results = defaultdict(dict) | |
322 for subject in Xblastdict: | |
323 results[subject]["HitDic"], results[subject]["subjectLength"], results[ | |
324 subject]["TotalCoverage"], results[subject][ | |
325 "RelativeSubjectCoverage"], results[subject][ | |
326 "maxBitScores"], results[subject][ | |
327 "meanBitScores"] = subjectCoverage(fastadict, Xblastdict, subject, | |
328 args.flanking) | |
329 blasted_transcripts = outputParsing( | |
330 args.dataset_name, args.tabularOutput, | |
331 args.fastaOutput, results, Xblastdict, fastadict, | |
332 filter_relativeCov=args.filter_relativeCov, | |
333 filter_maxScore=args.filter_maxScore, | |
334 filter_meanScore=args.filter_meanScore, | |
335 filter_term_in=args.filter_term_in, | |
336 filter_term_out=args.filter_term_out, mode=args.mode) | |
337 dispatch_sequences(fastadict, blasted_transcripts, args.al_sequences, | |
338 args.un_sequences) | |
339 | |
340 | |
341 if __name__ == "__main__": | |
342 __main__() |