Mercurial > repos > cstrittmatter > seqsero_v2
comparison libs/mapping_and_assembly_hybrid.py @ 0:6275272ebcbc draft
planemo upload commit 9b152b4a900a8cd70df992da881c7e3fa00d4e4c-dirty
author | cstrittmatter |
---|---|
date | Thu, 21 Dec 2017 12:45:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6275272ebcbc |
---|---|
1 import os,sys,glob,time,itertools,subprocess | |
2 from Initial_Conditions import phase1 | |
3 from Initial_Conditions import phase2 | |
4 from Initial_Conditions import phaseO | |
5 from Initial_Conditions import sero | |
6 from distutils.version import LooseVersion | |
7 | |
8 | |
9 | |
10 | |
11 def xml_parse_score_comparision_seqsero(xmlfile): | |
12 #used to do seqsero xml analysis | |
13 from Bio.Blast import NCBIXML | |
14 handle=open(xmlfile) | |
15 handle=NCBIXML.parse(handle) | |
16 handle=list(handle) | |
17 List=[] | |
18 List_score=[] | |
19 List_ids=[] | |
20 for i in range(len(handle)): | |
21 if len(handle[i].alignments)>0: | |
22 for j in range(len(handle[i].alignments)): | |
23 score=0 | |
24 ids=0 | |
25 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def) | |
26 for z in range(len(handle[i].alignments[j].hsps)): | |
27 if "last" in handle[i].query or "first" in handle[i].query: | |
28 score+=handle[i].alignments[j].hsps[z].bits | |
29 ids+=float(handle[i].alignments[j].hsps[z].identities)/handle[i].query_length | |
30 else: | |
31 if handle[i].alignments[j].hsps[z].align_length>=30: | |
32 #for the long alleles, filter noise parts | |
33 score+=handle[i].alignments[j].hsps[z].bits | |
34 ids+=float(handle[i].alignments[j].hsps[z].identities)/handle[i].query_length | |
35 List_score.append(score) | |
36 List_ids.append(ids) | |
37 temp=zip(List,List_score,List_ids) | |
38 Final_list=sorted(temp, key=lambda d:d[1], reverse = True) | |
39 return Final_list | |
40 | |
41 | |
42 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number | |
43 Old=L | |
44 L.sort() | |
45 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]] | |
46 count=[] | |
47 for j in range(len(L)): | |
48 y=0 | |
49 for x in Old: | |
50 if L[j]==x: | |
51 y+=1 | |
52 count.append(y) | |
53 if sort_on_fre!="none": | |
54 d=zip(*sorted(zip(count, L))) | |
55 L=d[1] | |
56 count=d[0] | |
57 return (L,count) | |
58 | |
59 | |
60 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list): | |
61 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test | |
62 #this is mainly used for | |
63 a=nodes_vs_score_list | |
64 fliC_score=0 | |
65 fljB_score=0 | |
66 for z in a: | |
67 if "fliC" in z[0]: | |
68 fliC_score+=z[1] | |
69 elif "fljB" in z[0]: | |
70 fljB_score+=z[1] | |
71 if fliC_score>=fljB_score: | |
72 role="fliC" | |
73 else: | |
74 role="fljB" | |
75 return (role,abs(fliC_score-fljB_score)) | |
76 | |
77 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list_passed): | |
78 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small) | |
79 #also used when no head or tail got blasted score for the contig | |
80 role="" | |
81 for z in Final_list_passed: | |
82 if node_name in z[0]: | |
83 role=z[0].split("_")[0] | |
84 break | |
85 return role | |
86 | |
87 | |
88 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list_passed): | |
89 #nodes_list is the c created by c,d=Uniq(nodes) in below function | |
90 first_target="" | |
91 role_list=[] | |
92 for x in nodes_list: | |
93 a=[] | |
94 role="" | |
95 for y in tail_head_list: | |
96 if x in y[0]: | |
97 a.append(y) | |
98 if len(a)==4: | |
99 #compare two heads (37 > 30) | |
100 #four contigs, most perfect assembly, high quality | |
101 """ | |
102 for z in a: | |
103 if "fliC_first_37" in z[0]: | |
104 t1=z[1] | |
105 elif "fljB_first_37" in z[0]: | |
106 t2=z[1] | |
107 if t1>=t2: | |
108 role="fliC" | |
109 else: | |
110 role="fljB" | |
111 """ | |
112 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
113 if diff<20: | |
114 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed) | |
115 elif len(a)==3: | |
116 """ | |
117 #compare the number, because hybrid problem | |
118 temp=[] | |
119 for z in a: | |
120 temp.append(z[0].split("_")[0]) | |
121 m,n=Uniq(temp)#only two choices in m or n | |
122 if n[0]>n[1]: | |
123 role=m[0] | |
124 else: | |
125 role=m[1] | |
126 """ | |
127 ###however, if the one with highest score is the fewer one, compare their accumulation score | |
128 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
129 if diff<20: | |
130 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed) | |
131 ###end of above score comparison | |
132 elif len(a)==2: | |
133 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37) | |
134 temp=[] | |
135 for z in a: | |
136 temp.append(z[0].split("_")[0]) | |
137 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too | |
138 if len(m)==1: | |
139 pass | |
140 else: | |
141 pass | |
142 #print "head and tail not belong to same role, now let's guess based on maximum likelihood" | |
143 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
144 if diff<20: | |
145 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed) | |
146 """ | |
147 max_unit_score=0 | |
148 for z in a: | |
149 unit_score=z[-1]/int(z[0].split("__")[1]) | |
150 if unit_score>=max_unit_score: | |
151 role=z[0].split("_")[0] | |
152 max_unit_score=unit_score | |
153 """ | |
154 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation | |
155 elif len(a)==1: | |
156 #that one | |
157 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
158 if diff<20: | |
159 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed) | |
160 #role=a[0][0].split("_")[0] | |
161 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0" | |
162 else:#a==0 | |
163 #use Final_list_passed best match | |
164 for z in Final_list_passed: | |
165 if x in z[0]: | |
166 role=z[0].split("_")[0] | |
167 break | |
168 #print x,role,len(a) | |
169 role_list.append((role,x)) | |
170 if len(role_list)==2: | |
171 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase | |
172 #just use score to do a final test | |
173 role_list=[] | |
174 for x in nodes_list: | |
175 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed) | |
176 role_list.append((role,x)) | |
177 return role_list | |
178 | |
179 def decide_contig_roles_for_H_antigen(Final_list): | |
180 #used to decide which contig is FliC and which one is fljB | |
181 contigs=[] | |
182 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=3.5 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]))] | |
183 nodes=[] | |
184 for x in Final_list_passed: | |
185 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]: | |
186 nodes.append(x[0].split("___")[1].strip()) | |
187 c,d=Uniq(nodes)#c is node_list | |
188 #print c | |
189 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])] | |
190 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list_passed) | |
191 return roles | |
192 | |
193 def Combine(b,c): | |
194 fliC_combinations=[] | |
195 fliC_combinations.append(",".join(c)) | |
196 temp_combinations=[] | |
197 for i in range(len(b)): | |
198 for x in itertools.combinations(b,i+1): | |
199 temp_combinations.append(",".join(x)) | |
200 for x in temp_combinations: | |
201 temp=[] | |
202 for y in c: | |
203 temp.append(y) | |
204 temp.append(x) | |
205 temp=",".join(temp) | |
206 temp=temp.split(",") | |
207 temp.sort() | |
208 temp=",".join(temp) | |
209 fliC_combinations.append(temp) | |
210 return fliC_combinations | |
211 | |
212 def decide_O_type_and_get_special_genes(Final_list): | |
213 #decide O based on Final_list | |
214 O_choice="?" | |
215 O_list=[] | |
216 special_genes=[] | |
217 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=3.5 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]))] | |
218 nodes=[] | |
219 for x in Final_list_passed: | |
220 if x[0].startswith("O-"): | |
221 nodes.append(x[0].split("___")[1].strip()) | |
222 elif not x[0].startswith("fl"): | |
223 special_genes.append(x) | |
224 #print "special_genes:",special_genes | |
225 c,d=Uniq(nodes) | |
226 #print "potential O antigen contig",c | |
227 final_O=[] | |
228 O_nodes_list=[] | |
229 for x in c:#c is the list for contigs | |
230 temp=0 | |
231 for y in Final_list_passed: | |
232 if x in y[0] and y[0].startswith("O-"): | |
233 final_O.append(y) | |
234 break | |
235 ### O contig has the problem of two genes on same contig, so do additional test | |
236 potenial_new_gene="" | |
237 for x in final_O: | |
238 pointer=0 #for genes merged or not | |
239 #not consider O-1,3,19_not_in_3,10, too short compared with others | |
240 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])+800 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region | |
241 pointer=x[0].split("___")[1].strip()#store the contig name | |
242 print pointer | |
243 if pointer!=0:#it has potential merge event | |
244 for y in Final_list: | |
245 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O | |
246 potenial_new_gene=y | |
247 print potenial_new_gene | |
248 break | |
249 if potenial_new_gene!="": | |
250 print "two differnt genes in same contig, fix it for O antigen" | |
251 final_O.append(potenial_new_gene) | |
252 ### end of the two genes on same contig test | |
253 if len(final_O)==0: | |
254 #print "$$$No Otype, due to no hit"#may need to be changed | |
255 O_choice="-" | |
256 else: | |
257 O_list=[] | |
258 for x in final_O: | |
259 O_list.append(x[0].split("__")[0]) | |
260 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis | |
261 O_nodes_list.append(x[0].split("___")[1]) | |
262 ### special test for O9,46 and O3,10 family | |
263 if "O-9,46_wbaV" in O_list:#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
264 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
265 O_choice="O-9,46" | |
266 #print "$$$Most possilble Otype: O-9,46" | |
267 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
268 O_choice="O-9,46,27" | |
269 #print "$$$Most possilble Otype: O-9,46,27" | |
270 else: | |
271 O_choice="O-9"#next, detect O9 vs O2? | |
272 O2=0 | |
273 O9=0 | |
274 for z in special_genes: | |
275 if "tyr-O-9" in z[0]: | |
276 O9=z[1] | |
277 elif "tyr-O-2" in z[0]: | |
278 O2=z[1] | |
279 if O2>O9: | |
280 O_choice="O-2" | |
281 elif O2<O9: | |
282 pass | |
283 else: | |
284 pass | |
285 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility." | |
286 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
287 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
288 O_choice="O-3,10" | |
289 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
290 else: | |
291 O_choice="O-1,3,19" | |
292 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
293 ### end of special test for O9,46 and O3,10 family | |
294 else: | |
295 try: | |
296 max_score=0 | |
297 for x in final_O: | |
298 if x[1]>=max_score: | |
299 max_score=x[1] | |
300 O_choice=x[0].split("_")[0] | |
301 if O_choice=="O-1,3,19": | |
302 O_choice=final_O[1][0].split("_")[0] | |
303 #print "$$$Most possilble Otype: ",O_choice | |
304 except: | |
305 pass | |
306 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)" | |
307 #print "O:",O_choice,O_nodes_list | |
308 return O_choice,O_nodes_list,special_genes,final_O | |
309 | |
310 def seqsero_from_formula_to_serotypes(Otype,fliC,fljB,special_gene_list): | |
311 #like test_output_06012017.txt | |
312 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list) | |
313 from Initial_Conditions import phase1 | |
314 from Initial_Conditions import phase2 | |
315 from Initial_Conditions import phaseO | |
316 from Initial_Conditions import sero | |
317 seronames=[] | |
318 for i in range(len(phase1)): | |
319 fliC_combine=[] | |
320 fljB_combine=[] | |
321 if phaseO[i]==Otype: | |
322 ### for fliC, detect every possible combinations to avoid the effect of "[" | |
323 if phase1[i].count("[")==0: | |
324 fliC_combine.append(phase1[i]) | |
325 elif phase1[i].count("[")>=1: | |
326 c=[] | |
327 b=[] | |
328 if phase1[i][0]=="[" and phase1[i][-1]=="]" and phase1[i].count("[")==1: | |
329 content=phase1[i].replace("[","").replace("]","") | |
330 fliC_combine.append(content) | |
331 fliC_combine.append("-") | |
332 else: | |
333 for x in phase1[i].split(","): | |
334 if "[" in x: | |
335 b.append(x.replace("[","").replace("]","")) | |
336 else: | |
337 c.append(x) | |
338 fliC_combine=Combine(b,c) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t | |
339 ### end of fliC "[" detect | |
340 ### for fljB, detect every possible combinations to avoid the effect of "[" | |
341 if phase2[i].count("[")==0: | |
342 fljB_combine.append(phase2[i]) | |
343 elif phase2[i].count("[")>=1: | |
344 d=[] | |
345 e=[] | |
346 if phase2[i][0]=="[" and phase2[i][-1]=="]" and phase2[i].count("[")==1: | |
347 content=phase2[i].replace("[","").replace("]","") | |
348 fljB_combine.append(content) | |
349 fljB_combine.append("-") | |
350 else: | |
351 for x in phase2[i].split(","): | |
352 if "[" in x: | |
353 d.append(x.replace("[","").replace("]","")) | |
354 else: | |
355 e.append(x) | |
356 fljB_combine=Combine(d,e) | |
357 ### end of fljB "[" detect | |
358 new_fliC=fliC.split(",") #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings | |
359 new_fliC.sort() | |
360 new_fliC=",".join(new_fliC) | |
361 new_fljB=fljB.split(",") | |
362 new_fljB.sort() | |
363 new_fljB=",".join(new_fljB) | |
364 if (new_fliC in fliC_combine or fliC in fliC_combine) and (new_fljB in fljB_combine or fljB in fljB_combine): | |
365 seronames.append(sero[i]) | |
366 #analyze seronames | |
367 if len(seronames)==0: | |
368 seronames=["N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"] | |
369 star="" | |
370 star_line="" | |
371 if len(seronames)>1:#there are two possible predictions for serotypes | |
372 star="*" | |
373 star_line="The predicted serotypes share the same general formula:\t"+Otype+":"+fliC+":"+fljB+"\n"## | |
374 print "\n" | |
375 predict_form=Otype+":"+fliC+":"+fljB# | |
376 predict_sero=(" or ").join(seronames) | |
377 ###special test for Enteritidis | |
378 if predict_form=="9:g,m:-": | |
379 sdf="-" | |
380 for x in special_gene_list: | |
381 if x[0].startswith("sdf"): | |
382 sdf="+" | |
383 predict_form=predict_form+"\nSdf prediction:"+sdf | |
384 if sdf=="-": | |
385 star="*" | |
386 star_line="Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"#+## | |
387 predict_sero="See comments below" | |
388 ###end of special test for Enteritidis | |
389 elif predict_form=="4:i:-": | |
390 predict_sero="potential monophasic variant of Typhimurium" | |
391 elif predict_form=="4:r:-": | |
392 predict_sero="potential monophasic variant of Heidelberg" | |
393 elif predict_form=="4:b:-": | |
394 predict_sero="potential monophasic variant of Paratyphi B" | |
395 elif predict_form=="8:e,h:1,2": | |
396 predict_sero="Newport" | |
397 star="*" | |
398 star_line="Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." | |
399 claim="The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes."## | |
400 if "N/A" in predict_sero: | |
401 claim="" | |
402 if "Typhimurium" in predict_sero or predict_form=="4:i:-": | |
403 normal=0 | |
404 mutation=0 | |
405 for x in special_gene_list: | |
406 if "oafA-O-4_full" in x[0]: | |
407 normal=x[1] | |
408 elif "oafA-O-4_5-" in x[0]: | |
409 mutation=x[1] | |
410 if normal>mutation: | |
411 #print "$$$Typhimurium" | |
412 pass | |
413 elif normal<mutation: | |
414 predict_sero=predict_sero.strip()+"(O5-)" | |
415 star="*"# | |
416 star_line="Detected the deletion of O5-." | |
417 #print "$$$Typhimurium_O5-" | |
418 else: | |
419 #print "$$$Typhimurium, even no 7 bases difference" | |
420 pass | |
421 return predict_form,predict_sero,star,star_line,claim | |
422 | |
423 def main(): | |
424 database=sys.argv[1]#used to extract reads | |
425 mapping_mode=sys.argv[2]#mem or sampe | |
426 threads=sys.argv[3] | |
427 for_fq=sys.argv[4] | |
428 rev_fq=sys.argv[5] | |
429 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) | |
430 sam=for_fq+".sam" | |
431 bam=for_fq+".bam" | |
432 sorted_bam=for_fq+"_sorted.bam" | |
433 mapped_fq1=for_fq+"_mapped.fq" | |
434 mapped_fq2=rev_fq+"_mapped.fq" | |
435 combined_fq=for_fq+"_combined.fq" | |
436 for_sai=for_fq+".sai" | |
437 rev_sai=rev_fq+".sai" | |
438 print "building database..." | |
439 #os.system("bwa index "+database+ " 2> /dev/null") | |
440 os.system("bwa index "+database+ " 2>> data_log.txt ") | |
441 print "mapping..." | |
442 if mapping_mode=="mem": | |
443 os.system("bwa mem -t "+threads+" "+database+" "+for_fq+" "+rev_fq+" > "+sam+ " 2>> data_log.txt") | |
444 elif mapping_mode=="sam": | |
445 os.system("bwa aln -t "+threads+" "+database+" "+for_fq+" > "+for_sai+ " 2>> data_log.txt") | |
446 os.system("bwa aln -t "+threads+" "+database+" "+rev_fq+" > "+rev_sai+ " 2>> data_log.txt") | |
447 os.system("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+for_fq+" "+rev_fq+" > "+sam+ " 2>> data_log.txt") | |
448 os.system("samtools view -@ "+threads+" -F 4 -Sbh "+sam+" > "+bam) | |
449 os.system("samtools view -@ "+threads+" -h -o "+sam+" "+bam) | |
450 ### check the version of samtools then use differnt commands | |
451 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
452 out, err = samtools_version.communicate() | |
453 version = err.split("ersion:")[1].strip().split(" ")[0].strip() | |
454 print "check samtools version:",version | |
455 if LooseVersion(version)<=LooseVersion("1.2"): | |
456 os.system("samtools sort -@ "+threads+" -n "+bam+" "+for_fq+"_sorted") | |
457 else: | |
458 os.system("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam) | |
459 ### end of samtools version check and its analysis | |
460 os.system("bamToFastq -i "+sorted_bam+" -fq "+combined_fq) | |
461 os.system("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt")#2> /dev/null if want no output | |
462 outdir=current_time+"_temp" | |
463 print "assembling..." | |
464 if int(threads)>4: | |
465 t="4" | |
466 else: | |
467 t=threads | |
468 os.system("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1") | |
469 new_fasta=for_fq+"_"+database+"_"+mapping_mode+".fasta" | |
470 os.system("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null") | |
471 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta | |
472 os.system("rm -rf "+outdir+ " 2> /dev/null") | |
473 ### begin blast | |
474 print "blasting..." | |
475 print "\n" | |
476 xmlfile=for_fq+"-extracted_vs_"+database+"_"+mapping_mode+".xml" | |
477 os.system('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1') #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015 | |
478 os.system("blastn -word_size 10 -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1")###1/27/2015 | |
479 Final_list=xml_parse_score_comparision_seqsero(xmlfile) | |
480 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=3.5 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]))] | |
481 fliC_choice="-" | |
482 fljB_choice="-" | |
483 fliC_contig="NA" | |
484 fljB_contig="NA" | |
485 fliC_length=0 #can be changed to coverage in future | |
486 fljB_length=0 #can be changed to coverage in future | |
487 O_choice=""#no need to decide O contig for now, should be only one | |
488 O_choice,O_nodes,special_gene_list,O_nodes_roles=decide_O_type_and_get_special_genes(Final_list)#decide the O antigen type and also return special-gene-list for further identification | |
489 O_choice=O_choice.split("-")[-1].strip() | |
490 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list)#decide the H antigen contig is fliC or fljB | |
491 log_file=open("SeqSero_hybrid_assembly_log.txt","a") | |
492 print "O_contigs:" | |
493 log_file.write("O_contigs:\n") | |
494 for x in O_nodes_roles: | |
495 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker | |
496 print x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%" | |
497 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+" "+"blast score: "+str(x[1])+"identity%:"+str(round(x[2]*100,2))+"%"+"\n") | |
498 print "H_contigs:" | |
499 log_file.write("H_contigs:\n") | |
500 H_contig_stat=[] | |
501 for i in range(len(H_contig_roles)): | |
502 x=H_contig_roles[i] | |
503 a=0 | |
504 for y in Final_list_passed: | |
505 if x[1] in y[0] and y[0].startswith(x[0]): | |
506 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide | |
507 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it | |
508 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB | |
509 print x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%" | |
510 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"%"+"\n") | |
511 H_contig_roles[i]="can't decide fliC or fljB, may be third phase" | |
512 break | |
513 else: | |
514 print x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%" | |
515 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"%"+"\n") | |
516 break | |
517 for x in H_contig_roles: | |
518 #if multiple choices, temporately select the one with longest length for now, will revise in further change | |
519 if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes:#remember to avoid the effect of O-type contig, so should not in O_node list | |
520 fliC_contig=x[1] | |
521 fliC_length=int(x[1].split("_")[3]) | |
522 elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes: | |
523 fljB_contig=x[1] | |
524 fljB_length=int(x[1].split("_")[3]) | |
525 for x in Final_list_passed: | |
526 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0] : | |
527 fliC_choice=x[0].split("_")[1] | |
528 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]: | |
529 fljB_choice=x[0].split("_")[1] | |
530 elif fliC_choice!="-" and fljB_choice!="-": | |
531 break | |
532 print "\n" | |
533 print "SeqSero Input files:",for_fq,rev_fq | |
534 print "Most possible O antigen:",O_choice | |
535 print "Most possible H1 antigen:",fliC_choice | |
536 print "Most possible H2 antigen:",fljB_choice | |
537 #print Final_list | |
538 ###output | |
539 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list) | |
540 new_file=open("Seqsero_result.txt","w") | |
541 new_file.write("Input files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+"O-"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## | |
542 new_file.close() | |
543 os.system("cat Seqsero_result.txt") | |
544 | |
545 if __name__ == '__main__': | |
546 main() |