comparison libs/mapping_and_assembly_hybrid.py @ 0:6275272ebcbc draft

planemo upload commit 9b152b4a900a8cd70df992da881c7e3fa00d4e4c-dirty
author cstrittmatter
date Thu, 21 Dec 2017 12:45:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6275272ebcbc
1 import os,sys,glob,time,itertools,subprocess
2 from Initial_Conditions import phase1
3 from Initial_Conditions import phase2
4 from Initial_Conditions import phaseO
5 from Initial_Conditions import sero
6 from distutils.version import LooseVersion
7
8
9
10
11 def xml_parse_score_comparision_seqsero(xmlfile):
12 #used to do seqsero xml analysis
13 from Bio.Blast import NCBIXML
14 handle=open(xmlfile)
15 handle=NCBIXML.parse(handle)
16 handle=list(handle)
17 List=[]
18 List_score=[]
19 List_ids=[]
20 for i in range(len(handle)):
21 if len(handle[i].alignments)>0:
22 for j in range(len(handle[i].alignments)):
23 score=0
24 ids=0
25 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
26 for z in range(len(handle[i].alignments[j].hsps)):
27 if "last" in handle[i].query or "first" in handle[i].query:
28 score+=handle[i].alignments[j].hsps[z].bits
29 ids+=float(handle[i].alignments[j].hsps[z].identities)/handle[i].query_length
30 else:
31 if handle[i].alignments[j].hsps[z].align_length>=30:
32 #for the long alleles, filter noise parts
33 score+=handle[i].alignments[j].hsps[z].bits
34 ids+=float(handle[i].alignments[j].hsps[z].identities)/handle[i].query_length
35 List_score.append(score)
36 List_ids.append(ids)
37 temp=zip(List,List_score,List_ids)
38 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
39 return Final_list
40
41
42 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
43 Old=L
44 L.sort()
45 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
46 count=[]
47 for j in range(len(L)):
48 y=0
49 for x in Old:
50 if L[j]==x:
51 y+=1
52 count.append(y)
53 if sort_on_fre!="none":
54 d=zip(*sorted(zip(count, L)))
55 L=d[1]
56 count=d[0]
57 return (L,count)
58
59
60 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
61 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
62 #this is mainly used for
63 a=nodes_vs_score_list
64 fliC_score=0
65 fljB_score=0
66 for z in a:
67 if "fliC" in z[0]:
68 fliC_score+=z[1]
69 elif "fljB" in z[0]:
70 fljB_score+=z[1]
71 if fliC_score>=fljB_score:
72 role="fliC"
73 else:
74 role="fljB"
75 return (role,abs(fliC_score-fljB_score))
76
77 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list_passed):
78 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
79 #also used when no head or tail got blasted score for the contig
80 role=""
81 for z in Final_list_passed:
82 if node_name in z[0]:
83 role=z[0].split("_")[0]
84 break
85 return role
86
87
88 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list_passed):
89 #nodes_list is the c created by c,d=Uniq(nodes) in below function
90 first_target=""
91 role_list=[]
92 for x in nodes_list:
93 a=[]
94 role=""
95 for y in tail_head_list:
96 if x in y[0]:
97 a.append(y)
98 if len(a)==4:
99 #compare two heads (37 > 30)
100 #four contigs, most perfect assembly, high quality
101 """
102 for z in a:
103 if "fliC_first_37" in z[0]:
104 t1=z[1]
105 elif "fljB_first_37" in z[0]:
106 t2=z[1]
107 if t1>=t2:
108 role="fliC"
109 else:
110 role="fljB"
111 """
112 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
113 if diff<20:
114 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed)
115 elif len(a)==3:
116 """
117 #compare the number, because hybrid problem
118 temp=[]
119 for z in a:
120 temp.append(z[0].split("_")[0])
121 m,n=Uniq(temp)#only two choices in m or n
122 if n[0]>n[1]:
123 role=m[0]
124 else:
125 role=m[1]
126 """
127 ###however, if the one with highest score is the fewer one, compare their accumulation score
128 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
129 if diff<20:
130 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed)
131 ###end of above score comparison
132 elif len(a)==2:
133 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
134 temp=[]
135 for z in a:
136 temp.append(z[0].split("_")[0])
137 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
138 if len(m)==1:
139 pass
140 else:
141 pass
142 #print "head and tail not belong to same role, now let's guess based on maximum likelihood"
143 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
144 if diff<20:
145 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed)
146 """
147 max_unit_score=0
148 for z in a:
149 unit_score=z[-1]/int(z[0].split("__")[1])
150 if unit_score>=max_unit_score:
151 role=z[0].split("_")[0]
152 max_unit_score=unit_score
153 """
154 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
155 elif len(a)==1:
156 #that one
157 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
158 if diff<20:
159 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed)
160 #role=a[0][0].split("_")[0]
161 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
162 else:#a==0
163 #use Final_list_passed best match
164 for z in Final_list_passed:
165 if x in z[0]:
166 role=z[0].split("_")[0]
167 break
168 #print x,role,len(a)
169 role_list.append((role,x))
170 if len(role_list)==2:
171 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
172 #just use score to do a final test
173 role_list=[]
174 for x in nodes_list:
175 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list_passed)
176 role_list.append((role,x))
177 return role_list
178
179 def decide_contig_roles_for_H_antigen(Final_list):
180 #used to decide which contig is FliC and which one is fljB
181 contigs=[]
182 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=3.5 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]))]
183 nodes=[]
184 for x in Final_list_passed:
185 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
186 nodes.append(x[0].split("___")[1].strip())
187 c,d=Uniq(nodes)#c is node_list
188 #print c
189 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
190 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list_passed)
191 return roles
192
193 def Combine(b,c):
194 fliC_combinations=[]
195 fliC_combinations.append(",".join(c))
196 temp_combinations=[]
197 for i in range(len(b)):
198 for x in itertools.combinations(b,i+1):
199 temp_combinations.append(",".join(x))
200 for x in temp_combinations:
201 temp=[]
202 for y in c:
203 temp.append(y)
204 temp.append(x)
205 temp=",".join(temp)
206 temp=temp.split(",")
207 temp.sort()
208 temp=",".join(temp)
209 fliC_combinations.append(temp)
210 return fliC_combinations
211
212 def decide_O_type_and_get_special_genes(Final_list):
213 #decide O based on Final_list
214 O_choice="?"
215 O_list=[]
216 special_genes=[]
217 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=3.5 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]))]
218 nodes=[]
219 for x in Final_list_passed:
220 if x[0].startswith("O-"):
221 nodes.append(x[0].split("___")[1].strip())
222 elif not x[0].startswith("fl"):
223 special_genes.append(x)
224 #print "special_genes:",special_genes
225 c,d=Uniq(nodes)
226 #print "potential O antigen contig",c
227 final_O=[]
228 O_nodes_list=[]
229 for x in c:#c is the list for contigs
230 temp=0
231 for y in Final_list_passed:
232 if x in y[0] and y[0].startswith("O-"):
233 final_O.append(y)
234 break
235 ### O contig has the problem of two genes on same contig, so do additional test
236 potenial_new_gene=""
237 for x in final_O:
238 pointer=0 #for genes merged or not
239 #not consider O-1,3,19_not_in_3,10, too short compared with others
240 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])+800 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
241 pointer=x[0].split("___")[1].strip()#store the contig name
242 print pointer
243 if pointer!=0:#it has potential merge event
244 for y in Final_list:
245 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
246 potenial_new_gene=y
247 print potenial_new_gene
248 break
249 if potenial_new_gene!="":
250 print "two differnt genes in same contig, fix it for O antigen"
251 final_O.append(potenial_new_gene)
252 ### end of the two genes on same contig test
253 if len(final_O)==0:
254 #print "$$$No Otype, due to no hit"#may need to be changed
255 O_choice="-"
256 else:
257 O_list=[]
258 for x in final_O:
259 O_list.append(x[0].split("__")[0])
260 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis
261 O_nodes_list.append(x[0].split("___")[1])
262 ### special test for O9,46 and O3,10 family
263 if "O-9,46_wbaV" in O_list:#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
264 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
265 O_choice="O-9,46"
266 #print "$$$Most possilble Otype: O-9,46"
267 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
268 O_choice="O-9,46,27"
269 #print "$$$Most possilble Otype: O-9,46,27"
270 else:
271 O_choice="O-9"#next, detect O9 vs O2?
272 O2=0
273 O9=0
274 for z in special_genes:
275 if "tyr-O-9" in z[0]:
276 O9=z[1]
277 elif "tyr-O-2" in z[0]:
278 O2=z[1]
279 if O2>O9:
280 O_choice="O-2"
281 elif O2<O9:
282 pass
283 else:
284 pass
285 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
286 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
287 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
288 O_choice="O-3,10"
289 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
290 else:
291 O_choice="O-1,3,19"
292 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
293 ### end of special test for O9,46 and O3,10 family
294 else:
295 try:
296 max_score=0
297 for x in final_O:
298 if x[1]>=max_score:
299 max_score=x[1]
300 O_choice=x[0].split("_")[0]
301 if O_choice=="O-1,3,19":
302 O_choice=final_O[1][0].split("_")[0]
303 #print "$$$Most possilble Otype: ",O_choice
304 except:
305 pass
306 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
307 #print "O:",O_choice,O_nodes_list
308 return O_choice,O_nodes_list,special_genes,final_O
309
310 def seqsero_from_formula_to_serotypes(Otype,fliC,fljB,special_gene_list):
311 #like test_output_06012017.txt
312 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
313 from Initial_Conditions import phase1
314 from Initial_Conditions import phase2
315 from Initial_Conditions import phaseO
316 from Initial_Conditions import sero
317 seronames=[]
318 for i in range(len(phase1)):
319 fliC_combine=[]
320 fljB_combine=[]
321 if phaseO[i]==Otype:
322 ### for fliC, detect every possible combinations to avoid the effect of "["
323 if phase1[i].count("[")==0:
324 fliC_combine.append(phase1[i])
325 elif phase1[i].count("[")>=1:
326 c=[]
327 b=[]
328 if phase1[i][0]=="[" and phase1[i][-1]=="]" and phase1[i].count("[")==1:
329 content=phase1[i].replace("[","").replace("]","")
330 fliC_combine.append(content)
331 fliC_combine.append("-")
332 else:
333 for x in phase1[i].split(","):
334 if "[" in x:
335 b.append(x.replace("[","").replace("]",""))
336 else:
337 c.append(x)
338 fliC_combine=Combine(b,c) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
339 ### end of fliC "[" detect
340 ### for fljB, detect every possible combinations to avoid the effect of "["
341 if phase2[i].count("[")==0:
342 fljB_combine.append(phase2[i])
343 elif phase2[i].count("[")>=1:
344 d=[]
345 e=[]
346 if phase2[i][0]=="[" and phase2[i][-1]=="]" and phase2[i].count("[")==1:
347 content=phase2[i].replace("[","").replace("]","")
348 fljB_combine.append(content)
349 fljB_combine.append("-")
350 else:
351 for x in phase2[i].split(","):
352 if "[" in x:
353 d.append(x.replace("[","").replace("]",""))
354 else:
355 e.append(x)
356 fljB_combine=Combine(d,e)
357 ### end of fljB "[" detect
358 new_fliC=fliC.split(",") #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
359 new_fliC.sort()
360 new_fliC=",".join(new_fliC)
361 new_fljB=fljB.split(",")
362 new_fljB.sort()
363 new_fljB=",".join(new_fljB)
364 if (new_fliC in fliC_combine or fliC in fliC_combine) and (new_fljB in fljB_combine or fljB in fljB_combine):
365 seronames.append(sero[i])
366 #analyze seronames
367 if len(seronames)==0:
368 seronames=["N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"]
369 star=""
370 star_line=""
371 if len(seronames)>1:#there are two possible predictions for serotypes
372 star="*"
373 star_line="The predicted serotypes share the same general formula:\t"+Otype+":"+fliC+":"+fljB+"\n"##
374 print "\n"
375 predict_form=Otype+":"+fliC+":"+fljB#
376 predict_sero=(" or ").join(seronames)
377 ###special test for Enteritidis
378 if predict_form=="9:g,m:-":
379 sdf="-"
380 for x in special_gene_list:
381 if x[0].startswith("sdf"):
382 sdf="+"
383 predict_form=predict_form+"\nSdf prediction:"+sdf
384 if sdf=="-":
385 star="*"
386 star_line="Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"#+##
387 predict_sero="See comments below"
388 ###end of special test for Enteritidis
389 elif predict_form=="4:i:-":
390 predict_sero="potential monophasic variant of Typhimurium"
391 elif predict_form=="4:r:-":
392 predict_sero="potential monophasic variant of Heidelberg"
393 elif predict_form=="4:b:-":
394 predict_sero="potential monophasic variant of Paratyphi B"
395 elif predict_form=="8:e,h:1,2":
396 predict_sero="Newport"
397 star="*"
398 star_line="Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
399 claim="The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes."##
400 if "N/A" in predict_sero:
401 claim=""
402 if "Typhimurium" in predict_sero or predict_form=="4:i:-":
403 normal=0
404 mutation=0
405 for x in special_gene_list:
406 if "oafA-O-4_full" in x[0]:
407 normal=x[1]
408 elif "oafA-O-4_5-" in x[0]:
409 mutation=x[1]
410 if normal>mutation:
411 #print "$$$Typhimurium"
412 pass
413 elif normal<mutation:
414 predict_sero=predict_sero.strip()+"(O5-)"
415 star="*"#
416 star_line="Detected the deletion of O5-."
417 #print "$$$Typhimurium_O5-"
418 else:
419 #print "$$$Typhimurium, even no 7 bases difference"
420 pass
421 return predict_form,predict_sero,star,star_line,claim
422
423 def main():
424 database=sys.argv[1]#used to extract reads
425 mapping_mode=sys.argv[2]#mem or sampe
426 threads=sys.argv[3]
427 for_fq=sys.argv[4]
428 rev_fq=sys.argv[5]
429 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
430 sam=for_fq+".sam"
431 bam=for_fq+".bam"
432 sorted_bam=for_fq+"_sorted.bam"
433 mapped_fq1=for_fq+"_mapped.fq"
434 mapped_fq2=rev_fq+"_mapped.fq"
435 combined_fq=for_fq+"_combined.fq"
436 for_sai=for_fq+".sai"
437 rev_sai=rev_fq+".sai"
438 print "building database..."
439 #os.system("bwa index "+database+ " 2> /dev/null")
440 os.system("bwa index "+database+ " 2>> data_log.txt ")
441 print "mapping..."
442 if mapping_mode=="mem":
443 os.system("bwa mem -t "+threads+" "+database+" "+for_fq+" "+rev_fq+" > "+sam+ " 2>> data_log.txt")
444 elif mapping_mode=="sam":
445 os.system("bwa aln -t "+threads+" "+database+" "+for_fq+" > "+for_sai+ " 2>> data_log.txt")
446 os.system("bwa aln -t "+threads+" "+database+" "+rev_fq+" > "+rev_sai+ " 2>> data_log.txt")
447 os.system("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+for_fq+" "+rev_fq+" > "+sam+ " 2>> data_log.txt")
448 os.system("samtools view -@ "+threads+" -F 4 -Sbh "+sam+" > "+bam)
449 os.system("samtools view -@ "+threads+" -h -o "+sam+" "+bam)
450 ### check the version of samtools then use differnt commands
451 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
452 out, err = samtools_version.communicate()
453 version = err.split("ersion:")[1].strip().split(" ")[0].strip()
454 print "check samtools version:",version
455 if LooseVersion(version)<=LooseVersion("1.2"):
456 os.system("samtools sort -@ "+threads+" -n "+bam+" "+for_fq+"_sorted")
457 else:
458 os.system("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam)
459 ### end of samtools version check and its analysis
460 os.system("bamToFastq -i "+sorted_bam+" -fq "+combined_fq)
461 os.system("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt")#2> /dev/null if want no output
462 outdir=current_time+"_temp"
463 print "assembling..."
464 if int(threads)>4:
465 t="4"
466 else:
467 t=threads
468 os.system("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1")
469 new_fasta=for_fq+"_"+database+"_"+mapping_mode+".fasta"
470 os.system("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null")
471 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
472 os.system("rm -rf "+outdir+ " 2> /dev/null")
473 ### begin blast
474 print "blasting..."
475 print "\n"
476 xmlfile=for_fq+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
477 os.system('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1') #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
478 os.system("blastn -word_size 10 -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1")###1/27/2015
479 Final_list=xml_parse_score_comparision_seqsero(xmlfile)
480 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=3.5 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]))]
481 fliC_choice="-"
482 fljB_choice="-"
483 fliC_contig="NA"
484 fljB_contig="NA"
485 fliC_length=0 #can be changed to coverage in future
486 fljB_length=0 #can be changed to coverage in future
487 O_choice=""#no need to decide O contig for now, should be only one
488 O_choice,O_nodes,special_gene_list,O_nodes_roles=decide_O_type_and_get_special_genes(Final_list)#decide the O antigen type and also return special-gene-list for further identification
489 O_choice=O_choice.split("-")[-1].strip()
490 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list)#decide the H antigen contig is fliC or fljB
491 log_file=open("SeqSero_hybrid_assembly_log.txt","a")
492 print "O_contigs:"
493 log_file.write("O_contigs:\n")
494 for x in O_nodes_roles:
495 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
496 print x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%"
497 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+" "+"blast score: "+str(x[1])+"identity%:"+str(round(x[2]*100,2))+"%"+"\n")
498 print "H_contigs:"
499 log_file.write("H_contigs:\n")
500 H_contig_stat=[]
501 for i in range(len(H_contig_roles)):
502 x=H_contig_roles[i]
503 a=0
504 for y in Final_list_passed:
505 if x[1] in y[0] and y[0].startswith(x[0]):
506 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
507 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
508 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
509 print x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%"
510 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"%"+"\n")
511 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
512 break
513 else:
514 print x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%"
515 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"%"+"\n")
516 break
517 for x in H_contig_roles:
518 #if multiple choices, temporately select the one with longest length for now, will revise in further change
519 if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes:#remember to avoid the effect of O-type contig, so should not in O_node list
520 fliC_contig=x[1]
521 fliC_length=int(x[1].split("_")[3])
522 elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes:
523 fljB_contig=x[1]
524 fljB_length=int(x[1].split("_")[3])
525 for x in Final_list_passed:
526 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0] :
527 fliC_choice=x[0].split("_")[1]
528 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
529 fljB_choice=x[0].split("_")[1]
530 elif fliC_choice!="-" and fljB_choice!="-":
531 break
532 print "\n"
533 print "SeqSero Input files:",for_fq,rev_fq
534 print "Most possible O antigen:",O_choice
535 print "Most possible H1 antigen:",fliC_choice
536 print "Most possible H2 antigen:",fljB_choice
537 #print Final_list
538 ###output
539 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list)
540 new_file=open("Seqsero_result.txt","w")
541 new_file.write("Input files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+"O-"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
542 new_file.close()
543 os.system("cat Seqsero_result.txt")
544
545 if __name__ == '__main__':
546 main()