0
|
1 #!/usr/bin/env python
|
|
2
|
|
3
|
|
4
|
|
5 import os
|
|
6 from Bio import SeqIO
|
|
7 import sys
|
|
8 import itertools
|
|
9 from Initial_Conditions import phase1
|
|
10 from Initial_Conditions import phase2
|
|
11 from Initial_Conditions import phaseO
|
|
12 from Initial_Conditions import sero
|
|
13 import time
|
|
14 import multiprocessing
|
|
15 import string
|
|
16
|
|
17 #m=string.atoi(sys.argv[1])
|
|
18 m=1 #temperorily, m can be set as one, because we just need one core to deal with it
|
|
19 file_name=sys.argv[1]
|
|
20
|
|
21 def Combine(b,c):
|
|
22 fliC_combinations=[]
|
|
23 fliC_combinations.append(",".join(c))
|
|
24 temp_combinations=[]
|
|
25 for i in range(len(b)):
|
|
26 for x in itertools.combinations(b,i+1):
|
|
27 temp_combinations.append(",".join(x))
|
|
28 for x in temp_combinations:
|
|
29 temp=[]
|
|
30 for y in c:
|
|
31 temp.append(y)
|
|
32 temp.append(x)
|
|
33 temp=",".join(temp)
|
|
34 temp=temp.split(",")
|
|
35 temp.sort()
|
|
36 temp=",".join(temp)
|
|
37 fliC_combinations.append(temp)
|
|
38 return fliC_combinations
|
|
39
|
|
40
|
|
41 def Test(file1,z,q):
|
|
42 fliC="?"
|
|
43 fljB="?"
|
|
44 Otype="?"
|
|
45 oafA=""#$$$$
|
|
46 O3_10=""
|
|
47 O1_3_19=""
|
|
48 file2=file1.replace(' ','_').replace(":","__").replace("[","").replace("]","")
|
|
49 try:
|
|
50 os.rename(file1, file2)
|
|
51 real_file=file2
|
|
52 except:
|
|
53 real_file=file1
|
|
54 #print "###The genome name:",file1
|
|
55 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))###01/27/2015
|
|
56 os.system('touch result.txt')
|
|
57 database_path="database"###01/27/2015
|
|
58 os.system('python '+dirpath+'/Otype_determine_analysis.py '+database_path+'/Typhimurium_LT2_gnd_galF.fasta '+real_file+' '+database_path+'/new_Oserotype.fasta >temp_result_'+str(q)+'O.txt')
|
|
59 os.system('cat temp_result_'+str(q)+'O.txt>>data_log.txt')
|
|
60 handle=open('temp_result_'+str(q)+'O.txt',"r")
|
|
61 handle=handle.readlines()
|
|
62 for line in handle:
|
|
63 if "$$$ Most" in line and "O_type" in line:
|
|
64 Otype=line.split("O-")[1].split("_")[0].split(" ")[0]
|
|
65 Otype=Otype.replace("\n","").strip()
|
|
66 #print line,
|
|
67 elif "$$$ No" in line:
|
|
68 Otype="-"
|
|
69 if "O-9" in line:
|
|
70 Otype="9"
|
|
71 #print line,
|
|
72 elif "$$$O5-" in line:#$$$
|
|
73 oafA="-"
|
|
74 elif "$$$O3,10 more possible" in line:#$$$
|
|
75 O3_10="+"
|
|
76 elif "$$$O1,3,19 more possible" in line:
|
|
77 O1_3_19="+"
|
|
78 if Otype=="1,3,19" or Otype=="3,10":#$$$judge O3,10 before formula forms
|
|
79 if O3_10=="+":
|
|
80 Otype="3,10"
|
|
81 elif O1_3_19=="+":
|
|
82 Otype="1,3,19"
|
|
83 else:
|
|
84 print "No_O3,10_O1,3,19_spe_sequences"
|
|
85 os.system('python '+dirpath+'/H_combination_output_analysis.py '+real_file+' '+database_path+'/H_new_fliC_protein_database.fasta '+database_path+'/H_new_fljB_protein_database.fasta >temp_result_'+str(q)+'H.txt')
|
|
86 os.system('cat temp_result_'+str(q)+'H.txt>>data_log.txt')
|
|
87 handle2=open('temp_result_'+str(q)+'H.txt',"r")
|
|
88 handle2=handle2.readlines()
|
|
89 suspect="no" #for the first choice doesn't hit core sequence
|
|
90 for line in handle2:
|
|
91 if "$$$ Most" in line and "fliC" in line:
|
|
92 #print line,
|
|
93 fliC=line.split("fliC_type: ")[1].split("_")[0].strip()
|
|
94 if fliC=="g,m,p,s":
|
|
95 fliC="g,m,s"
|
|
96 elif "$$$ No" in line and "fliC" in line:
|
|
97 fliC="-"
|
|
98 #print line,
|
|
99 elif "$$$ Most" in line and "fljB" in line:
|
|
100 #print line,
|
|
101 fljB=line.split("fljB_type: ")[1].split("_")[0].strip()
|
|
102 elif "$$$ No" in line and "fljB" in line:
|
|
103 fljB="-"
|
|
104 #print line,
|
|
105 if Otype=="9" and fliC=="g,m" and fljB=="-":
|
|
106 os.system('python '+dirpath+'/special_gene_test_assemblies.py '+database_path+'/specific_genes.fasta '+real_file+' sdf >temp_result_'+str(q)+'sdf.txt')
|
|
107 os.system('cat temp_result_'+str(q)+'sdf.txt>>data_log.txt')
|
|
108 handle3=open('temp_result_'+str(q)+'sdf.txt',"r")
|
|
109 sdf=""
|
|
110 for line in handle3:
|
|
111 if "$$$" in line and "got a hit" in line:
|
|
112 #print line,
|
|
113 sdf="+"
|
|
114 if sdf!="+":
|
|
115 sdf="-"
|
|
116
|
|
117 seronames=[]
|
|
118 for i in range(len(phase1)):
|
|
119 fliC_combine=[]
|
|
120 fljB_combine=[]
|
|
121 if phaseO[i]==Otype:
|
|
122 if phase1[i].count("[")==0:
|
|
123 fliC_combine.append(phase1[i])
|
|
124 elif phase1[i].count("[")>=1:
|
|
125 c=[]
|
|
126 b=[]
|
|
127 if phase1[i][0]=="[" and phase1[i][-1]=="]" and phase1[i].count("[")==1:#for specific situations like [1,5]
|
|
128 content=phase1[i].replace("[","").replace("]","")
|
|
129 fliC_combine.append(content)
|
|
130 fliC_combine.append("-")
|
|
131 else:
|
|
132 for x in phase1[i].split(","):
|
|
133 if "[" in x:
|
|
134 b.append(x.replace("[","").replace("]",""))
|
|
135 else:
|
|
136 c.append(x)
|
|
137 fliC_combine=Combine(b,c) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
|
|
138 if phase2[i].count("[")==0:
|
|
139 fljB_combine.append(phase2[i])
|
|
140 elif phase2[i].count("[")>=1:
|
|
141 d=[]
|
|
142 e=[]
|
|
143 if phase2[i][0]=="[" and phase2[i][-1]=="]" and phase2[i].count("[")==1:
|
|
144 content=phase2[i].replace("[","").replace("]","")
|
|
145 fljB_combine.append(content)
|
|
146 fljB_combine.append("-")
|
|
147 else:
|
|
148 for x in phase2[i].split(","):
|
|
149 if "[" in x:
|
|
150 d.append(x.replace("[","").replace("]",""))
|
|
151 else:
|
|
152 e.append(x)
|
|
153 fljB_combine=Combine(d,e)
|
|
154 new_fliC=fliC.split(",") #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
|
|
155 new_fliC.sort()
|
|
156 new_fliC=",".join(new_fliC)
|
|
157 new_fljB=fljB.split(",")
|
|
158 new_fljB.sort()
|
|
159 new_fljB=",".join(new_fljB)
|
|
160 if (new_fliC in fliC_combine or fliC in fliC_combine) and (new_fljB in fljB_combine or fljB in fljB_combine):
|
|
161 seronames.append(sero[i])
|
|
162 if len(seronames)==0:
|
|
163 seronames=["N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"]
|
|
164 star=""
|
|
165 star_line=""
|
|
166 if len(seronames)>1:
|
|
167 star="*"
|
|
168 star_line="The predicted serotypes share the same general formula:\t"+Otype+":"+fliC+":"+fljB+"\n"##
|
|
169 #print "$$$The most possible formula is: (by the order O:H1:H2) ",Otype,":",fliC,":",fljB
|
|
170 #print "$$$The possible serotyes are:",seronames
|
|
171 m=0
|
|
172 for y in seronames:
|
|
173 if y in file1:
|
|
174 #print "$$$ Is the judgement true? Answer:Yes!" #here we use file1, because we want ":", while file2 turned it to "__"
|
|
175 answer="Yes"
|
|
176 m=1
|
|
177 if m==0:
|
|
178 #print "$$$ Is the judgement true? Answer: Need to check the records and file names"
|
|
179 answer="Not sure"
|
|
180 print "\n","\n"
|
|
181 predict_form=Otype+":"+fliC+":"+fljB
|
|
182 predict_sero=(" or ").join(seronames)
|
|
183 if predict_form=="9:g,m:-":#
|
|
184 predict_form=predict_form+"\nSdf prediction:"+sdf #
|
|
185 if sdf=="-":#
|
|
186 star="*"#
|
|
187 star_line="Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"##
|
|
188 predict_sero="See comments below"#
|
|
189 elif predict_form=="4:i:-":#03252016#
|
|
190 predict_sero="potential monophasic variant of Typhimurium"#03252016#
|
|
191 elif predict_form=="4:r:-":#03252016#
|
|
192 predict_sero="potential monophasic variant of Heidelberg"#03252016#
|
|
193 elif predict_form=="4:b:-":#03252016#
|
|
194 predict_sero="potential monophasic variant of Paratyphi B"#03252016#
|
|
195 elif predict_form=="8:e,h:1,2":#03282016#
|
|
196 predict_sero="Newport"#03282016#
|
|
197 star="*"##03282016#
|
|
198 star_line="Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."#03282016#
|
|
199 claim="The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes."##
|
|
200 if "N/A" in predict_sero:###added after standalone version, 2015/2/3
|
|
201 claim=""###added after standalone version, 2015/2/3
|
|
202 '''
|
|
203 new_file=open(file2+".txt","w")
|
|
204 new_file.write(file2+"\t"+"O-"+Otype+"\t"+fliC+"\t"+fljB+"\t"+Otype+":"+fliC+":"+fljB+"\t"+(" or ").join(seronames)+"\t"+answer+"\t"+suspect+"\n")
|
|
205 new_file.close()
|
|
206 '''
|
|
207 if "Typhimurium" in predict_sero and oafA=="-":#$$$$#03252016#
|
|
208 predict_sero=predict_sero.strip()+"(O5-)"#03252016#
|
|
209 star="*"#
|
|
210 star_line="Detected the deletion of O5-."
|
|
211 new_file=open("Seqsero_result.txt","w")
|
|
212 new_file.write("Input files:\t"+file2+"\n"+"O antigen prediction:\t"+"O-"+Otype+"\n"+"H1 antigen prediction(fliC):\t"+fliC+"\n"+"H2 antigen prediction(fljB):\t"+fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")##
|
|
213 new_file.close()
|
|
214 os.system("rm temp_result_"+str(q)+"*.txt")###01/28/2015
|
|
215 os.system("rm result.txt")###01/28/2015
|
|
216 #os.system("rm -rf database")###01/28/2015
|
|
217 os.system("rm *.fasta *.xml *.fa")###01/28/2015
|
|
218
|
|
219
|
|
220 def main():
|
|
221 files1=[]
|
|
222 files1.append(file_name)
|
|
223 file_names=[]
|
|
224 fastq_names=[]
|
|
225 for file1 in files1:
|
|
226 if file1[-6:]=='.fasta' or file1[-4:]=='.fna' or file1[-3:]=='.fa' or file1[-4:]=='.fsa':
|
|
227 file_names.append(file1)
|
|
228 if file1[-9:]==".fastq.gz" or file1[-6:]==".fastq":
|
|
229 core_name=file1[:8]
|
|
230 fastq_names.append(core_name)
|
|
231 fastq_names=list(set(fastq_names))
|
|
232 file_names=file_names+fastq_names
|
|
233 for i in range(0,len(file_names),m):
|
|
234 jobs=[]
|
|
235 txt_names=[]
|
|
236 if len(file_names)>=i+m:
|
|
237 for j in range(m):
|
|
238 p = multiprocessing.Process(target=Test,args=(file_names[j+i],i+j+1,i+j,))
|
|
239 jobs.append(p)
|
|
240 p.start()
|
|
241 else:
|
|
242 t=m+i-len(file_names)
|
|
243 for j in range(m-t):
|
|
244 p = multiprocessing.Process(target=Test,args=(file_names[j+i],i+j+1,i+j,))
|
|
245 jobs.append(p)
|
|
246 p.start()
|
|
247 '''
|
|
248 for j in xrange(len(jobs)):
|
|
249 jobs[j].join()
|
|
250 txt_names.append(file_names[j+i].replace(' ','_').replace(":","__").replace("[","").replace("]","")+".txt")
|
|
251 print txt_names
|
|
252 for j in xrange(len(txt_names)):
|
|
253 print i,"and",j
|
|
254 print i+j+1
|
|
255 file=open(txt_names[j],"r")
|
|
256 handle=list(file)
|
|
257 b=handle[0].split("\t")
|
|
258 print b
|
|
259 sheet.write(i+j+1,0,b[0])
|
|
260 sheet.write(i+j+1,1,b[1])
|
|
261 sheet.write(i+j+1,2,b[2])
|
|
262 sheet.write(i+j+1,3,b[3])
|
|
263 sheet.write(i+j+1,4,b[4])
|
|
264 sheet.write(i+j+1,5,b[5])
|
|
265 sheet.write(i+j+1,6,b[6])
|
|
266 sheet.write(i+j+1,7,b[7])
|
|
267
|
|
268 print "End time,",time.time()
|
|
269 file3.save("Seqsero_result2.xls")
|
|
270 '''
|
|
271
|
|
272
|
|
273 if __name__ == '__main__':
|
|
274 main()
|
|
275
|
|
276
|
|
277
|
|
278
|
|
279
|
|
280
|
|
281
|
|
282
|