Mercurial > repos > cstrittmatter > ss2v110
comparison bin/SeqSero2_package.py @ 0:fc22ec8e924e draft
planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
author | cstrittmatter |
---|---|
date | Tue, 21 Apr 2020 12:45:34 -0400 |
parents | |
children | aa54a94b9aeb |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fc22ec8e924e |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import sys | |
4 import time | |
5 import random | |
6 import os | |
7 import subprocess | |
8 import gzip | |
9 import io | |
10 import pickle | |
11 import argparse | |
12 import itertools | |
13 from distutils.version import LooseVersion | |
14 from distutils.spawn import find_executable | |
15 sys.path.insert(1,sys.path[0]+'/..') | |
16 | |
17 try: | |
18 from .version import SeqSero2_version | |
19 except Exception: #ImportError | |
20 from version import SeqSero2_version | |
21 | |
22 ### SeqSero Kmer | |
23 def parse_args(): | |
24 "Parse the input arguments, use '-h' for help." | |
25 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.1.0')#add "-m <data_type>" in future | |
26 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### add 'type=os.path.abspath' to generate absolute path of input data. | |
27 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore fasta, '6' for nanopore fastq") | |
28 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode") | |
29 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1") | |
30 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a") | |
31 parser.add_argument("-n",help="<string>: optional, to specify a sample name in the report output") | |
32 parser.add_argument("-d",help="<string>: optional, to specify an output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number") | |
33 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files") | |
34 parser.add_argument("-s",action="store_true",help="<flag>: if '-s' was flagged, SeqSero2 will not output header in SeqSero_result.tsv") | |
35 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies") | |
36 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version) | |
37 return parser.parse_args() | |
38 | |
39 ### check paths of dependencies | |
40 check_dependencies = parse_args().check | |
41 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py'] | |
42 if check_dependencies: | |
43 for item in dependencies: | |
44 ext_path = find_executable(item) | |
45 if ext_path is not None: | |
46 print ("Using "+item+" - "+ext_path) | |
47 else: | |
48 print ("ERROR: can not find "+item+" in PATH") | |
49 sys.exit() | |
50 ### end of --check | |
51 | |
52 def reverse_complement(sequence): | |
53 complement = { | |
54 'A': 'T', | |
55 'C': 'G', | |
56 'G': 'C', | |
57 'T': 'A', | |
58 'N': 'N', | |
59 'M': 'K', | |
60 'R': 'Y', | |
61 'W': 'W', | |
62 'S': 'S', | |
63 'Y': 'R', | |
64 'K': 'M', | |
65 'V': 'B', | |
66 'H': 'D', | |
67 'D': 'H', | |
68 'B': 'V' | |
69 } | |
70 return "".join(complement[base] for base in reversed(sequence)) | |
71 | |
72 | |
73 def createKmerDict_reads(list_of_strings, kmer): | |
74 kmer_table = {} | |
75 for string in list_of_strings: | |
76 sequence = string.strip('\n') | |
77 for i in range(len(sequence) - kmer + 1): | |
78 new_mer = sequence[i:i + kmer].upper() | |
79 new_mer_rc = reverse_complement(new_mer) | |
80 if new_mer in kmer_table: | |
81 kmer_table[new_mer.upper()] += 1 | |
82 else: | |
83 kmer_table[new_mer.upper()] = 1 | |
84 if new_mer_rc in kmer_table: | |
85 kmer_table[new_mer_rc.upper()] += 1 | |
86 else: | |
87 kmer_table[new_mer_rc.upper()] = 1 | |
88 return kmer_table | |
89 | |
90 | |
91 def multifasta_dict(multifasta): | |
92 multifasta_list = [ | |
93 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0 | |
94 ] | |
95 headers = [i for i in multifasta_list if i[0] == '>'] | |
96 multifasta_dict = {} | |
97 for h in headers: | |
98 start = multifasta_list.index(h) | |
99 for element in multifasta_list[start + 1:]: | |
100 if element[0] == '>': | |
101 break | |
102 else: | |
103 if h[1:] in multifasta_dict: | |
104 multifasta_dict[h[1:]] += element | |
105 else: | |
106 multifasta_dict[h[1:]] = element | |
107 return multifasta_dict | |
108 | |
109 | |
110 def multifasta_single_string(multifasta): | |
111 multifasta_list = [ | |
112 line.strip() for line in open(multifasta, 'r') | |
113 if (len(line.strip()) > 0) and (line.strip()[0] != '>') | |
114 ] | |
115 return ''.join(multifasta_list) | |
116 | |
117 | |
118 def chunk_a_long_sequence(long_sequence, chunk_size=60): | |
119 chunk_list = [] | |
120 steps = len(long_sequence) // 60 #how many chunks | |
121 for i in range(steps): | |
122 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size]) | |
123 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)]) | |
124 return chunk_list | |
125 | |
126 | |
127 def target_multifasta_kmerizer(multifasta, k, kmerDict): | |
128 forward_length = 300 #if find the target, put forward 300 bases | |
129 reverse_length = 2200 #if find the target, put backward 2200 bases | |
130 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences | |
131 target_mers = [] | |
132 long_single_string = multifasta_single_string(multifasta) | |
133 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size) | |
134 unit_length = len(multifasta_list[0]) | |
135 forward_lines = int(forward_length / unit_length) + 1 | |
136 reverse_lines = int(forward_length / unit_length) + 1 | |
137 start_num = 0 | |
138 end_num = 0 | |
139 for i in range(len(multifasta_list)): | |
140 if i not in range(start_num, end_num): #avoid computational repetition | |
141 line = multifasta_list[i] | |
142 start = int((len(line) - k) // 2) | |
143 s1 = line[start:k + start] | |
144 if s1 in kmerDict: #detect it is a potential read or not (use the middle part) | |
145 if i - forward_lines >= 0: | |
146 start_num = i - forward_lines | |
147 else: | |
148 start_num = 0 | |
149 if i + reverse_lines <= len(multifasta_list) - 1: | |
150 end_num = i + reverse_lines | |
151 else: | |
152 end_num = len(multifasta_list) - 1 | |
153 target_list = [ | |
154 x.strip() for x in multifasta_list[start_num:end_num] | |
155 ] | |
156 target_line = "".join(target_list) | |
157 target_mers += [ | |
158 k1 for k1 in createKmerDict_reads([str(target_line)], k) | |
159 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length) | |
160 else: | |
161 pass | |
162 return set(target_mers) | |
163 | |
164 | |
165 def target_read_kmerizer(file, k, kmerDict): | |
166 i = 1 | |
167 n_reads = 0 | |
168 total_coverage = 0 | |
169 target_mers = [] | |
170 if file.endswith(".gz"): | |
171 file_content = io.BufferedReader(gzip.open(file)) | |
172 else: | |
173 file_content = open(file, "r").readlines() | |
174 for line in file_content: | |
175 start = int((len(line) - k) // 2) | |
176 if i % 4 == 2: | |
177 if file.endswith(".gz"): | |
178 s1 = line[start:k + start].decode() | |
179 line = line.decode() | |
180 else: | |
181 s1 = line[start:k + start] | |
182 if s1 in kmerDict: #detect it is a potential read or not (use the middle part) | |
183 n_reads += 1 | |
184 total_coverage += len(line) | |
185 target_mers += [ | |
186 k1 for k1 in createKmerDict_reads([str(line)], k) | |
187 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length) | |
188 i += 1 | |
189 if total_coverage >= 4000000: | |
190 break | |
191 return set(target_mers) | |
192 | |
193 | |
194 def minion_fasta_kmerizer(file, k, kmerDict): | |
195 i = 1 | |
196 n_reads = 0 | |
197 total_coverage = 0 | |
198 target_mers = {} | |
199 for line in open(file): | |
200 if i % 2 == 0: | |
201 for kmer, rc_kmer in kmers(line.strip().upper(), k): | |
202 if (kmer in kmerDict) or (rc_kmer in kmerDict): | |
203 if kmer in target_mers: | |
204 target_mers[kmer] += 1 | |
205 else: | |
206 target_mers[kmer] = 1 | |
207 if rc_kmer in target_mers: | |
208 target_mers[rc_kmer] += 1 | |
209 else: | |
210 target_mers[rc_kmer] = 1 | |
211 i += 1 | |
212 return set([h for h in target_mers]) | |
213 | |
214 | |
215 def minion_fastq_kmerizer(file, k, kmerDict): | |
216 i = 1 | |
217 n_reads = 0 | |
218 total_coverage = 0 | |
219 target_mers = {} | |
220 for line in open(file): | |
221 if i % 4 == 2: | |
222 for kmer, rc_kmer in kmers(line.strip().upper(), k): | |
223 if (kmer in kmerDict) or (rc_kmer in kmerDict): | |
224 if kmer in target_mers: | |
225 target_mers[kmer] += 1 | |
226 else: | |
227 target_mers[kmer] = 1 | |
228 if rc_kmer in target_mers: | |
229 target_mers[rc_kmer] += 1 | |
230 else: | |
231 target_mers[rc_kmer] = 1 | |
232 i += 1 | |
233 return set([h for h in target_mers]) | |
234 | |
235 | |
236 def multifasta_single_string2(multifasta): | |
237 single_string = '' | |
238 with open(multifasta, 'r') as f: | |
239 for line in f: | |
240 if line.strip()[0] == '>': | |
241 pass | |
242 else: | |
243 single_string += line.strip() | |
244 return single_string | |
245 | |
246 | |
247 def kmers(seq, k): | |
248 rev_comp = reverse_complement(seq) | |
249 for start in range(1, len(seq) - k + 1): | |
250 yield seq[start:start + k], rev_comp[-(start + k):-start] | |
251 | |
252 | |
253 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set | |
254 multi_seq_dict = multifasta_dict(multifasta) | |
255 lib_dict = {} | |
256 for h in multi_seq_dict: | |
257 lib_dict[h] = set( | |
258 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)]) | |
259 return lib_dict | |
260 | |
261 | |
262 def Combine(b, c): | |
263 fliC_combinations = [] | |
264 fliC_combinations.append(",".join(c)) | |
265 temp_combinations = [] | |
266 for i in range(len(b)): | |
267 for x in itertools.combinations(b, i + 1): | |
268 temp_combinations.append(",".join(x)) | |
269 for x in temp_combinations: | |
270 temp = [] | |
271 for y in c: | |
272 temp.append(y) | |
273 temp.append(x) | |
274 temp = ",".join(temp) | |
275 temp = temp.split(",") | |
276 temp.sort() | |
277 temp = ",".join(temp) | |
278 fliC_combinations.append(temp) | |
279 return fliC_combinations | |
280 | |
281 | |
282 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies): | |
283 #like test_output_06012017.txt | |
284 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list) | |
285 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict | |
286 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict] | |
287 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to | |
288 seronames = [] | |
289 seronames_none_subspecies=[] | |
290 for i in range(len(phase1)): | |
291 fliC_combine = [] | |
292 fljB_combine = [] | |
293 if phaseO[i] == Otype: # no VII in KW, but it's there | |
294 ### for fliC, detect every possible combinations to avoid the effect of "[" | |
295 if phase1[i].count("[") == 0: | |
296 fliC_combine.append(phase1[i]) | |
297 elif phase1[i].count("[") >= 1: | |
298 c = [] | |
299 b = [] | |
300 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count( | |
301 "[") == 1: | |
302 content = phase1[i].replace("[", "").replace("]", "") | |
303 fliC_combine.append(content) | |
304 fliC_combine.append("-") | |
305 else: | |
306 for x in phase1[i].split(","): | |
307 if "[" in x: | |
308 b.append(x.replace("[", "").replace("]", "")) | |
309 else: | |
310 c.append(x) | |
311 fliC_combine = Combine( | |
312 b, c | |
313 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t | |
314 ### end of fliC "[" detect | |
315 ### for fljB, detect every possible combinations to avoid the effect of "[" | |
316 if phase2[i].count("[") == 0: | |
317 fljB_combine.append(phase2[i]) | |
318 elif phase2[i].count("[") >= 1: | |
319 d = [] | |
320 e = [] | |
321 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count( | |
322 "[") == 1: | |
323 content = phase2[i].replace("[", "").replace("]", "") | |
324 fljB_combine.append(content) | |
325 fljB_combine.append("-") | |
326 else: | |
327 for x in phase2[i].split(","): | |
328 if "[" in x: | |
329 d.append(x.replace("[", "").replace("]", "")) | |
330 else: | |
331 e.append(x) | |
332 fljB_combine = Combine(d, e) | |
333 ### end of fljB "[" detect | |
334 new_fliC = fliC.split( | |
335 "," | |
336 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings | |
337 new_fliC.sort() | |
338 new_fliC = ",".join(new_fliC) | |
339 new_fljB = fljB.split(",") | |
340 new_fljB.sort() | |
341 new_fljB = ",".join(new_fljB) | |
342 if (new_fliC in fliC_combine | |
343 or fliC in fliC_combine) and (new_fljB in fljB_combine | |
344 or fljB in fljB_combine): | |
345 ######start, remove_list,rename_dict, added on 11/11/2018 | |
346 if sero[i] not in remove_list: | |
347 temp_sero=sero[i] | |
348 if temp_sero in rename_dict: | |
349 temp_sero=rename_dict[temp_sero] #rename if in the rename list | |
350 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider | |
351 if subs[i] == subspecies: | |
352 seronames.append(temp_sero) | |
353 seronames_none_subspecies.append(temp_sero) | |
354 else: | |
355 pass | |
356 else: | |
357 pass | |
358 ######end, added on 11/11/2018 | |
359 #analyze seronames | |
360 subspecies_pointer="" | |
361 if len(seronames) == 0 and len(seronames_none_subspecies)!=0: | |
362 # ed_SL_12182019: modified to fix the subspecies output problem | |
363 #seronames=seronames_none_subspecies | |
364 seronames=["N/A"] | |
365 #subspecies_pointer="1" | |
366 subspecies_pointer="0" | |
367 if len(seronames) == 0: | |
368 seronames = [ | |
369 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)" | |
370 ] | |
371 star = "" | |
372 star_line = "" | |
373 if len(seronames) > 1: #there are two possible predictions for serotypes | |
374 star = "*" | |
375 #changed 04072019 | |
376 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" | |
377 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0: | |
378 star="*" | |
379 star_line="The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme. " + star_line | |
380 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line | |
381 if Otype=="": | |
382 Otype="-" | |
383 predict_form = Otype + ":" + fliC + ":" + fljB | |
384 predict_sero = (" or ").join(seronames) | |
385 ###special test for Enteritidis | |
386 if predict_form == "9:g,m:-": | |
387 sdf = "-" | |
388 for x in special_gene_list: | |
389 if x.startswith("sdf"): | |
390 sdf = "+" | |
391 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" | |
392 star_line="sdf gene detected. " | |
393 #predict_form = predict_form + " Sdf prediction:" + sdf | |
394 predict_form = predict_form #changed 04072019 | |
395 if sdf == "-": | |
396 star = "*" | |
397 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" | |
398 star_line="sdf gene not detected. " | |
399 #changed in 04072019, for new output | |
400 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" | |
401 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement | |
402 predict_sero = "Gallinarum or Enteritidis" | |
403 ###end of special test for Enteritidis | |
404 elif predict_form == "4:i:-": | |
405 predict_sero = "I 4,[5],12:i:-" # change serotype name | |
406 elif predict_form == "4:r:-": | |
407 predict_sero = "N/A (4:r:-)" | |
408 elif predict_form == "4:b:-": | |
409 predict_sero = "N/A (4:b:-)" | |
410 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo | |
411 #predict_sero = "Newport" | |
412 #star = "*" | |
413 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." | |
414 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" | |
415 if "N/A" in predict_sero: | |
416 claim = "" | |
417 #special test for Typhimurium | |
418 if "Typhimurium" in predict_sero or predict_form == "4:i:-": | |
419 normal = 0 | |
420 mutation = 0 | |
421 for x in special_gene_list: | |
422 if "oafA-O-4_full" in x: | |
423 normal = float(special_gene_list[x]) | |
424 elif "oafA-O-4_5-" in x: | |
425 mutation = float(special_gene_list[x]) | |
426 if normal > mutation: | |
427 pass | |
428 elif normal < mutation: | |
429 #predict_sero = predict_sero.strip() + "(O5-)" | |
430 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019 | |
431 star = "*" | |
432 #star_line = "Detected the deletion of O5-." | |
433 star_line = "Detected a deletion that causes O5- variant of Typhimurium. " | |
434 else: | |
435 pass | |
436 #special test for Paratyphi B | |
437 if "Paratyphi B" in predict_sero or predict_form == "4:b:-": | |
438 normal = 0 | |
439 mutation = 0 | |
440 for x in special_gene_list: | |
441 if "gntR-family-regulatory-protein_dt-positive" in x: | |
442 normal = float(special_gene_list[x]) | |
443 elif "gntR-family-regulatory-protein_dt-negative" in x: | |
444 mutation = float(special_gene_list[x]) | |
445 #print(normal,mutation) | |
446 if normal > mutation: | |
447 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019 | |
448 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() | |
449 star = "*" | |
450 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." | |
451 star_line = "The SNP that causes d-Tartrate nonfermentating phenotype of Paratyphi B was not detected. " | |
452 elif normal < mutation: | |
453 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019 | |
454 predict_sero = predict_sero.strip() | |
455 star = "*" | |
456 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B." | |
457 star_line = "Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B. " | |
458 else: | |
459 star = "*" | |
460 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." | |
461 star_line = " " ## ed_SL_05152019: do not report this situation. | |
462 #special test for O13,22 and O13,23 | |
463 if Otype=="13": | |
464 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
465 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019 | |
466 f = open(ex_dir + '/special.pickle', 'rb') | |
467 special = pickle.load(f) | |
468 O22_O23=special['O22_O23'] | |
469 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze | |
470 O22_score=0 | |
471 O23_score=0 | |
472 for x in special_gene_list: | |
473 if "O:22" in x: | |
474 O22_score = O22_score+float(special_gene_list[x]) | |
475 elif "O:23" in x: | |
476 O23_score = O23_score+float(special_gene_list[x]) | |
477 #print(O22_score,O23_score) | |
478 for z in O22_O23[0]: | |
479 if predict_sero.split(" or ")[0] in z: | |
480 if O22_score > O23_score: | |
481 star = "*" | |
482 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 | |
483 predict_sero = z[0] | |
484 elif O22_score < O23_score: | |
485 star = "*" | |
486 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 | |
487 predict_sero = z[1] | |
488 else: | |
489 star = "*" | |
490 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019 | |
491 if " or " in predict_sero: | |
492 star_line = star_line + "The predicted serotypes share the same general formula: " + Otype + ":" + fliC + ":" + fljB + "." | |
493 #special test for O6,8 | |
494 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list | |
495 #for x in merge_O68_list: | |
496 # if x in predict_sero: | |
497 # predict_sero=x | |
498 # star="" | |
499 # star_line="" | |
500 #special test for Montevideo; most of them are monophasic | |
501 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list | |
502 #star="*" | |
503 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination." | |
504 return predict_form, predict_sero, star, star_line, claim | |
505 ### End of SeqSero Kmer part | |
506 | |
507 ### Begin of SeqSero2 allele prediction and output | |
508 def xml_parse_score_comparision_seqsero(xmlfile): | |
509 #used to do seqsero xml analysis | |
510 from Bio.Blast import NCBIXML | |
511 handle=open(xmlfile) | |
512 handle=NCBIXML.parse(handle) | |
513 handle=list(handle) | |
514 List=[] | |
515 List_score=[] | |
516 List_ids=[] | |
517 List_query_region=[] | |
518 for i in range(len(handle)): | |
519 if len(handle[i].alignments)>0: | |
520 for j in range(len(handle[i].alignments)): | |
521 score=0 | |
522 ids=0 | |
523 cover_region=set() #fixed problem that repeated calculation leading percentage > 1 | |
524 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def) | |
525 for z in range(len(handle[i].alignments[j].hsps)): | |
526 hsp=handle[i].alignments[j].hsps[z] | |
527 temp=set(range(hsp.query_start,hsp.query_end)) | |
528 if len(cover_region)==0: | |
529 cover_region=cover_region|temp | |
530 fraction=1 | |
531 else: | |
532 fraction=1-len(cover_region&temp)/float(len(temp)) | |
533 cover_region=cover_region|temp | |
534 if "last" in handle[i].query or "first" in handle[i].query: | |
535 score+=hsp.bits*fraction | |
536 ids+=float(hsp.identities)/handle[i].query_length*fraction | |
537 else: | |
538 score+=hsp.bits*fraction | |
539 ids+=float(hsp.identities)/handle[i].query_length*fraction | |
540 List_score.append(score) | |
541 List_ids.append(ids) | |
542 List_query_region.append(cover_region) | |
543 temp=zip(List,List_score,List_ids,List_query_region) | |
544 Final_list=sorted(temp, key=lambda d:d[1], reverse = True) | |
545 return Final_list | |
546 | |
547 | |
548 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number | |
549 Old=L | |
550 L.sort() | |
551 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]] | |
552 count=[] | |
553 for j in range(len(L)): | |
554 y=0 | |
555 for x in Old: | |
556 if L[j]==x: | |
557 y+=1 | |
558 count.append(y) | |
559 if sort_on_fre!="none": | |
560 d=zip(*sorted(zip(count, L))) | |
561 L=d[1] | |
562 count=d[0] | |
563 return (L,count) | |
564 | |
565 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list): | |
566 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test | |
567 #this is mainly used for | |
568 a=nodes_vs_score_list | |
569 fliC_score=0 | |
570 fljB_score=0 | |
571 for z in a: | |
572 if "fliC" in z[0]: | |
573 fliC_score+=z[1] | |
574 elif "fljB" in z[0]: | |
575 fljB_score+=z[1] | |
576 if fliC_score>=fljB_score: | |
577 role="fliC" | |
578 else: | |
579 role="fljB" | |
580 return (role,abs(fliC_score-fljB_score)) | |
581 | |
582 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed): | |
583 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small) | |
584 #also used when no head or tail got blasted score for the contig | |
585 role="" | |
586 for z in Final_list_passed: | |
587 if node_name in z[0]: | |
588 role=z[0].split("_")[0] | |
589 break | |
590 return role | |
591 | |
592 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed): | |
593 #nodes_list is the c created by c,d=Uniq(nodes) in below function | |
594 first_target="" | |
595 role_list=[] | |
596 for x in nodes_list: | |
597 a=[] | |
598 role="" | |
599 for y in tail_head_list: | |
600 if x in y[0]: | |
601 a.append(y) | |
602 if len(a)==4: | |
603 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
604 if diff<20: | |
605 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
606 elif len(a)==3: | |
607 ###however, if the one with highest score is the fewer one, compare their accumulation score | |
608 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
609 if diff<20: | |
610 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
611 ###end of above score comparison | |
612 elif len(a)==2: | |
613 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37) | |
614 temp=[] | |
615 for z in a: | |
616 temp.append(z[0].split("_")[0]) | |
617 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too | |
618 if len(m)==1: | |
619 pass | |
620 else: | |
621 pass | |
622 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
623 if diff<20: | |
624 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
625 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation | |
626 elif len(a)==1: | |
627 #that one | |
628 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
629 if diff<20: | |
630 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
631 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0" | |
632 else:#a==0 | |
633 #use Final_list_passed best match | |
634 for z in Final_list_passed: | |
635 if x in z[0]: | |
636 role=z[0].split("_")[0] | |
637 break | |
638 #print x,role,len(a) | |
639 role_list.append((role,x)) | |
640 if len(role_list)==2: | |
641 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase | |
642 #just use score to do a final test | |
643 role_list=[] | |
644 for x in nodes_list: | |
645 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
646 role_list.append((role,x)) | |
647 return role_list | |
648 | |
649 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed): | |
650 #used to decide which contig is FliC and which one is fljB | |
651 contigs=[] | |
652 nodes=[] | |
653 for x in Final_list_passed: | |
654 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]: | |
655 nodes.append(x[0].split("___")[1].strip()) | |
656 c,d=Uniq(nodes)#c is node_list | |
657 #print c | |
658 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])] | |
659 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed) | |
660 return roles | |
661 | |
662 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed): | |
663 #decide O based on Final_list | |
664 O_choice="?" | |
665 O_list=[] | |
666 special_genes={} | |
667 nodes=[] | |
668 for x in Final_list_passed: | |
669 if x[0].startswith("O-"): | |
670 nodes.append(x[0].split("___")[1].strip()) | |
671 elif not x[0].startswith("fl"): | |
672 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1] | |
673 #print "special_genes:",special_genes | |
674 c,d=Uniq(nodes) | |
675 #print "potential O antigen contig",c | |
676 final_O=[] | |
677 O_nodes_list=[] | |
678 for x in c:#c is the list for contigs | |
679 temp=0 | |
680 for y in Final_list_passed: | |
681 if x in y[0] and y[0].startswith("O-"): | |
682 final_O.append(y) | |
683 break | |
684 ### O contig has the problem of two genes on same contig, so do additional test | |
685 potenial_new_gene="" | |
686 for x in final_O: | |
687 pointer=0 #for genes merged or not | |
688 #not consider O-1,3,19_not_in_3,10, too short compared with others | |
689 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region | |
690 pointer=x[0].split("___")[1].strip()#store the contig name | |
691 print(pointer) | |
692 if pointer!=0:#it has potential merge event | |
693 for y in Final_list: | |
694 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O | |
695 potenial_new_gene=y | |
696 #print(potenial_new_gene) | |
697 break | |
698 if potenial_new_gene!="": | |
699 print("two differnt genes in same contig, fix it for O antigen") | |
700 print(potenial_new_gene[:3]) | |
701 pointer=0 | |
702 for y in final_O: | |
703 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]: | |
704 pointer=1 | |
705 if pointer!=0: #changed to consider two genes in same contig | |
706 final_O.append(potenial_new_gene) | |
707 ### end of the two genes on same contig test | |
708 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted | |
709 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): | |
710 #print "$$$No Otype, due to no hit"#may need to be changed | |
711 O_choice="-" | |
712 else: | |
713 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) | |
714 O_list=[] | |
715 O_list_less_contamination=[] | |
716 for x in final_O: | |
717 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off | |
718 O_list.append(x[0].split("__")[0]) | |
719 O_nodes_list.append(x[0].split("___")[1]) | |
720 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15: | |
721 O_list_less_contamination.append(x[0].split("__")[0]) | |
722 ### special test for O9,46 and O3,10 family | |
723 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
724 if "O-9,46_wzy" in O_list or "O-9,46_wzy_partial" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
725 O_choice="O-9,46" | |
726 #print "$$$Most possilble Otype: O-9,46" | |
727 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
728 O_choice="O-9,46,27" | |
729 #print "$$$Most possilble Otype: O-9,46,27" | |
730 else: | |
731 O_choice="O-9"#next, detect O9 vs O2? | |
732 O2=0 | |
733 O9=0 | |
734 for z in special_genes: | |
735 if "tyr-O-9" in z: | |
736 O9=special_genes[z] | |
737 elif "tyr-O-2" in z: | |
738 O2=special_genes[z] | |
739 if O2>O9: | |
740 O_choice="O-2" | |
741 elif O2<O9: | |
742 pass | |
743 else: | |
744 pass | |
745 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility." | |
746 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
747 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
748 O_choice="O-3,10" | |
749 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
750 else: | |
751 O_choice="O-1,3,19" | |
752 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
753 ### end of special test for O9,46 and O3,10 family | |
754 else: | |
755 try: | |
756 max_score=0 | |
757 for x in final_O: | |
758 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold | |
759 max_score=x[2]#change from x[-1] to x[2],08172018 | |
760 O_choice=x[0].split("_")[0] | |
761 if O_choice=="O-1,3,19": | |
762 O_choice=final_O[1][0].split("_")[0] | |
763 #print "$$$Most possilble Otype: ",O_choice | |
764 except: | |
765 pass | |
766 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)" | |
767 if O_choice=="O-9,46,27" and len(O_list)==2 and "O-4_wzx" in O_list: #special for very low chance sitatuion between O4 and O9,27,46, this is for serotypes like Bredeney and Schwarzengrund (normallly O-4 will have higher score, but sometimes sequencing quality may affect the prediction) | |
768 O_choice="O-4" | |
769 #print "O:",O_choice,O_nodes_list | |
770 Otypes=[] | |
771 for x in O_list: | |
772 if x!="O-1,3,19_not_in_3,10": | |
773 if "O-9,46_" not in x: | |
774 Otypes.append(x.split("_")[0]) | |
775 else: | |
776 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254 | |
777 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"] | |
778 Otypes_uniq,Otypes_fre=Uniq(Otypes) | |
779 contamination_O="" | |
780 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": | |
781 if len(Otypes_uniq)>2: | |
782 contamination_O="potential contamination from O antigen signals" | |
783 else: | |
784 if len(Otypes_uniq)>1: | |
785 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
786 contamination_O="" | |
787 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
788 contamination_O="" | |
789 else: | |
790 contamination_O="potential contamination from O antigen signals" | |
791 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq | |
792 ### End of SeqSero2 allele prediction and output | |
793 | |
794 def get_input_files(make_dir,input_file,data_type,dirpath): | |
795 #tell input files from datatype | |
796 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)" | |
797 for_fq="" | |
798 rev_fq="" | |
799 os.chdir(make_dir) | |
800 if data_type=="1": | |
801 input_file=input_file[0].split("/")[-1] | |
802 if input_file.endswith(".sra"): | |
803 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True) | |
804 for_fq=input_file.replace(".sra","_1.fastq") | |
805 rev_fq=input_file.replace(".sra","_2.fastq") | |
806 else: | |
807 core_id=input_file.split(".fastq")[0].split(".fq")[0] | |
808 for_fq=core_id+"_1.fastq" | |
809 rev_fq=core_id+"_2.fastq" | |
810 if input_file.endswith(".gz"): | |
811 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True) | |
812 else: | |
813 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True) | |
814 elif data_type=="2": | |
815 for_fq=input_file[0].split("/")[-1] | |
816 rev_fq=input_file[1].split("/")[-1] | |
817 elif data_type=="3": | |
818 input_file=input_file[0].split("/")[-1] | |
819 if input_file.endswith(".sra"): | |
820 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True) | |
821 for_fq=input_file.replace(".sra","_1.fastq") | |
822 else: | |
823 for_fq=input_file | |
824 elif data_type in ["4","5","6"]: | |
825 for_fq=input_file[0].split("/")[-1] | |
826 os.chdir("..") | |
827 return for_fq,rev_fq | |
828 | |
829 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta): | |
830 #get O and H types from Final_list from blast parsing; allele mode | |
831 from Bio import SeqIO | |
832 fliC_choice="-" | |
833 fljB_choice="-" | |
834 fliC_contig="NA" | |
835 fljB_contig="NA" | |
836 fliC_region=set([0]) | |
837 fljB_region=set([0,]) | |
838 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length | |
839 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length | |
840 O_choice="-"#no need to decide O contig for now, should be only one | |
841 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification | |
842 O_choice=O_choice.split("-")[-1].strip() | |
843 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": | |
844 O_choice="-" | |
845 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB | |
846 #add alignment locations, used for further selection, 03312019 | |
847 for i in range(len(H_contig_roles)): | |
848 x=H_contig_roles[i] | |
849 for y in Final_list_passed: | |
850 if x[1] in y[0] and y[0].startswith(x[0]): | |
851 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],) | |
852 break | |
853 log_file=open("SeqSero_log.txt","a") | |
854 extract_file=open("Extracted_antigen_alleles.fasta","a") | |
855 handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) | |
856 | |
857 #print("O_contigs:") | |
858 log_file.write("O_contigs:\n") | |
859 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n") | |
860 extract_file.write("#O_contigs:\n") | |
861 for x in O_nodes_roles: | |
862 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker | |
863 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1]))) | |
864 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n") | |
865 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n" | |
866 seqs="" | |
867 for z in handle_fasta: | |
868 if x[0].split("___")[-1]==z.description: | |
869 seqs=str(z.seq) | |
870 extract_file.write(title+seqs+"\n") | |
871 if len(H_contig_roles)!=0: | |
872 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output | |
873 else: | |
874 highest_H_coverage=0 | |
875 for x in H_contig_roles: | |
876 #if multiple choices, temporately select the one with longest length for now, will revise in further change | |
877 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list | |
878 fliC_contig=x[1] | |
879 fliC_length=len(x[-1]) | |
880 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13: | |
881 fljB_contig=x[1] | |
882 fljB_length=len(x[-1]) | |
883 for x in Final_list_passed: | |
884 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]: | |
885 fliC_choice=x[0].split("_")[1] | |
886 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]: | |
887 fljB_choice=x[0].split("_")[1] | |
888 elif fliC_choice!="-" and fljB_choice!="-": | |
889 break | |
890 #now remove contigs not in middle core part | |
891 first_allele="NA" | |
892 first_allele_percentage=0 | |
893 for x in Final_list: | |
894 if x[0].startswith("fliC") or x[0].startswith("fljB"): | |
895 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs | |
896 first_allele_percentage=x[2] | |
897 break | |
898 additional_contigs=[] | |
899 for x in Final_list: | |
900 if first_allele in x[0]: | |
901 if (fliC_contig == x[0].split("___")[-1]): | |
902 fliC_region=x[3] | |
903 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]): | |
904 fljB_region=x[3] | |
905 else: | |
906 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1 | |
907 additional_contigs.append(x) | |
908 #else: | |
909 #print x[:3] | |
910 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off | |
911 if first_allele_percentage>0.9: | |
912 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000: | |
913 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) | |
914 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000: | |
915 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) | |
916 else: | |
917 target_region=set()#doesn't do anything | |
918 else: | |
919 target_region=set()#doesn't do anything | |
920 #print(target_region) | |
921 #print(additional_contigs) | |
922 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best | |
923 target_region=target_region2|target_region | |
924 for x in additional_contigs: | |
925 removal=0 | |
926 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0]) | |
927 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off | |
928 removal=1 | |
929 else: | |
930 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part) | |
931 removal=1 | |
932 else: | |
933 pass | |
934 if removal==1: | |
935 for y in H_contig_roles: | |
936 if y[1] in x[0]: | |
937 H_contig_roles.remove(y) | |
938 else: | |
939 pass | |
940 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5) | |
941 #end of removing none-middle contigs | |
942 #print("H_contigs:") | |
943 log_file.write("H_contigs:\n") | |
944 extract_file.write("#H_contigs:\n") | |
945 H_contig_stat=[] | |
946 H1_cont_stat={} | |
947 H2_cont_stat={} | |
948 for i in range(len(H_contig_roles)): | |
949 x=H_contig_roles[i] | |
950 a=0 | |
951 for y in Final_list_passed: | |
952 if x[1] in y[0] and y[0].startswith(x[0]): | |
953 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide | |
954 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it | |
955 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB | |
956 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) | |
957 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") | |
958 H_contig_roles[i]="can't decide fliC or fljB, may be third phase" | |
959 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n" | |
960 seqs="" | |
961 for z in handle_fasta: | |
962 if x[1]==z.description: | |
963 seqs=str(z.seq) | |
964 extract_file.write(title+seqs+"\n") | |
965 break | |
966 else: | |
967 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) | |
968 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") | |
969 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n" | |
970 seqs="" | |
971 for z in handle_fasta: | |
972 if x[1]==z.description: | |
973 seqs=str(z.seq) | |
974 extract_file.write(title+seqs+"\n") | |
975 if x[0]=="fliC": | |
976 if y[0].split("_")[1] not in H1_cont_stat: | |
977 H1_cont_stat[y[0].split("_")[1]]=y[2] | |
978 else: | |
979 H1_cont_stat[y[0].split("_")[1]]+=y[2] | |
980 if x[0]=="fljB": | |
981 if y[0].split("_")[1] not in H2_cont_stat: | |
982 H2_cont_stat[y[0].split("_")[1]]=y[2] | |
983 else: | |
984 H2_cont_stat[y[0].split("_")[1]]+=y[2] | |
985 break | |
986 #detect contaminations | |
987 #print(H1_cont_stat) | |
988 #print(H2_cont_stat) | |
989 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2] | |
990 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2] | |
991 contamination_H="" | |
992 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1: | |
993 contamination_H="potential contamination from H antigen signals" | |
994 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA": | |
995 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected" | |
996 #get additional antigens | |
997 """ | |
998 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
999 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
1000 O_choice="O-9,46" | |
1001 #print "$$$Most possilble Otype: O-9,46" | |
1002 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
1003 O_choice="O-9,46,27" | |
1004 #print "$$$Most possilble Otype: O-9,46,27" | |
1005 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
1006 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
1007 O_choice="O-3,10" | |
1008 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
1009 else: | |
1010 O_choice="O-1,3,19" | |
1011 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
1012 ### end of special test for O9,46 and O3,10 family | |
1013 | |
1014 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": | |
1015 if len(Otypes_uniq)>2: | |
1016 contamination_O="potential contamination from O antigen signals" | |
1017 else: | |
1018 if len(Otypes_uniq)>1: | |
1019 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
1020 contamination_O="" | |
1021 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
1022 contamination_O="" | |
1023 """ | |
1024 additonal_antigents=[] | |
1025 #print(contamination_O) | |
1026 #print(contamination_H) | |
1027 log_file.write(contamination_O+"\n") | |
1028 log_file.write(contamination_H+"\n") | |
1029 log_file.close() | |
1030 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list | |
1031 | |
1032 def get_input_K(input_file,lib_dict,data_type,k_size): | |
1033 #kmer mode; get input_Ks from dict and data_type | |
1034 kmers = [] | |
1035 for h in lib_dict: | |
1036 kmers += lib_dict[h] | |
1037 if data_type == '4': | |
1038 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers)) | |
1039 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later | |
1040 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers)) | |
1041 elif data_type == '5':#minion_2d_fasta | |
1042 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers)) | |
1043 if data_type == '6':#minion_2d_fastq | |
1044 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers)) | |
1045 return input_Ks | |
1046 | |
1047 def get_kmer_dict(lib_dict,input_Ks): | |
1048 #kmer mode; get predicted types | |
1049 O_dict = {} | |
1050 H_dict = {} | |
1051 Special_dict = {} | |
1052 for h in lib_dict: | |
1053 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100 | |
1054 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases | |
1055 if h.startswith('O-') and score > 25: | |
1056 O_dict[h] = score | |
1057 if h.startswith('fl') and score > 40: | |
1058 H_dict[h] = score | |
1059 if (h[:2] != 'fl') and (h[:2] != 'O-'): | |
1060 Special_dict[h] = score | |
1061 return O_dict,H_dict,Special_dict | |
1062 | |
1063 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir): | |
1064 log_file=open("SeqSero_log.txt","a") | |
1065 log_file.write("O_scores:\n") | |
1066 #call O: | |
1067 highest_O = '-' | |
1068 if len(O_dict) == 0: | |
1069 pass | |
1070 else: | |
1071 for x in O_dict: | |
1072 log_file.write(x+"\t"+str(O_dict[x])+"\n") | |
1073 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
1074 #if 'O-9,46_wzy__1191' in O_dict or "O-9,46_wzy_partial__216" in O_dict: # and float(O946_wzy)/float(num_1) > 0.1 | |
1075 #modified to fix miscall of O-9,46 | |
1076 if ('O-9,46_wzy__1191' in O_dict and O_dict['O-9,46_wzy__1191']>40) or ("O-9,46_wzy_partial__216" in O_dict and O_dict["O-9,46_wzy_partial__216"]>40): # and float(O946_wzy)/float(num_1) > 0.1 | |
1077 highest_O = "O-9,46" | |
1078 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1 | |
1079 highest_O = "O-9,46,27" | |
1080 else: | |
1081 highest_O = "O-9" # next, detect O9 vs O2? | |
1082 O2 = 0 | |
1083 O9 = 0 | |
1084 for z in Special_dict: | |
1085 if "tyr-O-9" in z: | |
1086 O9 = float(Special_dict[z]) | |
1087 if "tyr-O-2" in z: | |
1088 O2 = float(Special_dict[z]) | |
1089 if O2 > O9: | |
1090 highest_O = "O-2" | |
1091 elif ("O-3,10_wzx__1539" in O_dict) and ( | |
1092 "O-9,46_wzy__1191" in O_dict | |
1093 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
1094 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1 | |
1095 highest_O = "O-3,10" | |
1096 else: | |
1097 highest_O = "O-1,3,19" | |
1098 ### end of special test for O9,46 and O3,10 family | |
1099 else: | |
1100 try: | |
1101 max_score = 0 | |
1102 for x in O_dict: | |
1103 if float(O_dict[x]) >= max_score: | |
1104 max_score = float(O_dict[x]) | |
1105 #highest_O = x.split("_")[0] | |
1106 # ed_SL_12182019: modified to fix the O-9,46 error example1 | |
1107 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict): | |
1108 highest_O = "O-9" | |
1109 else: | |
1110 highest_O = x.split("_")[0] | |
1111 if highest_O == "O-1,3,19": | |
1112 highest_O = '-' | |
1113 max_score = 0 | |
1114 for x in O_dict: | |
1115 if x == 'O-1,3,19_not_in_3,10__130': | |
1116 pass | |
1117 else: | |
1118 if float(O_dict[x]) >= max_score: | |
1119 max_score = float(O_dict[x]) | |
1120 #highest_O = x.split("_")[0] | |
1121 # ed_SL_12182019: modified to fix the O-9,46 error example1 | |
1122 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict): | |
1123 highest_O = "O-9" | |
1124 else: | |
1125 highest_O = x.split("_")[0] | |
1126 except: | |
1127 pass | |
1128 #call_fliC: | |
1129 if len(H_dict)!=0: | |
1130 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not | |
1131 else: | |
1132 highest_H_score_both_BC=0 | |
1133 highest_fliC = '-' | |
1134 highest_fliC_raw = '-' | |
1135 highest_Score = 0 | |
1136 log_file.write("\nH_scores:\n") | |
1137 for s in H_dict: | |
1138 log_file.write(s+"\t"+str(H_dict[s])+"\n") | |
1139 if s.startswith('fliC'): | |
1140 if float(H_dict[s]) > highest_Score: | |
1141 highest_fliC = s.split('_')[1] | |
1142 highest_fliC_raw = s | |
1143 highest_Score = float(H_dict[s]) | |
1144 #call_fljB | |
1145 highest_fljB = '-' | |
1146 highest_fljB_raw = '-' | |
1147 highest_Score = 0 | |
1148 for s in H_dict: | |
1149 if s.startswith('fljB'): | |
1150 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other | |
1151 #highest_fljB = s.split('_')[1] | |
1152 #highest_fljB_raw = s | |
1153 #highest_Score = float(H_dict[s]) | |
1154 if s.split('_')[1]!=highest_fliC: | |
1155 highest_fljB = s.split('_')[1] | |
1156 highest_fljB_raw = s | |
1157 highest_Score = float(H_dict[s]) | |
1158 log_file.write("\nSpecial_scores:\n") | |
1159 for s in Special_dict: | |
1160 log_file.write(s+"\t"+str(Special_dict[s])+"\n") | |
1161 log_file.close() | |
1162 return highest_O,highest_fliC,highest_fljB | |
1163 | |
1164 def get_temp_file_names(for_fq,rev_fq): | |
1165 #seqsero2 -a; get temp file names | |
1166 sam=for_fq+".sam" | |
1167 bam=for_fq+".bam" | |
1168 sorted_bam=for_fq+"_sorted.bam" | |
1169 mapped_fq1=for_fq+"_mapped.fq" | |
1170 mapped_fq2=rev_fq+"_mapped.fq" | |
1171 combined_fq=for_fq+"_combined.fq" | |
1172 for_sai=for_fq+".sai" | |
1173 rev_sai=rev_fq+".sai" | |
1174 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai | |
1175 | |
1176 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode): | |
1177 #seqsero2 -a; do mapping and sort | |
1178 print("building database...") | |
1179 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True) | |
1180 print("mapping...") | |
1181 if mapping_mode=="mem": | |
1182 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True) | |
1183 elif mapping_mode=="sam": | |
1184 if fnameB!="": | |
1185 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True) | |
1186 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True) | |
1187 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True) | |
1188 else: | |
1189 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True) | |
1190 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam) | |
1191 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True) | |
1192 ### check the version of samtools then use differnt commands | |
1193 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
1194 out, err = samtools_version.communicate() | |
1195 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip() | |
1196 print("check samtools version:",version) | |
1197 ### end of samtools version check and its analysis | |
1198 if LooseVersion(version)<=LooseVersion("1.2"): | |
1199 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True) | |
1200 else: | |
1201 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True) | |
1202 | |
1203 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode): | |
1204 #seqsero2 -a; extract, assembly and blast | |
1205 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True) | |
1206 #print("fnameA:",fnameA) | |
1207 #print("fnameB:",fnameB) | |
1208 if fnameB!="": | |
1209 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output | |
1210 else: | |
1211 pass | |
1212 outdir=current_time+"_temp" | |
1213 print("assembling...") | |
1214 if int(threads)>4: | |
1215 t="4" | |
1216 else: | |
1217 t=threads | |
1218 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-" | |
1219 if fnameB!="": | |
1220 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True) | |
1221 else: | |
1222 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True) | |
1223 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta" | |
1224 #new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # change path to databse for packaging | |
1225 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True) | |
1226 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta | |
1227 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True) | |
1228 print("blasting...","\n") | |
1229 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml" | |
1230 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015 | |
1231 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10" | |
1232 else: | |
1233 xmlfile="NA" | |
1234 return xmlfile,new_fasta | |
1235 | |
1236 def judge_subspecies(fnameA): | |
1237 #seqsero2 -a; judge subspecies on just forward raw reads fastq | |
1238 salmID_output=subprocess.Popen("SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
1239 out, err = salmID_output.communicate() | |
1240 out=out.decode("utf-8") | |
1241 file=open("data_log.txt","a") | |
1242 file.write(out) | |
1243 file.close() | |
1244 salm_species_scores=out.split("\n")[1].split("\t")[6:] | |
1245 salm_species_results=out.split("\n")[0].split("\t")[6:] | |
1246 max_score=0 | |
1247 max_score_index=1 #default is 1, means "I" | |
1248 for i in range(len(salm_species_scores)): | |
1249 if max_score<float(salm_species_scores[i]): | |
1250 max_score=float(salm_species_scores[i]) | |
1251 max_score_index=i | |
1252 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0] | |
1253 #if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare | |
1254 if float(out.split("\n")[1].split("\t")[4]) > 10 and float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): ## ed_SL_0318: change SalmID_ssp_threshold | |
1255 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part | |
1256 #if max_score<10: ## ed_SL_0318: change SalmID_ssp_threshold | |
1257 if max_score<60: | |
1258 prediction="-" | |
1259 return prediction | |
1260 | |
1261 def judge_subspecies_Kmer(Special_dict): | |
1262 #seqsero2 -k; | |
1263 max_score=0 | |
1264 prediction="-" #default should be I | |
1265 for x in Special_dict: | |
1266 #if "mer" in x: ## ed_SL_0318: change ssp_threshold | |
1267 if "mer" in x and float(Special_dict[x]) > 60: | |
1268 if max_score<float(Special_dict[x]): | |
1269 max_score=float(Special_dict[x]) | |
1270 prediction=x.split("_")[-1].strip() | |
1271 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica | |
1272 prediction="bongori" | |
1273 break | |
1274 return prediction | |
1275 | |
1276 ## ed_SL_11232019: add notes for missing antigen | |
1277 def check_antigens(ssp,O_antigen,H1_antigen,H2_antigen,NA_note): | |
1278 antigen_note = '' | |
1279 if ssp != '-': | |
1280 if O_antigen != '-' and H1_antigen == '-' and H2_antigen == '-': # O:-:- | |
1281 antigen_note = 'H antigens were not detected. This is an atypical result that should be further investigated. Most Salmonella strains have at least fliC, encoding the Phase 1 H antigen, even if it is not expressed. ' | |
1282 NA_note = '' | |
1283 elif O_antigen != '-' and H1_antigen == '-' and H2_antigen != '-': # O:-:H2 | |
1284 antigen_note = 'fliC was not detected. This is an atypical result that should be further investigated. Most Salmonella strains have fliC, encoding the Phase 1 H antigen, even if it is not expressed. ' | |
1285 NA_note = '' | |
1286 elif O_antigen == '-' and H1_antigen != '-': # -:H1:X | |
1287 antigen_note = 'O antigen was not detected. This result may be due to a rough strain that has deleted the rfb region. For raw reads input, the k-mer workflow is sometimes more sensitive than the microassembly workflow in detecting O antigen. Caution should be used with this approach because the k-mer result may be due to low levels of contamination. ' | |
1288 NA_note = '' | |
1289 elif O_antigen == '-' and H1_antigen == '-' and H2_antigen == '-': # -:-:- | |
1290 antigen_note = 'No serotype antigens were detected. This is an atypical result that should be further investigated. ' | |
1291 NA_note = '' | |
1292 else: | |
1293 antigen_note = 'The input genome cannot be identified as Salmonella. Check the input for taxonomic ID, contamination, or sequencing quality. ' | |
1294 NA_note = '' | |
1295 # if [O_antigen, H1_antigen, H2_antigen].count('-') >= 2: | |
1296 # antigen_note = 'No subspecies marker was detected and less than 2 serotype antigens were detected; further, this genome was not identified as Salmonella. This is an atypical result that should be further investigated. ' | |
1297 # else: | |
1298 # antigen_note = 'No subspecies marker was detected. This genome may not be Salmonella. This is an atypical result that should be further investigated. ' | |
1299 return (antigen_note,NA_note) | |
1300 | |
1301 def main(): | |
1302 #combine SeqSeroK and SeqSero2, also with SalmID | |
1303 args = parse_args() | |
1304 input_file = args.i | |
1305 data_type = args.t | |
1306 analysis_mode = args.m | |
1307 mapping_mode=args.b | |
1308 threads=args.p | |
1309 make_dir=args.d | |
1310 clean_mode=args.c | |
1311 sample_name=args.n | |
1312 ingore_header=args.s | |
1313 k_size=27 #will change for bug fixing | |
1314 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) | |
1315 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: add ex_dir for packaging | |
1316 seqsero2_db=ex_dir+"/H_and_O_and_specific_genes.fasta" # ed_SL_11092019: change path to database for packaging | |
1317 database="H_and_O_and_specific_genes.fasta" | |
1318 note="Note: " | |
1319 NA_note="This predicted serotype is not in the Kauffman-White scheme. " # ed_SL_09272019: add for new output format | |
1320 if len(sys.argv)==1: | |
1321 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file | |
1322 else: | |
1323 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime()) | |
1324 request_id += str(random.randint(1, 10000000)) | |
1325 if make_dir is None: | |
1326 make_dir="SeqSero_result_"+request_id | |
1327 if os.path.isdir(make_dir): | |
1328 pass | |
1329 else: | |
1330 subprocess.check_call(["mkdir",make_dir]) | |
1331 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) | |
1332 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) | |
1333 subprocess.check_call("ln -f -s "+seqsero2_db+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_11092019: change path to database for packaging | |
1334 #subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument). | |
1335 ############################begin the real analysis | |
1336 if analysis_mode=="a": | |
1337 if data_type in ["1","2","3"]:#use allele mode | |
1338 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) | |
1339 os.chdir(make_dir) | |
1340 ###add a function to tell input files | |
1341 fnameA=for_fq.split("/")[-1] | |
1342 fnameB=rev_fq.split("/")[-1] | |
1343 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) | |
1344 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id | |
1345 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort | |
1346 ### avoid error out when micro assembly fails. ed_SL_03172020 | |
1347 try: | |
1348 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast | |
1349 except (UnboundLocalError, subprocess.CalledProcessError): | |
1350 xmlfile="NA" | |
1351 H1_cont_stat_list=[] | |
1352 H2_cont_stat_list=[] | |
1353 ### | |
1354 if xmlfile=="NA": | |
1355 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","") | |
1356 else: | |
1357 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results | |
1358 file=open("data_log.txt","a") | |
1359 for x in Final_list: | |
1360 file.write("\t".join(str(y) for y in x)+"\n") | |
1361 file.close() | |
1362 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] | |
1363 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB | |
1364 subspecies=judge_subspecies(fnameA) #predict subspecies | |
1365 ###output | |
1366 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) | |
1367 claim="" #04132019, disable claim for new report requirement | |
1368 contamination_report="" | |
1369 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] | |
1370 if contamination_O!="" and contamination_H=="": | |
1371 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"." | |
1372 elif contamination_O=="" and contamination_H!="": | |
1373 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"." | |
1374 elif contamination_O!="" and contamination_H!="": | |
1375 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." | |
1376 if contamination_report!="": | |
1377 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019 | |
1378 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles. " | |
1379 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version | |
1380 #if contamination_report+star_line+claim=="": #0413, new output style | |
1381 # note="" | |
1382 #else: | |
1383 # note="Note:" | |
1384 | |
1385 ### ed_SL_11232019: add notes for missing antigen | |
1386 if O_choice=="": | |
1387 O_choice="-" | |
1388 antigen_note,NA_note=check_antigens(subspecies,O_choice,fliC_choice,fljB_choice,NA_note) | |
1389 if sample_name: | |
1390 print ("Sample name:\t"+sample_name) | |
1391 ### | |
1392 | |
1393 if clean_mode: | |
1394 subprocess.check_call("rm -rf ../"+make_dir,shell=True) | |
1395 make_dir="none-output-directory due to '-c' flag" | |
1396 else: | |
1397 new_file=open("SeqSero_result.txt","w") | |
1398 ### ed_SL_01152020: add new output | |
1399 conta_note="yes" if "inter-serotype contamination" in contamination_report else "no" | |
1400 tsv_file=open("SeqSero_result.tsv","w") | |
1401 if ingore_header: | |
1402 pass | |
1403 else: | |
1404 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted subspecies\tPredicted antigenic profile\tPredicted serotype\tPotential inter-serotype contamination\tNote\n") | |
1405 if sample_name: | |
1406 new_file.write("Sample name:\t"+sample_name+"\n") | |
1407 tsv_file.write(sample_name+'\t') | |
1408 else: | |
1409 tsv_file.write(input_file[0].split('/')[-1]+'\t') | |
1410 ### | |
1411 if "N/A" not in predict_sero: | |
1412 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
1413 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1414 "O antigen prediction:\t"+O_choice+"\n"+ | |
1415 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1416 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1417 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1418 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1419 "Predicted serotype:\t"+predict_sero+"\n"+ | |
1420 note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
1421 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies+"\t"+predict_form+"\t"+predict_sero+"\t"+conta_note+"\t"+contamination_report+star_line+claim+antigen_note+"\n") | |
1422 else: | |
1423 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n" | |
1424 star_line="" #04132019, for new output requirement, diable star_line if "NA" in output | |
1425 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
1426 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1427 "O antigen prediction:\t"+O_choice+"\n"+ | |
1428 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1429 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1430 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1431 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1432 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, add subspecies | |
1433 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
1434 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+conta_note+"\t"+NA_note+contamination_report+star_line+claim+antigen_note+"\n") | |
1435 new_file.close() | |
1436 tsv_file.close() | |
1437 #subprocess.check_call("cat Seqsero_result.txt",shell=True) | |
1438 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True) | |
1439 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True) | |
1440 if "N/A" not in predict_sero: | |
1441 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+## | |
1442 print("Output directory:\t"+make_dir+"\n"+ | |
1443 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1444 "O antigen prediction:\t"+O_choice+"\n"+ | |
1445 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1446 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1447 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1448 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1449 "Predicted serotype:\t"+predict_sero+"\n"+ | |
1450 note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
1451 else: | |
1452 print("Output directory:\t"+make_dir+"\n"+ | |
1453 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1454 "O antigen prediction:\t"+O_choice+"\n"+ | |
1455 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1456 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1457 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1458 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1459 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
1460 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n") | |
1461 else: | |
1462 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'") | |
1463 elif analysis_mode=="k": | |
1464 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
1465 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: change ex_dir for packaging | |
1466 #output_mode = args.mode | |
1467 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) | |
1468 input_file = for_fq #-k will just use forward because not all reads were used | |
1469 os.chdir(make_dir) | |
1470 f = open(ex_dir + '/antigens.pickle', 'rb') | |
1471 lib_dict = pickle.load(f) | |
1472 f.close | |
1473 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size) | |
1474 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks) | |
1475 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir) | |
1476 subspecies=judge_subspecies_Kmer(Special_dict) | |
1477 if subspecies=="IIb" or subspecies=="IIa": | |
1478 subspecies="II" | |
1479 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( | |
1480 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) | |
1481 claim="" #no claim any more based on new output requirement | |
1482 #if star_line+claim=="": #0413, new output style | |
1483 # note="" | |
1484 #else: | |
1485 # note="Note:" | |
1486 | |
1487 ### ed_SL_11232019: add notes for missing antigen | |
1488 if highest_O.split('-')[-1]=="": | |
1489 O_choice="-" | |
1490 else: | |
1491 O_choice=highest_O.split('-')[-1] | |
1492 antigen_note,NA_note=check_antigens(subspecies,O_choice,highest_fliC,highest_fljB,NA_note) | |
1493 if sample_name: | |
1494 print ("Sample name:\t"+sample_name) | |
1495 ### | |
1496 | |
1497 if clean_mode: | |
1498 subprocess.check_call("rm -rf ../"+make_dir,shell=True) | |
1499 make_dir="none-output-directory due to '-c' flag" | |
1500 # ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c" | |
1501 # if highest_O.split('-')[-1]=="": | |
1502 # O_choice="-" | |
1503 # else: | |
1504 # O_choice=highest_O.split('-')[-1] | |
1505 # ### | |
1506 else: | |
1507 # if highest_O.split('-')[-1]=="": | |
1508 # O_choice="-" | |
1509 # else: | |
1510 # O_choice=highest_O.split('-')[-1] | |
1511 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero) | |
1512 new_file=open("SeqSero_result.txt","w") | |
1513 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## | |
1514 ### ed_SL_01152020: add new output | |
1515 tsv_file=open("SeqSero_result.tsv","w") | |
1516 if ingore_header: | |
1517 pass | |
1518 else: | |
1519 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted subspecies\tPredicted antigenic profile\tPredicted serotype\tNote\n") | |
1520 if sample_name: | |
1521 new_file.write("Sample name:\t"+sample_name+"\n") | |
1522 tsv_file.write(sample_name+'\t') | |
1523 else: | |
1524 tsv_file.write(input_file.split('/')[-1]+'\t') | |
1525 ### | |
1526 if "N/A" not in predict_sero: | |
1527 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
1528 "Input files:\t"+input_file+"\n"+ | |
1529 "O antigen prediction:\t"+O_choice+"\n"+ | |
1530 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1531 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1532 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1533 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1534 "Predicted serotype:\t"+predict_sero+"\n"+ | |
1535 note+star_line+claim+antigen_note+"\n")#+## | |
1536 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies+"\t"+predict_form+"\t"+predict_sero+"\t"+star_line+claim+antigen_note+"\n") | |
1537 else: | |
1538 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n" | |
1539 star_line = "" #changed for new output requirement, 04132019 | |
1540 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
1541 "Input files:\t"+input_file+"\n"+ | |
1542 "O antigen prediction:\t"+O_choice+"\n"+ | |
1543 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1544 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1545 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1546 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1547 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
1548 note+NA_note+star_line+claim+antigen_note+"\n")#+## | |
1549 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+NA_note+star_line+claim+antigen_note+"\n") | |
1550 new_file.close() | |
1551 tsv_file.close() | |
1552 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True) | |
1553 if "N/A" not in predict_sero: | |
1554 print("Output directory:\t"+make_dir+"\n"+ | |
1555 "Input files:\t"+input_file+"\n"+ | |
1556 "O antigen prediction:\t"+O_choice+"\n"+ | |
1557 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1558 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1559 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1560 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1561 "Predicted serotype:\t"+predict_sero+"\n"+ | |
1562 note+star_line+claim+antigen_note+"\n")#+## | |
1563 else: | |
1564 print("Output directory:\t"+make_dir+"\n"+ | |
1565 "Input files:\t"+input_file+"\n"+ | |
1566 "O antigen prediction:\t"+O_choice+"\n"+ | |
1567 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1568 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1569 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1570 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1571 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
1572 note+NA_note+star_line+claim+antigen_note+"\n")#+## | |
1573 | |
1574 if __name__ == '__main__': | |
1575 main() |