comparison SeqSero2_package.py @ 0:fc22ec8e924e draft

planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
author cstrittmatter
date Tue, 21 Apr 2020 12:45:34 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fc22ec8e924e
1 #!/usr/bin/env python3
2
3 import sys
4 import time
5 import random
6 import os
7 import subprocess
8 import gzip
9 import io
10 import pickle
11 import argparse
12 import itertools
13 import math
14 from distutils.version import LooseVersion
15 from distutils.spawn import find_executable
16
17 try:
18 from .version import SeqSero2_version
19 except Exception: #ImportError
20 from version import SeqSero2_version
21
22 ### SeqSero Kmer
23 def parse_args():
24 "Parse the input arguments, use '-h' for help."
25 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.0.2')#add "-m <data_type>" in future
26 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### ed_SL_05282019: add 'type=os.path.abspath' to generate absolute path of input data.
27 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore fasta, '6' for nanopore fastq")
28 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode")
29 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1")
30 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a")
31 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
32 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files")
33 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies")
34 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version)
35 return parser.parse_args()
36
37 ### ed_SL_05282019: check paths of dependencies
38 check_dependencies = parse_args().check
39 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py']
40 if check_dependencies:
41 for item in dependencies:
42 ext_path = find_executable(item)
43 if ext_path is not None:
44 print ("Using "+item+" - "+ext_path)
45 else:
46 print ("ERROR: can not find "+item+" in PATH")
47 sys.exit()
48 ### end of --check
49
50 def reverse_complement(sequence):
51 complement = {
52 'A': 'T',
53 'C': 'G',
54 'G': 'C',
55 'T': 'A',
56 'N': 'N',
57 'M': 'K',
58 'R': 'Y',
59 'W': 'W',
60 'S': 'S',
61 'Y': 'R',
62 'K': 'M',
63 'V': 'B',
64 'H': 'D',
65 'D': 'H',
66 'B': 'V'
67 }
68 return "".join(complement[base] for base in reversed(sequence))
69
70
71 def createKmerDict_reads(list_of_strings, kmer):
72 kmer_table = {}
73 for string in list_of_strings:
74 sequence = string.strip('\n')
75 for i in range(len(sequence) - kmer + 1):
76 new_mer = sequence[i:i + kmer].upper()
77 new_mer_rc = reverse_complement(new_mer)
78 if new_mer in kmer_table:
79 kmer_table[new_mer.upper()] += 1
80 else:
81 kmer_table[new_mer.upper()] = 1
82 if new_mer_rc in kmer_table:
83 kmer_table[new_mer_rc.upper()] += 1
84 else:
85 kmer_table[new_mer_rc.upper()] = 1
86 return kmer_table
87
88
89 def multifasta_dict(multifasta):
90 multifasta_list = [
91 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
92 ]
93 headers = [i for i in multifasta_list if i[0] == '>']
94 multifasta_dict = {}
95 for h in headers:
96 start = multifasta_list.index(h)
97 for element in multifasta_list[start + 1:]:
98 if element[0] == '>':
99 break
100 else:
101 if h[1:] in multifasta_dict:
102 multifasta_dict[h[1:]] += element
103 else:
104 multifasta_dict[h[1:]] = element
105 return multifasta_dict
106
107
108 def multifasta_single_string(multifasta):
109 multifasta_list = [
110 line.strip() for line in open(multifasta, 'r')
111 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
112 ]
113 return ''.join(multifasta_list)
114
115
116 def chunk_a_long_sequence(long_sequence, chunk_size=60):
117 chunk_list = []
118 steps = len(long_sequence) // 60 #how many chunks
119 for i in range(steps):
120 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
121 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
122 return chunk_list
123
124
125 def target_multifasta_kmerizer(multifasta, k, kmerDict):
126 forward_length = 300 #if find the target, put forward 300 bases
127 reverse_length = 2200 #if find the target, put backward 2200 bases
128 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
129 target_mers = []
130 long_single_string = multifasta_single_string(multifasta)
131 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
132 unit_length = len(multifasta_list[0])
133 forward_lines = int(forward_length / unit_length) + 1
134 reverse_lines = int(forward_length / unit_length) + 1
135 start_num = 0
136 end_num = 0
137 for i in range(len(multifasta_list)):
138 if i not in range(start_num, end_num): #avoid computational repetition
139 line = multifasta_list[i]
140 start = int((len(line) - k) // 2)
141 s1 = line[start:k + start]
142 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
143 if i - forward_lines >= 0:
144 start_num = i - forward_lines
145 else:
146 start_num = 0
147 if i + reverse_lines <= len(multifasta_list) - 1:
148 end_num = i + reverse_lines
149 else:
150 end_num = len(multifasta_list) - 1
151 target_list = [
152 x.strip() for x in multifasta_list[start_num:end_num]
153 ]
154 target_line = "".join(target_list)
155 target_mers += [
156 k1 for k1 in createKmerDict_reads([str(target_line)], k)
157 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
158 else:
159 pass
160 return set(target_mers)
161
162
163 def target_read_kmerizer(file, k, kmerDict):
164 i = 1
165 n_reads = 0
166 total_coverage = 0
167 target_mers = []
168 if file.endswith(".gz"):
169 file_content = io.BufferedReader(gzip.open(file))
170 else:
171 file_content = open(file, "r").readlines()
172 for line in file_content:
173 start = int((len(line) - k) // 2)
174 if i % 4 == 2:
175 if file.endswith(".gz"):
176 s1 = line[start:k + start].decode()
177 line = line.decode()
178 else:
179 s1 = line[start:k + start]
180 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
181 n_reads += 1
182 total_coverage += len(line)
183 target_mers += [
184 k1 for k1 in createKmerDict_reads([str(line)], k)
185 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
186 i += 1
187 if total_coverage >= 4000000:
188 break
189 return set(target_mers)
190
191
192 def minion_fasta_kmerizer(file, k, kmerDict):
193 i = 1
194 n_reads = 0
195 total_coverage = 0
196 target_mers = {}
197 for line in open(file):
198 if i % 2 == 0:
199 for kmer, rc_kmer in kmers(line.strip().upper(), k):
200 if (kmer in kmerDict) or (rc_kmer in kmerDict):
201 if kmer in target_mers:
202 target_mers[kmer] += 1
203 else:
204 target_mers[kmer] = 1
205 if rc_kmer in target_mers:
206 target_mers[rc_kmer] += 1
207 else:
208 target_mers[rc_kmer] = 1
209 i += 1
210 return set([h for h in target_mers])
211
212
213 def minion_fastq_kmerizer(file, k, kmerDict):
214 i = 1
215 n_reads = 0
216 total_coverage = 0
217 target_mers = {}
218 for line in open(file):
219 if i % 4 == 2:
220 for kmer, rc_kmer in kmers(line.strip().upper(), k):
221 if (kmer in kmerDict) or (rc_kmer in kmerDict):
222 if kmer in target_mers:
223 target_mers[kmer] += 1
224 else:
225 target_mers[kmer] = 1
226 if rc_kmer in target_mers:
227 target_mers[rc_kmer] += 1
228 else:
229 target_mers[rc_kmer] = 1
230 i += 1
231 return set([h for h in target_mers])
232
233
234 def multifasta_single_string2(multifasta):
235 single_string = ''
236 with open(multifasta, 'r') as f:
237 for line in f:
238 if line.strip()[0] == '>':
239 pass
240 else:
241 single_string += line.strip()
242 return single_string
243
244
245 def kmers(seq, k):
246 rev_comp = reverse_complement(seq)
247 for start in range(1, len(seq) - k + 1):
248 yield seq[start:start + k], rev_comp[-(start + k):-start]
249
250
251 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
252 multi_seq_dict = multifasta_dict(multifasta)
253 lib_dict = {}
254 for h in multi_seq_dict:
255 lib_dict[h] = set(
256 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
257 return lib_dict
258
259
260 def Combine(b, c):
261 fliC_combinations = []
262 fliC_combinations.append(",".join(c))
263 temp_combinations = []
264 for i in range(len(b)):
265 for x in itertools.combinations(b, i + 1):
266 temp_combinations.append(",".join(x))
267 for x in temp_combinations:
268 temp = []
269 for y in c:
270 temp.append(y)
271 temp.append(x)
272 temp = ",".join(temp)
273 temp = temp.split(",")
274 temp.sort()
275 temp = ",".join(temp)
276 fliC_combinations.append(temp)
277 return fliC_combinations
278
279
280 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
281 #like test_output_06012017.txt
282 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
283 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
284 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
285 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
286 seronames = []
287 seronames_none_subspecies=[]
288 for i in range(len(phase1)):
289 fliC_combine = []
290 fljB_combine = []
291 if phaseO[i] == Otype: # no VII in KW, but it's there
292 ### for fliC, detect every possible combinations to avoid the effect of "["
293 if phase1[i].count("[") == 0:
294 fliC_combine.append(phase1[i])
295 elif phase1[i].count("[") >= 1:
296 c = []
297 b = []
298 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
299 "[") == 1:
300 content = phase1[i].replace("[", "").replace("]", "")
301 fliC_combine.append(content)
302 fliC_combine.append("-")
303 else:
304 for x in phase1[i].split(","):
305 if "[" in x:
306 b.append(x.replace("[", "").replace("]", ""))
307 else:
308 c.append(x)
309 fliC_combine = Combine(
310 b, c
311 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
312 ### end of fliC "[" detect
313 ### for fljB, detect every possible combinations to avoid the effect of "["
314 if phase2[i].count("[") == 0:
315 fljB_combine.append(phase2[i])
316 elif phase2[i].count("[") >= 1:
317 d = []
318 e = []
319 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
320 "[") == 1:
321 content = phase2[i].replace("[", "").replace("]", "")
322 fljB_combine.append(content)
323 fljB_combine.append("-")
324 else:
325 for x in phase2[i].split(","):
326 if "[" in x:
327 d.append(x.replace("[", "").replace("]", ""))
328 else:
329 e.append(x)
330 fljB_combine = Combine(d, e)
331 ### end of fljB "[" detect
332 new_fliC = fliC.split(
333 ","
334 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
335 new_fliC.sort()
336 new_fliC = ",".join(new_fliC)
337 new_fljB = fljB.split(",")
338 new_fljB.sort()
339 new_fljB = ",".join(new_fljB)
340 if (new_fliC in fliC_combine
341 or fliC in fliC_combine) and (new_fljB in fljB_combine
342 or fljB in fljB_combine):
343 ######start, remove_list,rename_dict, added on 11/11/2018
344 if sero[i] not in remove_list:
345 temp_sero=sero[i]
346 if temp_sero in rename_dict:
347 temp_sero=rename_dict[temp_sero] #rename if in the rename list
348 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
349 if subs[i] == subspecies:
350 seronames.append(temp_sero)
351 seronames_none_subspecies.append(temp_sero)
352 else:
353 pass
354 else:
355 pass
356 ######end, added on 11/11/2018
357 #analyze seronames
358 subspecies_pointer=""
359 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
360 seronames=seronames_none_subspecies
361 subspecies_pointer="1"
362 if len(seronames) == 0:
363 seronames = [
364 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
365 ]
366 star = ""
367 star_line = ""
368 if len(seronames) > 1: #there are two possible predictions for serotypes
369 star = "*"
370 #changed 04072019
371 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
372 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
373 star="*"
374 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
375 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
376 if Otype=="":
377 Otype="-"
378 predict_form = Otype + ":" + fliC + ":" + fljB
379 predict_sero = (" or ").join(seronames)
380 ###special test for Enteritidis
381 if predict_form == "9:g,m:-":
382 sdf = "-"
383 for x in special_gene_list:
384 if x.startswith("sdf"):
385 sdf = "+"
386 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
387 star_line=" sdf gene detected." # ed_SL_04152019: new output format
388 #predict_form = predict_form + " Sdf prediction:" + sdf
389 predict_form = predict_form #changed 04072019
390 if sdf == "-":
391 star = "*"
392 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
393 star_line=" sdf gene not detected." # ed_SL_04152019: new output format
394 #changed in 04072019, for new output
395 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
396 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
397 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
398 ###end of special test for Enteritidis
399 elif predict_form == "4:i:-":
400 predict_sero = "I 4,[5],12:i:-" # ed_SL_09242019: change serotype name
401 elif predict_form == "4:r:-":
402 predict_sero = "4:r:-"
403 elif predict_form == "4:b:-": # ed_SL_09272019: change for new output format
404 predict_sero = "N/A (4:b:-)"
405 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
406 #predict_sero = "Newport"
407 #star = "*"
408 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
409 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
410 if "N/A" in predict_sero:
411 claim = ""
412 #special test for Typhimurium
413 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
414 normal = 0
415 mutation = 0
416 for x in special_gene_list:
417 if "oafA-O-4_full" in x:
418 normal = float(special_gene_list[x])
419 elif "oafA-O-4_5-" in x:
420 mutation = float(special_gene_list[x])
421 if normal > mutation:
422 pass
423 elif normal < mutation:
424 #predict_sero = predict_sero.strip() + "(O5-)"
425 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
426 star = "*"
427 #star_line = "Detected the deletion of O5-."
428 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
429 else:
430 pass
431 #special test for Paratyphi B
432 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
433 normal = 0
434 mutation = 0
435 for x in special_gene_list:
436 if "gntR-family-regulatory-protein_dt-positive" in x:
437 normal = float(special_gene_list[x])
438 elif "gntR-family-regulatory-protein_dt-negative" in x:
439 mutation = float(special_gene_list[x])
440 #print(normal,mutation)
441 if normal > mutation:
442 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
443 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
444 star = "*"
445 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
446 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype of Paratyphi B was not detected. " # ed_SL_04152019: new output format
447 elif normal < mutation:
448 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
449 predict_sero = predict_sero.strip()
450 star = "*"
451 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
452 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B." # ed_SL_04152019: new output format
453 else:
454 star = "*"
455 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
456 star_line = " " ## ed_SL_05152019: do not report this situation.
457 #special test for O13,22 and O13,23
458 if Otype=="13":
459 #ex_dir = os.path.dirname(os.path.realpath(__file__))
460 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019
461 f = open(ex_dir + '/special.pickle', 'rb')
462 special = pickle.load(f)
463 O22_O23=special['O22_O23']
464 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
465 O22_score=0
466 O23_score=0
467 for x in special_gene_list:
468 if "O:22" in x:
469 O22_score = O22_score+float(special_gene_list[x])
470 elif "O:23" in x:
471 O23_score = O23_score+float(special_gene_list[x])
472 #print(O22_score,O23_score)
473 for z in O22_O23[0]:
474 if predict_sero.split(" or ")[0] in z:
475 if O22_score > O23_score:
476 star = "*"
477 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
478 predict_sero = z[0]
479 elif O22_score < O23_score:
480 star = "*"
481 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
482 predict_sero = z[1]
483 else:
484 star = "*"
485 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
486 if " or " in predict_sero:
487 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
488 #special test for O6,8
489 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
490 #for x in merge_O68_list:
491 # if x in predict_sero:
492 # predict_sero=x
493 # star=""
494 # star_line=""
495 #special test for Montevideo; most of them are monophasic
496 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
497 #star="*"
498 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
499 return predict_form, predict_sero, star, star_line, claim
500 ### End of SeqSero Kmer part
501
502 ### Begin of SeqSero2 allele prediction and output
503 def xml_parse_score_comparision_seqsero(xmlfile):
504 #used to do seqsero xml analysis
505 from Bio.Blast import NCBIXML
506 handle=open(xmlfile)
507 handle=NCBIXML.parse(handle)
508 handle=list(handle)
509 List=[]
510 List_score=[]
511 List_ids=[]
512 List_query_region=[]
513 for i in range(len(handle)):
514 if len(handle[i].alignments)>0:
515 for j in range(len(handle[i].alignments)):
516 score=0
517 ids=0
518 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
519 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
520 for z in range(len(handle[i].alignments[j].hsps)):
521 hsp=handle[i].alignments[j].hsps[z]
522 temp=set(range(hsp.query_start,hsp.query_end))
523 est_bit=(handle[i].ka_params[0]*hsp.score-math.log(handle[i].ka_params[1]))/(math.log(2))
524 if len(cover_region)==0:
525 cover_region=cover_region|temp
526 fraction=1
527 else:
528 fraction=1-len(cover_region&temp)/float(len(temp))
529 cover_region=cover_region|temp
530 if "last" in handle[i].query or "first" in handle[i].query:
531 #score+=hsp.bits*fraction
532 score+=est_bit*fraction
533 ids+=float(hsp.identities)/handle[i].query_length*fraction
534 else:
535 #score+=hsp.bits*fraction
536 score+=est_bit*fraction
537 ids+=float(hsp.identities)/handle[i].query_length*fraction
538 List_score.append(score)
539 List_ids.append(ids)
540 List_query_region.append(cover_region)
541 temp=zip(List,List_score,List_ids,List_query_region)
542 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
543 return Final_list
544
545
546 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
547 Old=L
548 L.sort()
549 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
550 count=[]
551 for j in range(len(L)):
552 y=0
553 for x in Old:
554 if L[j]==x:
555 y+=1
556 count.append(y)
557 if sort_on_fre!="none":
558 d=zip(*sorted(zip(count, L)))
559 L=d[1]
560 count=d[0]
561 return (L,count)
562
563 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
564 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
565 #this is mainly used for
566 a=nodes_vs_score_list
567 fliC_score=0
568 fljB_score=0
569 for z in a:
570 if "fliC" in z[0]:
571 fliC_score+=z[1]
572 elif "fljB" in z[0]:
573 fljB_score+=z[1]
574 if fliC_score>=fljB_score:
575 role="fliC"
576 else:
577 role="fljB"
578 return (role,abs(fliC_score-fljB_score))
579
580 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
581 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
582 #also used when no head or tail got blasted score for the contig
583 role=""
584 for z in Final_list_passed:
585 if node_name in z[0]:
586 role=z[0].split("_")[0]
587 break
588 return role
589
590 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
591 #nodes_list is the c created by c,d=Uniq(nodes) in below function
592 first_target=""
593 role_list=[]
594 for x in nodes_list:
595 a=[]
596 role=""
597 for y in tail_head_list:
598 if x in y[0]:
599 a.append(y)
600 if len(a)==4:
601 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
602 if diff<20:
603 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
604 elif len(a)==3:
605 ###however, if the one with highest score is the fewer one, compare their accumulation score
606 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
607 if diff<20:
608 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
609 ###end of above score comparison
610 elif len(a)==2:
611 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
612 temp=[]
613 for z in a:
614 temp.append(z[0].split("_")[0])
615 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
616 if len(m)==1:
617 pass
618 else:
619 pass
620 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
621 if diff<20:
622 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
623 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
624 elif len(a)==1:
625 #that one
626 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
627 if diff<20:
628 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
629 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
630 else:#a==0
631 #use Final_list_passed best match
632 for z in Final_list_passed:
633 if x in z[0]:
634 role=z[0].split("_")[0]
635 break
636 #print x,role,len(a)
637 role_list.append((role,x))
638 if len(role_list)==2:
639 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
640 #just use score to do a final test
641 role_list=[]
642 for x in nodes_list:
643 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
644 role_list.append((role,x))
645 return role_list
646
647 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
648 #used to decide which contig is FliC and which one is fljB
649 contigs=[]
650 nodes=[]
651 for x in Final_list_passed:
652 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
653 nodes.append(x[0].split("___")[1].strip())
654 c,d=Uniq(nodes)#c is node_list
655 #print c
656 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
657 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
658 return roles
659
660 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
661 #decide O based on Final_list
662 O_choice="?"
663 O_list=[]
664 special_genes={}
665 nodes=[]
666 for x in Final_list_passed:
667 if x[0].startswith("O-"):
668 nodes.append(x[0].split("___")[1].strip())
669 elif not x[0].startswith("fl"):
670 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
671 #print "special_genes:",special_genes
672 c,d=Uniq(nodes)
673 #print "potential O antigen contig",c
674 final_O=[]
675 O_nodes_list=[]
676 for x in c:#c is the list for contigs
677 temp=0
678 for y in Final_list_passed:
679 if x in y[0] and y[0].startswith("O-"):
680 final_O.append(y)
681 break
682 ### O contig has the problem of two genes on same contig, so do additional test
683 potenial_new_gene=""
684 for x in final_O:
685 pointer=0 #for genes merged or not
686 #not consider O-1,3,19_not_in_3,10, too short compared with others
687 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
688 pointer=x[0].split("___")[1].strip()#store the contig name
689 print(pointer)
690 if pointer!=0:#it has potential merge event
691 for y in Final_list:
692 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
693 potenial_new_gene=y
694 #print(potenial_new_gene)
695 break
696 if potenial_new_gene!="":
697 print("two differnt genes in same contig, fix it for O antigen")
698 print(potenial_new_gene[:3])
699 pointer=0
700 for y in final_O:
701 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
702 pointer=1
703 if pointer!=0: #changed to consider two genes in same contig
704 final_O.append(potenial_new_gene)
705 ### end of the two genes on same contig test
706 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
707 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
708 #print "$$$No Otype, due to no hit"#may need to be changed
709 O_choice="-"
710 else:
711 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
712 O_list=[]
713 O_list_less_contamination=[]
714 for x in final_O:
715 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
716 O_list.append(x[0].split("__")[0])
717 O_nodes_list.append(x[0].split("___")[1])
718 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
719 O_list_less_contamination.append(x[0].split("__")[0])
720 ### special test for O9,46 and O3,10 family
721 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
722 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
723 O_choice="O-9,46"
724 #print "$$$Most possilble Otype: O-9,46"
725 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
726 O_choice="O-9,46,27"
727 #print "$$$Most possilble Otype: O-9,46,27"
728 else:
729 O_choice="O-9"#next, detect O9 vs O2?
730 O2=0
731 O9=0
732 for z in special_genes:
733 if "tyr-O-9" in z:
734 O9=special_genes[z]
735 elif "tyr-O-2" in z:
736 O2=special_genes[z]
737 if O2>O9:
738 O_choice="O-2"
739 elif O2<O9:
740 pass
741 else:
742 pass
743 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
744 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
745 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
746 O_choice="O-3,10"
747 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
748 else:
749 O_choice="O-1,3,19"
750 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
751 ### end of special test for O9,46 and O3,10 family
752 else:
753 try:
754 max_score=0
755 for x in final_O:
756 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
757 max_score=x[2]#change from x[-1] to x[2],08172018
758 O_choice=x[0].split("_")[0]
759 if O_choice=="O-1,3,19":
760 O_choice=final_O[1][0].split("_")[0]
761 #print "$$$Most possilble Otype: ",O_choice
762 except:
763 pass
764 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
765 #print "O:",O_choice,O_nodes_list
766 Otypes=[]
767 for x in O_list:
768 if x!="O-1,3,19_not_in_3,10":
769 if "O-9,46_" not in x:
770 Otypes.append(x.split("_")[0])
771 else:
772 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
773 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
774 Otypes_uniq,Otypes_fre=Uniq(Otypes)
775 contamination_O=""
776 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
777 if len(Otypes_uniq)>2:
778 contamination_O="potential contamination from O antigen signals"
779 else:
780 if len(Otypes_uniq)>1:
781 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
782 contamination_O=""
783 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
784 contamination_O=""
785 else:
786 contamination_O="potential contamination from O antigen signals"
787 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
788 ### End of SeqSero2 allele prediction and output
789
790 def get_input_files(make_dir,input_file,data_type,dirpath):
791 #tell input files from datatype
792 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
793 for_fq=""
794 rev_fq=""
795 os.chdir(make_dir)
796 if data_type=="1":
797 input_file=input_file[0].split("/")[-1]
798 if input_file.endswith(".sra"):
799 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
800 for_fq=input_file.replace(".sra","_1.fastq")
801 rev_fq=input_file.replace(".sra","_2.fastq")
802 else:
803 core_id=input_file.split(".fastq")[0].split(".fq")[0]
804 for_fq=core_id+"_1.fastq"
805 rev_fq=core_id+"_2.fastq"
806 if input_file.endswith(".gz"):
807 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
808 else:
809 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
810 elif data_type=="2":
811 for_fq=input_file[0].split("/")[-1]
812 rev_fq=input_file[1].split("/")[-1]
813 elif data_type=="3":
814 input_file=input_file[0].split("/")[-1]
815 if input_file.endswith(".sra"):
816 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
817 for_fq=input_file.replace(".sra","_1.fastq")
818 else:
819 for_fq=input_file
820 elif data_type in ["4","5","6"]:
821 for_fq=input_file[0].split("/")[-1]
822 os.chdir("..")
823 return for_fq,rev_fq
824
825 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
826 #get O and H types from Final_list from blast parsing; allele mode
827 from Bio import SeqIO
828 fliC_choice="-"
829 fljB_choice="-"
830 fliC_contig="NA"
831 fljB_contig="NA"
832 fliC_region=set([0])
833 fljB_region=set([0,])
834 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
835 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
836 O_choice="-"#no need to decide O contig for now, should be only one
837 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
838 O_choice=O_choice.split("-")[-1].strip()
839 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
840 O_choice="-"
841 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
842 #add alignment locations, used for further selection, 03312019
843 for i in range(len(H_contig_roles)):
844 x=H_contig_roles[i]
845 for y in Final_list_passed:
846 if x[1] in y[0] and y[0].startswith(x[0]):
847 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
848 break
849 log_file=open("SeqSero_log.txt","a")
850 extract_file=open("Extracted_antigen_alleles.fasta","a")
851 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
852
853 #print("O_contigs:")
854 log_file.write("O_contigs:\n")
855 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
856 extract_file.write("#O_contigs:\n")
857 for x in O_nodes_roles:
858 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
859 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
860 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
861 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
862 seqs=""
863 for z in handle_fasta:
864 if x[0].split("___")[-1]==z.description:
865 seqs=str(z.seq)
866 extract_file.write(title+seqs+"\n")
867 if len(H_contig_roles)!=0:
868 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
869 else:
870 highest_H_coverage=0
871 for x in H_contig_roles:
872 #if multiple choices, temporately select the one with longest length for now, will revise in further change
873 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
874 fliC_contig=x[1]
875 fliC_length=len(x[-1])
876 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
877 fljB_contig=x[1]
878 fljB_length=len(x[-1])
879 for x in Final_list_passed:
880 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
881 fliC_choice=x[0].split("_")[1]
882 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
883 fljB_choice=x[0].split("_")[1]
884 elif fliC_choice!="-" and fljB_choice!="-":
885 break
886 #now remove contigs not in middle core part
887 first_allele="NA"
888 first_allele_percentage=0
889 for x in Final_list:
890 if x[0].startswith("fliC") or x[0].startswith("fljB"):
891 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
892 first_allele_percentage=x[2]
893 break
894 additional_contigs=[]
895 for x in Final_list:
896 if first_allele in x[0]:
897 if (fliC_contig == x[0].split("___")[-1]):
898 fliC_region=x[3]
899 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
900 fljB_region=x[3]
901 else:
902 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
903 additional_contigs.append(x)
904 #else:
905 #print x[:3]
906 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
907 if first_allele_percentage>0.9:
908 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
909 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
910 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
911 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
912 else:
913 target_region=set()#doesn't do anything
914 else:
915 target_region=set()#doesn't do anything
916 #print(target_region)
917 #print(additional_contigs)
918 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
919 target_region=target_region2|target_region
920 for x in additional_contigs:
921 removal=0
922 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
923 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
924 removal=1
925 else:
926 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
927 removal=1
928 else:
929 pass
930 if removal==1:
931 for y in H_contig_roles:
932 if y[1] in x[0]:
933 H_contig_roles.remove(y)
934 else:
935 pass
936 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
937 #end of removing none-middle contigs
938 #print("H_contigs:")
939 log_file.write("H_contigs:\n")
940 extract_file.write("#H_contigs:\n")
941 H_contig_stat=[]
942 H1_cont_stat={}
943 H2_cont_stat={}
944 for i in range(len(H_contig_roles)):
945 x=H_contig_roles[i]
946 a=0
947 for y in Final_list_passed:
948 if x[1] in y[0] and y[0].startswith(x[0]):
949 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
950 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
951 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
952 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
953 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
954 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
955 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
956 seqs=""
957 for z in handle_fasta:
958 if x[1]==z.description:
959 seqs=str(z.seq)
960 extract_file.write(title+seqs+"\n")
961 break
962 else:
963 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
964 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
965 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
966 seqs=""
967 for z in handle_fasta:
968 if x[1]==z.description:
969 seqs=str(z.seq)
970 extract_file.write(title+seqs+"\n")
971 if x[0]=="fliC":
972 if y[0].split("_")[1] not in H1_cont_stat:
973 H1_cont_stat[y[0].split("_")[1]]=y[2]
974 else:
975 H1_cont_stat[y[0].split("_")[1]]+=y[2]
976 if x[0]=="fljB":
977 if y[0].split("_")[1] not in H2_cont_stat:
978 H2_cont_stat[y[0].split("_")[1]]=y[2]
979 else:
980 H2_cont_stat[y[0].split("_")[1]]+=y[2]
981 break
982 #detect contaminations
983 #print(H1_cont_stat)
984 #print(H2_cont_stat)
985 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
986 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
987 contamination_H=""
988 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
989 contamination_H="potential contamination from H antigen signals"
990 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
991 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
992 #get additional antigens
993 """
994 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
995 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
996 O_choice="O-9,46"
997 #print "$$$Most possilble Otype: O-9,46"
998 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
999 O_choice="O-9,46,27"
1000 #print "$$$Most possilble Otype: O-9,46,27"
1001 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
1002 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
1003 O_choice="O-3,10"
1004 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
1005 else:
1006 O_choice="O-1,3,19"
1007 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
1008 ### end of special test for O9,46 and O3,10 family
1009
1010 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
1011 if len(Otypes_uniq)>2:
1012 contamination_O="potential contamination from O antigen signals"
1013 else:
1014 if len(Otypes_uniq)>1:
1015 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
1016 contamination_O=""
1017 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
1018 contamination_O=""
1019 """
1020 additonal_antigents=[]
1021 #print(contamination_O)
1022 #print(contamination_H)
1023 log_file.write(contamination_O+"\n")
1024 log_file.write(contamination_H+"\n")
1025 log_file.close()
1026 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
1027
1028 def get_input_K(input_file,lib_dict,data_type,k_size):
1029 #kmer mode; get input_Ks from dict and data_type
1030 kmers = []
1031 for h in lib_dict:
1032 kmers += lib_dict[h]
1033 if data_type == '4':
1034 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
1035 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
1036 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
1037 elif data_type == '5':#minion_2d_fasta
1038 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
1039 if data_type == '6':#minion_2d_fastq
1040 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
1041 return input_Ks
1042
1043 def get_kmer_dict(lib_dict,input_Ks):
1044 #kmer mode; get predicted types
1045 O_dict = {}
1046 H_dict = {}
1047 Special_dict = {}
1048 for h in lib_dict:
1049 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
1050 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
1051 if h.startswith('O-') and score > 25:
1052 O_dict[h] = score
1053 if h.startswith('fl') and score > 40:
1054 H_dict[h] = score
1055 if (h[:2] != 'fl') and (h[:2] != 'O-'):
1056 Special_dict[h] = score
1057 return O_dict,H_dict,Special_dict
1058
1059 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
1060 log_file=open("SeqSero_log.txt","a")
1061 log_file.write("O_scores:\n")
1062 #call O:
1063 highest_O = '-'
1064 if len(O_dict) == 0:
1065 pass
1066 else:
1067 for x in O_dict:
1068 log_file.write(x+"\t"+str(O_dict[x])+"\n")
1069 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
1070 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
1071 highest_O = "O-9,46"
1072 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
1073 highest_O = "O-9,46,27"
1074 else:
1075 highest_O = "O-9" # next, detect O9 vs O2?
1076 O2 = 0
1077 O9 = 0
1078 for z in Special_dict:
1079 if "tyr-O-9" in z:
1080 O9 = float(Special_dict[z])
1081 if "tyr-O-2" in z:
1082 O2 = float(Special_dict[z])
1083 if O2 > O9:
1084 highest_O = "O-2"
1085 elif ("O-3,10_wzx__1539" in O_dict) and (
1086 "O-9,46_wzy__1191" in O_dict
1087 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
1088 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
1089 highest_O = "O-3,10"
1090 else:
1091 highest_O = "O-1,3,19"
1092 ### end of special test for O9,46 and O3,10 family
1093 else:
1094 try:
1095 max_score = 0
1096 for x in O_dict:
1097 if float(O_dict[x]) >= max_score:
1098 max_score = float(O_dict[x])
1099 highest_O = x.split("_")[0]
1100 if highest_O == "O-1,3,19":
1101 highest_O = '-'
1102 max_score = 0
1103 for x in O_dict:
1104 if x == 'O-1,3,19_not_in_3,10__130':
1105 pass
1106 else:
1107 if float(O_dict[x]) >= max_score:
1108 max_score = float(O_dict[x])
1109 highest_O = x.split("_")[0]
1110 except:
1111 pass
1112 #call_fliC:
1113 if len(H_dict)!=0:
1114 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
1115 else:
1116 highest_H_score_both_BC=0
1117 highest_fliC = '-'
1118 highest_fliC_raw = '-'
1119 highest_Score = 0
1120 log_file.write("\nH_scores:\n")
1121 for s in H_dict:
1122 log_file.write(s+"\t"+str(H_dict[s])+"\n")
1123 if s.startswith('fliC'):
1124 if float(H_dict[s]) > highest_Score:
1125 highest_fliC = s.split('_')[1]
1126 highest_fliC_raw = s
1127 highest_Score = float(H_dict[s])
1128 #call_fljB
1129 highest_fljB = '-'
1130 highest_fljB_raw = '-'
1131 highest_Score = 0
1132 for s in H_dict:
1133 if s.startswith('fljB'):
1134 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
1135 highest_fljB = s.split('_')[1]
1136 highest_fljB_raw = s
1137 highest_Score = float(H_dict[s])
1138 log_file.write("\nSpecial_scores:\n")
1139 for s in Special_dict:
1140 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
1141 log_file.close()
1142 return highest_O,highest_fliC,highest_fljB
1143
1144 def get_temp_file_names(for_fq,rev_fq):
1145 #seqsero2 -a; get temp file names
1146 sam=for_fq+".sam"
1147 bam=for_fq+".bam"
1148 sorted_bam=for_fq+"_sorted.bam"
1149 mapped_fq1=for_fq+"_mapped.fq"
1150 mapped_fq2=rev_fq+"_mapped.fq"
1151 combined_fq=for_fq+"_combined.fq"
1152 for_sai=for_fq+".sai"
1153 rev_sai=rev_fq+".sai"
1154 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
1155
1156 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
1157 #seqsero2 -a; do mapping and sort
1158 print("building database...")
1159 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True)
1160 print("mapping...")
1161 if mapping_mode=="mem":
1162 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
1163 elif mapping_mode=="sam":
1164 if fnameB!="":
1165 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
1166 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True)
1167 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
1168 else:
1169 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
1170 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam)
1171 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True)
1172 ### check the version of samtools then use differnt commands
1173 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
1174 out, err = samtools_version.communicate()
1175 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
1176 print("check samtools version:",version)
1177 ### end of samtools version check and its analysis
1178 if LooseVersion(version)<=LooseVersion("1.2"):
1179 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True)
1180 else:
1181 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True)
1182
1183 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
1184 #seqsero2 -a; extract, assembly and blast
1185 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True)
1186 #print("fnameA:",fnameA)
1187 #print("fnameB:",fnameB)
1188 if fnameB!="":
1189 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output
1190 else:
1191 pass
1192 outdir=current_time+"_temp"
1193 print("assembling...")
1194 if int(threads)>4:
1195 t="4"
1196 else:
1197 t=threads
1198 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
1199 if fnameB!="":
1200 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
1201 else:
1202 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
1203 #new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
1204 new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # ed_SL_09152019: change path to databse for packaging
1205 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True)
1206 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
1207 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True)
1208 print("blasting...","\n")
1209 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
1210 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
1211 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10"
1212 else:
1213 xmlfile="NA"
1214 return xmlfile,new_fasta
1215
1216 def judge_subspecies(fnameA,dirpath):
1217 #seqsero2 -a; judge subspecies on just forward raw reads fastq
1218 salmID_output=subprocess.Popen("python " + dirpath + "/SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
1219 out, err = salmID_output.communicate()
1220 out=out.decode("utf-8")
1221 file=open("data_log.txt","a")
1222 file.write(out)
1223 file.close()
1224 salm_species_scores=out.split("\n")[1].split("\t")[6:]
1225 salm_species_results=out.split("\n")[0].split("\t")[6:]
1226 max_score=0
1227 max_score_index=1 #default is 1, means "I"
1228 for i in range(len(salm_species_scores)):
1229 if max_score<float(salm_species_scores[i]):
1230 max_score=float(salm_species_scores[i])
1231 max_score_index=i
1232 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
1233 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
1234 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
1235 if max_score<10:
1236 prediction="-"
1237 return prediction
1238
1239 def judge_subspecies_Kmer(Special_dict):
1240 #seqsero2 -k;
1241 max_score=0
1242 prediction="-" #default should be I
1243 for x in Special_dict:
1244 if "mer" in x:
1245 if max_score<float(Special_dict[x]):
1246 max_score=float(Special_dict[x])
1247 prediction=x.split("_")[-1].strip()
1248 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
1249 prediction="bongori"
1250 break
1251 return prediction
1252
1253 def main():
1254 #combine SeqSeroK and SeqSero2, also with SalmID
1255 args = parse_args()
1256 input_file = args.i
1257 data_type = args.t
1258 analysis_mode = args.m
1259 mapping_mode=args.b
1260 threads=args.p
1261 make_dir=args.d
1262 clean_mode=args.c
1263 k_size=27 #will change for bug fixing
1264 #database="H_and_O_and_specific_genes.fasta"
1265 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
1266 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: add ex_dir for packaging
1267 #database=ex_dir+"/seqsero2_db/H_and_O_and_specific_genes.fasta" # ed_SL_09152019: change path to database for packaging
1268 database="H_and_O_and_specific_genes.fasta"
1269 note="Note:"
1270 NA_note=" This predicted serotype is not in the Kauffman-White scheme." # ed_SL_09272019: add for new output format
1271 if len(sys.argv)==1:
1272 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file
1273 else:
1274 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
1275 request_id += str(random.randint(1, 10000000))
1276 if make_dir is None:
1277 make_dir="SeqSero_result_"+request_id
1278 if os.path.isdir(make_dir):
1279 pass
1280 else:
1281 subprocess.check_call(["mkdir",make_dir])
1282 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
1283 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
1284 #subprocess.check_call("ln -f -s "+database+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_09152019: change path to database for packaging
1285 subprocess.check_call("ln -f -s "+dirpath+"/seqsero2_db/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### ed_SL_05282019: use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument).
1286 ############################begin the real analysis
1287 if analysis_mode=="a":
1288 if data_type in ["1","2","3"]:#use allele mode
1289 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
1290 os.chdir(make_dir)
1291 ###add a function to tell input files
1292 fnameA=for_fq.split("/")[-1]
1293 fnameB=rev_fq.split("/")[-1]
1294 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
1295 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
1296 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
1297 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
1298 if xmlfile=="NA":
1299 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
1300 else:
1301 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
1302 file=open("data_log.txt","a")
1303 for x in Final_list:
1304 file.write("\t".join(str(y) for y in x)+"\n")
1305 file.close()
1306 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
1307 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
1308 subspecies=judge_subspecies(fnameA,dirpath) #predict subspecies
1309 ###output
1310 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
1311 claim="" #04132019, disable claim for new report requirement
1312 contamination_report=""
1313 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
1314 if contamination_O!="" and contamination_H=="":
1315 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
1316 elif contamination_O=="" and contamination_H!="":
1317 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
1318 elif contamination_O!="" and contamination_H!="":
1319 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
1320 if contamination_report!="":
1321 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
1322 contamination_report=" Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles."
1323 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
1324 ## ed_SL_09272019: change for new output format
1325 #if contamination_report+star_line+claim=="": #0413, new output style
1326 # note=""
1327 #else:
1328 # note="Note:"
1329 if clean_mode:
1330 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
1331 make_dir="none-output-directory due to '-c' flag"
1332 else:
1333 new_file=open("SeqSero_result.txt","w")
1334 if O_choice=="":
1335 O_choice="-"
1336 if "N/A" not in predict_sero:
1337 new_file.write("Output_directory:\t"+make_dir+"\n"+
1338 "Input files:\t"+"\t".join(input_file)+"\n"+
1339 "O antigen prediction:\t"+O_choice+"\n"+
1340 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1341 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1342 "Predicted subspecies:\t"+subspecies+"\n"+
1343 "Predicted antigenic profile:\t"+predict_form+"\n"+
1344 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
1345 note+contamination_report+star_line+claim+"\n")#+##
1346 else:
1347 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
1348 star_line="" #04132019, for new output requirement, diable star_line if "NA" in output
1349 new_file.write("Output_directory:\t"+make_dir+"\n"+
1350 "Input files:\t"+"\t".join(input_file)+"\n"+
1351 "O antigen prediction:\t"+O_choice+"\n"+
1352 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1353 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1354 "Predicted subspecies:\t"+subspecies+"\n"+
1355 "Predicted antigenic profile:\t"+predict_form+"\n"+
1356 "Predicted serotype:\t"+predict_form+"\n"+ # ed_SL_09242019: add serotype output for "N/A" prediction
1357 note+NA_note+contamination_report+star_line+claim+"\n")#+##
1358 new_file.close()
1359 print("\n")
1360 #subprocess.check_call("cat Seqsero_result.txt",shell=True)
1361 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True)
1362 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True)
1363 if "N/A" not in predict_sero:
1364 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+##
1365 print("Output_directory:\t"+make_dir+"\n"+
1366 "Input files:\t"+"\t".join(input_file)+"\n"+
1367 "O antigen prediction:\t"+O_choice+"\n"+
1368 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1369 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1370 "Predicted subspecies:\t"+subspecies+"\n"+
1371 "Predicted antigenic profile:\t"+predict_form+"\n"+
1372 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
1373 note+contamination_report+star_line+claim+"\n")#+##
1374 else:
1375 print("Output_directory:\t"+make_dir+"\n"+
1376 "Input files:\t"+"\t".join(input_file)+"\n"+
1377 "O antigen prediction:\t"+O_choice+"\n"+
1378 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1379 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1380 "Predicted subspecies:\t"+subspecies+"\n"+
1381 "Predicted antigenic profile:\t"+predict_form+"\n"+
1382 "Predicted serotype:\t"+predict_form+"\n"+ # ed_SL_09242019: add serotype output for "N/A" prediction
1383 note+NA_note+contamination_report+star_line+claim+"\n")
1384 else:
1385 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
1386 elif analysis_mode=="k":
1387 #ex_dir = os.path.dirname(os.path.realpath(__file__))
1388 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: change ex_dir for packaging
1389 #output_mode = args.mode
1390 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
1391 input_file = for_fq #-k will just use forward because not all reads were used
1392 os.chdir(make_dir)
1393 f = open(dirpath + '/seqsero2_db/antigens.pickle', 'rb')
1394 lib_dict = pickle.load(f)
1395 f.close
1396 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size)
1397 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks)
1398 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
1399 subspecies=judge_subspecies_Kmer(Special_dict)
1400 if subspecies=="IIb" or subspecies=="IIa":
1401 subspecies="II"
1402 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
1403 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
1404 claim="" #no claim any more based on new output requirement
1405 ## ed_SL_09272019: change for new output format
1406 #if star_line+claim=="": #0413, new output style
1407 # note=""
1408 #else:
1409 # note="Note:"
1410 if clean_mode:
1411 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
1412 make_dir="none-output-directory due to '-c' flag"
1413 ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c"
1414 if highest_O.split('-')[-1]=="":
1415 O_choice="-"
1416 else:
1417 O_choice=highest_O.split('-')[-1]
1418 ###
1419 else:
1420 if highest_O.split('-')[-1]=="":
1421 O_choice="-"
1422 else:
1423 O_choice=highest_O.split('-')[-1]
1424 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
1425 new_file=open("SeqSero_result.txt","w")
1426 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
1427 if "N/A" not in predict_sero:
1428 new_file.write("Output_directory:\t"+make_dir+"\n"+
1429 "Input files:\t"+input_file+"\n"+
1430 "O antigen prediction:\t"+O_choice+"\n"+
1431 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1432 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1433 "Predicted subspecies:\t"+subspecies+"\n"+
1434 "Predicted antigenic profile:\t"+predict_form+"\n"+
1435 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
1436 note+star_line+claim+"\n")#+##
1437 else:
1438 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
1439 star_line = "" #changed for new output requirement, 04132019
1440 new_file.write("Output_directory:\t"+make_dir+"\n"+
1441 "Input files:\t"+input_file+"\n"+
1442 "O antigen prediction:\t"+O_choice+"\n"+
1443 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1444 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1445 "Predicted subspecies:\t"+subspecies+"\n"+
1446 "Predicted antigenic profile:\t"+predict_form+"\n"+
1447 "Predicted serotype:\t"+predict_form+"\n"+ # ed_SL_09242019: add serotype output for "N/A" prediction
1448 note+NA_note+star_line+claim+"\n")#+##
1449 new_file.close()
1450 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True)
1451 if "N/A" not in predict_sero:
1452 print("Output_directory:\t"+make_dir+"\n"+
1453 "Input files:\t"+input_file+"\n"+
1454 "O antigen prediction:\t"+O_choice+"\n"+
1455 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1456 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1457 "Predicted subspecies:\t"+subspecies+"\n"+
1458 "Predicted antigenic profile:\t"+predict_form+"\n"+
1459 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
1460 note+star_line+claim+"\n")#+##
1461 else:
1462 print("Output_directory:\t"+make_dir+"\n"+
1463 "Input files:\t"+input_file+"\n"+
1464 "O antigen prediction:\t"+O_choice+"\n"+
1465 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1466 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1467 "Predicted subspecies:\t"+subspecies+"\n"+
1468 "Predicted antigenic profile:\t"+predict_form+"\n"+
1469 "Predicted serotype:\t"+predict_form+"\n"+ # ed_SL_09242019: add serotype output for "N/A" prediction
1470 note+NA_note+star_line+claim+"\n")#+##
1471
1472 if __name__ == '__main__':
1473 main()