Mercurial > repos > cstrittmatter > ss2v110
comparison SeqSero2_package.py @ 0:fc22ec8e924e draft
planemo upload commit 6b0a9d0f0ef4bdb0c2e2c54070b510ff28125f7a
author | cstrittmatter |
---|---|
date | Tue, 21 Apr 2020 12:45:34 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fc22ec8e924e |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import sys | |
4 import time | |
5 import random | |
6 import os | |
7 import subprocess | |
8 import gzip | |
9 import io | |
10 import pickle | |
11 import argparse | |
12 import itertools | |
13 import math | |
14 from distutils.version import LooseVersion | |
15 from distutils.spawn import find_executable | |
16 | |
17 try: | |
18 from .version import SeqSero2_version | |
19 except Exception: #ImportError | |
20 from version import SeqSero2_version | |
21 | |
22 ### SeqSero Kmer | |
23 def parse_args(): | |
24 "Parse the input arguments, use '-h' for help." | |
25 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.0.2')#add "-m <data_type>" in future | |
26 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### ed_SL_05282019: add 'type=os.path.abspath' to generate absolute path of input data. | |
27 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore fasta, '6' for nanopore fastq") | |
28 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode") | |
29 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1") | |
30 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a") | |
31 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number") | |
32 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files") | |
33 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies") | |
34 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version) | |
35 return parser.parse_args() | |
36 | |
37 ### ed_SL_05282019: check paths of dependencies | |
38 check_dependencies = parse_args().check | |
39 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py'] | |
40 if check_dependencies: | |
41 for item in dependencies: | |
42 ext_path = find_executable(item) | |
43 if ext_path is not None: | |
44 print ("Using "+item+" - "+ext_path) | |
45 else: | |
46 print ("ERROR: can not find "+item+" in PATH") | |
47 sys.exit() | |
48 ### end of --check | |
49 | |
50 def reverse_complement(sequence): | |
51 complement = { | |
52 'A': 'T', | |
53 'C': 'G', | |
54 'G': 'C', | |
55 'T': 'A', | |
56 'N': 'N', | |
57 'M': 'K', | |
58 'R': 'Y', | |
59 'W': 'W', | |
60 'S': 'S', | |
61 'Y': 'R', | |
62 'K': 'M', | |
63 'V': 'B', | |
64 'H': 'D', | |
65 'D': 'H', | |
66 'B': 'V' | |
67 } | |
68 return "".join(complement[base] for base in reversed(sequence)) | |
69 | |
70 | |
71 def createKmerDict_reads(list_of_strings, kmer): | |
72 kmer_table = {} | |
73 for string in list_of_strings: | |
74 sequence = string.strip('\n') | |
75 for i in range(len(sequence) - kmer + 1): | |
76 new_mer = sequence[i:i + kmer].upper() | |
77 new_mer_rc = reverse_complement(new_mer) | |
78 if new_mer in kmer_table: | |
79 kmer_table[new_mer.upper()] += 1 | |
80 else: | |
81 kmer_table[new_mer.upper()] = 1 | |
82 if new_mer_rc in kmer_table: | |
83 kmer_table[new_mer_rc.upper()] += 1 | |
84 else: | |
85 kmer_table[new_mer_rc.upper()] = 1 | |
86 return kmer_table | |
87 | |
88 | |
89 def multifasta_dict(multifasta): | |
90 multifasta_list = [ | |
91 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0 | |
92 ] | |
93 headers = [i for i in multifasta_list if i[0] == '>'] | |
94 multifasta_dict = {} | |
95 for h in headers: | |
96 start = multifasta_list.index(h) | |
97 for element in multifasta_list[start + 1:]: | |
98 if element[0] == '>': | |
99 break | |
100 else: | |
101 if h[1:] in multifasta_dict: | |
102 multifasta_dict[h[1:]] += element | |
103 else: | |
104 multifasta_dict[h[1:]] = element | |
105 return multifasta_dict | |
106 | |
107 | |
108 def multifasta_single_string(multifasta): | |
109 multifasta_list = [ | |
110 line.strip() for line in open(multifasta, 'r') | |
111 if (len(line.strip()) > 0) and (line.strip()[0] != '>') | |
112 ] | |
113 return ''.join(multifasta_list) | |
114 | |
115 | |
116 def chunk_a_long_sequence(long_sequence, chunk_size=60): | |
117 chunk_list = [] | |
118 steps = len(long_sequence) // 60 #how many chunks | |
119 for i in range(steps): | |
120 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size]) | |
121 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)]) | |
122 return chunk_list | |
123 | |
124 | |
125 def target_multifasta_kmerizer(multifasta, k, kmerDict): | |
126 forward_length = 300 #if find the target, put forward 300 bases | |
127 reverse_length = 2200 #if find the target, put backward 2200 bases | |
128 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences | |
129 target_mers = [] | |
130 long_single_string = multifasta_single_string(multifasta) | |
131 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size) | |
132 unit_length = len(multifasta_list[0]) | |
133 forward_lines = int(forward_length / unit_length) + 1 | |
134 reverse_lines = int(forward_length / unit_length) + 1 | |
135 start_num = 0 | |
136 end_num = 0 | |
137 for i in range(len(multifasta_list)): | |
138 if i not in range(start_num, end_num): #avoid computational repetition | |
139 line = multifasta_list[i] | |
140 start = int((len(line) - k) // 2) | |
141 s1 = line[start:k + start] | |
142 if s1 in kmerDict: #detect it is a potential read or not (use the middle part) | |
143 if i - forward_lines >= 0: | |
144 start_num = i - forward_lines | |
145 else: | |
146 start_num = 0 | |
147 if i + reverse_lines <= len(multifasta_list) - 1: | |
148 end_num = i + reverse_lines | |
149 else: | |
150 end_num = len(multifasta_list) - 1 | |
151 target_list = [ | |
152 x.strip() for x in multifasta_list[start_num:end_num] | |
153 ] | |
154 target_line = "".join(target_list) | |
155 target_mers += [ | |
156 k1 for k1 in createKmerDict_reads([str(target_line)], k) | |
157 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length) | |
158 else: | |
159 pass | |
160 return set(target_mers) | |
161 | |
162 | |
163 def target_read_kmerizer(file, k, kmerDict): | |
164 i = 1 | |
165 n_reads = 0 | |
166 total_coverage = 0 | |
167 target_mers = [] | |
168 if file.endswith(".gz"): | |
169 file_content = io.BufferedReader(gzip.open(file)) | |
170 else: | |
171 file_content = open(file, "r").readlines() | |
172 for line in file_content: | |
173 start = int((len(line) - k) // 2) | |
174 if i % 4 == 2: | |
175 if file.endswith(".gz"): | |
176 s1 = line[start:k + start].decode() | |
177 line = line.decode() | |
178 else: | |
179 s1 = line[start:k + start] | |
180 if s1 in kmerDict: #detect it is a potential read or not (use the middle part) | |
181 n_reads += 1 | |
182 total_coverage += len(line) | |
183 target_mers += [ | |
184 k1 for k1 in createKmerDict_reads([str(line)], k) | |
185 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length) | |
186 i += 1 | |
187 if total_coverage >= 4000000: | |
188 break | |
189 return set(target_mers) | |
190 | |
191 | |
192 def minion_fasta_kmerizer(file, k, kmerDict): | |
193 i = 1 | |
194 n_reads = 0 | |
195 total_coverage = 0 | |
196 target_mers = {} | |
197 for line in open(file): | |
198 if i % 2 == 0: | |
199 for kmer, rc_kmer in kmers(line.strip().upper(), k): | |
200 if (kmer in kmerDict) or (rc_kmer in kmerDict): | |
201 if kmer in target_mers: | |
202 target_mers[kmer] += 1 | |
203 else: | |
204 target_mers[kmer] = 1 | |
205 if rc_kmer in target_mers: | |
206 target_mers[rc_kmer] += 1 | |
207 else: | |
208 target_mers[rc_kmer] = 1 | |
209 i += 1 | |
210 return set([h for h in target_mers]) | |
211 | |
212 | |
213 def minion_fastq_kmerizer(file, k, kmerDict): | |
214 i = 1 | |
215 n_reads = 0 | |
216 total_coverage = 0 | |
217 target_mers = {} | |
218 for line in open(file): | |
219 if i % 4 == 2: | |
220 for kmer, rc_kmer in kmers(line.strip().upper(), k): | |
221 if (kmer in kmerDict) or (rc_kmer in kmerDict): | |
222 if kmer in target_mers: | |
223 target_mers[kmer] += 1 | |
224 else: | |
225 target_mers[kmer] = 1 | |
226 if rc_kmer in target_mers: | |
227 target_mers[rc_kmer] += 1 | |
228 else: | |
229 target_mers[rc_kmer] = 1 | |
230 i += 1 | |
231 return set([h for h in target_mers]) | |
232 | |
233 | |
234 def multifasta_single_string2(multifasta): | |
235 single_string = '' | |
236 with open(multifasta, 'r') as f: | |
237 for line in f: | |
238 if line.strip()[0] == '>': | |
239 pass | |
240 else: | |
241 single_string += line.strip() | |
242 return single_string | |
243 | |
244 | |
245 def kmers(seq, k): | |
246 rev_comp = reverse_complement(seq) | |
247 for start in range(1, len(seq) - k + 1): | |
248 yield seq[start:start + k], rev_comp[-(start + k):-start] | |
249 | |
250 | |
251 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set | |
252 multi_seq_dict = multifasta_dict(multifasta) | |
253 lib_dict = {} | |
254 for h in multi_seq_dict: | |
255 lib_dict[h] = set( | |
256 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)]) | |
257 return lib_dict | |
258 | |
259 | |
260 def Combine(b, c): | |
261 fliC_combinations = [] | |
262 fliC_combinations.append(",".join(c)) | |
263 temp_combinations = [] | |
264 for i in range(len(b)): | |
265 for x in itertools.combinations(b, i + 1): | |
266 temp_combinations.append(",".join(x)) | |
267 for x in temp_combinations: | |
268 temp = [] | |
269 for y in c: | |
270 temp.append(y) | |
271 temp.append(x) | |
272 temp = ",".join(temp) | |
273 temp = temp.split(",") | |
274 temp.sort() | |
275 temp = ",".join(temp) | |
276 fliC_combinations.append(temp) | |
277 return fliC_combinations | |
278 | |
279 | |
280 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies): | |
281 #like test_output_06012017.txt | |
282 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list) | |
283 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict | |
284 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict] | |
285 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to | |
286 seronames = [] | |
287 seronames_none_subspecies=[] | |
288 for i in range(len(phase1)): | |
289 fliC_combine = [] | |
290 fljB_combine = [] | |
291 if phaseO[i] == Otype: # no VII in KW, but it's there | |
292 ### for fliC, detect every possible combinations to avoid the effect of "[" | |
293 if phase1[i].count("[") == 0: | |
294 fliC_combine.append(phase1[i]) | |
295 elif phase1[i].count("[") >= 1: | |
296 c = [] | |
297 b = [] | |
298 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count( | |
299 "[") == 1: | |
300 content = phase1[i].replace("[", "").replace("]", "") | |
301 fliC_combine.append(content) | |
302 fliC_combine.append("-") | |
303 else: | |
304 for x in phase1[i].split(","): | |
305 if "[" in x: | |
306 b.append(x.replace("[", "").replace("]", "")) | |
307 else: | |
308 c.append(x) | |
309 fliC_combine = Combine( | |
310 b, c | |
311 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t | |
312 ### end of fliC "[" detect | |
313 ### for fljB, detect every possible combinations to avoid the effect of "[" | |
314 if phase2[i].count("[") == 0: | |
315 fljB_combine.append(phase2[i]) | |
316 elif phase2[i].count("[") >= 1: | |
317 d = [] | |
318 e = [] | |
319 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count( | |
320 "[") == 1: | |
321 content = phase2[i].replace("[", "").replace("]", "") | |
322 fljB_combine.append(content) | |
323 fljB_combine.append("-") | |
324 else: | |
325 for x in phase2[i].split(","): | |
326 if "[" in x: | |
327 d.append(x.replace("[", "").replace("]", "")) | |
328 else: | |
329 e.append(x) | |
330 fljB_combine = Combine(d, e) | |
331 ### end of fljB "[" detect | |
332 new_fliC = fliC.split( | |
333 "," | |
334 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings | |
335 new_fliC.sort() | |
336 new_fliC = ",".join(new_fliC) | |
337 new_fljB = fljB.split(",") | |
338 new_fljB.sort() | |
339 new_fljB = ",".join(new_fljB) | |
340 if (new_fliC in fliC_combine | |
341 or fliC in fliC_combine) and (new_fljB in fljB_combine | |
342 or fljB in fljB_combine): | |
343 ######start, remove_list,rename_dict, added on 11/11/2018 | |
344 if sero[i] not in remove_list: | |
345 temp_sero=sero[i] | |
346 if temp_sero in rename_dict: | |
347 temp_sero=rename_dict[temp_sero] #rename if in the rename list | |
348 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider | |
349 if subs[i] == subspecies: | |
350 seronames.append(temp_sero) | |
351 seronames_none_subspecies.append(temp_sero) | |
352 else: | |
353 pass | |
354 else: | |
355 pass | |
356 ######end, added on 11/11/2018 | |
357 #analyze seronames | |
358 subspecies_pointer="" | |
359 if len(seronames) == 0 and len(seronames_none_subspecies)!=0: | |
360 seronames=seronames_none_subspecies | |
361 subspecies_pointer="1" | |
362 if len(seronames) == 0: | |
363 seronames = [ | |
364 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)" | |
365 ] | |
366 star = "" | |
367 star_line = "" | |
368 if len(seronames) > 1: #there are two possible predictions for serotypes | |
369 star = "*" | |
370 #changed 04072019 | |
371 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" | |
372 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0: | |
373 star="*" | |
374 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line | |
375 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line | |
376 if Otype=="": | |
377 Otype="-" | |
378 predict_form = Otype + ":" + fliC + ":" + fljB | |
379 predict_sero = (" or ").join(seronames) | |
380 ###special test for Enteritidis | |
381 if predict_form == "9:g,m:-": | |
382 sdf = "-" | |
383 for x in special_gene_list: | |
384 if x.startswith("sdf"): | |
385 sdf = "+" | |
386 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" | |
387 star_line=" sdf gene detected." # ed_SL_04152019: new output format | |
388 #predict_form = predict_form + " Sdf prediction:" + sdf | |
389 predict_form = predict_form #changed 04072019 | |
390 if sdf == "-": | |
391 star = "*" | |
392 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" | |
393 star_line=" sdf gene not detected." # ed_SL_04152019: new output format | |
394 #changed in 04072019, for new output | |
395 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" | |
396 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement | |
397 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format | |
398 ###end of special test for Enteritidis | |
399 elif predict_form == "4:i:-": | |
400 predict_sero = "I 4,[5],12:i:-" # ed_SL_09242019: change serotype name | |
401 elif predict_form == "4:r:-": | |
402 predict_sero = "4:r:-" | |
403 elif predict_form == "4:b:-": # ed_SL_09272019: change for new output format | |
404 predict_sero = "N/A (4:b:-)" | |
405 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo | |
406 #predict_sero = "Newport" | |
407 #star = "*" | |
408 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." | |
409 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" | |
410 if "N/A" in predict_sero: | |
411 claim = "" | |
412 #special test for Typhimurium | |
413 if "Typhimurium" in predict_sero or predict_form == "4:i:-": | |
414 normal = 0 | |
415 mutation = 0 | |
416 for x in special_gene_list: | |
417 if "oafA-O-4_full" in x: | |
418 normal = float(special_gene_list[x]) | |
419 elif "oafA-O-4_5-" in x: | |
420 mutation = float(special_gene_list[x]) | |
421 if normal > mutation: | |
422 pass | |
423 elif normal < mutation: | |
424 #predict_sero = predict_sero.strip() + "(O5-)" | |
425 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019 | |
426 star = "*" | |
427 #star_line = "Detected the deletion of O5-." | |
428 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format | |
429 else: | |
430 pass | |
431 #special test for Paratyphi B | |
432 if "Paratyphi B" in predict_sero or predict_form == "4:b:-": | |
433 normal = 0 | |
434 mutation = 0 | |
435 for x in special_gene_list: | |
436 if "gntR-family-regulatory-protein_dt-positive" in x: | |
437 normal = float(special_gene_list[x]) | |
438 elif "gntR-family-regulatory-protein_dt-negative" in x: | |
439 mutation = float(special_gene_list[x]) | |
440 #print(normal,mutation) | |
441 if normal > mutation: | |
442 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019 | |
443 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format | |
444 star = "*" | |
445 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." | |
446 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype of Paratyphi B was not detected. " # ed_SL_04152019: new output format | |
447 elif normal < mutation: | |
448 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019 | |
449 predict_sero = predict_sero.strip() | |
450 star = "*" | |
451 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B." | |
452 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B." # ed_SL_04152019: new output format | |
453 else: | |
454 star = "*" | |
455 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." | |
456 star_line = " " ## ed_SL_05152019: do not report this situation. | |
457 #special test for O13,22 and O13,23 | |
458 if Otype=="13": | |
459 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
460 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019 | |
461 f = open(ex_dir + '/special.pickle', 'rb') | |
462 special = pickle.load(f) | |
463 O22_O23=special['O22_O23'] | |
464 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze | |
465 O22_score=0 | |
466 O23_score=0 | |
467 for x in special_gene_list: | |
468 if "O:22" in x: | |
469 O22_score = O22_score+float(special_gene_list[x]) | |
470 elif "O:23" in x: | |
471 O23_score = O23_score+float(special_gene_list[x]) | |
472 #print(O22_score,O23_score) | |
473 for z in O22_O23[0]: | |
474 if predict_sero.split(" or ")[0] in z: | |
475 if O22_score > O23_score: | |
476 star = "*" | |
477 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 | |
478 predict_sero = z[0] | |
479 elif O22_score < O23_score: | |
480 star = "*" | |
481 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 | |
482 predict_sero = z[1] | |
483 else: | |
484 star = "*" | |
485 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019 | |
486 if " or " in predict_sero: | |
487 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" | |
488 #special test for O6,8 | |
489 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list | |
490 #for x in merge_O68_list: | |
491 # if x in predict_sero: | |
492 # predict_sero=x | |
493 # star="" | |
494 # star_line="" | |
495 #special test for Montevideo; most of them are monophasic | |
496 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list | |
497 #star="*" | |
498 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination." | |
499 return predict_form, predict_sero, star, star_line, claim | |
500 ### End of SeqSero Kmer part | |
501 | |
502 ### Begin of SeqSero2 allele prediction and output | |
503 def xml_parse_score_comparision_seqsero(xmlfile): | |
504 #used to do seqsero xml analysis | |
505 from Bio.Blast import NCBIXML | |
506 handle=open(xmlfile) | |
507 handle=NCBIXML.parse(handle) | |
508 handle=list(handle) | |
509 List=[] | |
510 List_score=[] | |
511 List_ids=[] | |
512 List_query_region=[] | |
513 for i in range(len(handle)): | |
514 if len(handle[i].alignments)>0: | |
515 for j in range(len(handle[i].alignments)): | |
516 score=0 | |
517 ids=0 | |
518 cover_region=set() #fixed problem that repeated calculation leading percentage > 1 | |
519 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def) | |
520 for z in range(len(handle[i].alignments[j].hsps)): | |
521 hsp=handle[i].alignments[j].hsps[z] | |
522 temp=set(range(hsp.query_start,hsp.query_end)) | |
523 est_bit=(handle[i].ka_params[0]*hsp.score-math.log(handle[i].ka_params[1]))/(math.log(2)) | |
524 if len(cover_region)==0: | |
525 cover_region=cover_region|temp | |
526 fraction=1 | |
527 else: | |
528 fraction=1-len(cover_region&temp)/float(len(temp)) | |
529 cover_region=cover_region|temp | |
530 if "last" in handle[i].query or "first" in handle[i].query: | |
531 #score+=hsp.bits*fraction | |
532 score+=est_bit*fraction | |
533 ids+=float(hsp.identities)/handle[i].query_length*fraction | |
534 else: | |
535 #score+=hsp.bits*fraction | |
536 score+=est_bit*fraction | |
537 ids+=float(hsp.identities)/handle[i].query_length*fraction | |
538 List_score.append(score) | |
539 List_ids.append(ids) | |
540 List_query_region.append(cover_region) | |
541 temp=zip(List,List_score,List_ids,List_query_region) | |
542 Final_list=sorted(temp, key=lambda d:d[1], reverse = True) | |
543 return Final_list | |
544 | |
545 | |
546 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number | |
547 Old=L | |
548 L.sort() | |
549 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]] | |
550 count=[] | |
551 for j in range(len(L)): | |
552 y=0 | |
553 for x in Old: | |
554 if L[j]==x: | |
555 y+=1 | |
556 count.append(y) | |
557 if sort_on_fre!="none": | |
558 d=zip(*sorted(zip(count, L))) | |
559 L=d[1] | |
560 count=d[0] | |
561 return (L,count) | |
562 | |
563 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list): | |
564 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test | |
565 #this is mainly used for | |
566 a=nodes_vs_score_list | |
567 fliC_score=0 | |
568 fljB_score=0 | |
569 for z in a: | |
570 if "fliC" in z[0]: | |
571 fliC_score+=z[1] | |
572 elif "fljB" in z[0]: | |
573 fljB_score+=z[1] | |
574 if fliC_score>=fljB_score: | |
575 role="fliC" | |
576 else: | |
577 role="fljB" | |
578 return (role,abs(fliC_score-fljB_score)) | |
579 | |
580 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed): | |
581 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small) | |
582 #also used when no head or tail got blasted score for the contig | |
583 role="" | |
584 for z in Final_list_passed: | |
585 if node_name in z[0]: | |
586 role=z[0].split("_")[0] | |
587 break | |
588 return role | |
589 | |
590 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed): | |
591 #nodes_list is the c created by c,d=Uniq(nodes) in below function | |
592 first_target="" | |
593 role_list=[] | |
594 for x in nodes_list: | |
595 a=[] | |
596 role="" | |
597 for y in tail_head_list: | |
598 if x in y[0]: | |
599 a.append(y) | |
600 if len(a)==4: | |
601 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
602 if diff<20: | |
603 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
604 elif len(a)==3: | |
605 ###however, if the one with highest score is the fewer one, compare their accumulation score | |
606 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
607 if diff<20: | |
608 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
609 ###end of above score comparison | |
610 elif len(a)==2: | |
611 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37) | |
612 temp=[] | |
613 for z in a: | |
614 temp.append(z[0].split("_")[0]) | |
615 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too | |
616 if len(m)==1: | |
617 pass | |
618 else: | |
619 pass | |
620 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
621 if diff<20: | |
622 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
623 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation | |
624 elif len(a)==1: | |
625 #that one | |
626 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
627 if diff<20: | |
628 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
629 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0" | |
630 else:#a==0 | |
631 #use Final_list_passed best match | |
632 for z in Final_list_passed: | |
633 if x in z[0]: | |
634 role=z[0].split("_")[0] | |
635 break | |
636 #print x,role,len(a) | |
637 role_list.append((role,x)) | |
638 if len(role_list)==2: | |
639 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase | |
640 #just use score to do a final test | |
641 role_list=[] | |
642 for x in nodes_list: | |
643 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
644 role_list.append((role,x)) | |
645 return role_list | |
646 | |
647 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed): | |
648 #used to decide which contig is FliC and which one is fljB | |
649 contigs=[] | |
650 nodes=[] | |
651 for x in Final_list_passed: | |
652 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]: | |
653 nodes.append(x[0].split("___")[1].strip()) | |
654 c,d=Uniq(nodes)#c is node_list | |
655 #print c | |
656 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])] | |
657 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed) | |
658 return roles | |
659 | |
660 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed): | |
661 #decide O based on Final_list | |
662 O_choice="?" | |
663 O_list=[] | |
664 special_genes={} | |
665 nodes=[] | |
666 for x in Final_list_passed: | |
667 if x[0].startswith("O-"): | |
668 nodes.append(x[0].split("___")[1].strip()) | |
669 elif not x[0].startswith("fl"): | |
670 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1] | |
671 #print "special_genes:",special_genes | |
672 c,d=Uniq(nodes) | |
673 #print "potential O antigen contig",c | |
674 final_O=[] | |
675 O_nodes_list=[] | |
676 for x in c:#c is the list for contigs | |
677 temp=0 | |
678 for y in Final_list_passed: | |
679 if x in y[0] and y[0].startswith("O-"): | |
680 final_O.append(y) | |
681 break | |
682 ### O contig has the problem of two genes on same contig, so do additional test | |
683 potenial_new_gene="" | |
684 for x in final_O: | |
685 pointer=0 #for genes merged or not | |
686 #not consider O-1,3,19_not_in_3,10, too short compared with others | |
687 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region | |
688 pointer=x[0].split("___")[1].strip()#store the contig name | |
689 print(pointer) | |
690 if pointer!=0:#it has potential merge event | |
691 for y in Final_list: | |
692 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O | |
693 potenial_new_gene=y | |
694 #print(potenial_new_gene) | |
695 break | |
696 if potenial_new_gene!="": | |
697 print("two differnt genes in same contig, fix it for O antigen") | |
698 print(potenial_new_gene[:3]) | |
699 pointer=0 | |
700 for y in final_O: | |
701 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]: | |
702 pointer=1 | |
703 if pointer!=0: #changed to consider two genes in same contig | |
704 final_O.append(potenial_new_gene) | |
705 ### end of the two genes on same contig test | |
706 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted | |
707 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): | |
708 #print "$$$No Otype, due to no hit"#may need to be changed | |
709 O_choice="-" | |
710 else: | |
711 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) | |
712 O_list=[] | |
713 O_list_less_contamination=[] | |
714 for x in final_O: | |
715 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off | |
716 O_list.append(x[0].split("__")[0]) | |
717 O_nodes_list.append(x[0].split("___")[1]) | |
718 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15: | |
719 O_list_less_contamination.append(x[0].split("__")[0]) | |
720 ### special test for O9,46 and O3,10 family | |
721 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
722 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
723 O_choice="O-9,46" | |
724 #print "$$$Most possilble Otype: O-9,46" | |
725 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
726 O_choice="O-9,46,27" | |
727 #print "$$$Most possilble Otype: O-9,46,27" | |
728 else: | |
729 O_choice="O-9"#next, detect O9 vs O2? | |
730 O2=0 | |
731 O9=0 | |
732 for z in special_genes: | |
733 if "tyr-O-9" in z: | |
734 O9=special_genes[z] | |
735 elif "tyr-O-2" in z: | |
736 O2=special_genes[z] | |
737 if O2>O9: | |
738 O_choice="O-2" | |
739 elif O2<O9: | |
740 pass | |
741 else: | |
742 pass | |
743 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility." | |
744 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
745 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
746 O_choice="O-3,10" | |
747 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
748 else: | |
749 O_choice="O-1,3,19" | |
750 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
751 ### end of special test for O9,46 and O3,10 family | |
752 else: | |
753 try: | |
754 max_score=0 | |
755 for x in final_O: | |
756 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold | |
757 max_score=x[2]#change from x[-1] to x[2],08172018 | |
758 O_choice=x[0].split("_")[0] | |
759 if O_choice=="O-1,3,19": | |
760 O_choice=final_O[1][0].split("_")[0] | |
761 #print "$$$Most possilble Otype: ",O_choice | |
762 except: | |
763 pass | |
764 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)" | |
765 #print "O:",O_choice,O_nodes_list | |
766 Otypes=[] | |
767 for x in O_list: | |
768 if x!="O-1,3,19_not_in_3,10": | |
769 if "O-9,46_" not in x: | |
770 Otypes.append(x.split("_")[0]) | |
771 else: | |
772 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254 | |
773 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"] | |
774 Otypes_uniq,Otypes_fre=Uniq(Otypes) | |
775 contamination_O="" | |
776 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": | |
777 if len(Otypes_uniq)>2: | |
778 contamination_O="potential contamination from O antigen signals" | |
779 else: | |
780 if len(Otypes_uniq)>1: | |
781 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
782 contamination_O="" | |
783 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
784 contamination_O="" | |
785 else: | |
786 contamination_O="potential contamination from O antigen signals" | |
787 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq | |
788 ### End of SeqSero2 allele prediction and output | |
789 | |
790 def get_input_files(make_dir,input_file,data_type,dirpath): | |
791 #tell input files from datatype | |
792 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)" | |
793 for_fq="" | |
794 rev_fq="" | |
795 os.chdir(make_dir) | |
796 if data_type=="1": | |
797 input_file=input_file[0].split("/")[-1] | |
798 if input_file.endswith(".sra"): | |
799 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True) | |
800 for_fq=input_file.replace(".sra","_1.fastq") | |
801 rev_fq=input_file.replace(".sra","_2.fastq") | |
802 else: | |
803 core_id=input_file.split(".fastq")[0].split(".fq")[0] | |
804 for_fq=core_id+"_1.fastq" | |
805 rev_fq=core_id+"_2.fastq" | |
806 if input_file.endswith(".gz"): | |
807 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True) | |
808 else: | |
809 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True) | |
810 elif data_type=="2": | |
811 for_fq=input_file[0].split("/")[-1] | |
812 rev_fq=input_file[1].split("/")[-1] | |
813 elif data_type=="3": | |
814 input_file=input_file[0].split("/")[-1] | |
815 if input_file.endswith(".sra"): | |
816 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True) | |
817 for_fq=input_file.replace(".sra","_1.fastq") | |
818 else: | |
819 for_fq=input_file | |
820 elif data_type in ["4","5","6"]: | |
821 for_fq=input_file[0].split("/")[-1] | |
822 os.chdir("..") | |
823 return for_fq,rev_fq | |
824 | |
825 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta): | |
826 #get O and H types from Final_list from blast parsing; allele mode | |
827 from Bio import SeqIO | |
828 fliC_choice="-" | |
829 fljB_choice="-" | |
830 fliC_contig="NA" | |
831 fljB_contig="NA" | |
832 fliC_region=set([0]) | |
833 fljB_region=set([0,]) | |
834 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length | |
835 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length | |
836 O_choice="-"#no need to decide O contig for now, should be only one | |
837 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification | |
838 O_choice=O_choice.split("-")[-1].strip() | |
839 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": | |
840 O_choice="-" | |
841 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB | |
842 #add alignment locations, used for further selection, 03312019 | |
843 for i in range(len(H_contig_roles)): | |
844 x=H_contig_roles[i] | |
845 for y in Final_list_passed: | |
846 if x[1] in y[0] and y[0].startswith(x[0]): | |
847 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],) | |
848 break | |
849 log_file=open("SeqSero_log.txt","a") | |
850 extract_file=open("Extracted_antigen_alleles.fasta","a") | |
851 handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) | |
852 | |
853 #print("O_contigs:") | |
854 log_file.write("O_contigs:\n") | |
855 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n") | |
856 extract_file.write("#O_contigs:\n") | |
857 for x in O_nodes_roles: | |
858 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker | |
859 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1]))) | |
860 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n") | |
861 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n" | |
862 seqs="" | |
863 for z in handle_fasta: | |
864 if x[0].split("___")[-1]==z.description: | |
865 seqs=str(z.seq) | |
866 extract_file.write(title+seqs+"\n") | |
867 if len(H_contig_roles)!=0: | |
868 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output | |
869 else: | |
870 highest_H_coverage=0 | |
871 for x in H_contig_roles: | |
872 #if multiple choices, temporately select the one with longest length for now, will revise in further change | |
873 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list | |
874 fliC_contig=x[1] | |
875 fliC_length=len(x[-1]) | |
876 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13: | |
877 fljB_contig=x[1] | |
878 fljB_length=len(x[-1]) | |
879 for x in Final_list_passed: | |
880 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]: | |
881 fliC_choice=x[0].split("_")[1] | |
882 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]: | |
883 fljB_choice=x[0].split("_")[1] | |
884 elif fliC_choice!="-" and fljB_choice!="-": | |
885 break | |
886 #now remove contigs not in middle core part | |
887 first_allele="NA" | |
888 first_allele_percentage=0 | |
889 for x in Final_list: | |
890 if x[0].startswith("fliC") or x[0].startswith("fljB"): | |
891 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs | |
892 first_allele_percentage=x[2] | |
893 break | |
894 additional_contigs=[] | |
895 for x in Final_list: | |
896 if first_allele in x[0]: | |
897 if (fliC_contig == x[0].split("___")[-1]): | |
898 fliC_region=x[3] | |
899 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]): | |
900 fljB_region=x[3] | |
901 else: | |
902 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1 | |
903 additional_contigs.append(x) | |
904 #else: | |
905 #print x[:3] | |
906 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off | |
907 if first_allele_percentage>0.9: | |
908 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000: | |
909 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) | |
910 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000: | |
911 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) | |
912 else: | |
913 target_region=set()#doesn't do anything | |
914 else: | |
915 target_region=set()#doesn't do anything | |
916 #print(target_region) | |
917 #print(additional_contigs) | |
918 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best | |
919 target_region=target_region2|target_region | |
920 for x in additional_contigs: | |
921 removal=0 | |
922 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0]) | |
923 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off | |
924 removal=1 | |
925 else: | |
926 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part) | |
927 removal=1 | |
928 else: | |
929 pass | |
930 if removal==1: | |
931 for y in H_contig_roles: | |
932 if y[1] in x[0]: | |
933 H_contig_roles.remove(y) | |
934 else: | |
935 pass | |
936 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5) | |
937 #end of removing none-middle contigs | |
938 #print("H_contigs:") | |
939 log_file.write("H_contigs:\n") | |
940 extract_file.write("#H_contigs:\n") | |
941 H_contig_stat=[] | |
942 H1_cont_stat={} | |
943 H2_cont_stat={} | |
944 for i in range(len(H_contig_roles)): | |
945 x=H_contig_roles[i] | |
946 a=0 | |
947 for y in Final_list_passed: | |
948 if x[1] in y[0] and y[0].startswith(x[0]): | |
949 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide | |
950 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it | |
951 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB | |
952 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) | |
953 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") | |
954 H_contig_roles[i]="can't decide fliC or fljB, may be third phase" | |
955 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n" | |
956 seqs="" | |
957 for z in handle_fasta: | |
958 if x[1]==z.description: | |
959 seqs=str(z.seq) | |
960 extract_file.write(title+seqs+"\n") | |
961 break | |
962 else: | |
963 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) | |
964 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") | |
965 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n" | |
966 seqs="" | |
967 for z in handle_fasta: | |
968 if x[1]==z.description: | |
969 seqs=str(z.seq) | |
970 extract_file.write(title+seqs+"\n") | |
971 if x[0]=="fliC": | |
972 if y[0].split("_")[1] not in H1_cont_stat: | |
973 H1_cont_stat[y[0].split("_")[1]]=y[2] | |
974 else: | |
975 H1_cont_stat[y[0].split("_")[1]]+=y[2] | |
976 if x[0]=="fljB": | |
977 if y[0].split("_")[1] not in H2_cont_stat: | |
978 H2_cont_stat[y[0].split("_")[1]]=y[2] | |
979 else: | |
980 H2_cont_stat[y[0].split("_")[1]]+=y[2] | |
981 break | |
982 #detect contaminations | |
983 #print(H1_cont_stat) | |
984 #print(H2_cont_stat) | |
985 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2] | |
986 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2] | |
987 contamination_H="" | |
988 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1: | |
989 contamination_H="potential contamination from H antigen signals" | |
990 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA": | |
991 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected" | |
992 #get additional antigens | |
993 """ | |
994 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
995 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
996 O_choice="O-9,46" | |
997 #print "$$$Most possilble Otype: O-9,46" | |
998 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
999 O_choice="O-9,46,27" | |
1000 #print "$$$Most possilble Otype: O-9,46,27" | |
1001 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
1002 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
1003 O_choice="O-3,10" | |
1004 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
1005 else: | |
1006 O_choice="O-1,3,19" | |
1007 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
1008 ### end of special test for O9,46 and O3,10 family | |
1009 | |
1010 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": | |
1011 if len(Otypes_uniq)>2: | |
1012 contamination_O="potential contamination from O antigen signals" | |
1013 else: | |
1014 if len(Otypes_uniq)>1: | |
1015 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
1016 contamination_O="" | |
1017 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
1018 contamination_O="" | |
1019 """ | |
1020 additonal_antigents=[] | |
1021 #print(contamination_O) | |
1022 #print(contamination_H) | |
1023 log_file.write(contamination_O+"\n") | |
1024 log_file.write(contamination_H+"\n") | |
1025 log_file.close() | |
1026 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list | |
1027 | |
1028 def get_input_K(input_file,lib_dict,data_type,k_size): | |
1029 #kmer mode; get input_Ks from dict and data_type | |
1030 kmers = [] | |
1031 for h in lib_dict: | |
1032 kmers += lib_dict[h] | |
1033 if data_type == '4': | |
1034 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers)) | |
1035 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later | |
1036 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers)) | |
1037 elif data_type == '5':#minion_2d_fasta | |
1038 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers)) | |
1039 if data_type == '6':#minion_2d_fastq | |
1040 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers)) | |
1041 return input_Ks | |
1042 | |
1043 def get_kmer_dict(lib_dict,input_Ks): | |
1044 #kmer mode; get predicted types | |
1045 O_dict = {} | |
1046 H_dict = {} | |
1047 Special_dict = {} | |
1048 for h in lib_dict: | |
1049 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100 | |
1050 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases | |
1051 if h.startswith('O-') and score > 25: | |
1052 O_dict[h] = score | |
1053 if h.startswith('fl') and score > 40: | |
1054 H_dict[h] = score | |
1055 if (h[:2] != 'fl') and (h[:2] != 'O-'): | |
1056 Special_dict[h] = score | |
1057 return O_dict,H_dict,Special_dict | |
1058 | |
1059 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir): | |
1060 log_file=open("SeqSero_log.txt","a") | |
1061 log_file.write("O_scores:\n") | |
1062 #call O: | |
1063 highest_O = '-' | |
1064 if len(O_dict) == 0: | |
1065 pass | |
1066 else: | |
1067 for x in O_dict: | |
1068 log_file.write(x+"\t"+str(O_dict[x])+"\n") | |
1069 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
1070 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1 | |
1071 highest_O = "O-9,46" | |
1072 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1 | |
1073 highest_O = "O-9,46,27" | |
1074 else: | |
1075 highest_O = "O-9" # next, detect O9 vs O2? | |
1076 O2 = 0 | |
1077 O9 = 0 | |
1078 for z in Special_dict: | |
1079 if "tyr-O-9" in z: | |
1080 O9 = float(Special_dict[z]) | |
1081 if "tyr-O-2" in z: | |
1082 O2 = float(Special_dict[z]) | |
1083 if O2 > O9: | |
1084 highest_O = "O-2" | |
1085 elif ("O-3,10_wzx__1539" in O_dict) and ( | |
1086 "O-9,46_wzy__1191" in O_dict | |
1087 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
1088 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1 | |
1089 highest_O = "O-3,10" | |
1090 else: | |
1091 highest_O = "O-1,3,19" | |
1092 ### end of special test for O9,46 and O3,10 family | |
1093 else: | |
1094 try: | |
1095 max_score = 0 | |
1096 for x in O_dict: | |
1097 if float(O_dict[x]) >= max_score: | |
1098 max_score = float(O_dict[x]) | |
1099 highest_O = x.split("_")[0] | |
1100 if highest_O == "O-1,3,19": | |
1101 highest_O = '-' | |
1102 max_score = 0 | |
1103 for x in O_dict: | |
1104 if x == 'O-1,3,19_not_in_3,10__130': | |
1105 pass | |
1106 else: | |
1107 if float(O_dict[x]) >= max_score: | |
1108 max_score = float(O_dict[x]) | |
1109 highest_O = x.split("_")[0] | |
1110 except: | |
1111 pass | |
1112 #call_fliC: | |
1113 if len(H_dict)!=0: | |
1114 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not | |
1115 else: | |
1116 highest_H_score_both_BC=0 | |
1117 highest_fliC = '-' | |
1118 highest_fliC_raw = '-' | |
1119 highest_Score = 0 | |
1120 log_file.write("\nH_scores:\n") | |
1121 for s in H_dict: | |
1122 log_file.write(s+"\t"+str(H_dict[s])+"\n") | |
1123 if s.startswith('fliC'): | |
1124 if float(H_dict[s]) > highest_Score: | |
1125 highest_fliC = s.split('_')[1] | |
1126 highest_fliC_raw = s | |
1127 highest_Score = float(H_dict[s]) | |
1128 #call_fljB | |
1129 highest_fljB = '-' | |
1130 highest_fljB_raw = '-' | |
1131 highest_Score = 0 | |
1132 for s in H_dict: | |
1133 if s.startswith('fljB'): | |
1134 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other | |
1135 highest_fljB = s.split('_')[1] | |
1136 highest_fljB_raw = s | |
1137 highest_Score = float(H_dict[s]) | |
1138 log_file.write("\nSpecial_scores:\n") | |
1139 for s in Special_dict: | |
1140 log_file.write(s+"\t"+str(Special_dict[s])+"\n") | |
1141 log_file.close() | |
1142 return highest_O,highest_fliC,highest_fljB | |
1143 | |
1144 def get_temp_file_names(for_fq,rev_fq): | |
1145 #seqsero2 -a; get temp file names | |
1146 sam=for_fq+".sam" | |
1147 bam=for_fq+".bam" | |
1148 sorted_bam=for_fq+"_sorted.bam" | |
1149 mapped_fq1=for_fq+"_mapped.fq" | |
1150 mapped_fq2=rev_fq+"_mapped.fq" | |
1151 combined_fq=for_fq+"_combined.fq" | |
1152 for_sai=for_fq+".sai" | |
1153 rev_sai=rev_fq+".sai" | |
1154 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai | |
1155 | |
1156 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode): | |
1157 #seqsero2 -a; do mapping and sort | |
1158 print("building database...") | |
1159 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True) | |
1160 print("mapping...") | |
1161 if mapping_mode=="mem": | |
1162 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True) | |
1163 elif mapping_mode=="sam": | |
1164 if fnameB!="": | |
1165 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True) | |
1166 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True) | |
1167 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True) | |
1168 else: | |
1169 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True) | |
1170 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam) | |
1171 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True) | |
1172 ### check the version of samtools then use differnt commands | |
1173 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
1174 out, err = samtools_version.communicate() | |
1175 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip() | |
1176 print("check samtools version:",version) | |
1177 ### end of samtools version check and its analysis | |
1178 if LooseVersion(version)<=LooseVersion("1.2"): | |
1179 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True) | |
1180 else: | |
1181 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True) | |
1182 | |
1183 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode): | |
1184 #seqsero2 -a; extract, assembly and blast | |
1185 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True) | |
1186 #print("fnameA:",fnameA) | |
1187 #print("fnameB:",fnameB) | |
1188 if fnameB!="": | |
1189 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output | |
1190 else: | |
1191 pass | |
1192 outdir=current_time+"_temp" | |
1193 print("assembling...") | |
1194 if int(threads)>4: | |
1195 t="4" | |
1196 else: | |
1197 t=threads | |
1198 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-" | |
1199 if fnameB!="": | |
1200 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True) | |
1201 else: | |
1202 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True) | |
1203 #new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta" | |
1204 new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # ed_SL_09152019: change path to databse for packaging | |
1205 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True) | |
1206 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta | |
1207 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True) | |
1208 print("blasting...","\n") | |
1209 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml" | |
1210 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015 | |
1211 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10" | |
1212 else: | |
1213 xmlfile="NA" | |
1214 return xmlfile,new_fasta | |
1215 | |
1216 def judge_subspecies(fnameA,dirpath): | |
1217 #seqsero2 -a; judge subspecies on just forward raw reads fastq | |
1218 salmID_output=subprocess.Popen("python " + dirpath + "/SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
1219 out, err = salmID_output.communicate() | |
1220 out=out.decode("utf-8") | |
1221 file=open("data_log.txt","a") | |
1222 file.write(out) | |
1223 file.close() | |
1224 salm_species_scores=out.split("\n")[1].split("\t")[6:] | |
1225 salm_species_results=out.split("\n")[0].split("\t")[6:] | |
1226 max_score=0 | |
1227 max_score_index=1 #default is 1, means "I" | |
1228 for i in range(len(salm_species_scores)): | |
1229 if max_score<float(salm_species_scores[i]): | |
1230 max_score=float(salm_species_scores[i]) | |
1231 max_score_index=i | |
1232 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0] | |
1233 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare | |
1234 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part | |
1235 if max_score<10: | |
1236 prediction="-" | |
1237 return prediction | |
1238 | |
1239 def judge_subspecies_Kmer(Special_dict): | |
1240 #seqsero2 -k; | |
1241 max_score=0 | |
1242 prediction="-" #default should be I | |
1243 for x in Special_dict: | |
1244 if "mer" in x: | |
1245 if max_score<float(Special_dict[x]): | |
1246 max_score=float(Special_dict[x]) | |
1247 prediction=x.split("_")[-1].strip() | |
1248 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica | |
1249 prediction="bongori" | |
1250 break | |
1251 return prediction | |
1252 | |
1253 def main(): | |
1254 #combine SeqSeroK and SeqSero2, also with SalmID | |
1255 args = parse_args() | |
1256 input_file = args.i | |
1257 data_type = args.t | |
1258 analysis_mode = args.m | |
1259 mapping_mode=args.b | |
1260 threads=args.p | |
1261 make_dir=args.d | |
1262 clean_mode=args.c | |
1263 k_size=27 #will change for bug fixing | |
1264 #database="H_and_O_and_specific_genes.fasta" | |
1265 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) | |
1266 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: add ex_dir for packaging | |
1267 #database=ex_dir+"/seqsero2_db/H_and_O_and_specific_genes.fasta" # ed_SL_09152019: change path to database for packaging | |
1268 database="H_and_O_and_specific_genes.fasta" | |
1269 note="Note:" | |
1270 NA_note=" This predicted serotype is not in the Kauffman-White scheme." # ed_SL_09272019: add for new output format | |
1271 if len(sys.argv)==1: | |
1272 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file | |
1273 else: | |
1274 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime()) | |
1275 request_id += str(random.randint(1, 10000000)) | |
1276 if make_dir is None: | |
1277 make_dir="SeqSero_result_"+request_id | |
1278 if os.path.isdir(make_dir): | |
1279 pass | |
1280 else: | |
1281 subprocess.check_call(["mkdir",make_dir]) | |
1282 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) | |
1283 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) | |
1284 #subprocess.check_call("ln -f -s "+database+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_09152019: change path to database for packaging | |
1285 subprocess.check_call("ln -f -s "+dirpath+"/seqsero2_db/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### ed_SL_05282019: use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument). | |
1286 ############################begin the real analysis | |
1287 if analysis_mode=="a": | |
1288 if data_type in ["1","2","3"]:#use allele mode | |
1289 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) | |
1290 os.chdir(make_dir) | |
1291 ###add a function to tell input files | |
1292 fnameA=for_fq.split("/")[-1] | |
1293 fnameB=rev_fq.split("/")[-1] | |
1294 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) | |
1295 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id | |
1296 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort | |
1297 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast | |
1298 if xmlfile=="NA": | |
1299 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","") | |
1300 else: | |
1301 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results | |
1302 file=open("data_log.txt","a") | |
1303 for x in Final_list: | |
1304 file.write("\t".join(str(y) for y in x)+"\n") | |
1305 file.close() | |
1306 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] | |
1307 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB | |
1308 subspecies=judge_subspecies(fnameA,dirpath) #predict subspecies | |
1309 ###output | |
1310 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) | |
1311 claim="" #04132019, disable claim for new report requirement | |
1312 contamination_report="" | |
1313 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] | |
1314 if contamination_O!="" and contamination_H=="": | |
1315 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"." | |
1316 elif contamination_O=="" and contamination_H!="": | |
1317 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"." | |
1318 elif contamination_O!="" and contamination_H!="": | |
1319 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." | |
1320 if contamination_report!="": | |
1321 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019 | |
1322 contamination_report=" Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles." | |
1323 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version | |
1324 ## ed_SL_09272019: change for new output format | |
1325 #if contamination_report+star_line+claim=="": #0413, new output style | |
1326 # note="" | |
1327 #else: | |
1328 # note="Note:" | |
1329 if clean_mode: | |
1330 subprocess.check_call("rm -rf ../"+make_dir,shell=True) | |
1331 make_dir="none-output-directory due to '-c' flag" | |
1332 else: | |
1333 new_file=open("SeqSero_result.txt","w") | |
1334 if O_choice=="": | |
1335 O_choice="-" | |
1336 if "N/A" not in predict_sero: | |
1337 new_file.write("Output_directory:\t"+make_dir+"\n"+ | |
1338 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1339 "O antigen prediction:\t"+O_choice+"\n"+ | |
1340 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1341 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1342 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1343 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1344 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype | |
1345 note+contamination_report+star_line+claim+"\n")#+## | |
1346 else: | |
1347 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n" | |
1348 star_line="" #04132019, for new output requirement, diable star_line if "NA" in output | |
1349 new_file.write("Output_directory:\t"+make_dir+"\n"+ | |
1350 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1351 "O antigen prediction:\t"+O_choice+"\n"+ | |
1352 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1353 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1354 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1355 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1356 "Predicted serotype:\t"+predict_form+"\n"+ # ed_SL_09242019: add serotype output for "N/A" prediction | |
1357 note+NA_note+contamination_report+star_line+claim+"\n")#+## | |
1358 new_file.close() | |
1359 print("\n") | |
1360 #subprocess.check_call("cat Seqsero_result.txt",shell=True) | |
1361 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True) | |
1362 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True) | |
1363 if "N/A" not in predict_sero: | |
1364 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+## | |
1365 print("Output_directory:\t"+make_dir+"\n"+ | |
1366 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1367 "O antigen prediction:\t"+O_choice+"\n"+ | |
1368 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1369 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1370 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1371 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1372 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype | |
1373 note+contamination_report+star_line+claim+"\n")#+## | |
1374 else: | |
1375 print("Output_directory:\t"+make_dir+"\n"+ | |
1376 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1377 "O antigen prediction:\t"+O_choice+"\n"+ | |
1378 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1379 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1380 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1381 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1382 "Predicted serotype:\t"+predict_form+"\n"+ # ed_SL_09242019: add serotype output for "N/A" prediction | |
1383 note+NA_note+contamination_report+star_line+claim+"\n") | |
1384 else: | |
1385 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'") | |
1386 elif analysis_mode=="k": | |
1387 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
1388 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: change ex_dir for packaging | |
1389 #output_mode = args.mode | |
1390 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) | |
1391 input_file = for_fq #-k will just use forward because not all reads were used | |
1392 os.chdir(make_dir) | |
1393 f = open(dirpath + '/seqsero2_db/antigens.pickle', 'rb') | |
1394 lib_dict = pickle.load(f) | |
1395 f.close | |
1396 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size) | |
1397 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks) | |
1398 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir) | |
1399 subspecies=judge_subspecies_Kmer(Special_dict) | |
1400 if subspecies=="IIb" or subspecies=="IIa": | |
1401 subspecies="II" | |
1402 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( | |
1403 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) | |
1404 claim="" #no claim any more based on new output requirement | |
1405 ## ed_SL_09272019: change for new output format | |
1406 #if star_line+claim=="": #0413, new output style | |
1407 # note="" | |
1408 #else: | |
1409 # note="Note:" | |
1410 if clean_mode: | |
1411 subprocess.check_call("rm -rf ../"+make_dir,shell=True) | |
1412 make_dir="none-output-directory due to '-c' flag" | |
1413 ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c" | |
1414 if highest_O.split('-')[-1]=="": | |
1415 O_choice="-" | |
1416 else: | |
1417 O_choice=highest_O.split('-')[-1] | |
1418 ### | |
1419 else: | |
1420 if highest_O.split('-')[-1]=="": | |
1421 O_choice="-" | |
1422 else: | |
1423 O_choice=highest_O.split('-')[-1] | |
1424 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero) | |
1425 new_file=open("SeqSero_result.txt","w") | |
1426 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## | |
1427 if "N/A" not in predict_sero: | |
1428 new_file.write("Output_directory:\t"+make_dir+"\n"+ | |
1429 "Input files:\t"+input_file+"\n"+ | |
1430 "O antigen prediction:\t"+O_choice+"\n"+ | |
1431 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1432 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1433 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1434 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1435 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype | |
1436 note+star_line+claim+"\n")#+## | |
1437 else: | |
1438 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n" | |
1439 star_line = "" #changed for new output requirement, 04132019 | |
1440 new_file.write("Output_directory:\t"+make_dir+"\n"+ | |
1441 "Input files:\t"+input_file+"\n"+ | |
1442 "O antigen prediction:\t"+O_choice+"\n"+ | |
1443 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1444 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1445 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1446 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1447 "Predicted serotype:\t"+predict_form+"\n"+ # ed_SL_09242019: add serotype output for "N/A" prediction | |
1448 note+NA_note+star_line+claim+"\n")#+## | |
1449 new_file.close() | |
1450 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True) | |
1451 if "N/A" not in predict_sero: | |
1452 print("Output_directory:\t"+make_dir+"\n"+ | |
1453 "Input files:\t"+input_file+"\n"+ | |
1454 "O antigen prediction:\t"+O_choice+"\n"+ | |
1455 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1456 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1457 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1458 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1459 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype | |
1460 note+star_line+claim+"\n")#+## | |
1461 else: | |
1462 print("Output_directory:\t"+make_dir+"\n"+ | |
1463 "Input files:\t"+input_file+"\n"+ | |
1464 "O antigen prediction:\t"+O_choice+"\n"+ | |
1465 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1466 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1467 "Predicted subspecies:\t"+subspecies+"\n"+ | |
1468 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1469 "Predicted serotype:\t"+predict_form+"\n"+ # ed_SL_09242019: add serotype output for "N/A" prediction | |
1470 note+NA_note+star_line+claim+"\n")#+## | |
1471 | |
1472 if __name__ == '__main__': | |
1473 main() |