0
|
1 #!/usr/bin/env python3
|
|
2
|
|
3 import os, sys, re, time, pprint, io, shutil
|
|
4 import argparse, subprocess
|
|
5
|
|
6 from cgecore.alignment import extended_cigar
|
|
7 from cgecore.blaster.blaster import Blaster
|
|
8 from cgecore.cgefinder import CGEFinder
|
|
9 import json, gzip
|
|
10 from tabulate import tabulate
|
|
11
|
|
12
|
|
13 def get_read_filename(infiles):
|
|
14 ''' Infiles must be a list with 1 or 2 input files.
|
|
15 Removes path from given string and removes extensions:
|
|
16 .fq .fastq .gz and .trim
|
|
17 extract the common sample name i 2 files are given.
|
|
18 '''
|
|
19 # Remove common fastq extensions
|
|
20 seq_path = infiles[0]
|
|
21 seq_file = os.path.basename(seq_path)
|
|
22 seq_file = seq_file.replace(".fq", "")
|
|
23 seq_file = seq_file.replace(".fastq", "")
|
|
24 seq_file = seq_file.replace(".gz", "")
|
|
25 seq_file = seq_file.replace(".trim", "")
|
|
26 if len(infiles) == 1:
|
|
27 return seq_file.rstrip()
|
|
28
|
|
29 # If two files are given get the common sample name
|
|
30 sample_name = ""
|
|
31 seq_file_2 = os.path.basename(infiles[1])
|
|
32 for i in range(len(seq_file)):
|
|
33 if seq_file_2[i] == seq_file[i]:
|
|
34 sample_name += seq_file[i]
|
|
35 else:
|
|
36 break
|
|
37 if sample_name == "":
|
|
38 sys.error("Input error: sample names of input files, {} and {}, \
|
|
39 does not share a common sample name. If these files \
|
|
40 are paired end reads from the same sample, please rename \
|
|
41 them with a common sample name (e.g. 's22_R1.fq', 's22_R2.fq') \
|
|
42 or input them seperately.".format(infiles[0], infiles[1]))
|
|
43
|
|
44 return sample_name.rstrip("-").rstrip("_")
|
|
45
|
|
46 def is_gzipped(file_path):
|
|
47 ''' Returns True if file is gzipped and False otherwise.
|
|
48 The result is inferred from the first two bits in the file read
|
|
49 from the input path.
|
|
50 On unix systems this should be: 1f 8b
|
|
51 Theoretically there could be exceptions to this test but it is
|
|
52 unlikely and impossible if the input files are otherwise expected
|
|
53 to be encoded in utf-8.
|
|
54 '''
|
|
55 with open(file_path, mode='rb') as fh:
|
|
56 bit_start = fh.read(2)
|
|
57 if(bit_start == b'\x1f\x8b'):
|
|
58 return True
|
|
59 else:
|
|
60 return False
|
|
61
|
|
62 def get_file_format(input_files):
|
|
63 """
|
|
64 Takes all input files and checks their first character to assess
|
|
65 the file format. Returns one of the following strings; fasta, fastq,
|
|
66 other or mixed. fasta and fastq indicates that all input files are
|
|
67 of the same format, either fasta or fastq. other indiates that all
|
|
68 files are not fasta nor fastq files. mixed indicates that the inputfiles
|
|
69 are a mix of different file formats.
|
|
70 """
|
|
71
|
|
72 # Open all input files and get the first character
|
|
73 file_format = []
|
|
74 invalid_files = []
|
|
75 for infile in input_files:
|
|
76 if is_gzipped(infile):#[-3:] == ".gz":
|
|
77 f = gzip.open(infile, "rb")
|
|
78 fst_char = f.read(1);
|
|
79 else:
|
|
80 f = open(infile, "rb")
|
|
81 fst_char = f.read(1);
|
|
82 f.close()
|
|
83 # Assess the first character
|
|
84 if fst_char == b"@":
|
|
85 file_format.append("fastq")
|
|
86 elif fst_char == b">":
|
|
87 file_format.append("fasta")
|
|
88 else:
|
|
89 invalid_files.append("other")
|
|
90 if len(set(file_format)) != 1:
|
|
91 return "mixed"
|
|
92 return ",".join(set(file_format))
|
|
93
|
|
94 def import_profile(database, scheme, loci_list):
|
|
95 """Import all possible allele profiles with corresponding st's
|
|
96 for the scheme into a dict. The profiles are stored in a dict
|
|
97 of dicts, to easily look up what st types are accosiated with
|
|
98 a specific allele number of each loci.
|
|
99 """
|
|
100 # Open allele profile file from databaseloci
|
|
101 profile_file = open("{0}/{1}.txt.clean".format(database, scheme), "r")
|
|
102 profile_header = profile_file.readline().strip().split("\t")[1:len(loci_list)+1]
|
|
103
|
|
104 # Create dict for looking up st-types with locus/allele combinations
|
|
105 st_profiles = {}
|
|
106 # For each locus initate make an inner dict to store allele and st's
|
|
107 for locus in loci_list:
|
|
108 st_profiles[locus] = {}
|
|
109
|
|
110 # Fill inner dict with allele no as key and st-types seen with the allele as value
|
|
111 for line in profile_file:
|
|
112 profile = line.strip().split("\t")
|
|
113 st_name = profile[0]
|
|
114 allele_list = profile[1:len(loci_list)+1]
|
|
115
|
|
116 # Go through all allele profiles. Save locus-allele combination with the st-type
|
|
117 for i in range(len(allele_list)):
|
|
118 allele = allele_list[i]
|
|
119 locus = profile_header[i]
|
|
120 if allele in st_profiles[locus]:
|
|
121 st_profiles[locus][allele] += [st_name]
|
|
122 else:
|
|
123 st_profiles[locus][allele] = [st_name]
|
|
124 profile_file.close()
|
|
125
|
|
126 return st_profiles
|
|
127
|
|
128 def st_typing(st_profiles, allele_matches, loci_list):
|
|
129 """
|
|
130 Takes the path to a dictionary, the inp list of the allele
|
|
131 number that each loci has been assigned, and an output file string
|
|
132 where the found st type and similaity is written into it.
|
|
133 """
|
|
134
|
|
135 # Find best ST type for all allele profiles
|
|
136 st_output = ""
|
|
137 note = ""
|
|
138
|
|
139 # First line contains matrix column headers, which are the specific loci
|
|
140 st_hits = []
|
|
141 st_marks = []
|
|
142 note = ""
|
|
143
|
|
144 # Check the quality of the alle hits
|
|
145 for locus in allele_matches:
|
|
146 allele = allele_matches[locus]["allele"]
|
|
147
|
|
148 # Check if allele is marked as a non-perfect match. Save mark and write note.
|
|
149 if "?*" in allele:
|
|
150 note += "?* {}: Imperfect hit, ST can not be trusted!\n".format(locus)
|
|
151 st_marks = ["?","*"]
|
|
152 elif "?" in allele:
|
|
153 note += "? {}: Uncertain hit, ST can not be trusted.\n".format(locus)
|
|
154 st_marks.append("?")
|
|
155 elif "*" in allele:
|
|
156 note += "* {}: Novel allele, ST may indicate nearest ST.\n".format(locus)
|
|
157 st_marks.append("*")
|
|
158
|
|
159 # Remove mark from allele so it can be used to look up nearest st types
|
|
160 allele = allele.rstrip("*?!")
|
|
161
|
|
162 # Get all st's that have the alleles in it's allele profile
|
|
163 st_hits += st_profiles[locus].get(allele, ["None"])
|
|
164 if "alternative_hit" in allele_matches[locus] and allele_matches[locus]["alternative_hit"] != {}:
|
|
165 note += "! {}: Multiple perfect hits found\n".format(locus)
|
|
166 st_marks.append("!")
|
|
167 for allele_name, hit_info in allele_matches[locus]["alternative_hit"].items():
|
|
168 allele = hit_info["allele"].rstrip("!")
|
|
169 st_hits += st_profiles[locus].get(allele, ["None"])
|
|
170
|
|
171 # Save allele marks to be transfered to the ST
|
|
172 st_mark = "".join(set(st_marks))
|
|
173 notes = st_mark
|
|
174 # Add marks information to notes
|
|
175 if "!" in st_mark:
|
|
176 notes += " alleles with multiple perfect hits found, multiple STs might be found\n"
|
|
177 if "*" in st_mark and "?" in st_mark:
|
|
178 notes += " alleles with less than 100% identity and 100% coverages found\n"
|
|
179 elif st_mark == "*":
|
|
180 notes = st_mark + " alleles with less than 100% identity found\n"
|
|
181 elif st_mark == "?":
|
|
182 notes = st_mark + " alleles with less than 100% coverage found\n"
|
|
183 notes += note
|
|
184
|
|
185 # Find most frequent st in st_hits
|
|
186 st_hits_counter = {}
|
|
187 max_count = 0
|
|
188 best_hit = ""
|
|
189 for hit in st_hits:
|
|
190 if hit is not "None":
|
|
191 if hit in st_hits_counter:
|
|
192 st_hits_counter[hit] += 1
|
|
193 else:
|
|
194 st_hits_counter[hit] = 1
|
|
195 if max_count < st_hits_counter[hit]:
|
|
196 max_count = st_hits_counter[hit]
|
|
197 best_hit = hit
|
|
198
|
|
199 # Check if allele profile match found st 100 %
|
|
200 similarity = round(float(max_count)/len(loci_list)*100, 2)
|
|
201
|
|
202 if similarity != 100:
|
|
203 st = "Unknown"
|
|
204 nearest_sts = []
|
|
205 # If st is not perfect find nearest st's
|
|
206 for st_hit, allele_score in sorted(st_hits_counter.items(), key=lambda x: x[1], reverse=True):
|
|
207 if allele_score < max_count:
|
|
208 break
|
|
209 nearest_sts.append(st_hit)
|
|
210 nearest_sts = ",".join(nearest_sts) #+ st_mark
|
|
211 else:
|
|
212 # allele profile has a perfect ST hit but the st marks given to the alleles might indicate imperfect hits
|
|
213 sts = [st for st, no in st_hits_counter.items() if no == max_count]
|
|
214 #if len(sts) > 1:
|
|
215 st = "{},".format(st_mark).join(sts) + st_mark
|
|
216 #st = best_hit + st_mark
|
|
217 nearest_sts = ""
|
|
218
|
|
219 return st, notes, nearest_sts
|
|
220
|
|
221 def make_aln(scheme, file_handle, allele_matches, query_aligns, homol_aligns, sbjct_aligns):
|
|
222 for locus, locus_info in allele_matches.items():
|
|
223 allele_name = locus_info["allele_name"]
|
|
224 if allele_name == "No hit found":
|
|
225 continue
|
|
226 hit_name = locus_info["hit_name"]
|
|
227
|
|
228 seqs = ["","",""]
|
|
229 seqs[0] = sbjct_aligns[scheme][hit_name]
|
|
230 seqs[1] = homol_aligns[scheme][hit_name]
|
|
231 seqs[2] = query_aligns[scheme][hit_name]
|
|
232
|
|
233 write_align(seqs, allele_name, file_handle)
|
|
234
|
|
235
|
|
236 # write alternative seq
|
|
237 if "alternative_hit" in locus_info:
|
|
238 for allele_name in locus_info["alternative_hit"]:
|
|
239 hit_name = locus_info["alternative_hit"][allele_name]["hit_name"]
|
|
240 seqs = ["","",""]
|
|
241 seqs[0] = sbjct_aligns[scheme][hit_name]
|
|
242 seqs[1] = homol_aligns[scheme][hit_name]
|
|
243 seqs[2] = query_aligns[scheme][hit_name]
|
|
244
|
|
245 write_align(seqs, allele_name, file_handle)
|
|
246
|
|
247 def write_align(seq, seq_name, file_handle):
|
|
248 file_handle.write("# {}".format(seq_name) + "\n")
|
|
249 sbjct_seq = seq[0]
|
|
250 homol_seq = seq[1]
|
|
251 query_seq = seq[2]
|
|
252 for i in range(0,len(sbjct_seq),60):
|
|
253 file_handle.write("%-10s\t%s\n"%("template:", sbjct_seq[i:i+60]))
|
|
254 file_handle.write("%-10s\t%s\n"%("", homol_seq[i:i+60]))
|
|
255 file_handle.write("%-10s\t%s\n\n"%("query:", query_seq[i:i+60]))
|
|
256
|
|
257 def text_table(headers, rows, empty_replace='-'):
|
|
258 ''' Create text table
|
|
259
|
|
260 USAGE:
|
|
261 >>> from tabulate import tabulate
|
|
262 >>> headers = ['A','B']
|
|
263 >>> rows = [[1,2],[3,4]]
|
|
264 >>> print(text_table(headers, rows))
|
|
265 **********
|
|
266 A B
|
|
267 **********
|
|
268 1 2
|
|
269 3 4
|
|
270 ==========
|
|
271 '''
|
|
272 # Replace empty cells with placeholder
|
|
273 rows = map(lambda row: map(lambda x: x if x else empty_replace, row), rows)
|
|
274 # Create table
|
|
275 table = tabulate(rows, headers, tablefmt='simple').split('\n')
|
|
276 # Prepare title injection
|
|
277 width = len(table[0])
|
|
278 # Switch horisontal line
|
|
279 table[1] = '*'*(width+2)
|
|
280 # Update table with title
|
|
281 table = ("%s\n"*3)%('*'*(width+2), '\n'.join(table), '='*(width+2))
|
|
282 return table
|
|
283
|
|
284
|
|
285 parser = argparse.ArgumentParser(description="")
|
|
286 # Arguments
|
|
287 parser.add_argument("-i", "--infile",
|
|
288 help="FASTA or FASTQ files to do pMLST on.",
|
|
289 nargs="+", required=True)
|
|
290 parser.add_argument("-o", "--outdir",
|
|
291 help="Output directory.",
|
|
292 default=".")
|
|
293 parser.add_argument("-s", "--scheme",
|
|
294 help="scheme database used for pMLST prediction", required=True)
|
|
295 parser.add_argument("-p", "--database",
|
|
296 help="Directory containing the databases.", default="/database")
|
|
297 parser.add_argument("-t", "--tmp_dir",
|
|
298 help="Temporary directory for storage of the results\
|
|
299 from the external software.",
|
|
300 default="tmp_pMLST")
|
|
301 parser.add_argument("-mp", "--method_path",
|
|
302 help="Path to the method to use (kma or blastn)\
|
|
303 if assembled contigs are inputted the path to executable blastn should be given,\
|
|
304 if fastq files are given path to executable kma should be given")
|
|
305 parser.add_argument("-x", "--extented_output",
|
|
306 help="Give extented output with allignment files, template and query hits in fasta and\
|
|
307 a tab seperated file with allele profile results", action="store_true")
|
|
308 parser.add_argument("-q", "--quiet", action="store_true")
|
|
309
|
|
310
|
|
311 #parser.add_argument("-c", "--coverage",
|
|
312 # help="Minimum template coverage required", default = 0.6)
|
|
313 #parser.add_argument("-i", "--identity",
|
|
314 # help="Minimum template identity required", default = 0.9)
|
|
315 args = parser.parse_args()
|
|
316
|
|
317 if args.quiet:
|
|
318 f = open(os.devnull, 'w')
|
|
319 sys.stdout = f
|
|
320
|
|
321
|
|
322 #TODO what are the clonal complex data used for??
|
|
323
|
|
324 # TODO error handling
|
|
325 infile = args.infile
|
|
326 # Check that outdir is an existing dir...
|
|
327 outdir = os.path.abspath(args.outdir)
|
|
328 scheme = args.scheme
|
|
329 database = os.path.abspath(args.database)
|
|
330 tmp_dir = os.path.abspath(args.tmp_dir)
|
|
331 # Check if method path is executable
|
|
332 method_path = args.method_path
|
|
333 extented_output = args.extented_output
|
|
334
|
|
335 min_cov = 0.6 # args.coverage
|
|
336 threshold = 0.95 # args.identity
|
|
337
|
|
338 # Check file format (fasta, fastq or other format)
|
|
339 file_format = get_file_format(infile)
|
|
340
|
|
341 db_path = "{}/".format(database, scheme)
|
|
342
|
|
343 config_file = open(database + "/config","r")
|
|
344
|
|
345 # Get profile_name from config file
|
|
346 scheme_list = []
|
|
347 for line in config_file:
|
|
348 if line.startswith("#"):
|
|
349 continue
|
|
350 line = line.split("\t")
|
|
351 scheme_list.append(line[0])
|
|
352 if line[0] == scheme:
|
|
353 profile_name = line[1]
|
|
354
|
|
355 config_file.close()
|
|
356
|
|
357 if scheme not in scheme_list:
|
|
358 sys.exit("{}, is not a valid scheme. \n\nPlease choose a scheme available in the database:\n{}".format(scheme, ", ".join(scheme_list)))
|
|
359
|
|
360 # Get loci list from allele profile file
|
|
361 with open("{0}/{1}.txt.clean".format(database, scheme), "r") as st_file:
|
|
362 file_header = st_file.readline().strip().split("\t")
|
|
363 loci_list = file_header[1:]
|
|
364
|
|
365 # Call appropriate method (kma or blastn) based on file format
|
|
366 if file_format == "fastq":
|
|
367 if not method_path:
|
|
368 method_path = "kma"
|
|
369 if shutil.which(method_path) == None:
|
|
370 sys.exit("No valid path to a kma program was provided. Use the -mp flag to provide the path.")
|
|
371 # Check the number of files
|
|
372 if len(infile) == 1:
|
|
373 infile_1 = infile[0]
|
|
374 infile_2 = None
|
|
375 elif len(infile) == 2:
|
|
376 infile_1 = infile[0]
|
|
377 infile_2 = infile[1]
|
|
378 else:
|
|
379 sys.exit("Only 2 input file accepted for raw read data,\
|
|
380 if data from more runs is avaliable for the same\
|
|
381 sample, please concatinate the reads into two files")
|
|
382
|
|
383 sample_name = get_read_filename(infile)
|
|
384 method = "kma"
|
|
385
|
|
386 # Call KMA
|
|
387 method_obj = CGEFinder.kma(infile_1, outdir, [scheme], db_path, min_cov=min_cov,
|
|
388 threshold=threshold, kma_path=method_path, sample_name=sample_name,
|
|
389 inputfile_2=infile_2, kma_mrs=0.75, kma_gapopen=-5,
|
|
390 kma_gapextend=-1, kma_penalty=-3, kma_reward=1)
|
|
391
|
|
392 elif file_format == "fasta":
|
|
393 if not method_path:
|
|
394 method_path = "blastn"
|
|
395 if shutil.which(method_path) == None:
|
|
396 sys.exit("No valid path to a blastn program was provided. Use the -mp flag to provide the path.")
|
|
397 # Assert that only one fasta file is inputted
|
|
398 assert len(infile) == 1, "Only one input file accepted for assembled data."
|
|
399 infile = infile[0]
|
|
400 method = "blast"
|
|
401
|
|
402 # Call BLASTn
|
|
403 method_obj = Blaster(infile, [scheme], db_path, tmp_dir,
|
|
404 min_cov, threshold, method_path, cut_off=False)
|
|
405 #allewed_overlap=50)
|
|
406 else:
|
|
407 sys.exit("Input file must be fastq or fasta format, not "+ file_format)
|
|
408
|
|
409 results = method_obj.results
|
|
410 query_aligns = method_obj.gene_align_query
|
|
411 homol_aligns = method_obj.gene_align_homo
|
|
412 sbjct_aligns = method_obj.gene_align_sbjct
|
|
413
|
|
414 # Check that the results dict is not empty
|
|
415 warning = ""
|
|
416 if results[scheme] == "No hit found":
|
|
417 results[scheme] = {}
|
|
418 warning = ("No MLST loci was found in the input data, "
|
|
419 "make sure that the correct pMLST scheme was chosen.")
|
|
420
|
|
421
|
|
422 allele_matches = {}
|
|
423
|
|
424 # Get the found allele profile contained in the results dict
|
|
425 for hit, locus_hit in results[scheme].items():
|
|
426
|
|
427 # Get allele number for locus
|
|
428 allele_name = locus_hit["sbjct_header"]
|
|
429 allele_obj = re.search("(\w+)[_|-](\w+$)", allele_name)
|
|
430
|
|
431 # Get variable to later storage in the results dict
|
|
432 locus = allele_obj.group(1)
|
|
433 allele = allele_obj.group(2)
|
|
434 coverage = float(locus_hit["perc_coverage"])
|
|
435 identity = float(locus_hit["perc_ident"])
|
|
436 score = float(locus_hit["cal_score"])
|
|
437 gaps = int(locus_hit["gaps"])
|
|
438 align_len = locus_hit["HSP_length"]
|
|
439 sbj_len = int(locus_hit["sbjct_length"])
|
|
440 sbjct_seq = locus_hit["sbjct_string"]
|
|
441 query_seq = locus_hit["query_string"]
|
|
442 homol_seq = locus_hit["homo_string"]
|
|
443 cigar = extended_cigar(sbjct_aligns[scheme][hit], query_aligns[scheme][hit])
|
|
444
|
|
445 # Check for perfect hits
|
|
446 if coverage == 100 and identity == 100:
|
|
447 # If a perfect hit was already found the list more_perfect hits will exist this new hit is appended to this list
|
|
448 try:
|
|
449 allele_matches[locus]["alternative_hit"][allele_name] = {"allele":allele+"!", "align_len":align_len, "sbj_len":sbj_len,
|
|
450 "coverage":coverage, "identity":identity, "hit_name":hit}
|
|
451 if allele_matches[locus]["allele"][-1] != "!":
|
|
452 allele_matches[locus]["allele"] += "!"
|
|
453 except KeyError:
|
|
454 # Overwrite alleles already saved, save the perfect match and break to go to next locus
|
|
455 allele_matches[locus] = {"score":score, "allele":allele, "coverage":coverage,
|
|
456 "identity":identity, "match_priority": 1, "align_len":align_len,
|
|
457 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
|
|
458 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
|
|
459 "hit_name":hit, "cigar":cigar, "alternative_hit":{}}
|
|
460 else:
|
|
461 # If no hit has yet been stored initialize dict variables that are looked up below
|
|
462 if locus not in allele_matches:
|
|
463 allele_matches[locus] = {"score":0, "match_priority": 4}
|
|
464
|
|
465 # We weight full coverage higher than perfect identity match
|
|
466 if coverage == 100 and identity != 100:
|
|
467 # Check that better (higher prioritized) 100% coverage hit has not been stored yet
|
|
468 if allele_matches[locus]["match_priority"] > 2 or (allele_matches[locus]["match_priority"] == 2 and score > allele_matches[locus]["score"]):
|
|
469 allele_matches[locus] = {"score":score, "allele":allele+"*", "coverage":coverage,
|
|
470 "identity":identity, "match_priority": 2, "align_len":align_len,
|
|
471 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
|
|
472 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
|
|
473 "hit_name":hit, "cigar":cigar}
|
|
474 elif coverage != 100 and identity == 100:
|
|
475 # Check that higher prioritized hit was not already stored
|
|
476 if allele_matches[locus]["match_priority"] > 3 or (allele_matches[locus]["match_priority"] == 3 and score > allele_matches[locus]["score"]):
|
|
477 allele_matches[locus] = {"score":score, "allele":allele + "?", "coverage":coverage,
|
|
478 "identity":identity, "match_priority": 3, "align_len":align_len,
|
|
479 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
|
|
480 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
|
|
481 "hit_name":hit, "cigar":cigar}
|
|
482 else: # coverage != 100 and identity != 100:
|
|
483 if allele_matches[locus]["match_priority"] == 4 and score > allele_matches[locus]["score"]:
|
|
484 allele_matches[locus] = {"score":score, "allele":allele + "?*", "coverage":coverage,
|
|
485 "identity":identity, "match_priority": 4, "align_len":align_len,
|
|
486 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
|
|
487 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
|
|
488 "hit_name":hit, "cigar":cigar}
|
|
489 for locus in loci_list:
|
|
490 if locus not in allele_matches:
|
|
491 allele_matches[locus] = {"identity":"", "coverage":"", "allele":"", "allele_name":"No hit found", "align_len":"", "gaps":"", "sbj_len":""}
|
|
492
|
|
493 # Import all possible st profiles into dict
|
|
494 st_profiles = import_profile(database, scheme,loci_list)
|
|
495
|
|
496 # Find st or neatest sts
|
|
497 st, note, nearest_sts = st_typing(st_profiles, allele_matches, loci_list)
|
|
498
|
|
499 # Give warning of mlst schene if no loci were found
|
|
500 if note == "" and warning != "":
|
|
501 note = warning
|
|
502
|
|
503 # Set ST for incF
|
|
504 if scheme.lower() == "incf":
|
|
505 st = ["F","A", "B"]
|
|
506 if "FII" in allele_matches and allele_matches["FII"]["identity"] == 100.0:
|
|
507 st[0] += allele_matches["FII"]["allele_name"].split("_")[-1]
|
|
508 elif "FIC" in allele_matches and allele_matches["FIC"]["identity"] == 100.0:
|
|
509 st[0] = "C" + allele_matches["FIC"]["allele_name"].split("_")[-1]
|
|
510 elif "FIIK" in allele_matches and allele_matches["FIIK"]["identity"] == 100.0:
|
|
511 st[0] = "K" + allele_matches["FIIK"]["allele_name"].split("_")[-1]
|
|
512 elif "FIIS" in allele_matches and allele_matches["FIIS"]["identity"] == 100.0:
|
|
513 st[0] = "S" + allele_matches["FIIS"]["allele_name"].split("_")[-1]
|
|
514 elif "FIIY" in allele_matches and allele_matches["FIIY"]["identity"] == 100.0:
|
|
515 st[0] = "Y" + allele_matches["FIIY"]["allele_name"].split("_")[-1]
|
|
516 else:
|
|
517 st[0] += "-"
|
|
518
|
|
519 if "FIA" in allele_matches and allele_matches["FIA"]["identity"] == 100.0:
|
|
520 st[1] += allele_matches["FIA"]["allele_name"].split("_")[-1]
|
|
521 else:
|
|
522 st[1] += "-"
|
|
523
|
|
524 if "FIB" in allele_matches and allele_matches["FIB"]["identity"] == 100.0:
|
|
525 st[2] += allele_matches["FIB"]["allele_name"].split("_")[-1]
|
|
526 else:
|
|
527 st[2] += "-"
|
|
528
|
|
529 st = "["+":".join(st)+"]"
|
|
530
|
|
531
|
|
532 # Check if ST is associated with a clonal complex.
|
|
533 clpx = ""
|
|
534 if st != "Unknown" or nearest_sts != "":
|
|
535 with open("{0}/{1}.clpx".format(database,scheme), "r") as clpx_file:
|
|
536 for line in clpx_file:
|
|
537 line = line.split("\t")
|
|
538 if st[0] == line[0] or nearest_sts == line[0]:
|
|
539 clpx = line[1].strip()
|
|
540
|
|
541
|
|
542 # Get run info for JSON file
|
|
543 service = os.path.basename(__file__).replace(".py", "")
|
|
544 date = time.strftime("%d.%m.%Y")
|
|
545 time = time.strftime("%H:%M:%S")
|
|
546
|
|
547 # TODO find a system to show the database and service version using git
|
|
548
|
|
549 # Make JSON output file
|
|
550 data = {service:{}}
|
|
551 allele_results = {}
|
|
552 for locus, locus_info in allele_matches.items():
|
|
553 allele_results[locus] = {"identity":0, "coverage":0, "allele":[], "allele_name":[], "align_len":[], "gaps":0, "sbj_len":[]}
|
|
554 for (key, value) in locus_info.items():
|
|
555 if key in allele_results[locus] or (key == "alternative_hit" and value != {}):
|
|
556 allele_results[locus][key] = value
|
|
557
|
|
558 userinput = {"filename":args.infile, "scheme":args.scheme, "profile":profile_name,"file_format":file_format}
|
|
559 run_info = {"date":date, "time":time}#, "database":{"remote_db":remote_db, "last_commit_hash":head_hash}}
|
|
560 server_results = {"sequence_type":st, "allele_profile": allele_results,
|
|
561 "nearest_sts":nearest_sts,"clonal_complex":clpx, "notes":note}
|
|
562
|
|
563 data[service]["user_input"] = userinput
|
|
564 data[service]["run_info"] = run_info
|
|
565 data[service]["results"] = server_results
|
|
566
|
|
567 pprint.pprint(data)
|
|
568
|
|
569 # Save json output
|
|
570 result_file = "{}/data.json".format(outdir)
|
|
571 with open(result_file, "w") as outfile:
|
|
572 json.dump(data, outfile)
|
|
573
|
|
574 if extented_output:
|
|
575 # Define extented output
|
|
576 table_filename = "{}/results_tab.tsv".format(outdir)
|
|
577 query_filename = "{}/Hit_in_genome_seq.fsa".format(outdir)
|
|
578 sbjct_filename = "{}/pMLST_allele_seq.fsa".format(outdir)
|
|
579 result_filename = "{}/results.txt".format(outdir)
|
|
580 table_file = open(table_filename, "w")
|
|
581 query_file = open(query_filename, "w")
|
|
582 sbjct_file = open(sbjct_filename, "w")
|
|
583 result_file = open(result_filename, "w")
|
|
584
|
|
585 # Make results file
|
|
586 result_file.write("{0} Results\n\n".format(service))
|
|
587 result_file.write("pMLST profile: {}\n\nSequence Type: {}\n".format(profile_name, st))
|
|
588 # If ST is unknown report nearest ST
|
|
589 if st == "Unknown" and nearest_sts != "":
|
|
590 if len(nearest_sts.split(",")) == 1:
|
|
591 result_file.write("Nearest ST: {}\n".format(nearest_sts))
|
|
592 else:
|
|
593 result_file.write("Nearest STs: {}\n".format(nearest_sts))
|
|
594
|
|
595 # Report clonal complex if one was associated with ST:
|
|
596 if clpx != "":
|
|
597 result_file.write("Clonal complex: {}\n".format(clpx))
|
|
598
|
|
599 # Write tsv table header
|
|
600 table_header = ["Locus", "Identity", "Coverage", "Alignment Length", "Allele Length", "Gaps", "Allele"]
|
|
601 table_file.write("\t".join(table_header) + "\n")
|
|
602 rows = []
|
|
603 for locus, allele_info in allele_matches.items():
|
|
604
|
|
605 identity = str(allele_info["identity"])
|
|
606 coverage = str(allele_info["coverage"])
|
|
607 allele = allele_info["allele"]
|
|
608 allele_name = allele_info["allele_name"]
|
|
609 align_len = str(allele_info["align_len"])
|
|
610 sbj_len = str(allele_info["sbj_len"])
|
|
611 gaps = str(allele_info["gaps"])
|
|
612
|
|
613 # Write alleles names with indications of imperfect hits
|
|
614 if allele_name != "No hit found":
|
|
615 allele_name_w_mark = locus + "_" + allele
|
|
616 else:
|
|
617 allele_name_w_mark = allele_name
|
|
618
|
|
619 # Write allele results to tsv table
|
|
620 row = [locus, identity, coverage, align_len, sbj_len, gaps, allele_name_w_mark]
|
|
621 rows.append(row)
|
|
622 if "alternative_hit" in allele_info:
|
|
623 for allele_name, dic in allele_info["alternative_hit"].items():
|
|
624 row = [locus, identity, coverage, str(dic["align_len"]), str(dic["sbj_len"]), "0", allele_name + "!"]
|
|
625 rows.append(row)
|
|
626 #
|
|
627
|
|
628 if allele_name == "No hit found":
|
|
629 continue
|
|
630
|
|
631 # Write query fasta output
|
|
632 hit_name = allele_info["hit_name"]
|
|
633 query_seq = query_aligns[scheme][hit_name]
|
|
634 sbjct_seq = sbjct_aligns[scheme][hit_name]
|
|
635 homol_seq = homol_aligns[scheme][hit_name]
|
|
636
|
|
637 if allele_info["match_priority"] == 1:
|
|
638 match = "PERFECT MATCH"
|
|
639 else:
|
|
640 match = "WARNING"
|
|
641 header = ">{}:{} ID:{}% COV:{}% Best_match:{}\n".format(locus, match, allele_info["identity"],
|
|
642 allele_info["coverage"], allele_info["allele_name"])
|
|
643 query_file.write(header)
|
|
644 for i in range(0,len(query_seq),60):
|
|
645 query_file.write(query_seq[i:i+60] + "\n")
|
|
646
|
|
647 # Write template fasta output
|
|
648 header = ">{}\n".format(allele_info["allele_name"])
|
|
649 sbjct_file.write(header)
|
|
650 for i in range(0,len(sbjct_seq),60):
|
|
651 sbjct_file.write(sbjct_seq[i:i+60] + "\n")
|
|
652
|
|
653 if "alternative_hit" in allele_info:
|
|
654 for allele_name in allele_info["alternative_hit"]:
|
|
655 header = ">{}:{} ID:{}% COV:{}% Best_match:{}\n".format(locus, "PERFECT MATCH", 100,
|
|
656 100, allele_name)
|
|
657 hit_name = allele_info["alternative_hit"][allele_name]["hit_name"]
|
|
658 query_seq = query_aligns[scheme][hit_name]
|
|
659 sbjct_seq = sbjct_aligns[scheme][hit_name]
|
|
660 homol_seq = homol_aligns[scheme][hit_name]
|
|
661 query_file.write(header)
|
|
662 for i in range(0,len(query_seq),60):
|
|
663 query_file.write(query_seq[i:i+60] + "\n")
|
|
664
|
|
665 # Write template fasta output
|
|
666 header = ">{}\n".format(allele_name)
|
|
667 sbjct_file.write(header)
|
|
668 for i in range(0,len(sbjct_seq),60):
|
|
669 sbjct_file.write(sbjct_seq[i:i+60] + "\n")
|
|
670
|
|
671 # Write Allele profile results tables in results file and table file
|
|
672 rows.sort(key=lambda x: x[0])
|
|
673 result_file.write(text_table(table_header, rows))
|
|
674 for row in rows:
|
|
675 table_file.write("\t".join(row) + "\n")
|
|
676 # Write any notes
|
|
677 if note != "":
|
|
678 result_file.write("\nNotes: {}\n\n".format(note))
|
|
679
|
|
680 # Write allignment output
|
|
681 result_file.write("\n\nExtended Output:\n\n")
|
|
682 make_aln(scheme, result_file, allele_matches, query_aligns, homol_aligns, sbjct_aligns)
|
|
683
|
|
684 # Close all files
|
|
685 query_file.close()
|
|
686 sbjct_file.close()
|
|
687 table_file.close()
|
|
688 result_file.close()
|
|
689
|
|
690 if args.quiet:
|
|
691 f.close()
|