comparison pmlst/pmlst.py @ 0:cfab64885f66 draft default tip

Uploaded
author dcouvin
date Mon, 06 Sep 2021 18:27:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:cfab64885f66
1 #!/usr/bin/env python3
2
3 import os, sys, re, time, pprint, io, shutil
4 import argparse, subprocess
5
6 from cgecore.alignment import extended_cigar
7 from cgecore.blaster.blaster import Blaster
8 from cgecore.cgefinder import CGEFinder
9 import json, gzip
10 from tabulate import tabulate
11
12
13 def get_read_filename(infiles):
14 ''' Infiles must be a list with 1 or 2 input files.
15 Removes path from given string and removes extensions:
16 .fq .fastq .gz and .trim
17 extract the common sample name i 2 files are given.
18 '''
19 # Remove common fastq extensions
20 seq_path = infiles[0]
21 seq_file = os.path.basename(seq_path)
22 seq_file = seq_file.replace(".fq", "")
23 seq_file = seq_file.replace(".fastq", "")
24 seq_file = seq_file.replace(".gz", "")
25 seq_file = seq_file.replace(".trim", "")
26 if len(infiles) == 1:
27 return seq_file.rstrip()
28
29 # If two files are given get the common sample name
30 sample_name = ""
31 seq_file_2 = os.path.basename(infiles[1])
32 for i in range(len(seq_file)):
33 if seq_file_2[i] == seq_file[i]:
34 sample_name += seq_file[i]
35 else:
36 break
37 if sample_name == "":
38 sys.error("Input error: sample names of input files, {} and {}, \
39 does not share a common sample name. If these files \
40 are paired end reads from the same sample, please rename \
41 them with a common sample name (e.g. 's22_R1.fq', 's22_R2.fq') \
42 or input them seperately.".format(infiles[0], infiles[1]))
43
44 return sample_name.rstrip("-").rstrip("_")
45
46 def is_gzipped(file_path):
47 ''' Returns True if file is gzipped and False otherwise.
48 The result is inferred from the first two bits in the file read
49 from the input path.
50 On unix systems this should be: 1f 8b
51 Theoretically there could be exceptions to this test but it is
52 unlikely and impossible if the input files are otherwise expected
53 to be encoded in utf-8.
54 '''
55 with open(file_path, mode='rb') as fh:
56 bit_start = fh.read(2)
57 if(bit_start == b'\x1f\x8b'):
58 return True
59 else:
60 return False
61
62 def get_file_format(input_files):
63 """
64 Takes all input files and checks their first character to assess
65 the file format. Returns one of the following strings; fasta, fastq,
66 other or mixed. fasta and fastq indicates that all input files are
67 of the same format, either fasta or fastq. other indiates that all
68 files are not fasta nor fastq files. mixed indicates that the inputfiles
69 are a mix of different file formats.
70 """
71
72 # Open all input files and get the first character
73 file_format = []
74 invalid_files = []
75 for infile in input_files:
76 if is_gzipped(infile):#[-3:] == ".gz":
77 f = gzip.open(infile, "rb")
78 fst_char = f.read(1);
79 else:
80 f = open(infile, "rb")
81 fst_char = f.read(1);
82 f.close()
83 # Assess the first character
84 if fst_char == b"@":
85 file_format.append("fastq")
86 elif fst_char == b">":
87 file_format.append("fasta")
88 else:
89 invalid_files.append("other")
90 if len(set(file_format)) != 1:
91 return "mixed"
92 return ",".join(set(file_format))
93
94 def import_profile(database, scheme, loci_list):
95 """Import all possible allele profiles with corresponding st's
96 for the scheme into a dict. The profiles are stored in a dict
97 of dicts, to easily look up what st types are accosiated with
98 a specific allele number of each loci.
99 """
100 # Open allele profile file from databaseloci
101 profile_file = open("{0}/{1}.txt.clean".format(database, scheme), "r")
102 profile_header = profile_file.readline().strip().split("\t")[1:len(loci_list)+1]
103
104 # Create dict for looking up st-types with locus/allele combinations
105 st_profiles = {}
106 # For each locus initate make an inner dict to store allele and st's
107 for locus in loci_list:
108 st_profiles[locus] = {}
109
110 # Fill inner dict with allele no as key and st-types seen with the allele as value
111 for line in profile_file:
112 profile = line.strip().split("\t")
113 st_name = profile[0]
114 allele_list = profile[1:len(loci_list)+1]
115
116 # Go through all allele profiles. Save locus-allele combination with the st-type
117 for i in range(len(allele_list)):
118 allele = allele_list[i]
119 locus = profile_header[i]
120 if allele in st_profiles[locus]:
121 st_profiles[locus][allele] += [st_name]
122 else:
123 st_profiles[locus][allele] = [st_name]
124 profile_file.close()
125
126 return st_profiles
127
128 def st_typing(st_profiles, allele_matches, loci_list):
129 """
130 Takes the path to a dictionary, the inp list of the allele
131 number that each loci has been assigned, and an output file string
132 where the found st type and similaity is written into it.
133 """
134
135 # Find best ST type for all allele profiles
136 st_output = ""
137 note = ""
138
139 # First line contains matrix column headers, which are the specific loci
140 st_hits = []
141 st_marks = []
142 note = ""
143
144 # Check the quality of the alle hits
145 for locus in allele_matches:
146 allele = allele_matches[locus]["allele"]
147
148 # Check if allele is marked as a non-perfect match. Save mark and write note.
149 if "?*" in allele:
150 note += "?* {}: Imperfect hit, ST can not be trusted!\n".format(locus)
151 st_marks = ["?","*"]
152 elif "?" in allele:
153 note += "? {}: Uncertain hit, ST can not be trusted.\n".format(locus)
154 st_marks.append("?")
155 elif "*" in allele:
156 note += "* {}: Novel allele, ST may indicate nearest ST.\n".format(locus)
157 st_marks.append("*")
158
159 # Remove mark from allele so it can be used to look up nearest st types
160 allele = allele.rstrip("*?!")
161
162 # Get all st's that have the alleles in it's allele profile
163 st_hits += st_profiles[locus].get(allele, ["None"])
164 if "alternative_hit" in allele_matches[locus] and allele_matches[locus]["alternative_hit"] != {}:
165 note += "! {}: Multiple perfect hits found\n".format(locus)
166 st_marks.append("!")
167 for allele_name, hit_info in allele_matches[locus]["alternative_hit"].items():
168 allele = hit_info["allele"].rstrip("!")
169 st_hits += st_profiles[locus].get(allele, ["None"])
170
171 # Save allele marks to be transfered to the ST
172 st_mark = "".join(set(st_marks))
173 notes = st_mark
174 # Add marks information to notes
175 if "!" in st_mark:
176 notes += " alleles with multiple perfect hits found, multiple STs might be found\n"
177 if "*" in st_mark and "?" in st_mark:
178 notes += " alleles with less than 100% identity and 100% coverages found\n"
179 elif st_mark == "*":
180 notes = st_mark + " alleles with less than 100% identity found\n"
181 elif st_mark == "?":
182 notes = st_mark + " alleles with less than 100% coverage found\n"
183 notes += note
184
185 # Find most frequent st in st_hits
186 st_hits_counter = {}
187 max_count = 0
188 best_hit = ""
189 for hit in st_hits:
190 if hit is not "None":
191 if hit in st_hits_counter:
192 st_hits_counter[hit] += 1
193 else:
194 st_hits_counter[hit] = 1
195 if max_count < st_hits_counter[hit]:
196 max_count = st_hits_counter[hit]
197 best_hit = hit
198
199 # Check if allele profile match found st 100 %
200 similarity = round(float(max_count)/len(loci_list)*100, 2)
201
202 if similarity != 100:
203 st = "Unknown"
204 nearest_sts = []
205 # If st is not perfect find nearest st's
206 for st_hit, allele_score in sorted(st_hits_counter.items(), key=lambda x: x[1], reverse=True):
207 if allele_score < max_count:
208 break
209 nearest_sts.append(st_hit)
210 nearest_sts = ",".join(nearest_sts) #+ st_mark
211 else:
212 # allele profile has a perfect ST hit but the st marks given to the alleles might indicate imperfect hits
213 sts = [st for st, no in st_hits_counter.items() if no == max_count]
214 #if len(sts) > 1:
215 st = "{},".format(st_mark).join(sts) + st_mark
216 #st = best_hit + st_mark
217 nearest_sts = ""
218
219 return st, notes, nearest_sts
220
221 def make_aln(scheme, file_handle, allele_matches, query_aligns, homol_aligns, sbjct_aligns):
222 for locus, locus_info in allele_matches.items():
223 allele_name = locus_info["allele_name"]
224 if allele_name == "No hit found":
225 continue
226 hit_name = locus_info["hit_name"]
227
228 seqs = ["","",""]
229 seqs[0] = sbjct_aligns[scheme][hit_name]
230 seqs[1] = homol_aligns[scheme][hit_name]
231 seqs[2] = query_aligns[scheme][hit_name]
232
233 write_align(seqs, allele_name, file_handle)
234
235
236 # write alternative seq
237 if "alternative_hit" in locus_info:
238 for allele_name in locus_info["alternative_hit"]:
239 hit_name = locus_info["alternative_hit"][allele_name]["hit_name"]
240 seqs = ["","",""]
241 seqs[0] = sbjct_aligns[scheme][hit_name]
242 seqs[1] = homol_aligns[scheme][hit_name]
243 seqs[2] = query_aligns[scheme][hit_name]
244
245 write_align(seqs, allele_name, file_handle)
246
247 def write_align(seq, seq_name, file_handle):
248 file_handle.write("# {}".format(seq_name) + "\n")
249 sbjct_seq = seq[0]
250 homol_seq = seq[1]
251 query_seq = seq[2]
252 for i in range(0,len(sbjct_seq),60):
253 file_handle.write("%-10s\t%s\n"%("template:", sbjct_seq[i:i+60]))
254 file_handle.write("%-10s\t%s\n"%("", homol_seq[i:i+60]))
255 file_handle.write("%-10s\t%s\n\n"%("query:", query_seq[i:i+60]))
256
257 def text_table(headers, rows, empty_replace='-'):
258 ''' Create text table
259
260 USAGE:
261 >>> from tabulate import tabulate
262 >>> headers = ['A','B']
263 >>> rows = [[1,2],[3,4]]
264 >>> print(text_table(headers, rows))
265 **********
266 A B
267 **********
268 1 2
269 3 4
270 ==========
271 '''
272 # Replace empty cells with placeholder
273 rows = map(lambda row: map(lambda x: x if x else empty_replace, row), rows)
274 # Create table
275 table = tabulate(rows, headers, tablefmt='simple').split('\n')
276 # Prepare title injection
277 width = len(table[0])
278 # Switch horisontal line
279 table[1] = '*'*(width+2)
280 # Update table with title
281 table = ("%s\n"*3)%('*'*(width+2), '\n'.join(table), '='*(width+2))
282 return table
283
284
285 parser = argparse.ArgumentParser(description="")
286 # Arguments
287 parser.add_argument("-i", "--infile",
288 help="FASTA or FASTQ files to do pMLST on.",
289 nargs="+", required=True)
290 parser.add_argument("-o", "--outdir",
291 help="Output directory.",
292 default=".")
293 parser.add_argument("-s", "--scheme",
294 help="scheme database used for pMLST prediction", required=True)
295 parser.add_argument("-p", "--database",
296 help="Directory containing the databases.", default="/database")
297 parser.add_argument("-t", "--tmp_dir",
298 help="Temporary directory for storage of the results\
299 from the external software.",
300 default="tmp_pMLST")
301 parser.add_argument("-mp", "--method_path",
302 help="Path to the method to use (kma or blastn)\
303 if assembled contigs are inputted the path to executable blastn should be given,\
304 if fastq files are given path to executable kma should be given")
305 parser.add_argument("-x", "--extented_output",
306 help="Give extented output with allignment files, template and query hits in fasta and\
307 a tab seperated file with allele profile results", action="store_true")
308 parser.add_argument("-q", "--quiet", action="store_true")
309
310
311 #parser.add_argument("-c", "--coverage",
312 # help="Minimum template coverage required", default = 0.6)
313 #parser.add_argument("-i", "--identity",
314 # help="Minimum template identity required", default = 0.9)
315 args = parser.parse_args()
316
317 if args.quiet:
318 f = open(os.devnull, 'w')
319 sys.stdout = f
320
321
322 #TODO what are the clonal complex data used for??
323
324 # TODO error handling
325 infile = args.infile
326 # Check that outdir is an existing dir...
327 outdir = os.path.abspath(args.outdir)
328 scheme = args.scheme
329 database = os.path.abspath(args.database)
330 tmp_dir = os.path.abspath(args.tmp_dir)
331 # Check if method path is executable
332 method_path = args.method_path
333 extented_output = args.extented_output
334
335 min_cov = 0.6 # args.coverage
336 threshold = 0.95 # args.identity
337
338 # Check file format (fasta, fastq or other format)
339 file_format = get_file_format(infile)
340
341 db_path = "{}/".format(database, scheme)
342
343 config_file = open(database + "/config","r")
344
345 # Get profile_name from config file
346 scheme_list = []
347 for line in config_file:
348 if line.startswith("#"):
349 continue
350 line = line.split("\t")
351 scheme_list.append(line[0])
352 if line[0] == scheme:
353 profile_name = line[1]
354
355 config_file.close()
356
357 if scheme not in scheme_list:
358 sys.exit("{}, is not a valid scheme. \n\nPlease choose a scheme available in the database:\n{}".format(scheme, ", ".join(scheme_list)))
359
360 # Get loci list from allele profile file
361 with open("{0}/{1}.txt.clean".format(database, scheme), "r") as st_file:
362 file_header = st_file.readline().strip().split("\t")
363 loci_list = file_header[1:]
364
365 # Call appropriate method (kma or blastn) based on file format
366 if file_format == "fastq":
367 if not method_path:
368 method_path = "kma"
369 if shutil.which(method_path) == None:
370 sys.exit("No valid path to a kma program was provided. Use the -mp flag to provide the path.")
371 # Check the number of files
372 if len(infile) == 1:
373 infile_1 = infile[0]
374 infile_2 = None
375 elif len(infile) == 2:
376 infile_1 = infile[0]
377 infile_2 = infile[1]
378 else:
379 sys.exit("Only 2 input file accepted for raw read data,\
380 if data from more runs is avaliable for the same\
381 sample, please concatinate the reads into two files")
382
383 sample_name = get_read_filename(infile)
384 method = "kma"
385
386 # Call KMA
387 method_obj = CGEFinder.kma(infile_1, outdir, [scheme], db_path, min_cov=min_cov,
388 threshold=threshold, kma_path=method_path, sample_name=sample_name,
389 inputfile_2=infile_2, kma_mrs=0.75, kma_gapopen=-5,
390 kma_gapextend=-1, kma_penalty=-3, kma_reward=1)
391
392 elif file_format == "fasta":
393 if not method_path:
394 method_path = "blastn"
395 if shutil.which(method_path) == None:
396 sys.exit("No valid path to a blastn program was provided. Use the -mp flag to provide the path.")
397 # Assert that only one fasta file is inputted
398 assert len(infile) == 1, "Only one input file accepted for assembled data."
399 infile = infile[0]
400 method = "blast"
401
402 # Call BLASTn
403 method_obj = Blaster(infile, [scheme], db_path, tmp_dir,
404 min_cov, threshold, method_path, cut_off=False)
405 #allewed_overlap=50)
406 else:
407 sys.exit("Input file must be fastq or fasta format, not "+ file_format)
408
409 results = method_obj.results
410 query_aligns = method_obj.gene_align_query
411 homol_aligns = method_obj.gene_align_homo
412 sbjct_aligns = method_obj.gene_align_sbjct
413
414 # Check that the results dict is not empty
415 warning = ""
416 if results[scheme] == "No hit found":
417 results[scheme] = {}
418 warning = ("No MLST loci was found in the input data, "
419 "make sure that the correct pMLST scheme was chosen.")
420
421
422 allele_matches = {}
423
424 # Get the found allele profile contained in the results dict
425 for hit, locus_hit in results[scheme].items():
426
427 # Get allele number for locus
428 allele_name = locus_hit["sbjct_header"]
429 allele_obj = re.search("(\w+)[_|-](\w+$)", allele_name)
430
431 # Get variable to later storage in the results dict
432 locus = allele_obj.group(1)
433 allele = allele_obj.group(2)
434 coverage = float(locus_hit["perc_coverage"])
435 identity = float(locus_hit["perc_ident"])
436 score = float(locus_hit["cal_score"])
437 gaps = int(locus_hit["gaps"])
438 align_len = locus_hit["HSP_length"]
439 sbj_len = int(locus_hit["sbjct_length"])
440 sbjct_seq = locus_hit["sbjct_string"]
441 query_seq = locus_hit["query_string"]
442 homol_seq = locus_hit["homo_string"]
443 cigar = extended_cigar(sbjct_aligns[scheme][hit], query_aligns[scheme][hit])
444
445 # Check for perfect hits
446 if coverage == 100 and identity == 100:
447 # If a perfect hit was already found the list more_perfect hits will exist this new hit is appended to this list
448 try:
449 allele_matches[locus]["alternative_hit"][allele_name] = {"allele":allele+"!", "align_len":align_len, "sbj_len":sbj_len,
450 "coverage":coverage, "identity":identity, "hit_name":hit}
451 if allele_matches[locus]["allele"][-1] != "!":
452 allele_matches[locus]["allele"] += "!"
453 except KeyError:
454 # Overwrite alleles already saved, save the perfect match and break to go to next locus
455 allele_matches[locus] = {"score":score, "allele":allele, "coverage":coverage,
456 "identity":identity, "match_priority": 1, "align_len":align_len,
457 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
458 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
459 "hit_name":hit, "cigar":cigar, "alternative_hit":{}}
460 else:
461 # If no hit has yet been stored initialize dict variables that are looked up below
462 if locus not in allele_matches:
463 allele_matches[locus] = {"score":0, "match_priority": 4}
464
465 # We weight full coverage higher than perfect identity match
466 if coverage == 100 and identity != 100:
467 # Check that better (higher prioritized) 100% coverage hit has not been stored yet
468 if allele_matches[locus]["match_priority"] > 2 or (allele_matches[locus]["match_priority"] == 2 and score > allele_matches[locus]["score"]):
469 allele_matches[locus] = {"score":score, "allele":allele+"*", "coverage":coverage,
470 "identity":identity, "match_priority": 2, "align_len":align_len,
471 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
472 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
473 "hit_name":hit, "cigar":cigar}
474 elif coverage != 100 and identity == 100:
475 # Check that higher prioritized hit was not already stored
476 if allele_matches[locus]["match_priority"] > 3 or (allele_matches[locus]["match_priority"] == 3 and score > allele_matches[locus]["score"]):
477 allele_matches[locus] = {"score":score, "allele":allele + "?", "coverage":coverage,
478 "identity":identity, "match_priority": 3, "align_len":align_len,
479 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
480 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
481 "hit_name":hit, "cigar":cigar}
482 else: # coverage != 100 and identity != 100:
483 if allele_matches[locus]["match_priority"] == 4 and score > allele_matches[locus]["score"]:
484 allele_matches[locus] = {"score":score, "allele":allele + "?*", "coverage":coverage,
485 "identity":identity, "match_priority": 4, "align_len":align_len,
486 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
487 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
488 "hit_name":hit, "cigar":cigar}
489 for locus in loci_list:
490 if locus not in allele_matches:
491 allele_matches[locus] = {"identity":"", "coverage":"", "allele":"", "allele_name":"No hit found", "align_len":"", "gaps":"", "sbj_len":""}
492
493 # Import all possible st profiles into dict
494 st_profiles = import_profile(database, scheme,loci_list)
495
496 # Find st or neatest sts
497 st, note, nearest_sts = st_typing(st_profiles, allele_matches, loci_list)
498
499 # Give warning of mlst schene if no loci were found
500 if note == "" and warning != "":
501 note = warning
502
503 # Set ST for incF
504 if scheme.lower() == "incf":
505 st = ["F","A", "B"]
506 if "FII" in allele_matches and allele_matches["FII"]["identity"] == 100.0:
507 st[0] += allele_matches["FII"]["allele_name"].split("_")[-1]
508 elif "FIC" in allele_matches and allele_matches["FIC"]["identity"] == 100.0:
509 st[0] = "C" + allele_matches["FIC"]["allele_name"].split("_")[-1]
510 elif "FIIK" in allele_matches and allele_matches["FIIK"]["identity"] == 100.0:
511 st[0] = "K" + allele_matches["FIIK"]["allele_name"].split("_")[-1]
512 elif "FIIS" in allele_matches and allele_matches["FIIS"]["identity"] == 100.0:
513 st[0] = "S" + allele_matches["FIIS"]["allele_name"].split("_")[-1]
514 elif "FIIY" in allele_matches and allele_matches["FIIY"]["identity"] == 100.0:
515 st[0] = "Y" + allele_matches["FIIY"]["allele_name"].split("_")[-1]
516 else:
517 st[0] += "-"
518
519 if "FIA" in allele_matches and allele_matches["FIA"]["identity"] == 100.0:
520 st[1] += allele_matches["FIA"]["allele_name"].split("_")[-1]
521 else:
522 st[1] += "-"
523
524 if "FIB" in allele_matches and allele_matches["FIB"]["identity"] == 100.0:
525 st[2] += allele_matches["FIB"]["allele_name"].split("_")[-1]
526 else:
527 st[2] += "-"
528
529 st = "["+":".join(st)+"]"
530
531
532 # Check if ST is associated with a clonal complex.
533 clpx = ""
534 if st != "Unknown" or nearest_sts != "":
535 with open("{0}/{1}.clpx".format(database,scheme), "r") as clpx_file:
536 for line in clpx_file:
537 line = line.split("\t")
538 if st[0] == line[0] or nearest_sts == line[0]:
539 clpx = line[1].strip()
540
541
542 # Get run info for JSON file
543 service = os.path.basename(__file__).replace(".py", "")
544 date = time.strftime("%d.%m.%Y")
545 time = time.strftime("%H:%M:%S")
546
547 # TODO find a system to show the database and service version using git
548
549 # Make JSON output file
550 data = {service:{}}
551 allele_results = {}
552 for locus, locus_info in allele_matches.items():
553 allele_results[locus] = {"identity":0, "coverage":0, "allele":[], "allele_name":[], "align_len":[], "gaps":0, "sbj_len":[]}
554 for (key, value) in locus_info.items():
555 if key in allele_results[locus] or (key == "alternative_hit" and value != {}):
556 allele_results[locus][key] = value
557
558 userinput = {"filename":args.infile, "scheme":args.scheme, "profile":profile_name,"file_format":file_format}
559 run_info = {"date":date, "time":time}#, "database":{"remote_db":remote_db, "last_commit_hash":head_hash}}
560 server_results = {"sequence_type":st, "allele_profile": allele_results,
561 "nearest_sts":nearest_sts,"clonal_complex":clpx, "notes":note}
562
563 data[service]["user_input"] = userinput
564 data[service]["run_info"] = run_info
565 data[service]["results"] = server_results
566
567 pprint.pprint(data)
568
569 # Save json output
570 result_file = "{}/data.json".format(outdir)
571 with open(result_file, "w") as outfile:
572 json.dump(data, outfile)
573
574 if extented_output:
575 # Define extented output
576 table_filename = "{}/results_tab.tsv".format(outdir)
577 query_filename = "{}/Hit_in_genome_seq.fsa".format(outdir)
578 sbjct_filename = "{}/pMLST_allele_seq.fsa".format(outdir)
579 result_filename = "{}/results.txt".format(outdir)
580 table_file = open(table_filename, "w")
581 query_file = open(query_filename, "w")
582 sbjct_file = open(sbjct_filename, "w")
583 result_file = open(result_filename, "w")
584
585 # Make results file
586 result_file.write("{0} Results\n\n".format(service))
587 result_file.write("pMLST profile: {}\n\nSequence Type: {}\n".format(profile_name, st))
588 # If ST is unknown report nearest ST
589 if st == "Unknown" and nearest_sts != "":
590 if len(nearest_sts.split(",")) == 1:
591 result_file.write("Nearest ST: {}\n".format(nearest_sts))
592 else:
593 result_file.write("Nearest STs: {}\n".format(nearest_sts))
594
595 # Report clonal complex if one was associated with ST:
596 if clpx != "":
597 result_file.write("Clonal complex: {}\n".format(clpx))
598
599 # Write tsv table header
600 table_header = ["Locus", "Identity", "Coverage", "Alignment Length", "Allele Length", "Gaps", "Allele"]
601 table_file.write("\t".join(table_header) + "\n")
602 rows = []
603 for locus, allele_info in allele_matches.items():
604
605 identity = str(allele_info["identity"])
606 coverage = str(allele_info["coverage"])
607 allele = allele_info["allele"]
608 allele_name = allele_info["allele_name"]
609 align_len = str(allele_info["align_len"])
610 sbj_len = str(allele_info["sbj_len"])
611 gaps = str(allele_info["gaps"])
612
613 # Write alleles names with indications of imperfect hits
614 if allele_name != "No hit found":
615 allele_name_w_mark = locus + "_" + allele
616 else:
617 allele_name_w_mark = allele_name
618
619 # Write allele results to tsv table
620 row = [locus, identity, coverage, align_len, sbj_len, gaps, allele_name_w_mark]
621 rows.append(row)
622 if "alternative_hit" in allele_info:
623 for allele_name, dic in allele_info["alternative_hit"].items():
624 row = [locus, identity, coverage, str(dic["align_len"]), str(dic["sbj_len"]), "0", allele_name + "!"]
625 rows.append(row)
626 #
627
628 if allele_name == "No hit found":
629 continue
630
631 # Write query fasta output
632 hit_name = allele_info["hit_name"]
633 query_seq = query_aligns[scheme][hit_name]
634 sbjct_seq = sbjct_aligns[scheme][hit_name]
635 homol_seq = homol_aligns[scheme][hit_name]
636
637 if allele_info["match_priority"] == 1:
638 match = "PERFECT MATCH"
639 else:
640 match = "WARNING"
641 header = ">{}:{} ID:{}% COV:{}% Best_match:{}\n".format(locus, match, allele_info["identity"],
642 allele_info["coverage"], allele_info["allele_name"])
643 query_file.write(header)
644 for i in range(0,len(query_seq),60):
645 query_file.write(query_seq[i:i+60] + "\n")
646
647 # Write template fasta output
648 header = ">{}\n".format(allele_info["allele_name"])
649 sbjct_file.write(header)
650 for i in range(0,len(sbjct_seq),60):
651 sbjct_file.write(sbjct_seq[i:i+60] + "\n")
652
653 if "alternative_hit" in allele_info:
654 for allele_name in allele_info["alternative_hit"]:
655 header = ">{}:{} ID:{}% COV:{}% Best_match:{}\n".format(locus, "PERFECT MATCH", 100,
656 100, allele_name)
657 hit_name = allele_info["alternative_hit"][allele_name]["hit_name"]
658 query_seq = query_aligns[scheme][hit_name]
659 sbjct_seq = sbjct_aligns[scheme][hit_name]
660 homol_seq = homol_aligns[scheme][hit_name]
661 query_file.write(header)
662 for i in range(0,len(query_seq),60):
663 query_file.write(query_seq[i:i+60] + "\n")
664
665 # Write template fasta output
666 header = ">{}\n".format(allele_name)
667 sbjct_file.write(header)
668 for i in range(0,len(sbjct_seq),60):
669 sbjct_file.write(sbjct_seq[i:i+60] + "\n")
670
671 # Write Allele profile results tables in results file and table file
672 rows.sort(key=lambda x: x[0])
673 result_file.write(text_table(table_header, rows))
674 for row in rows:
675 table_file.write("\t".join(row) + "\n")
676 # Write any notes
677 if note != "":
678 result_file.write("\nNotes: {}\n\n".format(note))
679
680 # Write allignment output
681 result_file.write("\n\nExtended Output:\n\n")
682 make_aln(scheme, result_file, allele_matches, query_aligns, homol_aligns, sbjct_aligns)
683
684 # Close all files
685 query_file.close()
686 sbjct_file.close()
687 table_file.close()
688 result_file.close()
689
690 if args.quiet:
691 f.close()