comparison resfinder/cge/pointfinder.py @ 0:a16d245332d6 draft default tip

Uploaded
author dcouvin
date Wed, 08 Dec 2021 01:46:07 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a16d245332d6
1 #!/usr/bin/env python3
2 #
3 # Program: PointFinder-3.0
4 # Author: Camilla Hundahl Johnsen
5 #
6 # Dependencies: KMA or NCBI-blast together with BioPython.
7
8 import os
9 import re
10 import sys
11 import math
12 import argparse
13 import subprocess
14 import random
15
16 from cgecore.blaster import Blaster
17 from cgecore.cgefinder import CGEFinder
18 from .output.table import TableResults
19 from .phenotype2genotype.feature import ResMutation
20 from .phenotype2genotype.res_profile import PhenoDB
21
22
23 def eprint(*args, **kwargs):
24 print(*args, file=sys.stderr, **kwargs)
25
26
27 class GeneListError(Exception):
28 """ Raise when a specified gene is not found within the gene list.
29 """
30 def __init__(self, message, *args):
31 self.message = message
32 # allow users initialize misc. arguments as any other builtin Error
33 super(PanelNameError, self).__init__(message, *args)
34
35
36 class PointFinder(CGEFinder):
37
38 def __init__(self, db_path, species, gene_list=None):
39 """
40 """
41 self.species = species
42 self.specie_path = db_path
43 self.RNA_gene_list = []
44
45 self.gene_list = PointFinder.get_file_content(
46 self.specie_path + "/genes.txt")
47
48 if os.path.isfile(self.specie_path + "/RNA_genes.txt"):
49 self.RNA_gene_list = PointFinder.get_file_content(
50 self.specie_path + "/RNA_genes.txt")
51
52 # Creat user defined gene_list if given
53 if(gene_list is not None):
54 self.gene_list = get_user_defined_gene_list(gene_list)
55
56 self.known_mutations, self.drug_genes, self.known_stop_codon = (
57 self.get_db_mutations(self.specie_path + "/resistens-overview.txt",
58 self.gene_list))
59
60 def get_user_defined_gene_list(self, gene_list):
61 genes_specified = []
62 for gene in gene_list:
63 # Check that the genes are valid
64 if gene not in self.gene_list:
65 raise(GeneListError(
66 "Input Error: Specified gene not recognised "
67 "(%s)\nChoose one or more of the following genes:"
68 "\n%s" % (gene, "\n".join(self.gene_list))))
69 genes_specified.append(gene)
70 # Change the gene_list to the user defined gene_list
71 return genes_specified
72
73 #DELETE
74 def old_results_to_standard_output(self, result, software, version,
75 run_date, run_cmd, id,
76 unknown_mut=False, tableresults=None):
77 """
78 """
79 std_results = TableResults(software, version, run_date, run_cmd, id)
80 headers = [
81 "template_name",
82 "template_length",
83 "aln_length",
84 "aln_identity",
85 "aln_gaps",
86 "aln_template_string",
87 "aln_query_string",
88 "aln_homology_string",
89 "query_id",
90 "query_start_pos",
91 "query_end_pos",
92 "template_variant",
93 "query_depth",
94 "blast_eval",
95 "blast_bitscore",
96 "pval",
97 "software_score",
98 "mutation",
99 "ref_codon",
100 "alt_codon",
101 "ref_aa",
102 "alt_aa",
103 "insertion",
104 "deletion"
105 ]
106 for db_name, db in result.items():
107 if(db_name == "excluded"):
108 continue
109
110 if(db == "No hit found"):
111 continue
112
113 ####Added to solve PointFinder####
114 if(isinstance(db, str)):
115 if db.startswith("Gene found with coverage"):
116 continue
117 ####
118
119 # Start writing output string (to HTML tab file)
120 gene_name = PhenoDB.if_promoter_rename(db_name)
121
122 # Find and save mis_matches in gene
123 sbjct_start = db["sbjct_start"]
124 sbjct_seq = db["sbjct_string"]
125 qry_seq = db["query_string"]
126 db["mis_matches"] = self.find_mismatches(gene=db_name,
127 sbjct_start=sbjct_start,
128 sbjct_seq=sbjct_seq,
129 qry_seq=qry_seq)
130
131 known_muts = []
132 unknown_muts = []
133 if(len(db["mis_matches"]) > 0):
134 known_muts, unknown_muts = self.get_mutations(
135 db_name, gene_name, db['mis_matches'], True, db)
136
137 # No mutations found
138 if(not (unknown_muts or known_muts)):
139 continue
140
141 std_results.add_table(db_name)
142 std_db = std_results.long[db_name]
143 std_db.add_headers(headers)
144 std_db.lock_headers = True
145
146 for mut in known_muts:
147 unique_id = mut.unique_id
148 std_db[unique_id] = {
149 "template_name": db_name,
150 "template_length": db["sbjct_length"],
151 "aln_length": db["HSP_length"],
152 "aln_identity": db["perc_ident"],
153 "aln_gaps": db["gaps"],
154 "aln_template_string": db["sbjct_string"],
155 "aln_query_string": db["query_string"],
156 "aln_homology_string": db["homo_string"],
157 "query_id": db["contig_name"],
158 "query_start_pos": mut.pos,
159 "query_end_pos": mut.end,
160 "query_depth": db.get("depth", "NA"),
161 "pval": db.get("p_value", "NA"),
162 "software_score": db["cal_score"],
163 "mutation": mut.mut_string_short,
164 "ref_codon": mut.ref_codon,
165 "alt_codon": mut.mut_codon,
166 "ref_aa": mut.ref_aa,
167 "alt_aa": mut.mut_aa,
168 "insertion": mut.insertion,
169 "deletion": mut.deletion
170 }
171
172 return std_results
173
174 @staticmethod
175 def get_db_names(db_root_path):
176 out_lst = []
177 for entry in os.scandir(db_root_path):
178 if not entry.name.startswith('.') and entry.is_dir():
179 out_lst.append(entry.name)
180 return tuple(out_lst)
181
182 def results_to_str(self, res_type, results, unknown_flag, min_cov,
183 perc_iden):
184 # Initiate output stings with headers
185 gene_list = ', '.join(self.gene_list)
186 output_strings = [
187 "Mutation\tNucleotide change\tAmino acid change\tResistance\tPMID",
188 "Chromosomal point mutations - Results\nSpecies: %s\nGenes: %s\n"
189 "Mapping methode: %s\n\n\nKnown Mutations\n" % (self.species, gene_list, res_type), ""
190 ]
191 # Get all drug names and add header of all drugs to prediction file
192 drug_lst = [drug for drug in self.drug_genes.keys()]
193 output_strings[2] = "\t".join(drug_lst) + "\n"
194
195 # Define variables to write temporary output into
196 total_unknown_str = ""
197 unique_drug_list = []
198 excluded_hits = {}
199
200 if res_type == PointFinder.TYPE_BLAST:
201 # Patch together partial sequence hits (BLASTER does not do this)
202 # and removes dummy_hit_id
203 GENES = self.find_best_seqs(results, min_cov)
204
205 else:
206 # TODO: Some databases are either genes or species,
207 # depending on the method applied (BLAST/KMA).
208 GENES = results[self.species]
209
210 # If not hits found insert genes not found
211 if GENES == 'No hit found':
212 GENES = {}
213
214 # KMA only gives the genes found, however all genes should be
215 # included
216 for gene in self.gene_list:
217 if gene not in GENES:
218 GENES[gene] = 'No hit found'
219
220 # Included excluded
221 GENES['excluded'] = results['excluded']
222
223 for gene, hit in GENES.items():
224 # Start writing output string (to HTML tab file)
225 gene_name = gene # Not perfeft can differ from specific mutation
226 regex = r"promoter_size_(\d+)(?:bp)"
227 promtr_gene_objt = re.search(regex, gene)
228
229 if promtr_gene_objt:
230 gene_name = gene.split("_")[0]
231
232 output_strings[1] += "\n%s\n" % (gene_name)
233
234 # Ignore excluded genes
235 # TODO: Should all not found genes be moved to this dict?
236 if gene == "excluded":
237 continue
238
239 # Write exclusion reason for genes not found
240 if isinstance(GENES[gene], str):
241 output_strings[1] += GENES[gene] + "\n"
242 continue
243
244 if hit['perc_ident'] < perc_iden and hit['perc_coverage'] < min_cov:
245 GENES[gene] = ('Gene found with identity, %s, below minimum '
246 'identity threshold: %s. And with coverage, %s,'
247 ' below minimum coverage threshold: %s.'
248 % (hit['perc_ident'], perc_iden,
249 hit['perc_coverage'], min_cov))
250 output_strings[1] += GENES[gene] + "\n"
251 continue
252
253 if hit['perc_ident'] < perc_iden:
254 GENES[gene] = ('Gene found with identity, %s, below minimum '
255 'identity threshold: %s' % (hit['perc_ident'],
256 perc_iden))
257 output_strings[1] += GENES[gene] + "\n"
258 continue
259
260 if hit['perc_coverage'] < min_cov:
261 # Gene not found above given coverage
262 GENES[gene] = ('Gene found with coverage, %s, below minimum '
263 'coverage threshold: %s' % (hit['perc_coverage'],
264 min_cov))
265 output_strings[1] += GENES[gene] + "\n"
266 continue
267
268 sbjct_start = hit['sbjct_start']
269 sbjct_seq = hit['sbjct_string']
270 qry_seq = hit['query_string']
271
272 # Find and save mis_matches in gene
273 hit['mis_matches'] = self.find_mismatches(gene, sbjct_start,
274 sbjct_seq, qry_seq)
275
276 # Check if no mutations was found
277 if len(hit['mis_matches']) < 1:
278 output_strings[1] += ("No mutations found in {}\n"
279 .format(gene_name))
280 else:
281 # Write mutations found to output file
282 total_unknown_str += "\n%s\n" % (gene_name)
283
284 str_tuple = self.mut2str(gene, gene_name, hit['mis_matches'],
285 unknown_flag, hit)
286
287 all_results = str_tuple[0]
288 total_known = str_tuple[1]
289 total_unknown = str_tuple[2]
290 drug_list = str_tuple[3]
291
292 # Add results to output strings
293 if(all_results):
294 output_strings[0] += "\n" + all_results
295 output_strings[1] += total_known + "\n"
296
297 # Add unknown mutations the total results of
298 # unknown mutations
299 total_unknown_str += total_unknown + "\n"
300
301 # Add drugs to druglist
302 unique_drug_list += [drug.lower() for drug in drug_list]
303
304 if unknown_flag is True:
305 output_strings[1] += ("\n\nUnknown Mutations \n"
306 + total_unknown_str)
307
308 # Make Resistance Prediction output
309
310 # Go throug all drugs in the database and see if prediction
311 # can be called.
312 pred_output = []
313 for drug in drug_lst:
314 drug = drug.lower()
315 # Check if resistance to drug was found
316 if drug in unique_drug_list:
317 pred_output.append("1")
318 else:
319 # Check at all genes associated with the drug
320 # resistance where found
321 all_genes_found = True
322 for gene in self.drug_genes[drug]:
323 if gene not in GENES:
324 all_genes_found = False
325
326 if all_genes_found is False:
327 pred_output.append("?")
328 else:
329 pred_output.append("0")
330
331 output_strings[2] += "\t".join(pred_output) + "\n"
332
333 return output_strings
334
335 def write_results(self, out_path, result, res_type, unknown_flag, min_cov,
336 perc_iden):
337 """
338 """
339
340 result_str = self.results_to_str(res_type=res_type,
341 results=result,
342 unknown_flag=unknown_flag,
343 min_cov=min_cov,
344 perc_iden=perc_iden)
345
346 with open(out_path + "/PointFinder_results.txt", "w") as fh:
347 fh.write(result_str[0])
348 with open(out_path + "/PointFinder_table.txt", "w") as fh:
349 fh.write(result_str[1])
350 with open(out_path + "/PointFinder_prediction.txt", "w") as fh:
351 fh.write(result_str[2])
352
353 @staticmethod
354 def discard_unwanted_results(results, wanted):
355 """
356 Takes a dict and a list of keys.
357 Returns a dict containing only the keys from the list.
358 """
359 cleaned_results = dict()
360 for key, val in results.items():
361 if(key in wanted):
362 cleaned_results[key] = val
363 return cleaned_results
364
365 def blast(self, inputfile, out_path, min_cov=0.6, threshold=0.9,
366 blast="blastn", cut_off=True):
367 """
368 """
369 blast_run = Blaster(inputfile=inputfile, databases=self.gene_list,
370 db_path=self.specie_path, out_path=out_path,
371 min_cov=min_cov, threshold=threshold, blast=blast,
372 cut_off=cut_off)
373
374 self.blast_results = blast_run.results # TODO Is this used?
375 return blast_run
376
377 @staticmethod
378 def get_db_mutations(mut_db_path, gene_list):
379 """
380 This function opens the file resistenss-overview.txt, and reads
381 the content into a dict of dicts. The dict will contain
382 information about all known mutations given in the database.
383 This dict is returned.
384 """
385
386 # Initiate variables
387 known_mutations = dict()
388 known_stop_codon = dict()
389 drug_genes = dict()
390 indelflag = False
391 stopcodonflag = False
392
393 # Go throug mutation file line by line
394
395 with open(mut_db_path, "r") as fh:
396 mutdb_file = fh.readlines()
397 mutdb_file = [line.strip() for line in mutdb_file]
398
399 for line in mutdb_file:
400 # Ignore headers and check where the indel section starts
401 if line.startswith("#"):
402 if "indel" in line.lower():
403 indelflag = True
404 elif "stop codon" in line.lower():
405 stopcodonflag = True
406 else:
407 stopcodonflag = False
408 continue
409
410 mutation = [data.strip() for data in line.split("\t")]
411
412 gene_ID = mutation[0]
413
414 # Only consider mutations in genes found in the gene list
415 if gene_ID in gene_list:
416 gene_name = mutation[1]
417 mut_pos = int(mutation[2])
418 ref_codon = mutation[3] # Ref_nuc (1 or 3?)
419 ref_aa = mutation[4] # Ref_codon
420 alt_aa = mutation[5].split(",") # Res_codon
421 res_drug = mutation[6].replace("\t", " ")
422 pmid = mutation[7].split(",")
423
424 # Check if stop codons are predictive for resistance
425 if stopcodonflag is True:
426 if gene_ID not in known_stop_codon:
427 known_stop_codon[gene_ID] = {"pos": [],
428 "drug": res_drug}
429 known_stop_codon[gene_ID]["pos"].append(mut_pos)
430
431 # Add genes associated with drug resistance to drug_genes dict
432 drug_lst = res_drug.split(",")
433 drug_lst = [d.strip().lower() for d in drug_lst]
434 for drug in drug_lst:
435 if drug not in drug_genes:
436 drug_genes[drug] = []
437 if gene_ID not in drug_genes[drug]:
438 drug_genes[drug].append(gene_ID)
439
440 # Initiate empty dict to store relevant mutation information
441 mut_info = dict()
442
443 # Save need mutation info with pmid cooresponding to the amino
444 # acid change
445 for i in range(len(alt_aa)):
446 try:
447 mut_info[alt_aa[i]] = {"gene_name": gene_name,
448 "drug": res_drug,
449 "pmid": pmid[i]}
450 except IndexError:
451 mut_info[alt_aa[i]] = {"gene_name": gene_name,
452 "drug": res_drug,
453 "pmid": "-"}
454
455 # Add all possible types of mutations to the dict
456 if gene_ID not in known_mutations:
457 known_mutations[gene_ID] = {"sub": dict(), "ins": dict(),
458 "del": dict()}
459 # Check for the type of mutation
460 if indelflag is False:
461 mutation_type = "sub"
462 else:
463 mutation_type = ref_aa
464
465 # Save mutations positions with required information given in
466 # mut_info
467 if mut_pos not in known_mutations[gene_ID][mutation_type]:
468 known_mutations[gene_ID][mutation_type][mut_pos] = dict()
469 for amino in alt_aa:
470 if (amino in known_mutations[gene_ID][mutation_type]
471 [mut_pos]):
472 stored_mut_info = (known_mutations[gene_ID]
473 [mutation_type]
474 [mut_pos][amino])
475 if stored_mut_info["drug"] != mut_info[amino]["drug"]:
476 stored_mut_info["drug"] += "," + (mut_info[amino]
477 ["drug"])
478 if stored_mut_info["pmid"] != mut_info[amino]["pmid"]:
479 stored_mut_info["pmid"] += ";" + (mut_info[amino]
480 ["pmid"])
481
482 (known_mutations[gene_ID][mutation_type]
483 [mut_pos][amino]) = stored_mut_info
484 else:
485 (known_mutations[gene_ID][mutation_type]
486 [mut_pos][amino]) = mut_info[amino]
487
488 # Check that all genes in the gene list has known mutations
489 for gene in gene_list:
490 if gene not in known_mutations:
491 known_mutations[gene] = {"sub": dict(), "ins": dict(),
492 "del": dict()}
493 return known_mutations, drug_genes, known_stop_codon
494
495 def find_best_seqs(self, blast_results, min_cov):
496 """
497 This function finds gene sequences with the largest coverage based on
498 the blast results. If more hits covering different sequences parts
499 exists it concatinates partial gene hits into one hit.
500 If different overlap sequences occurs they are saved in the list
501 alternative_overlaps. The function returns a new results dict where
502 each gene has an inner dict with hit information corresponding to
503 the new concatinated hit. If no gene is found the value is a string
504 with info of why the gene wasn't found.
505 """
506
507 GENES = dict()
508 for gene, hits in blast_results.items():
509
510 # Save all hits in the list 'hits_found'
511 hits_found = []
512 GENES[gene] = {}
513
514 # Check for gene has any hits in blast results
515 if gene == 'excluded':
516 GENES[gene] = hits
517 continue
518 elif isinstance(hits, dict) and len(hits) > 0:
519 GENES[gene]['found'] = 'partially'
520 GENES[gene]['hits'] = hits
521 else:
522 # Gene not found! go to next gene
523 GENES[gene] = 'No hit found'
524 continue
525
526 # Check coverage for each hit, patch together partial genes hits
527 for hit_id, hit in hits.items():
528
529 # Save hits start and end positions in subject, total subject
530 # len, and subject and query sequences of each hit
531 hits_found += [(hit['sbjct_start'], hit['sbjct_end'],
532 hit['sbjct_string'], hit['query_string'],
533 hit['sbjct_length'], hit['homo_string'],
534 hit_id)]
535
536 # If coverage is 100% change found to yes
537 hit_coverage = hit['perc_coverage']
538 if hit_coverage == 1.0:
539 GENES[gene]['found'] = 'yes'
540
541 # Sort positions found
542 hits_found = sorted(hits_found, key=lambda x: x[0])
543
544 # Find best hit by concatenating sequences if more hits exist
545
546 # Get information from the first hit found
547 all_start = hits_found[0][0]
548 current_end = hits_found[0][1]
549 final_sbjct = hits_found[0][2]
550 final_qry = hits_found[0][3]
551 sbjct_len = hits_found[0][4]
552 final_homol = hits_found[0][5]
553 first_hit_id = hits_found[0][6]
554
555 alternative_overlaps = []
556 contigs = [hit['contig_name']]
557 scores = [str(hit['cal_score'])]
558
559 # Check if more then one hit was found within the same gene
560 for i in range(len(hits_found) - 1):
561
562 # Save information from previous hit
563 pre_block_start = hits_found[i][0]
564 pre_block_end = hits_found[i][1]
565 pre_sbjct = hits_found[i][2]
566 pre_qry = hits_found[i][3]
567 pre_homol = hits_found[i][5]
568 pre_id = hits_found[i][6]
569
570 # Save information from next hit
571 next_block_start = hits_found[i + 1][0]
572 next_block_end = hits_found[i + 1][1]
573 next_sbjct = hits_found[i + 1][2]
574 next_qry = hits_found[i + 1][3]
575 next_homol = hits_found[i + 1][5]
576 next_id = hits_found[i + 1][6]
577
578 contigs.append(hits[next_id]['contig_name'])
579 scores.append(str(hits[next_id]['cal_score']))
580
581 # Check for overlapping sequences, collaps them and save
582 # alternative overlaps if any
583 if next_block_start <= current_end:
584
585 # Find overlap start and take gaps into account
586 pos_count = 0
587 overlap_pos = pre_block_start
588 for i in range(len(pre_sbjct)):
589
590 # Stop loop if overlap_start position is reached
591 if overlap_pos == next_block_start:
592 overlap_start = pos_count
593 break
594 if pre_sbjct[i] != "-":
595 overlap_pos += 1
596 pos_count += 1
597
598 # Find overlap len and add next sequence to final sequence
599 if len(pre_sbjct[overlap_start:]) > len(next_sbjct):
600 # <--------->
601 # <--->
602 overlap_len = len(next_sbjct)
603 overlap_end_pos = next_block_end
604 else:
605 # <--------->
606 # <--------->
607 overlap_len = len(pre_sbjct[overlap_start:])
608 overlap_end_pos = pre_block_end
609
610 # Update current end
611 current_end = next_block_end
612
613 # Use the entire previous sequence and add the last
614 # part of the next sequence
615 final_sbjct += next_sbjct[overlap_len:]
616 final_qry += next_qry[overlap_len:]
617 final_homol += next_homol[overlap_len:]
618
619 # Find query overlap sequences
620 pre_qry_overlap = pre_qry[overlap_start: (overlap_start
621 + overlap_len)]
622 next_qry_overlap = next_qry[:overlap_len]
623 sbjct_overlap = next_sbjct[:overlap_len]
624
625 # If alternative query overlap excist save it
626 if pre_qry_overlap != next_qry_overlap:
627 eprint("OVERLAP WARNING:")
628 eprint("{}\n{}"
629 .format(pre_qry_overlap, next_qry_overlap))
630
631 # Save alternative overlaps
632 alternative_overlaps += [(next_block_start,
633 overlap_end_pos,
634 sbjct_overlap,
635 next_qry_overlap)]
636
637 elif next_block_start > current_end:
638 # <------->
639 # <------->
640 gap_size = next_block_start - current_end - 1
641 final_qry += "N" * gap_size
642 final_sbjct += "N" * gap_size
643 final_homol += "-" * gap_size
644 current_end = next_block_end
645 final_sbjct += next_sbjct
646 final_qry += next_qry
647 final_homol += next_homol
648
649 # Calculate new coverage
650 no_call = final_qry.upper().count("N")
651 coverage = ((current_end - all_start + 1 - no_call)
652 / float(sbjct_len))
653
654 # Calculate identity
655 equal = 0
656 not_equal = 0
657 for i in range(len(final_qry)):
658 if final_qry[i].upper() != "N":
659 if final_qry[i].upper() == final_sbjct[i].upper():
660 equal += 1
661 else:
662 not_equal += 1
663 identity = equal / float(equal + not_equal)
664 if coverage >= min_cov:
665 GENES[gene]['perc_coverage'] = coverage
666 GENES[gene]['perc_ident'] = identity
667 GENES[gene]['sbjct_string'] = final_sbjct
668 GENES[gene]['query_string'] = final_qry
669 GENES[gene]['homo_string'] = final_homol
670 GENES[gene]['contig_name'] = ", ".join(contigs)
671 GENES[gene]['HSP_length'] = len(final_qry)
672 GENES[gene]['sbjct_start'] = all_start
673 GENES[gene]['sbjct_end'] = current_end
674 GENES[gene]['sbjct_length'] = sbjct_len
675 GENES[gene]['cal_score'] = ", ".join(scores)
676 GENES[gene]['gaps'] = no_call
677 GENES[gene]['alternative_overlaps'] = alternative_overlaps
678 GENES[gene]['mis_matches'] = []
679
680 else:
681 # Gene not found above given coverage
682 GENES[gene] = ('Gene found with coverage, %f, below '
683 'minimum coverage threshold: %s' % (coverage,
684 min_cov))
685 return GENES
686
687 def find_mismatches(self, gene, sbjct_start, sbjct_seq, qry_seq):
688 """
689 This function finds mis matches between two sequeces. Depending
690 on the the sequence type either the function
691 find_codon_mismatches or find_nucleotid_mismatches are called,
692 if the sequences contains both a promoter and a coding region
693 both functions are called. The function can also call itself if
694 alternative overlaps is give. All found mismatches are returned
695 """
696
697 # Initiate the mis_matches list that will store all found mis matcehs
698 mis_matches = []
699
700 # Find mis matches in RNA genes
701 if gene in self.RNA_gene_list:
702 mis_matches += PointFinder.find_nucleotid_mismatches(sbjct_start,
703 sbjct_seq,
704 qry_seq)
705 else:
706 # Check if the gene sequence is with a promoter
707 regex = r"promoter_size_(\d+)(?:bp)"
708 promtr_gene_objt = re.search(regex, gene)
709
710 # Check for promoter sequences
711 if promtr_gene_objt:
712 # Get promoter length
713 promtr_len = int(promtr_gene_objt.group(1))
714
715 # Extract promoter sequence, while considering gaps
716 # --------agt-->----
717 # ---->?
718 if sbjct_start <= promtr_len:
719 # Find position in sbjct sequence where promoter ends
720 promtr_end = 0
721 nuc_count = sbjct_start - 1
722
723 for i in range(len(sbjct_seq)):
724 promtr_end += 1
725
726 if sbjct_seq[i] != "-":
727 nuc_count += 1
728
729 if nuc_count == promtr_len:
730 break
731
732 # Check if only a part of the promoter is found
733 # --------agt-->----
734 # ----
735 promtr_sbjct_start = -1
736 if nuc_count < promtr_len:
737 promtr_sbjct_start = nuc_count - promtr_len
738
739 # Get promoter part of subject and query
740 sbjct_promtr_seq = sbjct_seq[:promtr_end]
741 qry_promtr_seq = qry_seq[:promtr_end]
742
743 # For promoter part find nucleotide mis matches
744 mis_matches += PointFinder.find_nucleotid_mismatches(
745 promtr_sbjct_start, sbjct_promtr_seq, qry_promtr_seq,
746 promoter=True)
747
748 # Check if gene is also found
749 # --------agt-->----
750 # -----------
751 if((sbjct_start + len(sbjct_seq.replace("-", "")))
752 > promtr_len):
753 sbjct_gene_seq = sbjct_seq[promtr_end:]
754 qry_gene_seq = qry_seq[promtr_end:]
755 sbjct_gene_start = 1
756
757 # Find mismatches in gene part
758 mis_matches += PointFinder.find_codon_mismatches(
759 sbjct_gene_start, sbjct_gene_seq, qry_gene_seq)
760
761 # No promoter, only gene is found
762 # --------agt-->----
763 # -----
764 else:
765 sbjct_gene_start = sbjct_start - promtr_len
766
767 # Find mismatches in gene part
768 mis_matches += PointFinder.find_codon_mismatches(
769 sbjct_gene_start, sbjct_seq, qry_seq)
770
771 else:
772 # Find mismatches in gene
773 mis_matches += PointFinder.find_codon_mismatches(
774 sbjct_start, sbjct_seq, qry_seq)
775
776 return mis_matches
777
778 @staticmethod
779 def find_nucleotid_mismatches(sbjct_start, sbjct_seq, qry_seq,
780 promoter=False):
781 """
782 This function takes two alligned sequence (subject and query),
783 and the position on the subject where the alignment starts. The
784 sequences are compared one nucleotide at a time. If mis matches
785 are found they are saved. If a gap is found the function
786 find_nuc_indel is called to find the entire indel and it is
787 also saved into the list mis_matches. If promoter sequences are
788 given as arguments, these are reversed the and the absolut
789 value of the sequence position used, but when mutations are
790 saved the negative value and det reverse sequences are saved in
791 mis_mathces.
792 """
793
794 # Initiate the mis_matches list that will store all found
795 # mismatcehs
796 mis_matches = []
797
798 sbjct_start = abs(sbjct_start)
799 seq_pos = sbjct_start
800
801 # Set variables depending on promoter status
802 factor = 1
803 mut_prefix = "r."
804
805 if promoter is True:
806 factor = (-1)
807 mut_prefix = "n."
808 # Reverse promoter sequences
809 sbjct_seq = sbjct_seq[::-1]
810 qry_seq = qry_seq[::-1]
811
812 # Go through sequences one nucleotide at a time
813 shift = 0
814 for index in range(sbjct_start - 1, len(sbjct_seq)):
815 mut_name = mut_prefix
816 mut = ""
817 # Shift index according to gaps
818 i = index + shift
819
820 # If the end of the sequence is reached, stop
821 if i == len(sbjct_seq):
822 break
823
824 sbjct_nuc = sbjct_seq[i]
825 qry_nuc = qry_seq[i]
826
827 # Check for mis matches
828 if sbjct_nuc.upper() != qry_nuc.upper():
829
830 # check for insertions and deletions
831 if sbjct_nuc == "-" or qry_nuc == "-":
832 if sbjct_nuc == "-":
833 mut = "ins"
834 indel_start_pos = (seq_pos - 1) * factor
835 indel_end_pos = seq_pos * factor
836 indel = PointFinder.find_nuc_indel(sbjct_seq[i:],
837 qry_seq[i:])
838 else:
839 mut = "del"
840 indel_start_pos = seq_pos * factor
841 indel = PointFinder.find_nuc_indel(qry_seq[i:],
842 sbjct_seq[i:])
843 indel_end_pos = (seq_pos + len(indel) - 1) * factor
844 seq_pos += len(indel) - 1
845
846 # Shift the index to the end of the indel
847 shift += len(indel) - 1
848
849 # Write mutation name, depending on sequnce
850 if len(indel) == 1 and mut == "del":
851 mut_name += str(indel_start_pos) + mut + indel
852 else:
853 if promoter is True:
854 # Reverse the sequence and the start and
855 # end positions
856 indel = indel[::-1]
857 temp = indel_start_pos
858 indel_start_pos = indel_end_pos
859 indel_end_pos = temp
860
861 mut_name += (str(indel_start_pos) + "_"
862 + str(indel_end_pos) + mut + indel)
863
864 mis_matches += [[mut, seq_pos * factor, seq_pos * factor,
865 indel, mut_name, mut, indel]]
866
867 # Check for substitutions mutations
868 else:
869 mut = "sub"
870 mut_name += (str(seq_pos * factor) + sbjct_nuc + ">"
871 + qry_nuc)
872
873 mis_matches += [[mut, seq_pos * factor, seq_pos * factor,
874 qry_nuc, mut_name, sbjct_nuc, qry_nuc]]
875
876 # Increment sequence position
877 if mut != "ins":
878 seq_pos += 1
879
880 return mis_matches
881
882 @staticmethod
883 def find_nuc_indel(gapped_seq, indel_seq):
884 """
885 This function finds the entire indel missing in from a gapped
886 sequence compared to the indel_seqeunce. It is assumes that the
887 sequences start with the first position of the gap.
888 """
889 ref_indel = indel_seq[0]
890 for j in range(1, len(gapped_seq)):
891 if gapped_seq[j] == "-":
892 ref_indel += indel_seq[j]
893 else:
894 break
895 return ref_indel
896
897 @staticmethod
898 def aa(codon):
899 """
900 This function converts a codon to an amino acid. If the codon
901 is not valid an error message is given, or else, the amino acid
902 is returned.
903
904 Potential future issue: If species are added that utilizes
905 alternative translation tables.
906 """
907 codon = codon.upper()
908 aa = {"ATT": "I", "ATC": "I", "ATA": "I",
909 "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "TTA": "L",
910 "TTG": "L",
911 "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
912 "TTT": "F", "TTC": "F",
913 "ATG": "M",
914 "TGT": "C", "TGC": "C",
915 "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
916 "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
917 "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
918 "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
919 "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S",
920 "AGC": "S",
921 "TAT": "Y", "TAC": "Y",
922 "TGG": "W",
923 "CAA": "Q", "CAG": "Q",
924 "AAT": "N", "AAC": "N",
925 "CAT": "H", "CAC": "H",
926 "GAA": "E", "GAG": "E",
927 "GAT": "D", "GAC": "D",
928 "AAA": "K", "AAG": "K",
929 "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R",
930 "AGG": "R",
931 "TAA": "*", "TAG": "*", "TGA": "*"}
932 # Translate valid codon
933 try:
934 amino_a = aa[codon]
935 except KeyError:
936 amino_a = "?"
937 return amino_a
938
939 @staticmethod
940 def get_codon(seq, codon_no, start_offset):
941 """
942 This function takes a sequece and a codon number and returns
943 the codon found in the sequence at that position
944 """
945 seq = seq.replace("-", "")
946 codon_start_pos = int(codon_no - 1) * 3 - start_offset
947 codon = seq[codon_start_pos:codon_start_pos + 3]
948 return codon
949
950 @staticmethod
951 def name_insertion(sbjct_seq, codon_no, sbjct_nucs, aa_alt, start_offset):
952 """
953 This function is used to name a insertion mutation based on the
954 HGVS recommendation.
955 """
956 start_codon_no = codon_no - 1
957
958 if len(sbjct_nucs) == 3:
959 start_codon_no = codon_no
960
961 start_codon = PointFinder.get_codon(sbjct_seq, start_codon_no,
962 start_offset)
963 end_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset)
964 pos_name = "p.%s%d_%s%dins%s" % (PointFinder.aa(start_codon),
965 start_codon_no,
966 PointFinder.aa(end_codon),
967 codon_no, aa_alt)
968 return pos_name
969
970 @staticmethod
971 def name_deletion(sbjct_seq, sbjct_rf_indel, sbjct_nucs, codon_no, aa_alt,
972 start_offset, mutation="del"):
973 """
974 This function is used to name a deletion mutation based on the
975 HGVS recommendation. If combination of a deletion and an
976 insertion is identified the argument 'mutation' is set to
977 'delins' and the mutation name will indicate that the mutation
978 is a delins mutation.
979 """
980 del_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset)
981 pos_name = "p.%s%d" % (PointFinder.aa(del_codon), codon_no)
982
983 # This has been changed
984 if len(sbjct_rf_indel) == 3 and mutation == "del":
985 return pos_name + mutation
986
987 end_codon_no = codon_no + math.ceil(len(sbjct_nucs) / 3) - 1
988 end_codon = PointFinder.get_codon(sbjct_seq, end_codon_no,
989 start_offset)
990 pos_name += "_%s%d%s" % (PointFinder.aa(end_codon), end_codon_no,
991 mutation)
992 if mutation == "delins":
993 pos_name += aa_alt
994 return pos_name
995
996 @staticmethod
997 def name_indel_mutation(sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
998 codon_no, mut, start_offset):
999 """
1000 This function serves to name the individual mutations
1001 dependently on the type of the mutation.
1002 """
1003 # Get the subject and query sequences without gaps
1004 sbjct_nucs = sbjct_rf_indel.replace("-", "")
1005 qry_nucs = qry_rf_indel.replace("-", "")
1006
1007 # Translate nucleotides to amino acids
1008 aa_ref = ""
1009 aa_alt = ""
1010
1011 for i in range(0, len(sbjct_nucs), 3):
1012 aa_ref += PointFinder.aa(sbjct_nucs[i:i + 3])
1013
1014 for i in range(0, len(qry_nucs), 3):
1015 aa_alt += PointFinder.aa(qry_nucs[i:i + 3])
1016
1017 # Identify the gapped sequence
1018 if mut == "ins":
1019 gapped_seq = sbjct_rf_indel
1020 else:
1021 gapped_seq = qry_rf_indel
1022
1023 gap_size = gapped_seq.count("-")
1024
1025 # Write mutation names
1026 if gap_size < 3 and len(sbjct_nucs) == 3 and len(qry_nucs) == 3:
1027 # Write mutation name for substitution mutation
1028 mut_name = "p.%s%d%s" % (PointFinder.aa(sbjct_nucs), codon_no,
1029 PointFinder.aa(qry_nucs))
1030 elif len(gapped_seq) == gap_size:
1031
1032 if mut == "ins":
1033 # Write mutation name for insertion mutation
1034 mut_name = PointFinder.name_insertion(sbjct_seq, codon_no,
1035 sbjct_nucs, aa_alt,
1036 start_offset)
1037 aa_ref = mut
1038 else:
1039 # Write mutation name for deletion mutation
1040 mut_name = PointFinder.name_deletion(sbjct_seq, sbjct_rf_indel,
1041 sbjct_nucs, codon_no,
1042 aa_alt, start_offset,
1043 mutation="del")
1044 aa_alt = mut
1045 # Check for delins - mix of insertion and deletion
1046 else:
1047 # Write mutation name for a mixed insertion and deletion
1048 # mutation
1049 mut_name = PointFinder.name_deletion(sbjct_seq,
1050 sbjct_rf_indel, sbjct_nucs,
1051 codon_no, aa_alt,
1052 start_offset,
1053 mutation="delins")
1054
1055 # Check for frameshift
1056 if gapped_seq.count("-") % 3 != 0:
1057 # Add the frameshift tag to mutation name
1058 mut_name += " - Frameshift"
1059
1060 return mut_name, aa_ref, aa_alt
1061
1062 @staticmethod
1063 def get_inframe_gap(seq, nucs_needed=3):
1064 """
1065 This funtion takes a sequnece starting with a gap or the
1066 complementary seqeuence to the gap, and the number of
1067 nucleotides that the seqeunce should contain in order to
1068 maintain the correct reading frame. The sequence is gone
1069 through and the number of non-gap characters are counted. When
1070 the number has reach the number of needed nucleotides the indel
1071 is returned. If the indel is a 'clean' insert or deletion that
1072 starts in the start of a codon and can be divided by 3, then
1073 only the gap is returned.
1074 """
1075 nuc_count = 0
1076 gap_indel = ""
1077 nucs = ""
1078
1079 for i in range(len(seq)):
1080 # Check if the character is not a gap
1081 if seq[i] != "-":
1082 # Check if the indel is a 'clean'
1083 # i.e. if the insert or deletion starts at the first
1084 # nucleotide in the codon and can be divided by 3
1085
1086 if(gap_indel.count("-") == len(gap_indel)
1087 and gap_indel.count("-") >= 3 and len(gap_indel) != 0):
1088 return gap_indel
1089
1090 nuc_count += 1
1091 gap_indel += seq[i]
1092
1093 # if the number of nucleotides in the indel equals the amount
1094 # needed for the indel, the indel is returned.
1095 if nuc_count == nucs_needed:
1096 return gap_indel
1097
1098 # This will only happen if the gap is in the very end of a sequence
1099 return gap_indel
1100
1101 @staticmethod
1102 def get_indels(sbjct_seq, qry_seq, start_pos):
1103 """
1104 This function uses regex to find inserts and deletions in
1105 sequences given as arguments. A list of these indels are
1106 returned. The list includes, type of mutations(ins/del),
1107 subject codon no of found mutation, subject sequence position,
1108 insert/deletions nucleotide sequence, and the affected qry
1109 codon no.
1110 """
1111
1112 seqs = [sbjct_seq, qry_seq]
1113 indels = []
1114 gap_obj = re.compile(r"-+")
1115 for i in range(len(seqs)):
1116 for match in gap_obj.finditer(seqs[i]):
1117 pos = int(match.start())
1118 gap = match.group()
1119
1120 # Find position of the mutation corresponding to the
1121 # subject sequence
1122 sbj_pos = len(sbjct_seq[:pos].replace("-", "")) + start_pos
1123
1124 # Get indel sequence and the affected sequences in
1125 # sbjct and qry in the reading frame
1126 indel = seqs[abs(i - 1)][pos:pos + len(gap)]
1127
1128 # Find codon number for mutation
1129 codon_no = int(math.ceil((sbj_pos) / 3))
1130 qry_pos = len(qry_seq[:pos].replace("-", "")) + start_pos
1131 qry_codon = int(math.ceil((qry_pos) / 3))
1132
1133 if i == 0:
1134 mut = "ins"
1135 else:
1136 mut = "del"
1137
1138 indels.append([mut, codon_no, sbj_pos, indel, qry_codon])
1139
1140 # Sort indels based on codon position and sequence position
1141 indels = sorted(indels, key=lambda x: (x[1], x[2]))
1142
1143 return indels
1144
1145 @staticmethod
1146 def find_codon_mismatches(sbjct_start, sbjct_seq, qry_seq):
1147 """
1148 This function takes two alligned sequence (subject and query),
1149 and the position on the subject where the alignment starts. The
1150 sequences are compared codon by codon. If a mis matches is
1151 found it is saved in 'mis_matches'. If a gap is found the
1152 function get_inframe_gap is used to find the indel sequence and
1153 keep the sequence in the correct reading frame. The function
1154 translate_indel is used to name indel mutations and translate
1155 the indels to amino acids The function returns a list of tuples
1156 containing all needed informations about the mutation in order
1157 to look it up in the database dict known mutation and the with
1158 the output files the the user.
1159 """
1160 mis_matches = []
1161
1162 # Find start pos of first codon in frame, i_start
1163 codon_offset = (sbjct_start - 1) % 3
1164 i_start = 0
1165
1166 if codon_offset != 0:
1167 i_start = 3 - codon_offset
1168
1169 sbjct_start = sbjct_start + i_start
1170
1171 # Set sequences in frame
1172 sbjct_seq = sbjct_seq[i_start:]
1173 qry_seq = qry_seq[i_start:]
1174
1175 # Find codon number of the first codon in the sequence, start
1176 # at 0
1177 codon_no = int((sbjct_start - 1) / 3) # 1,2,3 start on 0
1178
1179 # s_shift and q_shift are used when gaps appears
1180 q_shift = 0
1181 s_shift = 0
1182 mut_no = 0
1183
1184 # Find inserts and deletions in sequence
1185 indel_no = 0
1186 indels = PointFinder.get_indels(sbjct_seq, qry_seq, sbjct_start)
1187
1188 # Go through sequence and save mutations when found
1189 for index in range(0, len(sbjct_seq), 3):
1190 # Count codon number
1191 codon_no += 1
1192
1193 # Shift index according to gaps
1194 s_i = index + s_shift
1195 q_i = index + q_shift
1196
1197 # Get codons
1198 sbjct_codon = sbjct_seq[s_i:s_i + 3]
1199 qry_codon = qry_seq[q_i:q_i + 3]
1200
1201 if(len(sbjct_seq[s_i:].replace("-", ""))
1202 + len(qry_codon[q_i:].replace("-", "")) < 6):
1203 break
1204
1205 # Check for mutations
1206 if sbjct_codon.upper() != qry_codon.upper():
1207
1208 # Check for codon insertions and deletions and
1209 # frameshift mutations
1210 if "-" in sbjct_codon or "-" in qry_codon:
1211
1212 # Get indel info
1213 try:
1214 indel_data = indels[indel_no]
1215 except IndexError:
1216 sys.exit("indel_data list is out of range, bug!")
1217
1218 mut = indel_data[0]
1219 codon_no_indel = indel_data[1]
1220 seq_pos = indel_data[2] + sbjct_start - 1
1221 indel = indel_data[3]
1222 indel_no += 1
1223
1224 # Get the affected sequence in frame for both for
1225 # sbjct and qry
1226 if mut == "ins":
1227 sbjct_rf_indel = PointFinder.get_inframe_gap(
1228 sbjct_seq[s_i:], 3)
1229 qry_rf_indel = PointFinder.get_inframe_gap(
1230 qry_seq[q_i:],
1231 int(math.floor(len(sbjct_rf_indel) / 3) * 3))
1232 else:
1233 qry_rf_indel = PointFinder.get_inframe_gap(
1234 qry_seq[q_i:], 3)
1235 sbjct_rf_indel = PointFinder.get_inframe_gap(
1236 sbjct_seq[s_i:],
1237 int(math.floor(len(qry_rf_indel) / 3) * 3))
1238
1239 mut_name, aa_ref, aa_alt = PointFinder.name_indel_mutation(
1240 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
1241 codon_no, mut, sbjct_start - 1)
1242
1243 # Set index to the correct reading frame after the
1244 # indel gap
1245 shift_diff_before = abs(s_shift - q_shift)
1246 s_shift += len(sbjct_rf_indel) - 3
1247 q_shift += len(qry_rf_indel) - 3
1248 shift_diff = abs(s_shift - q_shift)
1249
1250 if shift_diff_before != 0 and shift_diff % 3 == 0:
1251
1252 if s_shift > q_shift:
1253 nucs_needed = (int((len(sbjct_rf_indel) / 3) * 3)
1254 + shift_diff)
1255 pre_qry_indel = qry_rf_indel
1256 qry_rf_indel = PointFinder.get_inframe_gap(
1257 qry_seq[q_i:], nucs_needed)
1258 q_shift += len(qry_rf_indel) - len(pre_qry_indel)
1259
1260 elif q_shift > s_shift:
1261 nucs_needed = (int((len(qry_rf_indel) / 3) * 3)
1262 + shift_diff)
1263 pre_sbjct_indel = sbjct_rf_indel
1264 sbjct_rf_indel = PointFinder.get_inframe_gap(
1265 sbjct_seq[s_i:], nucs_needed)
1266 s_shift += (len(sbjct_rf_indel)
1267 - len(pre_sbjct_indel))
1268
1269 mut_name, aa_ref, aa_alt = (
1270 PointFinder.name_indel_mutation(
1271 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
1272 codon_no, mut, sbjct_start - 1)
1273 )
1274 if "Frameshift" in mut_name:
1275 mut_name = (mut_name.split("-")[0]
1276 + "- Frame restored")
1277 if mut_name is "p.V940delins - Frame restored":
1278 sys.exit()
1279 mis_matches += [[mut, codon_no_indel, seq_pos, indel,
1280 mut_name, sbjct_rf_indel, qry_rf_indel,
1281 aa_ref, aa_alt]]
1282
1283 # Check if the next mutation in the indels list is
1284 # in the current codon.
1285 # Find the number of individul gaps in the
1286 # evaluated sequence.
1287 no_of_indels = (len(re.findall("\-\w", sbjct_rf_indel))
1288 + len(re.findall("\-\w", qry_rf_indel)))
1289
1290 if no_of_indels > 1:
1291
1292 for j in range(indel_no, indel_no + no_of_indels - 1):
1293 try:
1294 indel_data = indels[j]
1295 except IndexError:
1296 sys.exit("indel_data list is out of range, "
1297 "bug!")
1298 mut = indel_data[0]
1299 codon_no_indel = indel_data[1]
1300 seq_pos = indel_data[2] + sbjct_start - 1
1301 indel = indel_data[3]
1302 indel_no += 1
1303 mis_matches += [[mut, codon_no_indel, seq_pos,
1304 indel, mut_name, sbjct_rf_indel,
1305 qry_rf_indel, aa_ref, aa_alt]]
1306
1307 # Set codon number, and save nucleotides from out
1308 # of frame mutations
1309 if mut == "del":
1310 codon_no += int((len(sbjct_rf_indel) - 3) / 3)
1311 # If evaluated insert is only gaps codon_no should
1312 # not increment
1313 elif sbjct_rf_indel.count("-") == len(sbjct_rf_indel):
1314 codon_no -= 1
1315
1316 # Check of point mutations
1317 else:
1318 mut = "sub"
1319 aa_ref = PointFinder.aa(sbjct_codon)
1320 aa_alt = PointFinder.aa(qry_codon)
1321
1322 if aa_ref != aa_alt:
1323 # End search for mutation if a premature stop
1324 # codon is found
1325 mut_name = "p." + aa_ref + str(codon_no) + aa_alt
1326
1327 mis_matches += [[mut, codon_no, codon_no, aa_alt,
1328 mut_name, sbjct_codon, qry_codon,
1329 aa_ref, aa_alt]]
1330 # If a Premature stop codon occur report it an stop the
1331 # loop
1332
1333 try:
1334 if mis_matches[-1][-1] == "*":
1335 mut_name += " - Premature stop codon"
1336 mis_matches[-1][4] = (mis_matches[-1][4].split("-")[0]
1337 + " - Premature stop codon")
1338 break
1339 except IndexError:
1340 pass
1341
1342 # Sort mutations on position
1343 mis_matches = sorted(mis_matches, key=lambda x: x[1])
1344
1345 return mis_matches
1346
1347 @staticmethod
1348 def mutstr2mutdict(m):
1349 out_dict = {}
1350
1351 # Protein / Amino acid mutations
1352 # Ex.: "p.T83I"
1353 if(m.startswith("p.")):
1354 out_dict["nucleotide"] = False
1355
1356 # Remove frameshift tag
1357 frameshift_match = re.search(r"(.+) - Frameshift.*$", m)
1358 if(frameshift_match):
1359 m = frameshift_match.group(1)
1360 out_dict["frameshift"] = True
1361
1362 # Remove frame restored tag
1363 framerestored_match = re.search(r"(.+) - Frame restored.*$", m)
1364 if(framerestored_match):
1365 m = framerestored_match.group(1)
1366 out_dict["frame restored"] = True
1367
1368 # Remove premature stop tag
1369 prem_stop_match = re.search(r"(.+) - Premature stop codon.*$", m)
1370 if(prem_stop_match):
1371 m = prem_stop_match.group(1)
1372 out_dict["prem_stop"] = True
1373
1374 # TODO: premature or frameshift tag adds too many whitespaces
1375 m = m.strip()
1376
1377 # Delins
1378 multi_delins_match = re.search(
1379 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins(\S+)$", m)
1380 single_delins_match = re.search(
1381 r"^p.(\D{1})(\d+)delins(\S+)$", m)
1382 # TODO: is both necessary?
1383 multi_delins_match2 = re.search(
1384 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins$", m)
1385 single_delins_match2 = re.search(
1386 r"^p.(\D{1})(\d+)delins$", m)
1387 multi_ins_match = re.search(
1388 r"^p.(\D{1})(\d+)_(\D{1})(\d+)ins(\D*)$", m)
1389 if(multi_delins_match or single_delins_match):
1390 out_dict["deletion"] = True
1391 out_dict["insertion"] = True
1392 if(single_delins_match):
1393 out_dict["ref_aa"] = single_delins_match.group(1)
1394 out_dict["pos"] = single_delins_match.group(2)
1395 out_dict["mut_aa"] = single_delins_match.group(3)
1396 else:
1397 out_dict["ref_aa"] = multi_delins_match.group(1)
1398 out_dict["pos"] = multi_delins_match.group(2)
1399 out_dict["ref_aa_right"] = multi_delins_match.group(3)
1400 out_dict["mut_end"] = multi_delins_match.group(4)
1401 out_dict["mut_aa"] = multi_delins_match.group(5)
1402 # Deletions
1403 elif(m[-3:] == "del"):
1404 single_del_match = re.search(
1405 r"^p.(\D{1})(\d+)del$", m)
1406 multi_del_match = re.search(
1407 r"^p.(\D{1})(\d+)_(\D{1})(\d+)del$", m)
1408 out_dict["deletion"] = True
1409 if(single_del_match):
1410 out_dict["ref_aa"] = single_del_match.group(1)
1411 out_dict["pos"] = single_del_match.group(2)
1412 else:
1413 out_dict["ref_aa"] = multi_del_match.group(1)
1414 out_dict["pos"] = multi_del_match.group(2)
1415 out_dict["ref_aa_right"] = multi_del_match.group(3)
1416 out_dict["mut_end"] = multi_del_match.group(4)
1417 # Insertions
1418 elif(multi_ins_match):
1419 out_dict["insertion"] = True
1420 out_dict["ref_aa"] = multi_ins_match.group(1).lower()
1421 out_dict["pos"] = multi_ins_match.group(2)
1422 out_dict["ref_aa_right"] = multi_ins_match.group(3).lower()
1423 if(multi_ins_match.group(4)):
1424 out_dict["mut_aa"] = multi_ins_match.group(4).lower()
1425 # Substitutions
1426 else:
1427 sub_match = re.search(
1428 r"^p.(\D{1})(\d+)(\D{1})$", m)
1429 out_dict["ref_aa"] = sub_match.group(1).lower()
1430 out_dict["pos"] = sub_match.group(2)
1431 out_dict["mut_aa"] = sub_match.group(3).lower()
1432
1433 # Nucleotide mutations
1434 # Ex. sub: n.-42T>C
1435 # Ex. ins: n.-13_-14insG (TODO: Verify)
1436 # Ex. del: n.42delT (TODO: Verify)
1437 # Ex. del: n.42_45del (TODO: Verify)
1438 elif(m.startswith("n.") or m.startswith("r.")):
1439 out_dict["nucleotide"] = True
1440
1441 sub_match = re.search(
1442 r"^[nr]{1}\.(-{0,1}\d+)(\D{1})>(\D{1})$", m)
1443 ins_match = re.search(
1444 r"^[nr]{1}\.(-{0,1}\d+)_(-{0,1}\d+)ins(\S+)$", m)
1445 # r.541delA
1446 del_match = re.search((
1447 r"^[nr]{1}\.(-{0,1}\d+)_{0,1}(-{0,1}\d*)del(\S*)$"), m)
1448 if(sub_match):
1449 out_dict["pos"] = sub_match.group(1)
1450 out_dict["ref_nuc"] = sub_match.group(2)
1451 out_dict["mut_nuc"] = sub_match.group(3)
1452 elif(ins_match):
1453 out_dict["insertion"] = True
1454 out_dict["pos"] = ins_match.group(1)
1455 out_dict["mut_end"] = ins_match.group(2)
1456 out_dict["mut_nuc"] = ins_match.group(3)
1457 elif(del_match):
1458 out_dict["deletion"] = True
1459 out_dict["pos"] = del_match.group(1)
1460 if(del_match.group(2)):
1461 out_dict["mut_end"] = del_match.group(2)
1462 if(del_match.group(3)):
1463 out_dict["ref_nuc"] = del_match.group(3)
1464 else:
1465 sys.exit("ERROR: Nucleotide deletion did not contain any "
1466 "reference sequence. mut string: {}".format(m))
1467
1468 return out_dict
1469
1470 def get_mutations(self, gene, gene_name, mis_matches, unknown_flag, hit):
1471 RNA = False
1472 if gene in self.RNA_gene_list:
1473 RNA = True
1474
1475 known_muts = []
1476 unknown_muts = []
1477
1478 # Go through each mutation
1479 for i in range(len(mis_matches)):
1480 m_type = mis_matches[i][0]
1481 pos = mis_matches[i][1] # sort on pos?
1482 look_up_pos = mis_matches[i][2]
1483 look_up_mut = mis_matches[i][3]
1484 mut_name = mis_matches[i][4]
1485 # nuc_ref = mis_matches[i][5]
1486 # nuc_alt = mis_matches[i][6]
1487 ref = mis_matches[i][-2]
1488 alt = mis_matches[i][-1]
1489 mut_dict = PointFinder.mutstr2mutdict(mut_name)
1490
1491 mut_id = ("{gene}_{pos}_{alt}"
1492 .format(gene=gene_name, pos=pos, alt=alt.lower()))
1493
1494 ref_aa = mut_dict.get("ref_aa", None)
1495 ref_aa_right = mut_dict.get("ref_aa_right", None)
1496 mut_aa = mut_dict.get("mut_aa", None)
1497 ref_nuc = mut_dict.get("ref_nuc", None)
1498 mut_nuc = mut_dict.get("mut_nuc", None)
1499 is_nuc = mut_dict.get("nucleotide", None)
1500 is_ins = mut_dict.get("insertion", None)
1501 is_del = mut_dict.get("deletion", None)
1502 mut_end = mut_dict.get("mut_end", None)
1503 prem_stop = mut_dict.get("prem_stop", False)
1504
1505 mut = ResMutation(unique_id=mut_id, seq_region=gene_name, pos=pos,
1506 hit=hit, ref_codon=ref_nuc, mut_codon=mut_nuc,
1507 ref_aa=ref_aa, ref_aa_right=ref_aa_right,
1508 mut_aa=mut_aa, insertion=is_ins, deletion=is_del,
1509 end=mut_end, nuc=is_nuc,
1510 premature_stop=prem_stop, ab_class=None)
1511
1512 if "Premature stop codon" in mut_name:
1513 sbjct_len = hit['sbjct_length']
1514 qry_len = pos * 3
1515 perc_trunc = round(
1516 ((float(sbjct_len) - qry_len)
1517 / float(sbjct_len))
1518 * 100, 2
1519 )
1520 mut.premature_stop = perc_trunc
1521
1522 # Check if mutation is known
1523 gene_mut_name, resistence, pmid = self.look_up_known_muts(
1524 gene, look_up_pos, look_up_mut, m_type, gene_name)
1525
1526 # Collect known mutations
1527 if resistence != "Unknown":
1528 known_muts.append(mut)
1529 # Collect unknown mutations
1530 else:
1531 unknown_muts.append(mut)
1532
1533 # TODO: Use ResMutation class to make sure identical mutations are
1534 # not kept.
1535
1536 return (known_muts, unknown_muts)
1537
1538 def mut2str(self, gene, gene_name, mis_matches, unknown_flag, hit):
1539 """
1540 This function takes a gene name a list of mis matches found
1541 between subject and query of this gene, the dictionary of
1542 known mutation in the point finder database, and the flag
1543 telling weather the user wants unknown mutations to be
1544 reported. All mis matches are looked up in the known
1545 mutation dict to se if the mutation is known, and in this
1546 case what drug resistence it causes. The funtions return 3
1547 strings that are used as output to the users. One string is
1548 only tab seperated and contains the mutations listed line
1549 by line. If the unknown flag is set to true it will contain
1550 both known and unknown mutations. The next string contains
1551 only known mutation and are given in in a format that is
1552 easy to convert to HTML. The last string is the HTML tab
1553 sting from the unknown mutations.
1554 """
1555 known_header = ("Mutation\tNucleotide change\tAmino acid change\t"
1556 "Resistance\tPMID\n")
1557
1558 unknown_header = "Mutation\tNucleotide change\tAmino acid change\n"
1559
1560 RNA = False
1561 if gene in self.RNA_gene_list:
1562 RNA = True
1563 known_header = "Mutation\tNucleotide change\tResistance\tPMID\n"
1564 unknown_header = "Mutation\tNucleotide change\n"
1565
1566 known_lst = []
1567 unknown_lst = []
1568 all_results_lst = []
1569 output_mut = []
1570 stop_codons = []
1571
1572 # Go through each mutation
1573 for i in range(len(mis_matches)):
1574 m_type = mis_matches[i][0]
1575 pos = mis_matches[i][1] # sort on pos?
1576 look_up_pos = mis_matches[i][2]
1577 look_up_mut = mis_matches[i][3]
1578 mut_name = mis_matches[i][4]
1579 nuc_ref = mis_matches[i][5]
1580 nuc_alt = mis_matches[i][6]
1581 ref = mis_matches[i][-2]
1582 alt = mis_matches[i][-1]
1583
1584 # First index in list indicates if mutation is known
1585 output_mut += [[]]
1586
1587 # Define output vaiables
1588 codon_change = nuc_ref + " -> " + nuc_alt
1589 aa_change = ref + " -> " + alt
1590
1591 if RNA is True:
1592 aa_change = "RNA mutations"
1593 elif pos < 0:
1594 aa_change = "Promoter mutations"
1595
1596 # Check if mutation is known
1597 gene_mut_name, resistence, pmid = self.look_up_known_muts(
1598 gene, look_up_pos, look_up_mut, m_type, gene_name)
1599
1600 gene_mut_name = gene_mut_name + " " + mut_name
1601
1602 output_mut[i] = [gene_mut_name, codon_change, aa_change,
1603 resistence, pmid]
1604
1605 # Add mutation to output strings for known mutations
1606 if resistence != "Unknown":
1607 if RNA is True:
1608 # don't include the amino acid change field for
1609 # RNA mutations
1610 known_lst += ["\t".join(output_mut[i][:2]) + "\t"
1611 + "\t".join(output_mut[i][3:])]
1612 else:
1613 known_lst += ["\t".join(output_mut[i])]
1614
1615 all_results_lst += ["\t".join(output_mut[i])]
1616
1617 # Add mutation to output strings for unknown mutations
1618 else:
1619 if RNA is True:
1620 unknown_lst += ["\t".join(output_mut[i][:2])]
1621 else:
1622 unknown_lst += ["\t".join(output_mut[i][:3])]
1623
1624 if unknown_flag is True:
1625 all_results_lst += ["\t".join(output_mut[i])]
1626
1627 # Check that you do not print two equal lines (can happen
1628 # if two indels occure in the same codon)
1629 if len(output_mut) > 1:
1630 if output_mut[i] == output_mut[i - 1]:
1631
1632 if resistence != "Unknown":
1633 known_lst = known_lst[:-1]
1634 all_results_lst = all_results_lst[:-1]
1635 else:
1636 unknown_lst = unknown_lst[:-1]
1637
1638 if unknown_flag is True:
1639 all_results_lst = all_results_lst[:-1]
1640
1641 if "Premature stop codon" in mut_name:
1642 sbjct_len = hit['sbjct_length']
1643 qry_len = pos * 3
1644 prec_truckat = round(
1645 ((float(sbjct_len) - qry_len)
1646 / float(sbjct_len))
1647 * 100, 2
1648 )
1649 perc = "%"
1650 stop_codons.append("Premature stop codon in %s, %.2f%s lost"
1651 % (gene, prec_truckat, perc))
1652
1653 # Creat final strings
1654 if(all_results_lst):
1655 all_results = "\n".join(all_results_lst)
1656 else:
1657 all_results = ""
1658 total_known_str = ""
1659 total_unknown_str = ""
1660
1661 # Check if there are only unknown mutations
1662 resistence_lst = []
1663 for mut in output_mut:
1664 for res in mut[3].split(","):
1665 resistence_lst.append(res)
1666
1667 # Save known mutations
1668 unknown_no = resistence_lst.count("Unknown")
1669 if unknown_no < len(resistence_lst):
1670 total_known_str = known_header + "\n".join(known_lst)
1671 else:
1672 total_known_str = "No known mutations found in %s" % gene_name
1673
1674 # Save unknown mutations
1675 if unknown_no > 0:
1676 total_unknown_str = unknown_header + "\n".join(unknown_lst)
1677 else:
1678 total_unknown_str = "No unknown mutations found in %s" % gene_name
1679
1680 return (all_results, total_known_str, total_unknown_str,
1681 resistence_lst + stop_codons)
1682
1683 @staticmethod
1684 def get_file_content(file_path, fst_char_only=False):
1685 """
1686 This function opens a file, given as the first argument
1687 file_path and returns the lines of the file in a list or the
1688 first character of the file if fst_char_only is set to True.
1689 """
1690 with open(file_path, "r") as infile:
1691 line_lst = []
1692 for line in infile:
1693 line = line.strip()
1694 if line != "":
1695 line_lst.append(line)
1696 if fst_char_only:
1697 return line_lst[0][0]
1698 return line_lst
1699
1700 def look_up_known_muts(self, gene, pos, found_mut, mut, gene_name):
1701 """
1702 This function uses the known_mutations dictionay, a gene a
1703 string with the gene key name, a gene position as integer,
1704 found_mut is given as amino acid or nucleotides, the
1705 mutation type (mut) is either "del", "ins", or "sub", and
1706 gene_name is the gene name that should be returned to the
1707 user. The function looks up if the found_mut defined by the
1708 gene, position and the found_mut string is given in the
1709 known_mutations dictionary, if it is, the resistance and
1710 the pmid are returned together with the gene_name given in
1711 the known_mutations dict. If the mutation type is "del" the
1712 deleted nucleotids are checked to be contained in any of
1713 the deletion described in the known_mutation dict.
1714 """
1715 resistence = "Unknown"
1716 pmid = "-"
1717 found_mut = found_mut.upper()
1718
1719 if mut == "del":
1720 for i, i_pos in enumerate(range(pos, pos + len(found_mut))):
1721
1722 known_indels = self.known_mutations[gene]["del"].get(i_pos, [])
1723 for known_indel in known_indels:
1724 partial_mut = found_mut[i:len(known_indel) + i]
1725
1726 # Check if part of found mut is known and check if
1727 # found mut and known mut is in the same reading
1728 # frame
1729 if(partial_mut == known_indel
1730 and len(found_mut) % 3 == len(known_indel) % 3):
1731
1732 resistence = (self.known_mutations[gene]["del"][i_pos]
1733 [known_indel]['drug'])
1734
1735 pmid = (self.known_mutations[gene]["del"][i_pos]
1736 [known_indel]['pmid'])
1737
1738 gene_name = (self.known_mutations[gene]["del"][i_pos]
1739 [known_indel]['gene_name'])
1740 break
1741 else:
1742 if pos in self.known_mutations[gene][mut]:
1743 if found_mut in self.known_mutations[gene][mut][pos]:
1744 resistence = (self.known_mutations[gene][mut][pos]
1745 [found_mut]['drug'])
1746
1747 pmid = (self.known_mutations[gene][mut][pos][found_mut]
1748 ['pmid'])
1749
1750 gene_name = (self.known_mutations[gene][mut][pos]
1751 [found_mut]['gene_name'])
1752
1753 # Check if stop codons refer resistance
1754 if "*" in found_mut and gene in self.known_stop_codon:
1755 if resistence == "Unknown":
1756 resistence = self.known_stop_codon[gene]["drug"]
1757 else:
1758 resistence += "," + self.known_stop_codon[gene]["drug"]
1759
1760 return (gene_name, resistence, pmid)
1761
1762
1763 if __name__ == '__main__':
1764
1765 ##########################################################################
1766 # PARSE COMMAND LINE OPTIONS
1767 ##########################################################################
1768
1769 parser = argparse.ArgumentParser(
1770 description="This program predicting resistance associated with \
1771 chromosomal mutations based on WGS data",
1772 prog="pointfinder.py")
1773
1774 # required arguments
1775 parser.add_argument("-i",
1776 dest="inputfiles",
1777 metavar="INFILE",
1778 nargs='+',
1779 help="Input file. fastq file(s) from one sample for \
1780 KMA or one fasta file for blastn.",
1781 required=True)
1782 parser.add_argument("-o",
1783 dest="out_path",
1784 metavar="OUTFOLDER",
1785 help="Output folder, output files will be stored \
1786 here.",
1787 required=True)
1788 parser.add_argument("-s",
1789 dest="species",
1790 metavar="SPECIES",
1791 choices=['e.coli', 'gonorrhoeae', 'campylobacter',
1792 'salmonella', 'tuberculosis'],
1793 help="Species of choice, e.coli, tuberculosis, \
1794 salmonella, campylobactor, gonorrhoeae, \
1795 klebsiella, or malaria",
1796 required=True)
1797 parser.add_argument("-m",
1798 dest="method",
1799 metavar="METHOD",
1800 choices=["kma", "blastn"],
1801 help="Method of choice, kma or blastn",
1802 required=True)
1803 parser.add_argument("-m_p",
1804 dest="method_path",
1805 help="Path to the location of blastn or kma dependent \
1806 of the chosen method",
1807 required=True)
1808 parser.add_argument("-p",
1809 dest="db_path",
1810 metavar="DATABASE",
1811 help="Path to the location of the pointfinder \
1812 database",
1813 required=True)
1814
1815 # optional arguments
1816 parser.add_argument("-t",
1817 dest="threshold",
1818 metavar="IDENTITY",
1819 help="Minimum gene identity threshold, default = 0.9",
1820 type=float,
1821 default=0.9)
1822 parser.add_argument("-l",
1823 dest="min_cov",
1824 metavar="COVERAGE",
1825 help="Minimum gene coverage threshold, \
1826 threshold = 0.6",
1827 type=float,
1828 default=0.6)
1829 parser.add_argument("-u",
1830 dest="unknown_mutations",
1831 help="Show all mutations found even if it's unknown \
1832 to the resistance database.",
1833 action='store_true',
1834 default=False)
1835 parser.add_argument("-g",
1836 dest="specific_gene",
1837 nargs='+',
1838 help="Specify genes existing in the database to \
1839 search for - if none is specified all genes are \
1840 included in the search.",
1841 default=None)
1842
1843 args = parser.parse_args()
1844
1845 # If no arguments are given print usage message and exit
1846 if len(sys.argv) == 1:
1847 sys.exit("Usage: " + parser.usage)
1848
1849 if(args.method == "blastn"):
1850 method = PointFinder.TYPE_BLAST
1851 else:
1852 method = PointFinder.TYPE_KMA
1853
1854 # Get sample name
1855 filename = args.inputfiles[0].split("/")[-1]
1856 sample_name = "".join(filename.split(".")[0:-1]) # .split("_")[0]
1857 if sample_name == "":
1858 sample_name = filename
1859
1860 # TODO: Change ilogocal var name
1861 kma_db_path = args.db_path + "/" + args.species
1862
1863 finder = PointFinder(db_path=kma_db_path, species=args.species,
1864 gene_list=args.specific_gene)
1865
1866 if method == PointFinder.TYPE_BLAST:
1867
1868 # Check that only one input file is given
1869 if len(args.inputfiles) != 1:
1870 sys.exit("Input Error: Blast was chosen as mapping method only 1 "
1871 "input file requied, not %s" % (len(args.inputfiles)))
1872
1873 finder_run = finder.blast(inputfile=args.inputfiles[0],
1874 out_path=args.out_path,
1875 min_cov=0.01,
1876 threshold=args.threshold,
1877 blast=args.method_path,
1878 cut_off=False)
1879 else:
1880 inputfile_1 = args.inputfiles[0]
1881 inputfile_2 = None
1882
1883 if(len(args.inputfiles) == 2):
1884 inputfile_2 = args.inputfiles[1]
1885 finder_run = finder.kma(inputfile_1=inputfile_1,
1886 inputfile_2=inputfile_2,
1887 out_path=args.out_path,
1888 db_path_kma=kma_db_path,
1889 databases=[args.species],
1890 min_cov=args.min_cov,
1891 threshold=args.threshold,
1892 kma_path=args.method_path,
1893 sample_name=sample_name,
1894 kma_mrs=0.5, kma_gapopen=-5, kma_gapextend=-2,
1895 kma_penalty=-3, kma_reward=1)
1896
1897 results = finder_run.results
1898
1899 if(args.specific_gene):
1900 results = PointFinder.discard_unwanted_results(
1901 results=results, wanted=args.specific_gene)
1902
1903 finder.write_results(out_path=args.out_path, result=results, min_cov=args.min_cov,
1904 res_type=method, unknown_flag=args.unknown_mutations)