Mercurial > repos > dcouvin > resfinder4
comparison resfinder/cge/pointfinder.py @ 0:55051a9bc58d draft default tip
Uploaded
| author | dcouvin |
|---|---|
| date | Mon, 10 Jan 2022 20:06:07 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:55051a9bc58d |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 # | |
| 3 # Program: PointFinder-3.0 | |
| 4 # Author: Camilla Hundahl Johnsen | |
| 5 # | |
| 6 # Dependencies: KMA or NCBI-blast together with BioPython. | |
| 7 | |
| 8 import os | |
| 9 import re | |
| 10 import sys | |
| 11 import math | |
| 12 import argparse | |
| 13 import subprocess | |
| 14 import random | |
| 15 | |
| 16 from cgecore.blaster import Blaster | |
| 17 from cgecore.cgefinder import CGEFinder | |
| 18 from .output.table import TableResults | |
| 19 from .phenotype2genotype.feature import ResMutation | |
| 20 from .phenotype2genotype.res_profile import PhenoDB | |
| 21 | |
| 22 | |
| 23 def eprint(*args, **kwargs): | |
| 24 print(*args, file=sys.stderr, **kwargs) | |
| 25 | |
| 26 | |
| 27 class GeneListError(Exception): | |
| 28 """ Raise when a specified gene is not found within the gene list. | |
| 29 """ | |
| 30 def __init__(self, message, *args): | |
| 31 self.message = message | |
| 32 # allow users initialize misc. arguments as any other builtin Error | |
| 33 super(PanelNameError, self).__init__(message, *args) | |
| 34 | |
| 35 | |
| 36 class PointFinder(CGEFinder): | |
| 37 | |
| 38 def __init__(self, db_path, species, gene_list=None): | |
| 39 """ | |
| 40 """ | |
| 41 self.species = species | |
| 42 self.specie_path = db_path | |
| 43 self.RNA_gene_list = [] | |
| 44 | |
| 45 self.gene_list = PointFinder.get_file_content( | |
| 46 self.specie_path + "/genes.txt") | |
| 47 | |
| 48 if os.path.isfile(self.specie_path + "/RNA_genes.txt"): | |
| 49 self.RNA_gene_list = PointFinder.get_file_content( | |
| 50 self.specie_path + "/RNA_genes.txt") | |
| 51 | |
| 52 # Creat user defined gene_list if given | |
| 53 if(gene_list is not None): | |
| 54 self.gene_list = get_user_defined_gene_list(gene_list) | |
| 55 | |
| 56 self.known_mutations, self.drug_genes, self.known_stop_codon = ( | |
| 57 self.get_db_mutations(self.specie_path + "/resistens-overview.txt", | |
| 58 self.gene_list)) | |
| 59 | |
| 60 def get_user_defined_gene_list(self, gene_list): | |
| 61 genes_specified = [] | |
| 62 for gene in gene_list: | |
| 63 # Check that the genes are valid | |
| 64 if gene not in self.gene_list: | |
| 65 raise(GeneListError( | |
| 66 "Input Error: Specified gene not recognised " | |
| 67 "(%s)\nChoose one or more of the following genes:" | |
| 68 "\n%s" % (gene, "\n".join(self.gene_list)))) | |
| 69 genes_specified.append(gene) | |
| 70 # Change the gene_list to the user defined gene_list | |
| 71 return genes_specified | |
| 72 | |
| 73 #DELETE | |
| 74 def old_results_to_standard_output(self, result, software, version, | |
| 75 run_date, run_cmd, id, | |
| 76 unknown_mut=False, tableresults=None): | |
| 77 """ | |
| 78 """ | |
| 79 std_results = TableResults(software, version, run_date, run_cmd, id) | |
| 80 headers = [ | |
| 81 "template_name", | |
| 82 "template_length", | |
| 83 "aln_length", | |
| 84 "aln_identity", | |
| 85 "aln_gaps", | |
| 86 "aln_template_string", | |
| 87 "aln_query_string", | |
| 88 "aln_homology_string", | |
| 89 "query_id", | |
| 90 "query_start_pos", | |
| 91 "query_end_pos", | |
| 92 "template_variant", | |
| 93 "query_depth", | |
| 94 "blast_eval", | |
| 95 "blast_bitscore", | |
| 96 "pval", | |
| 97 "software_score", | |
| 98 "mutation", | |
| 99 "ref_codon", | |
| 100 "alt_codon", | |
| 101 "ref_aa", | |
| 102 "alt_aa", | |
| 103 "insertion", | |
| 104 "deletion" | |
| 105 ] | |
| 106 for db_name, db in result.items(): | |
| 107 if(db_name == "excluded"): | |
| 108 continue | |
| 109 | |
| 110 if(db == "No hit found"): | |
| 111 continue | |
| 112 | |
| 113 ####Added to solve PointFinder#### | |
| 114 if(isinstance(db, str)): | |
| 115 if db.startswith("Gene found with coverage"): | |
| 116 continue | |
| 117 #### | |
| 118 | |
| 119 # Start writing output string (to HTML tab file) | |
| 120 gene_name = PhenoDB.if_promoter_rename(db_name) | |
| 121 | |
| 122 # Find and save mis_matches in gene | |
| 123 sbjct_start = db["sbjct_start"] | |
| 124 sbjct_seq = db["sbjct_string"] | |
| 125 qry_seq = db["query_string"] | |
| 126 db["mis_matches"] = self.find_mismatches(gene=db_name, | |
| 127 sbjct_start=sbjct_start, | |
| 128 sbjct_seq=sbjct_seq, | |
| 129 qry_seq=qry_seq) | |
| 130 | |
| 131 known_muts = [] | |
| 132 unknown_muts = [] | |
| 133 if(len(db["mis_matches"]) > 0): | |
| 134 known_muts, unknown_muts = self.get_mutations( | |
| 135 db_name, gene_name, db['mis_matches'], True, db) | |
| 136 | |
| 137 # No mutations found | |
| 138 if(not (unknown_muts or known_muts)): | |
| 139 continue | |
| 140 | |
| 141 std_results.add_table(db_name) | |
| 142 std_db = std_results.long[db_name] | |
| 143 std_db.add_headers(headers) | |
| 144 std_db.lock_headers = True | |
| 145 | |
| 146 for mut in known_muts: | |
| 147 unique_id = mut.unique_id | |
| 148 std_db[unique_id] = { | |
| 149 "template_name": db_name, | |
| 150 "template_length": db["sbjct_length"], | |
| 151 "aln_length": db["HSP_length"], | |
| 152 "aln_identity": db["perc_ident"], | |
| 153 "aln_gaps": db["gaps"], | |
| 154 "aln_template_string": db["sbjct_string"], | |
| 155 "aln_query_string": db["query_string"], | |
| 156 "aln_homology_string": db["homo_string"], | |
| 157 "query_id": db["contig_name"], | |
| 158 "query_start_pos": mut.pos, | |
| 159 "query_end_pos": mut.end, | |
| 160 "query_depth": db.get("depth", "NA"), | |
| 161 "pval": db.get("p_value", "NA"), | |
| 162 "software_score": db["cal_score"], | |
| 163 "mutation": mut.mut_string_short, | |
| 164 "ref_codon": mut.ref_codon, | |
| 165 "alt_codon": mut.mut_codon, | |
| 166 "ref_aa": mut.ref_aa, | |
| 167 "alt_aa": mut.mut_aa, | |
| 168 "insertion": mut.insertion, | |
| 169 "deletion": mut.deletion | |
| 170 } | |
| 171 | |
| 172 return std_results | |
| 173 | |
| 174 @staticmethod | |
| 175 def get_db_names(db_root_path): | |
| 176 out_lst = [] | |
| 177 for entry in os.scandir(db_root_path): | |
| 178 if not entry.name.startswith('.') and entry.is_dir(): | |
| 179 out_lst.append(entry.name) | |
| 180 return tuple(out_lst) | |
| 181 | |
| 182 def results_to_str(self, res_type, results, unknown_flag, min_cov, | |
| 183 perc_iden): | |
| 184 # Initiate output stings with headers | |
| 185 gene_list = ', '.join(self.gene_list) | |
| 186 output_strings = [ | |
| 187 "Mutation\tNucleotide change\tAmino acid change\tResistance\tPMID", | |
| 188 "Chromosomal point mutations - Results\nSpecies: %s\nGenes: %s\n" | |
| 189 "Mapping methode: %s\n\n\nKnown Mutations\n" % (self.species, gene_list, res_type), "" | |
| 190 ] | |
| 191 # Get all drug names and add header of all drugs to prediction file | |
| 192 drug_lst = [drug for drug in self.drug_genes.keys()] | |
| 193 output_strings[2] = "\t".join(drug_lst) + "\n" | |
| 194 | |
| 195 # Define variables to write temporary output into | |
| 196 total_unknown_str = "" | |
| 197 unique_drug_list = [] | |
| 198 excluded_hits = {} | |
| 199 | |
| 200 if res_type == PointFinder.TYPE_BLAST: | |
| 201 # Patch together partial sequence hits (BLASTER does not do this) | |
| 202 # and removes dummy_hit_id | |
| 203 GENES = self.find_best_seqs(results, min_cov) | |
| 204 | |
| 205 else: | |
| 206 # TODO: Some databases are either genes or species, | |
| 207 # depending on the method applied (BLAST/KMA). | |
| 208 GENES = results[self.species] | |
| 209 | |
| 210 # If not hits found insert genes not found | |
| 211 if GENES == 'No hit found': | |
| 212 GENES = {} | |
| 213 | |
| 214 # KMA only gives the genes found, however all genes should be | |
| 215 # included | |
| 216 for gene in self.gene_list: | |
| 217 if gene not in GENES: | |
| 218 GENES[gene] = 'No hit found' | |
| 219 | |
| 220 # Included excluded | |
| 221 GENES['excluded'] = results['excluded'] | |
| 222 | |
| 223 for gene, hit in GENES.items(): | |
| 224 # Start writing output string (to HTML tab file) | |
| 225 gene_name = gene # Not perfeft can differ from specific mutation | |
| 226 regex = r"promoter_size_(\d+)(?:bp)" | |
| 227 promtr_gene_objt = re.search(regex, gene) | |
| 228 | |
| 229 if promtr_gene_objt: | |
| 230 gene_name = gene.split("_")[0] | |
| 231 | |
| 232 output_strings[1] += "\n%s\n" % (gene_name) | |
| 233 | |
| 234 # Ignore excluded genes | |
| 235 # TODO: Should all not found genes be moved to this dict? | |
| 236 if gene == "excluded": | |
| 237 continue | |
| 238 | |
| 239 # Write exclusion reason for genes not found | |
| 240 if isinstance(GENES[gene], str): | |
| 241 output_strings[1] += GENES[gene] + "\n" | |
| 242 continue | |
| 243 | |
| 244 if hit['perc_ident'] < perc_iden and hit['perc_coverage'] < min_cov: | |
| 245 GENES[gene] = ('Gene found with identity, %s, below minimum ' | |
| 246 'identity threshold: %s. And with coverage, %s,' | |
| 247 ' below minimum coverage threshold: %s.' | |
| 248 % (hit['perc_ident'], perc_iden, | |
| 249 hit['perc_coverage'], min_cov)) | |
| 250 output_strings[1] += GENES[gene] + "\n" | |
| 251 continue | |
| 252 | |
| 253 if hit['perc_ident'] < perc_iden: | |
| 254 GENES[gene] = ('Gene found with identity, %s, below minimum ' | |
| 255 'identity threshold: %s' % (hit['perc_ident'], | |
| 256 perc_iden)) | |
| 257 output_strings[1] += GENES[gene] + "\n" | |
| 258 continue | |
| 259 | |
| 260 if hit['perc_coverage'] < min_cov: | |
| 261 # Gene not found above given coverage | |
| 262 GENES[gene] = ('Gene found with coverage, %s, below minimum ' | |
| 263 'coverage threshold: %s' % (hit['perc_coverage'], | |
| 264 min_cov)) | |
| 265 output_strings[1] += GENES[gene] + "\n" | |
| 266 continue | |
| 267 | |
| 268 sbjct_start = hit['sbjct_start'] | |
| 269 sbjct_seq = hit['sbjct_string'] | |
| 270 qry_seq = hit['query_string'] | |
| 271 | |
| 272 # Find and save mis_matches in gene | |
| 273 hit['mis_matches'] = self.find_mismatches(gene, sbjct_start, | |
| 274 sbjct_seq, qry_seq) | |
| 275 | |
| 276 # Check if no mutations was found | |
| 277 if len(hit['mis_matches']) < 1: | |
| 278 output_strings[1] += ("No mutations found in {}\n" | |
| 279 .format(gene_name)) | |
| 280 else: | |
| 281 # Write mutations found to output file | |
| 282 total_unknown_str += "\n%s\n" % (gene_name) | |
| 283 | |
| 284 str_tuple = self.mut2str(gene, gene_name, hit['mis_matches'], | |
| 285 unknown_flag, hit) | |
| 286 | |
| 287 all_results = str_tuple[0] | |
| 288 total_known = str_tuple[1] | |
| 289 total_unknown = str_tuple[2] | |
| 290 drug_list = str_tuple[3] | |
| 291 | |
| 292 # Add results to output strings | |
| 293 if(all_results): | |
| 294 output_strings[0] += "\n" + all_results | |
| 295 output_strings[1] += total_known + "\n" | |
| 296 | |
| 297 # Add unknown mutations the total results of | |
| 298 # unknown mutations | |
| 299 total_unknown_str += total_unknown + "\n" | |
| 300 | |
| 301 # Add drugs to druglist | |
| 302 unique_drug_list += [drug.lower() for drug in drug_list] | |
| 303 | |
| 304 if unknown_flag is True: | |
| 305 output_strings[1] += ("\n\nUnknown Mutations \n" | |
| 306 + total_unknown_str) | |
| 307 | |
| 308 # Make Resistance Prediction output | |
| 309 | |
| 310 # Go throug all drugs in the database and see if prediction | |
| 311 # can be called. | |
| 312 pred_output = [] | |
| 313 for drug in drug_lst: | |
| 314 drug = drug.lower() | |
| 315 # Check if resistance to drug was found | |
| 316 if drug in unique_drug_list: | |
| 317 pred_output.append("1") | |
| 318 else: | |
| 319 # Check at all genes associated with the drug | |
| 320 # resistance where found | |
| 321 all_genes_found = True | |
| 322 for gene in self.drug_genes[drug]: | |
| 323 if gene not in GENES: | |
| 324 all_genes_found = False | |
| 325 | |
| 326 if all_genes_found is False: | |
| 327 pred_output.append("?") | |
| 328 else: | |
| 329 pred_output.append("0") | |
| 330 | |
| 331 output_strings[2] += "\t".join(pred_output) + "\n" | |
| 332 | |
| 333 return output_strings | |
| 334 | |
| 335 def write_results(self, out_path, result, res_type, unknown_flag, min_cov, | |
| 336 perc_iden): | |
| 337 """ | |
| 338 """ | |
| 339 | |
| 340 result_str = self.results_to_str(res_type=res_type, | |
| 341 results=result, | |
| 342 unknown_flag=unknown_flag, | |
| 343 min_cov=min_cov, | |
| 344 perc_iden=perc_iden) | |
| 345 | |
| 346 with open(out_path + "/PointFinder_results.txt", "w") as fh: | |
| 347 fh.write(result_str[0]) | |
| 348 with open(out_path + "/PointFinder_table.txt", "w") as fh: | |
| 349 fh.write(result_str[1]) | |
| 350 with open(out_path + "/PointFinder_prediction.txt", "w") as fh: | |
| 351 fh.write(result_str[2]) | |
| 352 | |
| 353 @staticmethod | |
| 354 def discard_unwanted_results(results, wanted): | |
| 355 """ | |
| 356 Takes a dict and a list of keys. | |
| 357 Returns a dict containing only the keys from the list. | |
| 358 """ | |
| 359 cleaned_results = dict() | |
| 360 for key, val in results.items(): | |
| 361 if(key in wanted): | |
| 362 cleaned_results[key] = val | |
| 363 return cleaned_results | |
| 364 | |
| 365 def blast(self, inputfile, out_path, min_cov=0.6, threshold=0.9, | |
| 366 blast="blastn", cut_off=True): | |
| 367 """ | |
| 368 """ | |
| 369 blast_run = Blaster(inputfile=inputfile, databases=self.gene_list, | |
| 370 db_path=self.specie_path, out_path=out_path, | |
| 371 min_cov=min_cov, threshold=threshold, blast=blast, | |
| 372 cut_off=cut_off) | |
| 373 | |
| 374 self.blast_results = blast_run.results # TODO Is this used? | |
| 375 return blast_run | |
| 376 | |
| 377 @staticmethod | |
| 378 def get_db_mutations(mut_db_path, gene_list): | |
| 379 """ | |
| 380 This function opens the file resistenss-overview.txt, and reads | |
| 381 the content into a dict of dicts. The dict will contain | |
| 382 information about all known mutations given in the database. | |
| 383 This dict is returned. | |
| 384 """ | |
| 385 | |
| 386 # Initiate variables | |
| 387 known_mutations = dict() | |
| 388 known_stop_codon = dict() | |
| 389 drug_genes = dict() | |
| 390 indelflag = False | |
| 391 stopcodonflag = False | |
| 392 | |
| 393 # Go throug mutation file line by line | |
| 394 | |
| 395 with open(mut_db_path, "r") as fh: | |
| 396 mutdb_file = fh.readlines() | |
| 397 mutdb_file = [line.strip() for line in mutdb_file] | |
| 398 | |
| 399 for line in mutdb_file: | |
| 400 # Ignore headers and check where the indel section starts | |
| 401 if line.startswith("#"): | |
| 402 if "indel" in line.lower(): | |
| 403 indelflag = True | |
| 404 elif "stop codon" in line.lower(): | |
| 405 stopcodonflag = True | |
| 406 else: | |
| 407 stopcodonflag = False | |
| 408 continue | |
| 409 | |
| 410 mutation = [data.strip() for data in line.split("\t")] | |
| 411 | |
| 412 gene_ID = mutation[0] | |
| 413 | |
| 414 # Only consider mutations in genes found in the gene list | |
| 415 if gene_ID in gene_list: | |
| 416 gene_name = mutation[1] | |
| 417 mut_pos = int(mutation[2]) | |
| 418 ref_codon = mutation[3] # Ref_nuc (1 or 3?) | |
| 419 ref_aa = mutation[4] # Ref_codon | |
| 420 alt_aa = mutation[5].split(",") # Res_codon | |
| 421 res_drug = mutation[6].replace("\t", " ") | |
| 422 pmid = mutation[7].split(",") | |
| 423 | |
| 424 # Check if stop codons are predictive for resistance | |
| 425 if stopcodonflag is True: | |
| 426 if gene_ID not in known_stop_codon: | |
| 427 known_stop_codon[gene_ID] = {"pos": [], | |
| 428 "drug": res_drug} | |
| 429 known_stop_codon[gene_ID]["pos"].append(mut_pos) | |
| 430 | |
| 431 # Add genes associated with drug resistance to drug_genes dict | |
| 432 drug_lst = res_drug.split(",") | |
| 433 drug_lst = [d.strip().lower() for d in drug_lst] | |
| 434 for drug in drug_lst: | |
| 435 if drug not in drug_genes: | |
| 436 drug_genes[drug] = [] | |
| 437 if gene_ID not in drug_genes[drug]: | |
| 438 drug_genes[drug].append(gene_ID) | |
| 439 | |
| 440 # Initiate empty dict to store relevant mutation information | |
| 441 mut_info = dict() | |
| 442 | |
| 443 # Save need mutation info with pmid cooresponding to the amino | |
| 444 # acid change | |
| 445 for i in range(len(alt_aa)): | |
| 446 try: | |
| 447 mut_info[alt_aa[i]] = {"gene_name": gene_name, | |
| 448 "drug": res_drug, | |
| 449 "pmid": pmid[i]} | |
| 450 except IndexError: | |
| 451 mut_info[alt_aa[i]] = {"gene_name": gene_name, | |
| 452 "drug": res_drug, | |
| 453 "pmid": "-"} | |
| 454 | |
| 455 # Add all possible types of mutations to the dict | |
| 456 if gene_ID not in known_mutations: | |
| 457 known_mutations[gene_ID] = {"sub": dict(), "ins": dict(), | |
| 458 "del": dict()} | |
| 459 # Check for the type of mutation | |
| 460 if indelflag is False: | |
| 461 mutation_type = "sub" | |
| 462 else: | |
| 463 mutation_type = ref_aa | |
| 464 | |
| 465 # Save mutations positions with required information given in | |
| 466 # mut_info | |
| 467 if mut_pos not in known_mutations[gene_ID][mutation_type]: | |
| 468 known_mutations[gene_ID][mutation_type][mut_pos] = dict() | |
| 469 for amino in alt_aa: | |
| 470 if (amino in known_mutations[gene_ID][mutation_type] | |
| 471 [mut_pos]): | |
| 472 stored_mut_info = (known_mutations[gene_ID] | |
| 473 [mutation_type] | |
| 474 [mut_pos][amino]) | |
| 475 if stored_mut_info["drug"] != mut_info[amino]["drug"]: | |
| 476 stored_mut_info["drug"] += "," + (mut_info[amino] | |
| 477 ["drug"]) | |
| 478 if stored_mut_info["pmid"] != mut_info[amino]["pmid"]: | |
| 479 stored_mut_info["pmid"] += ";" + (mut_info[amino] | |
| 480 ["pmid"]) | |
| 481 | |
| 482 (known_mutations[gene_ID][mutation_type] | |
| 483 [mut_pos][amino]) = stored_mut_info | |
| 484 else: | |
| 485 (known_mutations[gene_ID][mutation_type] | |
| 486 [mut_pos][amino]) = mut_info[amino] | |
| 487 | |
| 488 # Check that all genes in the gene list has known mutations | |
| 489 for gene in gene_list: | |
| 490 if gene not in known_mutations: | |
| 491 known_mutations[gene] = {"sub": dict(), "ins": dict(), | |
| 492 "del": dict()} | |
| 493 return known_mutations, drug_genes, known_stop_codon | |
| 494 | |
| 495 def find_best_seqs(self, blast_results, min_cov): | |
| 496 """ | |
| 497 This function finds gene sequences with the largest coverage based on | |
| 498 the blast results. If more hits covering different sequences parts | |
| 499 exists it concatinates partial gene hits into one hit. | |
| 500 If different overlap sequences occurs they are saved in the list | |
| 501 alternative_overlaps. The function returns a new results dict where | |
| 502 each gene has an inner dict with hit information corresponding to | |
| 503 the new concatinated hit. If no gene is found the value is a string | |
| 504 with info of why the gene wasn't found. | |
| 505 """ | |
| 506 | |
| 507 GENES = dict() | |
| 508 for gene, hits in blast_results.items(): | |
| 509 | |
| 510 # Save all hits in the list 'hits_found' | |
| 511 hits_found = [] | |
| 512 GENES[gene] = {} | |
| 513 | |
| 514 # Check for gene has any hits in blast results | |
| 515 if gene == 'excluded': | |
| 516 GENES[gene] = hits | |
| 517 continue | |
| 518 elif isinstance(hits, dict) and len(hits) > 0: | |
| 519 GENES[gene]['found'] = 'partially' | |
| 520 GENES[gene]['hits'] = hits | |
| 521 else: | |
| 522 # Gene not found! go to next gene | |
| 523 GENES[gene] = 'No hit found' | |
| 524 continue | |
| 525 | |
| 526 # Check coverage for each hit, patch together partial genes hits | |
| 527 for hit_id, hit in hits.items(): | |
| 528 | |
| 529 # Save hits start and end positions in subject, total subject | |
| 530 # len, and subject and query sequences of each hit | |
| 531 hits_found += [(hit['sbjct_start'], hit['sbjct_end'], | |
| 532 hit['sbjct_string'], hit['query_string'], | |
| 533 hit['sbjct_length'], hit['homo_string'], | |
| 534 hit_id)] | |
| 535 | |
| 536 # If coverage is 100% change found to yes | |
| 537 hit_coverage = hit['perc_coverage'] | |
| 538 if hit_coverage == 1.0: | |
| 539 GENES[gene]['found'] = 'yes' | |
| 540 | |
| 541 # Sort positions found | |
| 542 hits_found = sorted(hits_found, key=lambda x: x[0]) | |
| 543 | |
| 544 # Find best hit by concatenating sequences if more hits exist | |
| 545 | |
| 546 # Get information from the first hit found | |
| 547 all_start = hits_found[0][0] | |
| 548 current_end = hits_found[0][1] | |
| 549 final_sbjct = hits_found[0][2] | |
| 550 final_qry = hits_found[0][3] | |
| 551 sbjct_len = hits_found[0][4] | |
| 552 final_homol = hits_found[0][5] | |
| 553 first_hit_id = hits_found[0][6] | |
| 554 | |
| 555 alternative_overlaps = [] | |
| 556 contigs = [hit['contig_name']] | |
| 557 scores = [str(hit['cal_score'])] | |
| 558 | |
| 559 # Check if more then one hit was found within the same gene | |
| 560 for i in range(len(hits_found) - 1): | |
| 561 | |
| 562 # Save information from previous hit | |
| 563 pre_block_start = hits_found[i][0] | |
| 564 pre_block_end = hits_found[i][1] | |
| 565 pre_sbjct = hits_found[i][2] | |
| 566 pre_qry = hits_found[i][3] | |
| 567 pre_homol = hits_found[i][5] | |
| 568 pre_id = hits_found[i][6] | |
| 569 | |
| 570 # Save information from next hit | |
| 571 next_block_start = hits_found[i + 1][0] | |
| 572 next_block_end = hits_found[i + 1][1] | |
| 573 next_sbjct = hits_found[i + 1][2] | |
| 574 next_qry = hits_found[i + 1][3] | |
| 575 next_homol = hits_found[i + 1][5] | |
| 576 next_id = hits_found[i + 1][6] | |
| 577 | |
| 578 contigs.append(hits[next_id]['contig_name']) | |
| 579 scores.append(str(hits[next_id]['cal_score'])) | |
| 580 | |
| 581 # Check for overlapping sequences, collaps them and save | |
| 582 # alternative overlaps if any | |
| 583 if next_block_start <= current_end: | |
| 584 | |
| 585 # Find overlap start and take gaps into account | |
| 586 pos_count = 0 | |
| 587 overlap_pos = pre_block_start | |
| 588 for i in range(len(pre_sbjct)): | |
| 589 | |
| 590 # Stop loop if overlap_start position is reached | |
| 591 if overlap_pos == next_block_start: | |
| 592 overlap_start = pos_count | |
| 593 break | |
| 594 if pre_sbjct[i] != "-": | |
| 595 overlap_pos += 1 | |
| 596 pos_count += 1 | |
| 597 | |
| 598 # Find overlap len and add next sequence to final sequence | |
| 599 if len(pre_sbjct[overlap_start:]) > len(next_sbjct): | |
| 600 # <---------> | |
| 601 # <---> | |
| 602 overlap_len = len(next_sbjct) | |
| 603 overlap_end_pos = next_block_end | |
| 604 else: | |
| 605 # <---------> | |
| 606 # <---------> | |
| 607 overlap_len = len(pre_sbjct[overlap_start:]) | |
| 608 overlap_end_pos = pre_block_end | |
| 609 | |
| 610 # Update current end | |
| 611 current_end = next_block_end | |
| 612 | |
| 613 # Use the entire previous sequence and add the last | |
| 614 # part of the next sequence | |
| 615 final_sbjct += next_sbjct[overlap_len:] | |
| 616 final_qry += next_qry[overlap_len:] | |
| 617 final_homol += next_homol[overlap_len:] | |
| 618 | |
| 619 # Find query overlap sequences | |
| 620 pre_qry_overlap = pre_qry[overlap_start: (overlap_start | |
| 621 + overlap_len)] | |
| 622 next_qry_overlap = next_qry[:overlap_len] | |
| 623 sbjct_overlap = next_sbjct[:overlap_len] | |
| 624 | |
| 625 # If alternative query overlap excist save it | |
| 626 if pre_qry_overlap != next_qry_overlap: | |
| 627 eprint("OVERLAP WARNING:") | |
| 628 eprint("{}\n{}" | |
| 629 .format(pre_qry_overlap, next_qry_overlap)) | |
| 630 | |
| 631 # Save alternative overlaps | |
| 632 alternative_overlaps += [(next_block_start, | |
| 633 overlap_end_pos, | |
| 634 sbjct_overlap, | |
| 635 next_qry_overlap)] | |
| 636 | |
| 637 elif next_block_start > current_end: | |
| 638 # <-------> | |
| 639 # <-------> | |
| 640 gap_size = next_block_start - current_end - 1 | |
| 641 final_qry += "N" * gap_size | |
| 642 final_sbjct += "N" * gap_size | |
| 643 final_homol += "-" * gap_size | |
| 644 current_end = next_block_end | |
| 645 final_sbjct += next_sbjct | |
| 646 final_qry += next_qry | |
| 647 final_homol += next_homol | |
| 648 | |
| 649 # Calculate new coverage | |
| 650 no_call = final_qry.upper().count("N") | |
| 651 coverage = ((current_end - all_start + 1 - no_call) | |
| 652 / float(sbjct_len)) | |
| 653 | |
| 654 # Calculate identity | |
| 655 equal = 0 | |
| 656 not_equal = 0 | |
| 657 for i in range(len(final_qry)): | |
| 658 if final_qry[i].upper() != "N": | |
| 659 if final_qry[i].upper() == final_sbjct[i].upper(): | |
| 660 equal += 1 | |
| 661 else: | |
| 662 not_equal += 1 | |
| 663 identity = equal / float(equal + not_equal) | |
| 664 if coverage >= min_cov: | |
| 665 GENES[gene]['perc_coverage'] = coverage | |
| 666 GENES[gene]['perc_ident'] = identity | |
| 667 GENES[gene]['sbjct_string'] = final_sbjct | |
| 668 GENES[gene]['query_string'] = final_qry | |
| 669 GENES[gene]['homo_string'] = final_homol | |
| 670 GENES[gene]['contig_name'] = ", ".join(contigs) | |
| 671 GENES[gene]['HSP_length'] = len(final_qry) | |
| 672 GENES[gene]['sbjct_start'] = all_start | |
| 673 GENES[gene]['sbjct_end'] = current_end | |
| 674 GENES[gene]['sbjct_length'] = sbjct_len | |
| 675 GENES[gene]['cal_score'] = ", ".join(scores) | |
| 676 GENES[gene]['gaps'] = no_call | |
| 677 GENES[gene]['alternative_overlaps'] = alternative_overlaps | |
| 678 GENES[gene]['mis_matches'] = [] | |
| 679 | |
| 680 else: | |
| 681 # Gene not found above given coverage | |
| 682 GENES[gene] = ('Gene found with coverage, %f, below ' | |
| 683 'minimum coverage threshold: %s' % (coverage, | |
| 684 min_cov)) | |
| 685 return GENES | |
| 686 | |
| 687 def find_mismatches(self, gene, sbjct_start, sbjct_seq, qry_seq): | |
| 688 """ | |
| 689 This function finds mis matches between two sequeces. Depending | |
| 690 on the the sequence type either the function | |
| 691 find_codon_mismatches or find_nucleotid_mismatches are called, | |
| 692 if the sequences contains both a promoter and a coding region | |
| 693 both functions are called. The function can also call itself if | |
| 694 alternative overlaps is give. All found mismatches are returned | |
| 695 """ | |
| 696 | |
| 697 # Initiate the mis_matches list that will store all found mis matcehs | |
| 698 mis_matches = [] | |
| 699 | |
| 700 # Find mis matches in RNA genes | |
| 701 if gene in self.RNA_gene_list: | |
| 702 mis_matches += PointFinder.find_nucleotid_mismatches(sbjct_start, | |
| 703 sbjct_seq, | |
| 704 qry_seq) | |
| 705 else: | |
| 706 # Check if the gene sequence is with a promoter | |
| 707 regex = r"promoter_size_(\d+)(?:bp)" | |
| 708 promtr_gene_objt = re.search(regex, gene) | |
| 709 | |
| 710 # Check for promoter sequences | |
| 711 if promtr_gene_objt: | |
| 712 # Get promoter length | |
| 713 promtr_len = int(promtr_gene_objt.group(1)) | |
| 714 | |
| 715 # Extract promoter sequence, while considering gaps | |
| 716 # --------agt-->---- | |
| 717 # ---->? | |
| 718 if sbjct_start <= promtr_len: | |
| 719 # Find position in sbjct sequence where promoter ends | |
| 720 promtr_end = 0 | |
| 721 nuc_count = sbjct_start - 1 | |
| 722 | |
| 723 for i in range(len(sbjct_seq)): | |
| 724 promtr_end += 1 | |
| 725 | |
| 726 if sbjct_seq[i] != "-": | |
| 727 nuc_count += 1 | |
| 728 | |
| 729 if nuc_count == promtr_len: | |
| 730 break | |
| 731 | |
| 732 # Check if only a part of the promoter is found | |
| 733 # --------agt-->---- | |
| 734 # ---- | |
| 735 promtr_sbjct_start = -1 | |
| 736 if nuc_count < promtr_len: | |
| 737 promtr_sbjct_start = nuc_count - promtr_len | |
| 738 | |
| 739 # Get promoter part of subject and query | |
| 740 sbjct_promtr_seq = sbjct_seq[:promtr_end] | |
| 741 qry_promtr_seq = qry_seq[:promtr_end] | |
| 742 | |
| 743 # For promoter part find nucleotide mis matches | |
| 744 mis_matches += PointFinder.find_nucleotid_mismatches( | |
| 745 promtr_sbjct_start, sbjct_promtr_seq, qry_promtr_seq, | |
| 746 promoter=True) | |
| 747 | |
| 748 # Check if gene is also found | |
| 749 # --------agt-->---- | |
| 750 # ----------- | |
| 751 if((sbjct_start + len(sbjct_seq.replace("-", ""))) | |
| 752 > promtr_len): | |
| 753 sbjct_gene_seq = sbjct_seq[promtr_end:] | |
| 754 qry_gene_seq = qry_seq[promtr_end:] | |
| 755 sbjct_gene_start = 1 | |
| 756 | |
| 757 # Find mismatches in gene part | |
| 758 mis_matches += PointFinder.find_codon_mismatches( | |
| 759 sbjct_gene_start, sbjct_gene_seq, qry_gene_seq) | |
| 760 | |
| 761 # No promoter, only gene is found | |
| 762 # --------agt-->---- | |
| 763 # ----- | |
| 764 else: | |
| 765 sbjct_gene_start = sbjct_start - promtr_len | |
| 766 | |
| 767 # Find mismatches in gene part | |
| 768 mis_matches += PointFinder.find_codon_mismatches( | |
| 769 sbjct_gene_start, sbjct_seq, qry_seq) | |
| 770 | |
| 771 else: | |
| 772 # Find mismatches in gene | |
| 773 mis_matches += PointFinder.find_codon_mismatches( | |
| 774 sbjct_start, sbjct_seq, qry_seq) | |
| 775 | |
| 776 return mis_matches | |
| 777 | |
| 778 @staticmethod | |
| 779 def find_nucleotid_mismatches(sbjct_start, sbjct_seq, qry_seq, | |
| 780 promoter=False): | |
| 781 """ | |
| 782 This function takes two alligned sequence (subject and query), | |
| 783 and the position on the subject where the alignment starts. The | |
| 784 sequences are compared one nucleotide at a time. If mis matches | |
| 785 are found they are saved. If a gap is found the function | |
| 786 find_nuc_indel is called to find the entire indel and it is | |
| 787 also saved into the list mis_matches. If promoter sequences are | |
| 788 given as arguments, these are reversed the and the absolut | |
| 789 value of the sequence position used, but when mutations are | |
| 790 saved the negative value and det reverse sequences are saved in | |
| 791 mis_mathces. | |
| 792 """ | |
| 793 | |
| 794 # Initiate the mis_matches list that will store all found | |
| 795 # mismatcehs | |
| 796 mis_matches = [] | |
| 797 | |
| 798 sbjct_start = abs(sbjct_start) | |
| 799 seq_pos = sbjct_start | |
| 800 | |
| 801 # Set variables depending on promoter status | |
| 802 factor = 1 | |
| 803 mut_prefix = "r." | |
| 804 | |
| 805 if promoter is True: | |
| 806 factor = (-1) | |
| 807 mut_prefix = "n." | |
| 808 # Reverse promoter sequences | |
| 809 sbjct_seq = sbjct_seq[::-1] | |
| 810 qry_seq = qry_seq[::-1] | |
| 811 | |
| 812 # Go through sequences one nucleotide at a time | |
| 813 shift = 0 | |
| 814 for index in range(sbjct_start - 1, len(sbjct_seq)): | |
| 815 mut_name = mut_prefix | |
| 816 mut = "" | |
| 817 # Shift index according to gaps | |
| 818 i = index + shift | |
| 819 | |
| 820 # If the end of the sequence is reached, stop | |
| 821 if i == len(sbjct_seq): | |
| 822 break | |
| 823 | |
| 824 sbjct_nuc = sbjct_seq[i] | |
| 825 qry_nuc = qry_seq[i] | |
| 826 | |
| 827 # Check for mis matches | |
| 828 if sbjct_nuc.upper() != qry_nuc.upper(): | |
| 829 | |
| 830 # check for insertions and deletions | |
| 831 if sbjct_nuc == "-" or qry_nuc == "-": | |
| 832 if sbjct_nuc == "-": | |
| 833 mut = "ins" | |
| 834 indel_start_pos = (seq_pos - 1) * factor | |
| 835 indel_end_pos = seq_pos * factor | |
| 836 indel = PointFinder.find_nuc_indel(sbjct_seq[i:], | |
| 837 qry_seq[i:]) | |
| 838 else: | |
| 839 mut = "del" | |
| 840 indel_start_pos = seq_pos * factor | |
| 841 indel = PointFinder.find_nuc_indel(qry_seq[i:], | |
| 842 sbjct_seq[i:]) | |
| 843 indel_end_pos = (seq_pos + len(indel) - 1) * factor | |
| 844 seq_pos += len(indel) - 1 | |
| 845 | |
| 846 # Shift the index to the end of the indel | |
| 847 shift += len(indel) - 1 | |
| 848 | |
| 849 # Write mutation name, depending on sequnce | |
| 850 if len(indel) == 1 and mut == "del": | |
| 851 mut_name += str(indel_start_pos) + mut + indel | |
| 852 else: | |
| 853 if promoter is True: | |
| 854 # Reverse the sequence and the start and | |
| 855 # end positions | |
| 856 indel = indel[::-1] | |
| 857 temp = indel_start_pos | |
| 858 indel_start_pos = indel_end_pos | |
| 859 indel_end_pos = temp | |
| 860 | |
| 861 mut_name += (str(indel_start_pos) + "_" | |
| 862 + str(indel_end_pos) + mut + indel) | |
| 863 | |
| 864 mis_matches += [[mut, seq_pos * factor, seq_pos * factor, | |
| 865 indel, mut_name, mut, indel]] | |
| 866 | |
| 867 # Check for substitutions mutations | |
| 868 else: | |
| 869 mut = "sub" | |
| 870 mut_name += (str(seq_pos * factor) + sbjct_nuc + ">" | |
| 871 + qry_nuc) | |
| 872 | |
| 873 mis_matches += [[mut, seq_pos * factor, seq_pos * factor, | |
| 874 qry_nuc, mut_name, sbjct_nuc, qry_nuc]] | |
| 875 | |
| 876 # Increment sequence position | |
| 877 if mut != "ins": | |
| 878 seq_pos += 1 | |
| 879 | |
| 880 return mis_matches | |
| 881 | |
| 882 @staticmethod | |
| 883 def find_nuc_indel(gapped_seq, indel_seq): | |
| 884 """ | |
| 885 This function finds the entire indel missing in from a gapped | |
| 886 sequence compared to the indel_seqeunce. It is assumes that the | |
| 887 sequences start with the first position of the gap. | |
| 888 """ | |
| 889 ref_indel = indel_seq[0] | |
| 890 for j in range(1, len(gapped_seq)): | |
| 891 if gapped_seq[j] == "-": | |
| 892 ref_indel += indel_seq[j] | |
| 893 else: | |
| 894 break | |
| 895 return ref_indel | |
| 896 | |
| 897 @staticmethod | |
| 898 def aa(codon): | |
| 899 """ | |
| 900 This function converts a codon to an amino acid. If the codon | |
| 901 is not valid an error message is given, or else, the amino acid | |
| 902 is returned. | |
| 903 | |
| 904 Potential future issue: If species are added that utilizes | |
| 905 alternative translation tables. | |
| 906 """ | |
| 907 codon = codon.upper() | |
| 908 aa = {"ATT": "I", "ATC": "I", "ATA": "I", | |
| 909 "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "TTA": "L", | |
| 910 "TTG": "L", | |
| 911 "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", | |
| 912 "TTT": "F", "TTC": "F", | |
| 913 "ATG": "M", | |
| 914 "TGT": "C", "TGC": "C", | |
| 915 "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", | |
| 916 "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", | |
| 917 "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", | |
| 918 "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", | |
| 919 "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S", | |
| 920 "AGC": "S", | |
| 921 "TAT": "Y", "TAC": "Y", | |
| 922 "TGG": "W", | |
| 923 "CAA": "Q", "CAG": "Q", | |
| 924 "AAT": "N", "AAC": "N", | |
| 925 "CAT": "H", "CAC": "H", | |
| 926 "GAA": "E", "GAG": "E", | |
| 927 "GAT": "D", "GAC": "D", | |
| 928 "AAA": "K", "AAG": "K", | |
| 929 "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R", | |
| 930 "AGG": "R", | |
| 931 "TAA": "*", "TAG": "*", "TGA": "*"} | |
| 932 # Translate valid codon | |
| 933 try: | |
| 934 amino_a = aa[codon] | |
| 935 except KeyError: | |
| 936 amino_a = "?" | |
| 937 return amino_a | |
| 938 | |
| 939 @staticmethod | |
| 940 def get_codon(seq, codon_no, start_offset): | |
| 941 """ | |
| 942 This function takes a sequece and a codon number and returns | |
| 943 the codon found in the sequence at that position | |
| 944 """ | |
| 945 seq = seq.replace("-", "") | |
| 946 codon_start_pos = int(codon_no - 1) * 3 - start_offset | |
| 947 codon = seq[codon_start_pos:codon_start_pos + 3] | |
| 948 return codon | |
| 949 | |
| 950 @staticmethod | |
| 951 def name_insertion(sbjct_seq, codon_no, sbjct_nucs, aa_alt, start_offset): | |
| 952 """ | |
| 953 This function is used to name a insertion mutation based on the | |
| 954 HGVS recommendation. | |
| 955 """ | |
| 956 start_codon_no = codon_no - 1 | |
| 957 | |
| 958 if len(sbjct_nucs) == 3: | |
| 959 start_codon_no = codon_no | |
| 960 | |
| 961 start_codon = PointFinder.get_codon(sbjct_seq, start_codon_no, | |
| 962 start_offset) | |
| 963 end_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset) | |
| 964 pos_name = "p.%s%d_%s%dins%s" % (PointFinder.aa(start_codon), | |
| 965 start_codon_no, | |
| 966 PointFinder.aa(end_codon), | |
| 967 codon_no, aa_alt) | |
| 968 return pos_name | |
| 969 | |
| 970 @staticmethod | |
| 971 def name_deletion(sbjct_seq, sbjct_rf_indel, sbjct_nucs, codon_no, aa_alt, | |
| 972 start_offset, mutation="del"): | |
| 973 """ | |
| 974 This function is used to name a deletion mutation based on the | |
| 975 HGVS recommendation. If combination of a deletion and an | |
| 976 insertion is identified the argument 'mutation' is set to | |
| 977 'delins' and the mutation name will indicate that the mutation | |
| 978 is a delins mutation. | |
| 979 """ | |
| 980 del_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset) | |
| 981 pos_name = "p.%s%d" % (PointFinder.aa(del_codon), codon_no) | |
| 982 | |
| 983 # This has been changed | |
| 984 if len(sbjct_rf_indel) == 3 and mutation == "del": | |
| 985 return pos_name + mutation | |
| 986 | |
| 987 end_codon_no = codon_no + math.ceil(len(sbjct_nucs) / 3) - 1 | |
| 988 end_codon = PointFinder.get_codon(sbjct_seq, end_codon_no, | |
| 989 start_offset) | |
| 990 pos_name += "_%s%d%s" % (PointFinder.aa(end_codon), end_codon_no, | |
| 991 mutation) | |
| 992 if mutation == "delins": | |
| 993 pos_name += aa_alt | |
| 994 return pos_name | |
| 995 | |
| 996 @staticmethod | |
| 997 def name_indel_mutation(sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel, | |
| 998 codon_no, mut, start_offset): | |
| 999 """ | |
| 1000 This function serves to name the individual mutations | |
| 1001 dependently on the type of the mutation. | |
| 1002 """ | |
| 1003 # Get the subject and query sequences without gaps | |
| 1004 sbjct_nucs = sbjct_rf_indel.replace("-", "") | |
| 1005 qry_nucs = qry_rf_indel.replace("-", "") | |
| 1006 | |
| 1007 # Translate nucleotides to amino acids | |
| 1008 aa_ref = "" | |
| 1009 aa_alt = "" | |
| 1010 | |
| 1011 for i in range(0, len(sbjct_nucs), 3): | |
| 1012 aa_ref += PointFinder.aa(sbjct_nucs[i:i + 3]) | |
| 1013 | |
| 1014 for i in range(0, len(qry_nucs), 3): | |
| 1015 aa_alt += PointFinder.aa(qry_nucs[i:i + 3]) | |
| 1016 | |
| 1017 # Identify the gapped sequence | |
| 1018 if mut == "ins": | |
| 1019 gapped_seq = sbjct_rf_indel | |
| 1020 else: | |
| 1021 gapped_seq = qry_rf_indel | |
| 1022 | |
| 1023 gap_size = gapped_seq.count("-") | |
| 1024 | |
| 1025 # Write mutation names | |
| 1026 if gap_size < 3 and len(sbjct_nucs) == 3 and len(qry_nucs) == 3: | |
| 1027 # Write mutation name for substitution mutation | |
| 1028 mut_name = "p.%s%d%s" % (PointFinder.aa(sbjct_nucs), codon_no, | |
| 1029 PointFinder.aa(qry_nucs)) | |
| 1030 elif len(gapped_seq) == gap_size: | |
| 1031 | |
| 1032 if mut == "ins": | |
| 1033 # Write mutation name for insertion mutation | |
| 1034 mut_name = PointFinder.name_insertion(sbjct_seq, codon_no, | |
| 1035 sbjct_nucs, aa_alt, | |
| 1036 start_offset) | |
| 1037 aa_ref = mut | |
| 1038 else: | |
| 1039 # Write mutation name for deletion mutation | |
| 1040 mut_name = PointFinder.name_deletion(sbjct_seq, sbjct_rf_indel, | |
| 1041 sbjct_nucs, codon_no, | |
| 1042 aa_alt, start_offset, | |
| 1043 mutation="del") | |
| 1044 aa_alt = mut | |
| 1045 # Check for delins - mix of insertion and deletion | |
| 1046 else: | |
| 1047 # Write mutation name for a mixed insertion and deletion | |
| 1048 # mutation | |
| 1049 mut_name = PointFinder.name_deletion(sbjct_seq, | |
| 1050 sbjct_rf_indel, sbjct_nucs, | |
| 1051 codon_no, aa_alt, | |
| 1052 start_offset, | |
| 1053 mutation="delins") | |
| 1054 | |
| 1055 # Check for frameshift | |
| 1056 if gapped_seq.count("-") % 3 != 0: | |
| 1057 # Add the frameshift tag to mutation name | |
| 1058 mut_name += " - Frameshift" | |
| 1059 | |
| 1060 return mut_name, aa_ref, aa_alt | |
| 1061 | |
| 1062 @staticmethod | |
| 1063 def get_inframe_gap(seq, nucs_needed=3): | |
| 1064 """ | |
| 1065 This funtion takes a sequnece starting with a gap or the | |
| 1066 complementary seqeuence to the gap, and the number of | |
| 1067 nucleotides that the seqeunce should contain in order to | |
| 1068 maintain the correct reading frame. The sequence is gone | |
| 1069 through and the number of non-gap characters are counted. When | |
| 1070 the number has reach the number of needed nucleotides the indel | |
| 1071 is returned. If the indel is a 'clean' insert or deletion that | |
| 1072 starts in the start of a codon and can be divided by 3, then | |
| 1073 only the gap is returned. | |
| 1074 """ | |
| 1075 nuc_count = 0 | |
| 1076 gap_indel = "" | |
| 1077 nucs = "" | |
| 1078 | |
| 1079 for i in range(len(seq)): | |
| 1080 # Check if the character is not a gap | |
| 1081 if seq[i] != "-": | |
| 1082 # Check if the indel is a 'clean' | |
| 1083 # i.e. if the insert or deletion starts at the first | |
| 1084 # nucleotide in the codon and can be divided by 3 | |
| 1085 | |
| 1086 if(gap_indel.count("-") == len(gap_indel) | |
| 1087 and gap_indel.count("-") >= 3 and len(gap_indel) != 0): | |
| 1088 return gap_indel | |
| 1089 | |
| 1090 nuc_count += 1 | |
| 1091 gap_indel += seq[i] | |
| 1092 | |
| 1093 # if the number of nucleotides in the indel equals the amount | |
| 1094 # needed for the indel, the indel is returned. | |
| 1095 if nuc_count == nucs_needed: | |
| 1096 return gap_indel | |
| 1097 | |
| 1098 # This will only happen if the gap is in the very end of a sequence | |
| 1099 return gap_indel | |
| 1100 | |
| 1101 @staticmethod | |
| 1102 def get_indels(sbjct_seq, qry_seq, start_pos): | |
| 1103 """ | |
| 1104 This function uses regex to find inserts and deletions in | |
| 1105 sequences given as arguments. A list of these indels are | |
| 1106 returned. The list includes, type of mutations(ins/del), | |
| 1107 subject codon no of found mutation, subject sequence position, | |
| 1108 insert/deletions nucleotide sequence, and the affected qry | |
| 1109 codon no. | |
| 1110 """ | |
| 1111 | |
| 1112 seqs = [sbjct_seq, qry_seq] | |
| 1113 indels = [] | |
| 1114 gap_obj = re.compile(r"-+") | |
| 1115 for i in range(len(seqs)): | |
| 1116 for match in gap_obj.finditer(seqs[i]): | |
| 1117 pos = int(match.start()) | |
| 1118 gap = match.group() | |
| 1119 | |
| 1120 # Find position of the mutation corresponding to the | |
| 1121 # subject sequence | |
| 1122 sbj_pos = len(sbjct_seq[:pos].replace("-", "")) + start_pos | |
| 1123 | |
| 1124 # Get indel sequence and the affected sequences in | |
| 1125 # sbjct and qry in the reading frame | |
| 1126 indel = seqs[abs(i - 1)][pos:pos + len(gap)] | |
| 1127 | |
| 1128 # Find codon number for mutation | |
| 1129 codon_no = int(math.ceil((sbj_pos) / 3)) | |
| 1130 qry_pos = len(qry_seq[:pos].replace("-", "")) + start_pos | |
| 1131 qry_codon = int(math.ceil((qry_pos) / 3)) | |
| 1132 | |
| 1133 if i == 0: | |
| 1134 mut = "ins" | |
| 1135 else: | |
| 1136 mut = "del" | |
| 1137 | |
| 1138 indels.append([mut, codon_no, sbj_pos, indel, qry_codon]) | |
| 1139 | |
| 1140 # Sort indels based on codon position and sequence position | |
| 1141 indels = sorted(indels, key=lambda x: (x[1], x[2])) | |
| 1142 | |
| 1143 return indels | |
| 1144 | |
| 1145 @staticmethod | |
| 1146 def find_codon_mismatches(sbjct_start, sbjct_seq, qry_seq): | |
| 1147 """ | |
| 1148 This function takes two alligned sequence (subject and query), | |
| 1149 and the position on the subject where the alignment starts. The | |
| 1150 sequences are compared codon by codon. If a mis matches is | |
| 1151 found it is saved in 'mis_matches'. If a gap is found the | |
| 1152 function get_inframe_gap is used to find the indel sequence and | |
| 1153 keep the sequence in the correct reading frame. The function | |
| 1154 translate_indel is used to name indel mutations and translate | |
| 1155 the indels to amino acids The function returns a list of tuples | |
| 1156 containing all needed informations about the mutation in order | |
| 1157 to look it up in the database dict known mutation and the with | |
| 1158 the output files the the user. | |
| 1159 """ | |
| 1160 mis_matches = [] | |
| 1161 | |
| 1162 # Find start pos of first codon in frame, i_start | |
| 1163 codon_offset = (sbjct_start - 1) % 3 | |
| 1164 i_start = 0 | |
| 1165 | |
| 1166 if codon_offset != 0: | |
| 1167 i_start = 3 - codon_offset | |
| 1168 | |
| 1169 sbjct_start = sbjct_start + i_start | |
| 1170 | |
| 1171 # Set sequences in frame | |
| 1172 sbjct_seq = sbjct_seq[i_start:] | |
| 1173 qry_seq = qry_seq[i_start:] | |
| 1174 | |
| 1175 # Find codon number of the first codon in the sequence, start | |
| 1176 # at 0 | |
| 1177 codon_no = int((sbjct_start - 1) / 3) # 1,2,3 start on 0 | |
| 1178 | |
| 1179 # s_shift and q_shift are used when gaps appears | |
| 1180 q_shift = 0 | |
| 1181 s_shift = 0 | |
| 1182 mut_no = 0 | |
| 1183 | |
| 1184 # Find inserts and deletions in sequence | |
| 1185 indel_no = 0 | |
| 1186 indels = PointFinder.get_indels(sbjct_seq, qry_seq, sbjct_start) | |
| 1187 | |
| 1188 # Go through sequence and save mutations when found | |
| 1189 for index in range(0, len(sbjct_seq), 3): | |
| 1190 # Count codon number | |
| 1191 codon_no += 1 | |
| 1192 | |
| 1193 # Shift index according to gaps | |
| 1194 s_i = index + s_shift | |
| 1195 q_i = index + q_shift | |
| 1196 | |
| 1197 # Get codons | |
| 1198 sbjct_codon = sbjct_seq[s_i:s_i + 3] | |
| 1199 qry_codon = qry_seq[q_i:q_i + 3] | |
| 1200 | |
| 1201 if(len(sbjct_seq[s_i:].replace("-", "")) | |
| 1202 + len(qry_codon[q_i:].replace("-", "")) < 6): | |
| 1203 break | |
| 1204 | |
| 1205 # Check for mutations | |
| 1206 if sbjct_codon.upper() != qry_codon.upper(): | |
| 1207 | |
| 1208 # Check for codon insertions and deletions and | |
| 1209 # frameshift mutations | |
| 1210 if "-" in sbjct_codon or "-" in qry_codon: | |
| 1211 | |
| 1212 # Get indel info | |
| 1213 try: | |
| 1214 indel_data = indels[indel_no] | |
| 1215 except IndexError: | |
| 1216 sys.exit("indel_data list is out of range, bug!") | |
| 1217 | |
| 1218 mut = indel_data[0] | |
| 1219 codon_no_indel = indel_data[1] | |
| 1220 seq_pos = indel_data[2] + sbjct_start - 1 | |
| 1221 indel = indel_data[3] | |
| 1222 indel_no += 1 | |
| 1223 | |
| 1224 # Get the affected sequence in frame for both for | |
| 1225 # sbjct and qry | |
| 1226 if mut == "ins": | |
| 1227 sbjct_rf_indel = PointFinder.get_inframe_gap( | |
| 1228 sbjct_seq[s_i:], 3) | |
| 1229 qry_rf_indel = PointFinder.get_inframe_gap( | |
| 1230 qry_seq[q_i:], | |
| 1231 int(math.floor(len(sbjct_rf_indel) / 3) * 3)) | |
| 1232 else: | |
| 1233 qry_rf_indel = PointFinder.get_inframe_gap( | |
| 1234 qry_seq[q_i:], 3) | |
| 1235 sbjct_rf_indel = PointFinder.get_inframe_gap( | |
| 1236 sbjct_seq[s_i:], | |
| 1237 int(math.floor(len(qry_rf_indel) / 3) * 3)) | |
| 1238 | |
| 1239 mut_name, aa_ref, aa_alt = PointFinder.name_indel_mutation( | |
| 1240 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel, | |
| 1241 codon_no, mut, sbjct_start - 1) | |
| 1242 | |
| 1243 # Set index to the correct reading frame after the | |
| 1244 # indel gap | |
| 1245 shift_diff_before = abs(s_shift - q_shift) | |
| 1246 s_shift += len(sbjct_rf_indel) - 3 | |
| 1247 q_shift += len(qry_rf_indel) - 3 | |
| 1248 shift_diff = abs(s_shift - q_shift) | |
| 1249 | |
| 1250 if shift_diff_before != 0 and shift_diff % 3 == 0: | |
| 1251 | |
| 1252 if s_shift > q_shift: | |
| 1253 nucs_needed = (int((len(sbjct_rf_indel) / 3) * 3) | |
| 1254 + shift_diff) | |
| 1255 pre_qry_indel = qry_rf_indel | |
| 1256 qry_rf_indel = PointFinder.get_inframe_gap( | |
| 1257 qry_seq[q_i:], nucs_needed) | |
| 1258 q_shift += len(qry_rf_indel) - len(pre_qry_indel) | |
| 1259 | |
| 1260 elif q_shift > s_shift: | |
| 1261 nucs_needed = (int((len(qry_rf_indel) / 3) * 3) | |
| 1262 + shift_diff) | |
| 1263 pre_sbjct_indel = sbjct_rf_indel | |
| 1264 sbjct_rf_indel = PointFinder.get_inframe_gap( | |
| 1265 sbjct_seq[s_i:], nucs_needed) | |
| 1266 s_shift += (len(sbjct_rf_indel) | |
| 1267 - len(pre_sbjct_indel)) | |
| 1268 | |
| 1269 mut_name, aa_ref, aa_alt = ( | |
| 1270 PointFinder.name_indel_mutation( | |
| 1271 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel, | |
| 1272 codon_no, mut, sbjct_start - 1) | |
| 1273 ) | |
| 1274 if "Frameshift" in mut_name: | |
| 1275 mut_name = (mut_name.split("-")[0] | |
| 1276 + "- Frame restored") | |
| 1277 if mut_name is "p.V940delins - Frame restored": | |
| 1278 sys.exit() | |
| 1279 mis_matches += [[mut, codon_no_indel, seq_pos, indel, | |
| 1280 mut_name, sbjct_rf_indel, qry_rf_indel, | |
| 1281 aa_ref, aa_alt]] | |
| 1282 | |
| 1283 # Check if the next mutation in the indels list is | |
| 1284 # in the current codon. | |
| 1285 # Find the number of individul gaps in the | |
| 1286 # evaluated sequence. | |
| 1287 no_of_indels = (len(re.findall("\-\w", sbjct_rf_indel)) | |
| 1288 + len(re.findall("\-\w", qry_rf_indel))) | |
| 1289 | |
| 1290 if no_of_indels > 1: | |
| 1291 | |
| 1292 for j in range(indel_no, indel_no + no_of_indels - 1): | |
| 1293 try: | |
| 1294 indel_data = indels[j] | |
| 1295 except IndexError: | |
| 1296 sys.exit("indel_data list is out of range, " | |
| 1297 "bug!") | |
| 1298 mut = indel_data[0] | |
| 1299 codon_no_indel = indel_data[1] | |
| 1300 seq_pos = indel_data[2] + sbjct_start - 1 | |
| 1301 indel = indel_data[3] | |
| 1302 indel_no += 1 | |
| 1303 mis_matches += [[mut, codon_no_indel, seq_pos, | |
| 1304 indel, mut_name, sbjct_rf_indel, | |
| 1305 qry_rf_indel, aa_ref, aa_alt]] | |
| 1306 | |
| 1307 # Set codon number, and save nucleotides from out | |
| 1308 # of frame mutations | |
| 1309 if mut == "del": | |
| 1310 codon_no += int((len(sbjct_rf_indel) - 3) / 3) | |
| 1311 # If evaluated insert is only gaps codon_no should | |
| 1312 # not increment | |
| 1313 elif sbjct_rf_indel.count("-") == len(sbjct_rf_indel): | |
| 1314 codon_no -= 1 | |
| 1315 | |
| 1316 # Check of point mutations | |
| 1317 else: | |
| 1318 mut = "sub" | |
| 1319 aa_ref = PointFinder.aa(sbjct_codon) | |
| 1320 aa_alt = PointFinder.aa(qry_codon) | |
| 1321 | |
| 1322 if aa_ref != aa_alt: | |
| 1323 # End search for mutation if a premature stop | |
| 1324 # codon is found | |
| 1325 mut_name = "p." + aa_ref + str(codon_no) + aa_alt | |
| 1326 | |
| 1327 mis_matches += [[mut, codon_no, codon_no, aa_alt, | |
| 1328 mut_name, sbjct_codon, qry_codon, | |
| 1329 aa_ref, aa_alt]] | |
| 1330 # If a Premature stop codon occur report it an stop the | |
| 1331 # loop | |
| 1332 | |
| 1333 try: | |
| 1334 if mis_matches[-1][-1] == "*": | |
| 1335 mut_name += " - Premature stop codon" | |
| 1336 mis_matches[-1][4] = (mis_matches[-1][4].split("-")[0] | |
| 1337 + " - Premature stop codon") | |
| 1338 break | |
| 1339 except IndexError: | |
| 1340 pass | |
| 1341 | |
| 1342 # Sort mutations on position | |
| 1343 mis_matches = sorted(mis_matches, key=lambda x: x[1]) | |
| 1344 | |
| 1345 return mis_matches | |
| 1346 | |
| 1347 @staticmethod | |
| 1348 def mutstr2mutdict(m): | |
| 1349 out_dict = {} | |
| 1350 | |
| 1351 # Protein / Amino acid mutations | |
| 1352 # Ex.: "p.T83I" | |
| 1353 if(m.startswith("p.")): | |
| 1354 out_dict["nucleotide"] = False | |
| 1355 | |
| 1356 # Remove frameshift tag | |
| 1357 frameshift_match = re.search(r"(.+) - Frameshift.*$", m) | |
| 1358 if(frameshift_match): | |
| 1359 m = frameshift_match.group(1) | |
| 1360 out_dict["frameshift"] = True | |
| 1361 | |
| 1362 # Remove frame restored tag | |
| 1363 framerestored_match = re.search(r"(.+) - Frame restored.*$", m) | |
| 1364 if(framerestored_match): | |
| 1365 m = framerestored_match.group(1) | |
| 1366 out_dict["frame restored"] = True | |
| 1367 | |
| 1368 # Remove premature stop tag | |
| 1369 prem_stop_match = re.search(r"(.+) - Premature stop codon.*$", m) | |
| 1370 if(prem_stop_match): | |
| 1371 m = prem_stop_match.group(1) | |
| 1372 out_dict["prem_stop"] = True | |
| 1373 | |
| 1374 # TODO: premature or frameshift tag adds too many whitespaces | |
| 1375 m = m.strip() | |
| 1376 | |
| 1377 # Delins | |
| 1378 multi_delins_match = re.search( | |
| 1379 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins(\S+)$", m) | |
| 1380 single_delins_match = re.search( | |
| 1381 r"^p.(\D{1})(\d+)delins(\S+)$", m) | |
| 1382 # TODO: is both necessary? | |
| 1383 multi_delins_match2 = re.search( | |
| 1384 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins$", m) | |
| 1385 single_delins_match2 = re.search( | |
| 1386 r"^p.(\D{1})(\d+)delins$", m) | |
| 1387 multi_ins_match = re.search( | |
| 1388 r"^p.(\D{1})(\d+)_(\D{1})(\d+)ins(\D*)$", m) | |
| 1389 if(multi_delins_match or single_delins_match): | |
| 1390 out_dict["deletion"] = True | |
| 1391 out_dict["insertion"] = True | |
| 1392 if(single_delins_match): | |
| 1393 out_dict["ref_aa"] = single_delins_match.group(1) | |
| 1394 out_dict["pos"] = single_delins_match.group(2) | |
| 1395 out_dict["mut_aa"] = single_delins_match.group(3) | |
| 1396 else: | |
| 1397 out_dict["ref_aa"] = multi_delins_match.group(1) | |
| 1398 out_dict["pos"] = multi_delins_match.group(2) | |
| 1399 out_dict["ref_aa_right"] = multi_delins_match.group(3) | |
| 1400 out_dict["mut_end"] = multi_delins_match.group(4) | |
| 1401 out_dict["mut_aa"] = multi_delins_match.group(5) | |
| 1402 # Deletions | |
| 1403 elif(m[-3:] == "del"): | |
| 1404 single_del_match = re.search( | |
| 1405 r"^p.(\D{1})(\d+)del$", m) | |
| 1406 multi_del_match = re.search( | |
| 1407 r"^p.(\D{1})(\d+)_(\D{1})(\d+)del$", m) | |
| 1408 out_dict["deletion"] = True | |
| 1409 if(single_del_match): | |
| 1410 out_dict["ref_aa"] = single_del_match.group(1) | |
| 1411 out_dict["pos"] = single_del_match.group(2) | |
| 1412 else: | |
| 1413 out_dict["ref_aa"] = multi_del_match.group(1) | |
| 1414 out_dict["pos"] = multi_del_match.group(2) | |
| 1415 out_dict["ref_aa_right"] = multi_del_match.group(3) | |
| 1416 out_dict["mut_end"] = multi_del_match.group(4) | |
| 1417 # Insertions | |
| 1418 elif(multi_ins_match): | |
| 1419 out_dict["insertion"] = True | |
| 1420 out_dict["ref_aa"] = multi_ins_match.group(1).lower() | |
| 1421 out_dict["pos"] = multi_ins_match.group(2) | |
| 1422 out_dict["ref_aa_right"] = multi_ins_match.group(3).lower() | |
| 1423 if(multi_ins_match.group(4)): | |
| 1424 out_dict["mut_aa"] = multi_ins_match.group(4).lower() | |
| 1425 # Substitutions | |
| 1426 else: | |
| 1427 sub_match = re.search( | |
| 1428 r"^p.(\D{1})(\d+)(\D{1})$", m) | |
| 1429 out_dict["ref_aa"] = sub_match.group(1).lower() | |
| 1430 out_dict["pos"] = sub_match.group(2) | |
| 1431 out_dict["mut_aa"] = sub_match.group(3).lower() | |
| 1432 | |
| 1433 # Nucleotide mutations | |
| 1434 # Ex. sub: n.-42T>C | |
| 1435 # Ex. ins: n.-13_-14insG (TODO: Verify) | |
| 1436 # Ex. del: n.42delT (TODO: Verify) | |
| 1437 # Ex. del: n.42_45del (TODO: Verify) | |
| 1438 elif(m.startswith("n.") or m.startswith("r.")): | |
| 1439 out_dict["nucleotide"] = True | |
| 1440 | |
| 1441 sub_match = re.search( | |
| 1442 r"^[nr]{1}\.(-{0,1}\d+)(\D{1})>(\D{1})$", m) | |
| 1443 ins_match = re.search( | |
| 1444 r"^[nr]{1}\.(-{0,1}\d+)_(-{0,1}\d+)ins(\S+)$", m) | |
| 1445 # r.541delA | |
| 1446 del_match = re.search(( | |
| 1447 r"^[nr]{1}\.(-{0,1}\d+)_{0,1}(-{0,1}\d*)del(\S*)$"), m) | |
| 1448 if(sub_match): | |
| 1449 out_dict["pos"] = sub_match.group(1) | |
| 1450 out_dict["ref_nuc"] = sub_match.group(2) | |
| 1451 out_dict["mut_nuc"] = sub_match.group(3) | |
| 1452 elif(ins_match): | |
| 1453 out_dict["insertion"] = True | |
| 1454 out_dict["pos"] = ins_match.group(1) | |
| 1455 out_dict["mut_end"] = ins_match.group(2) | |
| 1456 out_dict["mut_nuc"] = ins_match.group(3) | |
| 1457 elif(del_match): | |
| 1458 out_dict["deletion"] = True | |
| 1459 out_dict["pos"] = del_match.group(1) | |
| 1460 if(del_match.group(2)): | |
| 1461 out_dict["mut_end"] = del_match.group(2) | |
| 1462 if(del_match.group(3)): | |
| 1463 out_dict["ref_nuc"] = del_match.group(3) | |
| 1464 else: | |
| 1465 sys.exit("ERROR: Nucleotide deletion did not contain any " | |
| 1466 "reference sequence. mut string: {}".format(m)) | |
| 1467 | |
| 1468 return out_dict | |
| 1469 | |
| 1470 def get_mutations(self, gene, gene_name, mis_matches, unknown_flag, hit): | |
| 1471 RNA = False | |
| 1472 if gene in self.RNA_gene_list: | |
| 1473 RNA = True | |
| 1474 | |
| 1475 known_muts = [] | |
| 1476 unknown_muts = [] | |
| 1477 | |
| 1478 # Go through each mutation | |
| 1479 for i in range(len(mis_matches)): | |
| 1480 m_type = mis_matches[i][0] | |
| 1481 pos = mis_matches[i][1] # sort on pos? | |
| 1482 look_up_pos = mis_matches[i][2] | |
| 1483 look_up_mut = mis_matches[i][3] | |
| 1484 mut_name = mis_matches[i][4] | |
| 1485 # nuc_ref = mis_matches[i][5] | |
| 1486 # nuc_alt = mis_matches[i][6] | |
| 1487 ref = mis_matches[i][-2] | |
| 1488 alt = mis_matches[i][-1] | |
| 1489 mut_dict = PointFinder.mutstr2mutdict(mut_name) | |
| 1490 | |
| 1491 mut_id = ("{gene}_{pos}_{alt}" | |
| 1492 .format(gene=gene_name, pos=pos, alt=alt.lower())) | |
| 1493 | |
| 1494 ref_aa = mut_dict.get("ref_aa", None) | |
| 1495 ref_aa_right = mut_dict.get("ref_aa_right", None) | |
| 1496 mut_aa = mut_dict.get("mut_aa", None) | |
| 1497 ref_nuc = mut_dict.get("ref_nuc", None) | |
| 1498 mut_nuc = mut_dict.get("mut_nuc", None) | |
| 1499 is_nuc = mut_dict.get("nucleotide", None) | |
| 1500 is_ins = mut_dict.get("insertion", None) | |
| 1501 is_del = mut_dict.get("deletion", None) | |
| 1502 mut_end = mut_dict.get("mut_end", None) | |
| 1503 prem_stop = mut_dict.get("prem_stop", False) | |
| 1504 | |
| 1505 mut = ResMutation(unique_id=mut_id, seq_region=gene_name, pos=pos, | |
| 1506 hit=hit, ref_codon=ref_nuc, mut_codon=mut_nuc, | |
| 1507 ref_aa=ref_aa, ref_aa_right=ref_aa_right, | |
| 1508 mut_aa=mut_aa, insertion=is_ins, deletion=is_del, | |
| 1509 end=mut_end, nuc=is_nuc, | |
| 1510 premature_stop=prem_stop, ab_class=None) | |
| 1511 | |
| 1512 if "Premature stop codon" in mut_name: | |
| 1513 sbjct_len = hit['sbjct_length'] | |
| 1514 qry_len = pos * 3 | |
| 1515 perc_trunc = round( | |
| 1516 ((float(sbjct_len) - qry_len) | |
| 1517 / float(sbjct_len)) | |
| 1518 * 100, 2 | |
| 1519 ) | |
| 1520 mut.premature_stop = perc_trunc | |
| 1521 | |
| 1522 # Check if mutation is known | |
| 1523 gene_mut_name, resistence, pmid = self.look_up_known_muts( | |
| 1524 gene, look_up_pos, look_up_mut, m_type, gene_name) | |
| 1525 | |
| 1526 # Collect known mutations | |
| 1527 if resistence != "Unknown": | |
| 1528 known_muts.append(mut) | |
| 1529 # Collect unknown mutations | |
| 1530 else: | |
| 1531 unknown_muts.append(mut) | |
| 1532 | |
| 1533 # TODO: Use ResMutation class to make sure identical mutations are | |
| 1534 # not kept. | |
| 1535 | |
| 1536 return (known_muts, unknown_muts) | |
| 1537 | |
| 1538 def mut2str(self, gene, gene_name, mis_matches, unknown_flag, hit): | |
| 1539 """ | |
| 1540 This function takes a gene name a list of mis matches found | |
| 1541 between subject and query of this gene, the dictionary of | |
| 1542 known mutation in the point finder database, and the flag | |
| 1543 telling weather the user wants unknown mutations to be | |
| 1544 reported. All mis matches are looked up in the known | |
| 1545 mutation dict to se if the mutation is known, and in this | |
| 1546 case what drug resistence it causes. The funtions return 3 | |
| 1547 strings that are used as output to the users. One string is | |
| 1548 only tab seperated and contains the mutations listed line | |
| 1549 by line. If the unknown flag is set to true it will contain | |
| 1550 both known and unknown mutations. The next string contains | |
| 1551 only known mutation and are given in in a format that is | |
| 1552 easy to convert to HTML. The last string is the HTML tab | |
| 1553 sting from the unknown mutations. | |
| 1554 """ | |
| 1555 known_header = ("Mutation\tNucleotide change\tAmino acid change\t" | |
| 1556 "Resistance\tPMID\n") | |
| 1557 | |
| 1558 unknown_header = "Mutation\tNucleotide change\tAmino acid change\n" | |
| 1559 | |
| 1560 RNA = False | |
| 1561 if gene in self.RNA_gene_list: | |
| 1562 RNA = True | |
| 1563 known_header = "Mutation\tNucleotide change\tResistance\tPMID\n" | |
| 1564 unknown_header = "Mutation\tNucleotide change\n" | |
| 1565 | |
| 1566 known_lst = [] | |
| 1567 unknown_lst = [] | |
| 1568 all_results_lst = [] | |
| 1569 output_mut = [] | |
| 1570 stop_codons = [] | |
| 1571 | |
| 1572 # Go through each mutation | |
| 1573 for i in range(len(mis_matches)): | |
| 1574 m_type = mis_matches[i][0] | |
| 1575 pos = mis_matches[i][1] # sort on pos? | |
| 1576 look_up_pos = mis_matches[i][2] | |
| 1577 look_up_mut = mis_matches[i][3] | |
| 1578 mut_name = mis_matches[i][4] | |
| 1579 nuc_ref = mis_matches[i][5] | |
| 1580 nuc_alt = mis_matches[i][6] | |
| 1581 ref = mis_matches[i][-2] | |
| 1582 alt = mis_matches[i][-1] | |
| 1583 | |
| 1584 # First index in list indicates if mutation is known | |
| 1585 output_mut += [[]] | |
| 1586 | |
| 1587 # Define output vaiables | |
| 1588 codon_change = nuc_ref + " -> " + nuc_alt | |
| 1589 aa_change = ref + " -> " + alt | |
| 1590 | |
| 1591 if RNA is True: | |
| 1592 aa_change = "RNA mutations" | |
| 1593 elif pos < 0: | |
| 1594 aa_change = "Promoter mutations" | |
| 1595 | |
| 1596 # Check if mutation is known | |
| 1597 gene_mut_name, resistence, pmid = self.look_up_known_muts( | |
| 1598 gene, look_up_pos, look_up_mut, m_type, gene_name) | |
| 1599 | |
| 1600 gene_mut_name = gene_mut_name + " " + mut_name | |
| 1601 | |
| 1602 output_mut[i] = [gene_mut_name, codon_change, aa_change, | |
| 1603 resistence, pmid] | |
| 1604 | |
| 1605 # Add mutation to output strings for known mutations | |
| 1606 if resistence != "Unknown": | |
| 1607 if RNA is True: | |
| 1608 # don't include the amino acid change field for | |
| 1609 # RNA mutations | |
| 1610 known_lst += ["\t".join(output_mut[i][:2]) + "\t" | |
| 1611 + "\t".join(output_mut[i][3:])] | |
| 1612 else: | |
| 1613 known_lst += ["\t".join(output_mut[i])] | |
| 1614 | |
| 1615 all_results_lst += ["\t".join(output_mut[i])] | |
| 1616 | |
| 1617 # Add mutation to output strings for unknown mutations | |
| 1618 else: | |
| 1619 if RNA is True: | |
| 1620 unknown_lst += ["\t".join(output_mut[i][:2])] | |
| 1621 else: | |
| 1622 unknown_lst += ["\t".join(output_mut[i][:3])] | |
| 1623 | |
| 1624 if unknown_flag is True: | |
| 1625 all_results_lst += ["\t".join(output_mut[i])] | |
| 1626 | |
| 1627 # Check that you do not print two equal lines (can happen | |
| 1628 # if two indels occure in the same codon) | |
| 1629 if len(output_mut) > 1: | |
| 1630 if output_mut[i] == output_mut[i - 1]: | |
| 1631 | |
| 1632 if resistence != "Unknown": | |
| 1633 known_lst = known_lst[:-1] | |
| 1634 all_results_lst = all_results_lst[:-1] | |
| 1635 else: | |
| 1636 unknown_lst = unknown_lst[:-1] | |
| 1637 | |
| 1638 if unknown_flag is True: | |
| 1639 all_results_lst = all_results_lst[:-1] | |
| 1640 | |
| 1641 if "Premature stop codon" in mut_name: | |
| 1642 sbjct_len = hit['sbjct_length'] | |
| 1643 qry_len = pos * 3 | |
| 1644 prec_truckat = round( | |
| 1645 ((float(sbjct_len) - qry_len) | |
| 1646 / float(sbjct_len)) | |
| 1647 * 100, 2 | |
| 1648 ) | |
| 1649 perc = "%" | |
| 1650 stop_codons.append("Premature stop codon in %s, %.2f%s lost" | |
| 1651 % (gene, prec_truckat, perc)) | |
| 1652 | |
| 1653 # Creat final strings | |
| 1654 if(all_results_lst): | |
| 1655 all_results = "\n".join(all_results_lst) | |
| 1656 else: | |
| 1657 all_results = "" | |
| 1658 total_known_str = "" | |
| 1659 total_unknown_str = "" | |
| 1660 | |
| 1661 # Check if there are only unknown mutations | |
| 1662 resistence_lst = [] | |
| 1663 for mut in output_mut: | |
| 1664 for res in mut[3].split(","): | |
| 1665 resistence_lst.append(res) | |
| 1666 | |
| 1667 # Save known mutations | |
| 1668 unknown_no = resistence_lst.count("Unknown") | |
| 1669 if unknown_no < len(resistence_lst): | |
| 1670 total_known_str = known_header + "\n".join(known_lst) | |
| 1671 else: | |
| 1672 total_known_str = "No known mutations found in %s" % gene_name | |
| 1673 | |
| 1674 # Save unknown mutations | |
| 1675 if unknown_no > 0: | |
| 1676 total_unknown_str = unknown_header + "\n".join(unknown_lst) | |
| 1677 else: | |
| 1678 total_unknown_str = "No unknown mutations found in %s" % gene_name | |
| 1679 | |
| 1680 return (all_results, total_known_str, total_unknown_str, | |
| 1681 resistence_lst + stop_codons) | |
| 1682 | |
| 1683 @staticmethod | |
| 1684 def get_file_content(file_path, fst_char_only=False): | |
| 1685 """ | |
| 1686 This function opens a file, given as the first argument | |
| 1687 file_path and returns the lines of the file in a list or the | |
| 1688 first character of the file if fst_char_only is set to True. | |
| 1689 """ | |
| 1690 with open(file_path, "r") as infile: | |
| 1691 line_lst = [] | |
| 1692 for line in infile: | |
| 1693 line = line.strip() | |
| 1694 if line != "": | |
| 1695 line_lst.append(line) | |
| 1696 if fst_char_only: | |
| 1697 return line_lst[0][0] | |
| 1698 return line_lst | |
| 1699 | |
| 1700 def look_up_known_muts(self, gene, pos, found_mut, mut, gene_name): | |
| 1701 """ | |
| 1702 This function uses the known_mutations dictionay, a gene a | |
| 1703 string with the gene key name, a gene position as integer, | |
| 1704 found_mut is given as amino acid or nucleotides, the | |
| 1705 mutation type (mut) is either "del", "ins", or "sub", and | |
| 1706 gene_name is the gene name that should be returned to the | |
| 1707 user. The function looks up if the found_mut defined by the | |
| 1708 gene, position and the found_mut string is given in the | |
| 1709 known_mutations dictionary, if it is, the resistance and | |
| 1710 the pmid are returned together with the gene_name given in | |
| 1711 the known_mutations dict. If the mutation type is "del" the | |
| 1712 deleted nucleotids are checked to be contained in any of | |
| 1713 the deletion described in the known_mutation dict. | |
| 1714 """ | |
| 1715 resistence = "Unknown" | |
| 1716 pmid = "-" | |
| 1717 found_mut = found_mut.upper() | |
| 1718 | |
| 1719 if mut == "del": | |
| 1720 for i, i_pos in enumerate(range(pos, pos + len(found_mut))): | |
| 1721 | |
| 1722 known_indels = self.known_mutations[gene]["del"].get(i_pos, []) | |
| 1723 for known_indel in known_indels: | |
| 1724 partial_mut = found_mut[i:len(known_indel) + i] | |
| 1725 | |
| 1726 # Check if part of found mut is known and check if | |
| 1727 # found mut and known mut is in the same reading | |
| 1728 # frame | |
| 1729 if(partial_mut == known_indel | |
| 1730 and len(found_mut) % 3 == len(known_indel) % 3): | |
| 1731 | |
| 1732 resistence = (self.known_mutations[gene]["del"][i_pos] | |
| 1733 [known_indel]['drug']) | |
| 1734 | |
| 1735 pmid = (self.known_mutations[gene]["del"][i_pos] | |
| 1736 [known_indel]['pmid']) | |
| 1737 | |
| 1738 gene_name = (self.known_mutations[gene]["del"][i_pos] | |
| 1739 [known_indel]['gene_name']) | |
| 1740 break | |
| 1741 else: | |
| 1742 if pos in self.known_mutations[gene][mut]: | |
| 1743 if found_mut in self.known_mutations[gene][mut][pos]: | |
| 1744 resistence = (self.known_mutations[gene][mut][pos] | |
| 1745 [found_mut]['drug']) | |
| 1746 | |
| 1747 pmid = (self.known_mutations[gene][mut][pos][found_mut] | |
| 1748 ['pmid']) | |
| 1749 | |
| 1750 gene_name = (self.known_mutations[gene][mut][pos] | |
| 1751 [found_mut]['gene_name']) | |
| 1752 | |
| 1753 # Check if stop codons refer resistance | |
| 1754 if "*" in found_mut and gene in self.known_stop_codon: | |
| 1755 if resistence == "Unknown": | |
| 1756 resistence = self.known_stop_codon[gene]["drug"] | |
| 1757 else: | |
| 1758 resistence += "," + self.known_stop_codon[gene]["drug"] | |
| 1759 | |
| 1760 return (gene_name, resistence, pmid) | |
| 1761 | |
| 1762 | |
| 1763 if __name__ == '__main__': | |
| 1764 | |
| 1765 ########################################################################## | |
| 1766 # PARSE COMMAND LINE OPTIONS | |
| 1767 ########################################################################## | |
| 1768 | |
| 1769 parser = argparse.ArgumentParser( | |
| 1770 description="This program predicting resistance associated with \ | |
| 1771 chromosomal mutations based on WGS data", | |
| 1772 prog="pointfinder.py") | |
| 1773 | |
| 1774 # required arguments | |
| 1775 parser.add_argument("-i", | |
| 1776 dest="inputfiles", | |
| 1777 metavar="INFILE", | |
| 1778 nargs='+', | |
| 1779 help="Input file. fastq file(s) from one sample for \ | |
| 1780 KMA or one fasta file for blastn.", | |
| 1781 required=True) | |
| 1782 parser.add_argument("-o", | |
| 1783 dest="out_path", | |
| 1784 metavar="OUTFOLDER", | |
| 1785 help="Output folder, output files will be stored \ | |
| 1786 here.", | |
| 1787 required=True) | |
| 1788 parser.add_argument("-s", | |
| 1789 dest="species", | |
| 1790 metavar="SPECIES", | |
| 1791 choices=['e.coli', 'gonorrhoeae', 'campylobacter', | |
| 1792 'salmonella', 'tuberculosis'], | |
| 1793 help="Species of choice, e.coli, tuberculosis, \ | |
| 1794 salmonella, campylobactor, gonorrhoeae, \ | |
| 1795 klebsiella, or malaria", | |
| 1796 required=True) | |
| 1797 parser.add_argument("-m", | |
| 1798 dest="method", | |
| 1799 metavar="METHOD", | |
| 1800 choices=["kma", "blastn"], | |
| 1801 help="Method of choice, kma or blastn", | |
| 1802 required=True) | |
| 1803 parser.add_argument("-m_p", | |
| 1804 dest="method_path", | |
| 1805 help="Path to the location of blastn or kma dependent \ | |
| 1806 of the chosen method", | |
| 1807 required=True) | |
| 1808 parser.add_argument("-p", | |
| 1809 dest="db_path", | |
| 1810 metavar="DATABASE", | |
| 1811 help="Path to the location of the pointfinder \ | |
| 1812 database", | |
| 1813 required=True) | |
| 1814 | |
| 1815 # optional arguments | |
| 1816 parser.add_argument("-t", | |
| 1817 dest="threshold", | |
| 1818 metavar="IDENTITY", | |
| 1819 help="Minimum gene identity threshold, default = 0.9", | |
| 1820 type=float, | |
| 1821 default=0.9) | |
| 1822 parser.add_argument("-l", | |
| 1823 dest="min_cov", | |
| 1824 metavar="COVERAGE", | |
| 1825 help="Minimum gene coverage threshold, \ | |
| 1826 threshold = 0.6", | |
| 1827 type=float, | |
| 1828 default=0.6) | |
| 1829 parser.add_argument("-u", | |
| 1830 dest="unknown_mutations", | |
| 1831 help="Show all mutations found even if it's unknown \ | |
| 1832 to the resistance database.", | |
| 1833 action='store_true', | |
| 1834 default=False) | |
| 1835 parser.add_argument("-g", | |
| 1836 dest="specific_gene", | |
| 1837 nargs='+', | |
| 1838 help="Specify genes existing in the database to \ | |
| 1839 search for - if none is specified all genes are \ | |
| 1840 included in the search.", | |
| 1841 default=None) | |
| 1842 | |
| 1843 args = parser.parse_args() | |
| 1844 | |
| 1845 # If no arguments are given print usage message and exit | |
| 1846 if len(sys.argv) == 1: | |
| 1847 sys.exit("Usage: " + parser.usage) | |
| 1848 | |
| 1849 if(args.method == "blastn"): | |
| 1850 method = PointFinder.TYPE_BLAST | |
| 1851 else: | |
| 1852 method = PointFinder.TYPE_KMA | |
| 1853 | |
| 1854 # Get sample name | |
| 1855 filename = args.inputfiles[0].split("/")[-1] | |
| 1856 sample_name = "".join(filename.split(".")[0:-1]) # .split("_")[0] | |
| 1857 if sample_name == "": | |
| 1858 sample_name = filename | |
| 1859 | |
| 1860 # TODO: Change ilogocal var name | |
| 1861 kma_db_path = args.db_path + "/" + args.species | |
| 1862 | |
| 1863 finder = PointFinder(db_path=kma_db_path, species=args.species, | |
| 1864 gene_list=args.specific_gene) | |
| 1865 | |
| 1866 if method == PointFinder.TYPE_BLAST: | |
| 1867 | |
| 1868 # Check that only one input file is given | |
| 1869 if len(args.inputfiles) != 1: | |
| 1870 sys.exit("Input Error: Blast was chosen as mapping method only 1 " | |
| 1871 "input file requied, not %s" % (len(args.inputfiles))) | |
| 1872 | |
| 1873 finder_run = finder.blast(inputfile=args.inputfiles[0], | |
| 1874 out_path=args.out_path, | |
| 1875 min_cov=0.01, | |
| 1876 threshold=args.threshold, | |
| 1877 blast=args.method_path, | |
| 1878 cut_off=False) | |
| 1879 else: | |
| 1880 inputfile_1 = args.inputfiles[0] | |
| 1881 inputfile_2 = None | |
| 1882 | |
| 1883 if(len(args.inputfiles) == 2): | |
| 1884 inputfile_2 = args.inputfiles[1] | |
| 1885 finder_run = finder.kma(inputfile_1=inputfile_1, | |
| 1886 inputfile_2=inputfile_2, | |
| 1887 out_path=args.out_path, | |
| 1888 db_path_kma=kma_db_path, | |
| 1889 databases=[args.species], | |
| 1890 min_cov=args.min_cov, | |
| 1891 threshold=args.threshold, | |
| 1892 kma_path=args.method_path, | |
| 1893 sample_name=sample_name, | |
| 1894 kma_mrs=0.5, kma_gapopen=-5, kma_gapextend=-2, | |
| 1895 kma_penalty=-3, kma_reward=1) | |
| 1896 | |
| 1897 results = finder_run.results | |
| 1898 | |
| 1899 if(args.specific_gene): | |
| 1900 results = PointFinder.discard_unwanted_results( | |
| 1901 results=results, wanted=args.specific_gene) | |
| 1902 | |
| 1903 finder.write_results(out_path=args.out_path, result=results, min_cov=args.min_cov, | |
| 1904 res_type=method, unknown_flag=args.unknown_mutations) |
