annotate resfinder/cge/pointfinder.py @ 0:a16d245332d6 draft default tip

Uploaded
author dcouvin
date Wed, 08 Dec 2021 01:46:07 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1 #!/usr/bin/env python3
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
2 #
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
3 # Program: PointFinder-3.0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
4 # Author: Camilla Hundahl Johnsen
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
5 #
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
6 # Dependencies: KMA or NCBI-blast together with BioPython.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
7
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
8 import os
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
9 import re
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
10 import sys
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
11 import math
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
12 import argparse
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
13 import subprocess
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
14 import random
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
15
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
16 from cgecore.blaster import Blaster
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
17 from cgecore.cgefinder import CGEFinder
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
18 from .output.table import TableResults
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
19 from .phenotype2genotype.feature import ResMutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
20 from .phenotype2genotype.res_profile import PhenoDB
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
21
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
22
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
23 def eprint(*args, **kwargs):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
24 print(*args, file=sys.stderr, **kwargs)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
25
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
26
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
27 class GeneListError(Exception):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
28 """ Raise when a specified gene is not found within the gene list.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
29 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
30 def __init__(self, message, *args):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
31 self.message = message
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
32 # allow users initialize misc. arguments as any other builtin Error
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
33 super(PanelNameError, self).__init__(message, *args)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
34
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
35
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
36 class PointFinder(CGEFinder):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
37
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
38 def __init__(self, db_path, species, gene_list=None):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
39 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
40 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
41 self.species = species
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
42 self.specie_path = db_path
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
43 self.RNA_gene_list = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
44
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
45 self.gene_list = PointFinder.get_file_content(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
46 self.specie_path + "/genes.txt")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
47
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
48 if os.path.isfile(self.specie_path + "/RNA_genes.txt"):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
49 self.RNA_gene_list = PointFinder.get_file_content(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
50 self.specie_path + "/RNA_genes.txt")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
51
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
52 # Creat user defined gene_list if given
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
53 if(gene_list is not None):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
54 self.gene_list = get_user_defined_gene_list(gene_list)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
55
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
56 self.known_mutations, self.drug_genes, self.known_stop_codon = (
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
57 self.get_db_mutations(self.specie_path + "/resistens-overview.txt",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
58 self.gene_list))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
59
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
60 def get_user_defined_gene_list(self, gene_list):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
61 genes_specified = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
62 for gene in gene_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
63 # Check that the genes are valid
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
64 if gene not in self.gene_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
65 raise(GeneListError(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
66 "Input Error: Specified gene not recognised "
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
67 "(%s)\nChoose one or more of the following genes:"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
68 "\n%s" % (gene, "\n".join(self.gene_list))))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
69 genes_specified.append(gene)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
70 # Change the gene_list to the user defined gene_list
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
71 return genes_specified
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
72
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
73 #DELETE
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
74 def old_results_to_standard_output(self, result, software, version,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
75 run_date, run_cmd, id,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
76 unknown_mut=False, tableresults=None):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
77 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
78 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
79 std_results = TableResults(software, version, run_date, run_cmd, id)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
80 headers = [
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
81 "template_name",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
82 "template_length",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
83 "aln_length",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
84 "aln_identity",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
85 "aln_gaps",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
86 "aln_template_string",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
87 "aln_query_string",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
88 "aln_homology_string",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
89 "query_id",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
90 "query_start_pos",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
91 "query_end_pos",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
92 "template_variant",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
93 "query_depth",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
94 "blast_eval",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
95 "blast_bitscore",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
96 "pval",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
97 "software_score",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
98 "mutation",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
99 "ref_codon",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
100 "alt_codon",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
101 "ref_aa",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
102 "alt_aa",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
103 "insertion",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
104 "deletion"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
105 ]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
106 for db_name, db in result.items():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
107 if(db_name == "excluded"):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
108 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
109
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
110 if(db == "No hit found"):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
111 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
112
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
113 ####Added to solve PointFinder####
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
114 if(isinstance(db, str)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
115 if db.startswith("Gene found with coverage"):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
116 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
117 ####
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
118
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
119 # Start writing output string (to HTML tab file)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
120 gene_name = PhenoDB.if_promoter_rename(db_name)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
121
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
122 # Find and save mis_matches in gene
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
123 sbjct_start = db["sbjct_start"]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
124 sbjct_seq = db["sbjct_string"]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
125 qry_seq = db["query_string"]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
126 db["mis_matches"] = self.find_mismatches(gene=db_name,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
127 sbjct_start=sbjct_start,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
128 sbjct_seq=sbjct_seq,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
129 qry_seq=qry_seq)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
130
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
131 known_muts = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
132 unknown_muts = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
133 if(len(db["mis_matches"]) > 0):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
134 known_muts, unknown_muts = self.get_mutations(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
135 db_name, gene_name, db['mis_matches'], True, db)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
136
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
137 # No mutations found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
138 if(not (unknown_muts or known_muts)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
139 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
140
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
141 std_results.add_table(db_name)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
142 std_db = std_results.long[db_name]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
143 std_db.add_headers(headers)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
144 std_db.lock_headers = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
145
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
146 for mut in known_muts:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
147 unique_id = mut.unique_id
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
148 std_db[unique_id] = {
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
149 "template_name": db_name,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
150 "template_length": db["sbjct_length"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
151 "aln_length": db["HSP_length"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
152 "aln_identity": db["perc_ident"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
153 "aln_gaps": db["gaps"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
154 "aln_template_string": db["sbjct_string"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
155 "aln_query_string": db["query_string"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
156 "aln_homology_string": db["homo_string"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
157 "query_id": db["contig_name"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
158 "query_start_pos": mut.pos,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
159 "query_end_pos": mut.end,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
160 "query_depth": db.get("depth", "NA"),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
161 "pval": db.get("p_value", "NA"),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
162 "software_score": db["cal_score"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
163 "mutation": mut.mut_string_short,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
164 "ref_codon": mut.ref_codon,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
165 "alt_codon": mut.mut_codon,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
166 "ref_aa": mut.ref_aa,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
167 "alt_aa": mut.mut_aa,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
168 "insertion": mut.insertion,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
169 "deletion": mut.deletion
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
170 }
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
171
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
172 return std_results
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
173
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
174 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
175 def get_db_names(db_root_path):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
176 out_lst = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
177 for entry in os.scandir(db_root_path):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
178 if not entry.name.startswith('.') and entry.is_dir():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
179 out_lst.append(entry.name)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
180 return tuple(out_lst)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
181
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
182 def results_to_str(self, res_type, results, unknown_flag, min_cov,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
183 perc_iden):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
184 # Initiate output stings with headers
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
185 gene_list = ', '.join(self.gene_list)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
186 output_strings = [
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
187 "Mutation\tNucleotide change\tAmino acid change\tResistance\tPMID",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
188 "Chromosomal point mutations - Results\nSpecies: %s\nGenes: %s\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
189 "Mapping methode: %s\n\n\nKnown Mutations\n" % (self.species, gene_list, res_type), ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
190 ]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
191 # Get all drug names and add header of all drugs to prediction file
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
192 drug_lst = [drug for drug in self.drug_genes.keys()]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
193 output_strings[2] = "\t".join(drug_lst) + "\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
194
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
195 # Define variables to write temporary output into
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
196 total_unknown_str = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
197 unique_drug_list = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
198 excluded_hits = {}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
199
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
200 if res_type == PointFinder.TYPE_BLAST:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
201 # Patch together partial sequence hits (BLASTER does not do this)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
202 # and removes dummy_hit_id
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
203 GENES = self.find_best_seqs(results, min_cov)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
204
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
205 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
206 # TODO: Some databases are either genes or species,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
207 # depending on the method applied (BLAST/KMA).
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
208 GENES = results[self.species]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
209
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
210 # If not hits found insert genes not found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
211 if GENES == 'No hit found':
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
212 GENES = {}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
213
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
214 # KMA only gives the genes found, however all genes should be
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
215 # included
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
216 for gene in self.gene_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
217 if gene not in GENES:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
218 GENES[gene] = 'No hit found'
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
219
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
220 # Included excluded
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
221 GENES['excluded'] = results['excluded']
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
222
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
223 for gene, hit in GENES.items():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
224 # Start writing output string (to HTML tab file)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
225 gene_name = gene # Not perfeft can differ from specific mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
226 regex = r"promoter_size_(\d+)(?:bp)"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
227 promtr_gene_objt = re.search(regex, gene)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
228
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
229 if promtr_gene_objt:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
230 gene_name = gene.split("_")[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
231
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
232 output_strings[1] += "\n%s\n" % (gene_name)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
233
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
234 # Ignore excluded genes
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
235 # TODO: Should all not found genes be moved to this dict?
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
236 if gene == "excluded":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
237 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
238
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
239 # Write exclusion reason for genes not found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
240 if isinstance(GENES[gene], str):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
241 output_strings[1] += GENES[gene] + "\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
242 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
243
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
244 if hit['perc_ident'] < perc_iden and hit['perc_coverage'] < min_cov:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
245 GENES[gene] = ('Gene found with identity, %s, below minimum '
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
246 'identity threshold: %s. And with coverage, %s,'
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
247 ' below minimum coverage threshold: %s.'
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
248 % (hit['perc_ident'], perc_iden,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
249 hit['perc_coverage'], min_cov))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
250 output_strings[1] += GENES[gene] + "\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
251 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
252
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
253 if hit['perc_ident'] < perc_iden:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
254 GENES[gene] = ('Gene found with identity, %s, below minimum '
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
255 'identity threshold: %s' % (hit['perc_ident'],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
256 perc_iden))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
257 output_strings[1] += GENES[gene] + "\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
258 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
259
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
260 if hit['perc_coverage'] < min_cov:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
261 # Gene not found above given coverage
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
262 GENES[gene] = ('Gene found with coverage, %s, below minimum '
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
263 'coverage threshold: %s' % (hit['perc_coverage'],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
264 min_cov))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
265 output_strings[1] += GENES[gene] + "\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
266 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
267
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
268 sbjct_start = hit['sbjct_start']
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
269 sbjct_seq = hit['sbjct_string']
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
270 qry_seq = hit['query_string']
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
271
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
272 # Find and save mis_matches in gene
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
273 hit['mis_matches'] = self.find_mismatches(gene, sbjct_start,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
274 sbjct_seq, qry_seq)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
275
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
276 # Check if no mutations was found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
277 if len(hit['mis_matches']) < 1:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
278 output_strings[1] += ("No mutations found in {}\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
279 .format(gene_name))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
280 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
281 # Write mutations found to output file
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
282 total_unknown_str += "\n%s\n" % (gene_name)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
283
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
284 str_tuple = self.mut2str(gene, gene_name, hit['mis_matches'],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
285 unknown_flag, hit)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
286
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
287 all_results = str_tuple[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
288 total_known = str_tuple[1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
289 total_unknown = str_tuple[2]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
290 drug_list = str_tuple[3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
291
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
292 # Add results to output strings
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
293 if(all_results):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
294 output_strings[0] += "\n" + all_results
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
295 output_strings[1] += total_known + "\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
296
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
297 # Add unknown mutations the total results of
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
298 # unknown mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
299 total_unknown_str += total_unknown + "\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
300
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
301 # Add drugs to druglist
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
302 unique_drug_list += [drug.lower() for drug in drug_list]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
303
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
304 if unknown_flag is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
305 output_strings[1] += ("\n\nUnknown Mutations \n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
306 + total_unknown_str)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
307
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
308 # Make Resistance Prediction output
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
309
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
310 # Go throug all drugs in the database and see if prediction
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
311 # can be called.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
312 pred_output = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
313 for drug in drug_lst:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
314 drug = drug.lower()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
315 # Check if resistance to drug was found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
316 if drug in unique_drug_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
317 pred_output.append("1")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
318 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
319 # Check at all genes associated with the drug
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
320 # resistance where found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
321 all_genes_found = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
322 for gene in self.drug_genes[drug]:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
323 if gene not in GENES:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
324 all_genes_found = False
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
325
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
326 if all_genes_found is False:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
327 pred_output.append("?")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
328 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
329 pred_output.append("0")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
330
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
331 output_strings[2] += "\t".join(pred_output) + "\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
332
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
333 return output_strings
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
334
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
335 def write_results(self, out_path, result, res_type, unknown_flag, min_cov,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
336 perc_iden):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
337 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
338 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
339
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
340 result_str = self.results_to_str(res_type=res_type,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
341 results=result,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
342 unknown_flag=unknown_flag,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
343 min_cov=min_cov,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
344 perc_iden=perc_iden)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
345
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
346 with open(out_path + "/PointFinder_results.txt", "w") as fh:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
347 fh.write(result_str[0])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
348 with open(out_path + "/PointFinder_table.txt", "w") as fh:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
349 fh.write(result_str[1])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
350 with open(out_path + "/PointFinder_prediction.txt", "w") as fh:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
351 fh.write(result_str[2])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
352
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
353 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
354 def discard_unwanted_results(results, wanted):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
355 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
356 Takes a dict and a list of keys.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
357 Returns a dict containing only the keys from the list.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
358 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
359 cleaned_results = dict()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
360 for key, val in results.items():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
361 if(key in wanted):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
362 cleaned_results[key] = val
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
363 return cleaned_results
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
364
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
365 def blast(self, inputfile, out_path, min_cov=0.6, threshold=0.9,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
366 blast="blastn", cut_off=True):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
367 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
368 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
369 blast_run = Blaster(inputfile=inputfile, databases=self.gene_list,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
370 db_path=self.specie_path, out_path=out_path,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
371 min_cov=min_cov, threshold=threshold, blast=blast,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
372 cut_off=cut_off)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
373
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
374 self.blast_results = blast_run.results # TODO Is this used?
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
375 return blast_run
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
376
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
377 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
378 def get_db_mutations(mut_db_path, gene_list):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
379 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
380 This function opens the file resistenss-overview.txt, and reads
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
381 the content into a dict of dicts. The dict will contain
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
382 information about all known mutations given in the database.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
383 This dict is returned.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
384 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
385
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
386 # Initiate variables
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
387 known_mutations = dict()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
388 known_stop_codon = dict()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
389 drug_genes = dict()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
390 indelflag = False
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
391 stopcodonflag = False
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
392
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
393 # Go throug mutation file line by line
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
394
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
395 with open(mut_db_path, "r") as fh:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
396 mutdb_file = fh.readlines()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
397 mutdb_file = [line.strip() for line in mutdb_file]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
398
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
399 for line in mutdb_file:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
400 # Ignore headers and check where the indel section starts
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
401 if line.startswith("#"):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
402 if "indel" in line.lower():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
403 indelflag = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
404 elif "stop codon" in line.lower():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
405 stopcodonflag = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
406 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
407 stopcodonflag = False
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
408 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
409
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
410 mutation = [data.strip() for data in line.split("\t")]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
411
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
412 gene_ID = mutation[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
413
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
414 # Only consider mutations in genes found in the gene list
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
415 if gene_ID in gene_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
416 gene_name = mutation[1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
417 mut_pos = int(mutation[2])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
418 ref_codon = mutation[3] # Ref_nuc (1 or 3?)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
419 ref_aa = mutation[4] # Ref_codon
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
420 alt_aa = mutation[5].split(",") # Res_codon
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
421 res_drug = mutation[6].replace("\t", " ")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
422 pmid = mutation[7].split(",")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
423
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
424 # Check if stop codons are predictive for resistance
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
425 if stopcodonflag is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
426 if gene_ID not in known_stop_codon:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
427 known_stop_codon[gene_ID] = {"pos": [],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
428 "drug": res_drug}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
429 known_stop_codon[gene_ID]["pos"].append(mut_pos)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
430
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
431 # Add genes associated with drug resistance to drug_genes dict
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
432 drug_lst = res_drug.split(",")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
433 drug_lst = [d.strip().lower() for d in drug_lst]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
434 for drug in drug_lst:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
435 if drug not in drug_genes:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
436 drug_genes[drug] = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
437 if gene_ID not in drug_genes[drug]:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
438 drug_genes[drug].append(gene_ID)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
439
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
440 # Initiate empty dict to store relevant mutation information
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
441 mut_info = dict()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
442
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
443 # Save need mutation info with pmid cooresponding to the amino
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
444 # acid change
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
445 for i in range(len(alt_aa)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
446 try:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
447 mut_info[alt_aa[i]] = {"gene_name": gene_name,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
448 "drug": res_drug,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
449 "pmid": pmid[i]}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
450 except IndexError:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
451 mut_info[alt_aa[i]] = {"gene_name": gene_name,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
452 "drug": res_drug,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
453 "pmid": "-"}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
454
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
455 # Add all possible types of mutations to the dict
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
456 if gene_ID not in known_mutations:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
457 known_mutations[gene_ID] = {"sub": dict(), "ins": dict(),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
458 "del": dict()}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
459 # Check for the type of mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
460 if indelflag is False:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
461 mutation_type = "sub"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
462 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
463 mutation_type = ref_aa
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
464
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
465 # Save mutations positions with required information given in
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
466 # mut_info
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
467 if mut_pos not in known_mutations[gene_ID][mutation_type]:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
468 known_mutations[gene_ID][mutation_type][mut_pos] = dict()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
469 for amino in alt_aa:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
470 if (amino in known_mutations[gene_ID][mutation_type]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
471 [mut_pos]):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
472 stored_mut_info = (known_mutations[gene_ID]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
473 [mutation_type]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
474 [mut_pos][amino])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
475 if stored_mut_info["drug"] != mut_info[amino]["drug"]:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
476 stored_mut_info["drug"] += "," + (mut_info[amino]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
477 ["drug"])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
478 if stored_mut_info["pmid"] != mut_info[amino]["pmid"]:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
479 stored_mut_info["pmid"] += ";" + (mut_info[amino]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
480 ["pmid"])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
481
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
482 (known_mutations[gene_ID][mutation_type]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
483 [mut_pos][amino]) = stored_mut_info
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
484 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
485 (known_mutations[gene_ID][mutation_type]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
486 [mut_pos][amino]) = mut_info[amino]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
487
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
488 # Check that all genes in the gene list has known mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
489 for gene in gene_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
490 if gene not in known_mutations:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
491 known_mutations[gene] = {"sub": dict(), "ins": dict(),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
492 "del": dict()}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
493 return known_mutations, drug_genes, known_stop_codon
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
494
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
495 def find_best_seqs(self, blast_results, min_cov):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
496 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
497 This function finds gene sequences with the largest coverage based on
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
498 the blast results. If more hits covering different sequences parts
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
499 exists it concatinates partial gene hits into one hit.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
500 If different overlap sequences occurs they are saved in the list
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
501 alternative_overlaps. The function returns a new results dict where
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
502 each gene has an inner dict with hit information corresponding to
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
503 the new concatinated hit. If no gene is found the value is a string
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
504 with info of why the gene wasn't found.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
505 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
506
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
507 GENES = dict()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
508 for gene, hits in blast_results.items():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
509
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
510 # Save all hits in the list 'hits_found'
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
511 hits_found = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
512 GENES[gene] = {}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
513
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
514 # Check for gene has any hits in blast results
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
515 if gene == 'excluded':
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
516 GENES[gene] = hits
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
517 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
518 elif isinstance(hits, dict) and len(hits) > 0:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
519 GENES[gene]['found'] = 'partially'
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
520 GENES[gene]['hits'] = hits
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
521 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
522 # Gene not found! go to next gene
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
523 GENES[gene] = 'No hit found'
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
524 continue
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
525
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
526 # Check coverage for each hit, patch together partial genes hits
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
527 for hit_id, hit in hits.items():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
528
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
529 # Save hits start and end positions in subject, total subject
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
530 # len, and subject and query sequences of each hit
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
531 hits_found += [(hit['sbjct_start'], hit['sbjct_end'],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
532 hit['sbjct_string'], hit['query_string'],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
533 hit['sbjct_length'], hit['homo_string'],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
534 hit_id)]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
535
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
536 # If coverage is 100% change found to yes
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
537 hit_coverage = hit['perc_coverage']
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
538 if hit_coverage == 1.0:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
539 GENES[gene]['found'] = 'yes'
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
540
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
541 # Sort positions found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
542 hits_found = sorted(hits_found, key=lambda x: x[0])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
543
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
544 # Find best hit by concatenating sequences if more hits exist
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
545
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
546 # Get information from the first hit found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
547 all_start = hits_found[0][0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
548 current_end = hits_found[0][1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
549 final_sbjct = hits_found[0][2]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
550 final_qry = hits_found[0][3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
551 sbjct_len = hits_found[0][4]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
552 final_homol = hits_found[0][5]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
553 first_hit_id = hits_found[0][6]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
554
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
555 alternative_overlaps = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
556 contigs = [hit['contig_name']]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
557 scores = [str(hit['cal_score'])]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
558
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
559 # Check if more then one hit was found within the same gene
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
560 for i in range(len(hits_found) - 1):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
561
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
562 # Save information from previous hit
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
563 pre_block_start = hits_found[i][0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
564 pre_block_end = hits_found[i][1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
565 pre_sbjct = hits_found[i][2]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
566 pre_qry = hits_found[i][3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
567 pre_homol = hits_found[i][5]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
568 pre_id = hits_found[i][6]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
569
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
570 # Save information from next hit
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
571 next_block_start = hits_found[i + 1][0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
572 next_block_end = hits_found[i + 1][1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
573 next_sbjct = hits_found[i + 1][2]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
574 next_qry = hits_found[i + 1][3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
575 next_homol = hits_found[i + 1][5]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
576 next_id = hits_found[i + 1][6]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
577
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
578 contigs.append(hits[next_id]['contig_name'])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
579 scores.append(str(hits[next_id]['cal_score']))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
580
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
581 # Check for overlapping sequences, collaps them and save
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
582 # alternative overlaps if any
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
583 if next_block_start <= current_end:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
584
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
585 # Find overlap start and take gaps into account
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
586 pos_count = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
587 overlap_pos = pre_block_start
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
588 for i in range(len(pre_sbjct)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
589
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
590 # Stop loop if overlap_start position is reached
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
591 if overlap_pos == next_block_start:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
592 overlap_start = pos_count
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
593 break
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
594 if pre_sbjct[i] != "-":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
595 overlap_pos += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
596 pos_count += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
597
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
598 # Find overlap len and add next sequence to final sequence
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
599 if len(pre_sbjct[overlap_start:]) > len(next_sbjct):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
600 # <--------->
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
601 # <--->
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
602 overlap_len = len(next_sbjct)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
603 overlap_end_pos = next_block_end
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
604 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
605 # <--------->
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
606 # <--------->
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
607 overlap_len = len(pre_sbjct[overlap_start:])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
608 overlap_end_pos = pre_block_end
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
609
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
610 # Update current end
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
611 current_end = next_block_end
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
612
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
613 # Use the entire previous sequence and add the last
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
614 # part of the next sequence
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
615 final_sbjct += next_sbjct[overlap_len:]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
616 final_qry += next_qry[overlap_len:]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
617 final_homol += next_homol[overlap_len:]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
618
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
619 # Find query overlap sequences
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
620 pre_qry_overlap = pre_qry[overlap_start: (overlap_start
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
621 + overlap_len)]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
622 next_qry_overlap = next_qry[:overlap_len]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
623 sbjct_overlap = next_sbjct[:overlap_len]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
624
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
625 # If alternative query overlap excist save it
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
626 if pre_qry_overlap != next_qry_overlap:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
627 eprint("OVERLAP WARNING:")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
628 eprint("{}\n{}"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
629 .format(pre_qry_overlap, next_qry_overlap))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
630
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
631 # Save alternative overlaps
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
632 alternative_overlaps += [(next_block_start,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
633 overlap_end_pos,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
634 sbjct_overlap,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
635 next_qry_overlap)]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
636
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
637 elif next_block_start > current_end:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
638 # <------->
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
639 # <------->
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
640 gap_size = next_block_start - current_end - 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
641 final_qry += "N" * gap_size
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
642 final_sbjct += "N" * gap_size
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
643 final_homol += "-" * gap_size
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
644 current_end = next_block_end
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
645 final_sbjct += next_sbjct
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
646 final_qry += next_qry
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
647 final_homol += next_homol
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
648
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
649 # Calculate new coverage
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
650 no_call = final_qry.upper().count("N")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
651 coverage = ((current_end - all_start + 1 - no_call)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
652 / float(sbjct_len))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
653
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
654 # Calculate identity
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
655 equal = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
656 not_equal = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
657 for i in range(len(final_qry)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
658 if final_qry[i].upper() != "N":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
659 if final_qry[i].upper() == final_sbjct[i].upper():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
660 equal += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
661 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
662 not_equal += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
663 identity = equal / float(equal + not_equal)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
664 if coverage >= min_cov:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
665 GENES[gene]['perc_coverage'] = coverage
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
666 GENES[gene]['perc_ident'] = identity
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
667 GENES[gene]['sbjct_string'] = final_sbjct
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
668 GENES[gene]['query_string'] = final_qry
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
669 GENES[gene]['homo_string'] = final_homol
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
670 GENES[gene]['contig_name'] = ", ".join(contigs)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
671 GENES[gene]['HSP_length'] = len(final_qry)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
672 GENES[gene]['sbjct_start'] = all_start
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
673 GENES[gene]['sbjct_end'] = current_end
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
674 GENES[gene]['sbjct_length'] = sbjct_len
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
675 GENES[gene]['cal_score'] = ", ".join(scores)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
676 GENES[gene]['gaps'] = no_call
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
677 GENES[gene]['alternative_overlaps'] = alternative_overlaps
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
678 GENES[gene]['mis_matches'] = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
679
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
680 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
681 # Gene not found above given coverage
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
682 GENES[gene] = ('Gene found with coverage, %f, below '
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
683 'minimum coverage threshold: %s' % (coverage,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
684 min_cov))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
685 return GENES
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
686
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
687 def find_mismatches(self, gene, sbjct_start, sbjct_seq, qry_seq):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
688 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
689 This function finds mis matches between two sequeces. Depending
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
690 on the the sequence type either the function
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
691 find_codon_mismatches or find_nucleotid_mismatches are called,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
692 if the sequences contains both a promoter and a coding region
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
693 both functions are called. The function can also call itself if
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
694 alternative overlaps is give. All found mismatches are returned
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
695 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
696
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
697 # Initiate the mis_matches list that will store all found mis matcehs
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
698 mis_matches = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
699
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
700 # Find mis matches in RNA genes
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
701 if gene in self.RNA_gene_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
702 mis_matches += PointFinder.find_nucleotid_mismatches(sbjct_start,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
703 sbjct_seq,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
704 qry_seq)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
705 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
706 # Check if the gene sequence is with a promoter
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
707 regex = r"promoter_size_(\d+)(?:bp)"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
708 promtr_gene_objt = re.search(regex, gene)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
709
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
710 # Check for promoter sequences
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
711 if promtr_gene_objt:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
712 # Get promoter length
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
713 promtr_len = int(promtr_gene_objt.group(1))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
714
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
715 # Extract promoter sequence, while considering gaps
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
716 # --------agt-->----
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
717 # ---->?
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
718 if sbjct_start <= promtr_len:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
719 # Find position in sbjct sequence where promoter ends
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
720 promtr_end = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
721 nuc_count = sbjct_start - 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
722
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
723 for i in range(len(sbjct_seq)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
724 promtr_end += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
725
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
726 if sbjct_seq[i] != "-":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
727 nuc_count += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
728
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
729 if nuc_count == promtr_len:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
730 break
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
731
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
732 # Check if only a part of the promoter is found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
733 # --------agt-->----
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
734 # ----
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
735 promtr_sbjct_start = -1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
736 if nuc_count < promtr_len:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
737 promtr_sbjct_start = nuc_count - promtr_len
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
738
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
739 # Get promoter part of subject and query
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
740 sbjct_promtr_seq = sbjct_seq[:promtr_end]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
741 qry_promtr_seq = qry_seq[:promtr_end]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
742
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
743 # For promoter part find nucleotide mis matches
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
744 mis_matches += PointFinder.find_nucleotid_mismatches(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
745 promtr_sbjct_start, sbjct_promtr_seq, qry_promtr_seq,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
746 promoter=True)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
747
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
748 # Check if gene is also found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
749 # --------agt-->----
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
750 # -----------
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
751 if((sbjct_start + len(sbjct_seq.replace("-", "")))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
752 > promtr_len):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
753 sbjct_gene_seq = sbjct_seq[promtr_end:]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
754 qry_gene_seq = qry_seq[promtr_end:]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
755 sbjct_gene_start = 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
756
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
757 # Find mismatches in gene part
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
758 mis_matches += PointFinder.find_codon_mismatches(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
759 sbjct_gene_start, sbjct_gene_seq, qry_gene_seq)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
760
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
761 # No promoter, only gene is found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
762 # --------agt-->----
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
763 # -----
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
764 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
765 sbjct_gene_start = sbjct_start - promtr_len
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
766
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
767 # Find mismatches in gene part
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
768 mis_matches += PointFinder.find_codon_mismatches(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
769 sbjct_gene_start, sbjct_seq, qry_seq)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
770
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
771 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
772 # Find mismatches in gene
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
773 mis_matches += PointFinder.find_codon_mismatches(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
774 sbjct_start, sbjct_seq, qry_seq)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
775
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
776 return mis_matches
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
777
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
778 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
779 def find_nucleotid_mismatches(sbjct_start, sbjct_seq, qry_seq,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
780 promoter=False):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
781 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
782 This function takes two alligned sequence (subject and query),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
783 and the position on the subject where the alignment starts. The
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
784 sequences are compared one nucleotide at a time. If mis matches
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
785 are found they are saved. If a gap is found the function
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
786 find_nuc_indel is called to find the entire indel and it is
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
787 also saved into the list mis_matches. If promoter sequences are
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
788 given as arguments, these are reversed the and the absolut
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
789 value of the sequence position used, but when mutations are
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
790 saved the negative value and det reverse sequences are saved in
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
791 mis_mathces.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
792 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
793
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
794 # Initiate the mis_matches list that will store all found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
795 # mismatcehs
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
796 mis_matches = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
797
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
798 sbjct_start = abs(sbjct_start)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
799 seq_pos = sbjct_start
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
800
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
801 # Set variables depending on promoter status
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
802 factor = 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
803 mut_prefix = "r."
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
804
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
805 if promoter is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
806 factor = (-1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
807 mut_prefix = "n."
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
808 # Reverse promoter sequences
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
809 sbjct_seq = sbjct_seq[::-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
810 qry_seq = qry_seq[::-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
811
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
812 # Go through sequences one nucleotide at a time
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
813 shift = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
814 for index in range(sbjct_start - 1, len(sbjct_seq)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
815 mut_name = mut_prefix
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
816 mut = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
817 # Shift index according to gaps
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
818 i = index + shift
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
819
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
820 # If the end of the sequence is reached, stop
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
821 if i == len(sbjct_seq):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
822 break
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
823
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
824 sbjct_nuc = sbjct_seq[i]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
825 qry_nuc = qry_seq[i]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
826
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
827 # Check for mis matches
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
828 if sbjct_nuc.upper() != qry_nuc.upper():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
829
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
830 # check for insertions and deletions
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
831 if sbjct_nuc == "-" or qry_nuc == "-":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
832 if sbjct_nuc == "-":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
833 mut = "ins"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
834 indel_start_pos = (seq_pos - 1) * factor
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
835 indel_end_pos = seq_pos * factor
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
836 indel = PointFinder.find_nuc_indel(sbjct_seq[i:],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
837 qry_seq[i:])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
838 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
839 mut = "del"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
840 indel_start_pos = seq_pos * factor
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
841 indel = PointFinder.find_nuc_indel(qry_seq[i:],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
842 sbjct_seq[i:])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
843 indel_end_pos = (seq_pos + len(indel) - 1) * factor
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
844 seq_pos += len(indel) - 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
845
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
846 # Shift the index to the end of the indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
847 shift += len(indel) - 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
848
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
849 # Write mutation name, depending on sequnce
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
850 if len(indel) == 1 and mut == "del":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
851 mut_name += str(indel_start_pos) + mut + indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
852 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
853 if promoter is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
854 # Reverse the sequence and the start and
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
855 # end positions
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
856 indel = indel[::-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
857 temp = indel_start_pos
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
858 indel_start_pos = indel_end_pos
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
859 indel_end_pos = temp
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
860
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
861 mut_name += (str(indel_start_pos) + "_"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
862 + str(indel_end_pos) + mut + indel)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
863
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
864 mis_matches += [[mut, seq_pos * factor, seq_pos * factor,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
865 indel, mut_name, mut, indel]]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
866
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
867 # Check for substitutions mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
868 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
869 mut = "sub"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
870 mut_name += (str(seq_pos * factor) + sbjct_nuc + ">"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
871 + qry_nuc)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
872
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
873 mis_matches += [[mut, seq_pos * factor, seq_pos * factor,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
874 qry_nuc, mut_name, sbjct_nuc, qry_nuc]]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
875
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
876 # Increment sequence position
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
877 if mut != "ins":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
878 seq_pos += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
879
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
880 return mis_matches
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
881
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
882 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
883 def find_nuc_indel(gapped_seq, indel_seq):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
884 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
885 This function finds the entire indel missing in from a gapped
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
886 sequence compared to the indel_seqeunce. It is assumes that the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
887 sequences start with the first position of the gap.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
888 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
889 ref_indel = indel_seq[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
890 for j in range(1, len(gapped_seq)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
891 if gapped_seq[j] == "-":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
892 ref_indel += indel_seq[j]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
893 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
894 break
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
895 return ref_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
896
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
897 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
898 def aa(codon):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
899 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
900 This function converts a codon to an amino acid. If the codon
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
901 is not valid an error message is given, or else, the amino acid
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
902 is returned.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
903
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
904 Potential future issue: If species are added that utilizes
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
905 alternative translation tables.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
906 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
907 codon = codon.upper()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
908 aa = {"ATT": "I", "ATC": "I", "ATA": "I",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
909 "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "TTA": "L",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
910 "TTG": "L",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
911 "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
912 "TTT": "F", "TTC": "F",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
913 "ATG": "M",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
914 "TGT": "C", "TGC": "C",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
915 "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
916 "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
917 "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
918 "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
919 "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
920 "AGC": "S",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
921 "TAT": "Y", "TAC": "Y",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
922 "TGG": "W",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
923 "CAA": "Q", "CAG": "Q",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
924 "AAT": "N", "AAC": "N",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
925 "CAT": "H", "CAC": "H",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
926 "GAA": "E", "GAG": "E",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
927 "GAT": "D", "GAC": "D",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
928 "AAA": "K", "AAG": "K",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
929 "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
930 "AGG": "R",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
931 "TAA": "*", "TAG": "*", "TGA": "*"}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
932 # Translate valid codon
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
933 try:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
934 amino_a = aa[codon]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
935 except KeyError:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
936 amino_a = "?"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
937 return amino_a
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
938
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
939 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
940 def get_codon(seq, codon_no, start_offset):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
941 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
942 This function takes a sequece and a codon number and returns
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
943 the codon found in the sequence at that position
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
944 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
945 seq = seq.replace("-", "")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
946 codon_start_pos = int(codon_no - 1) * 3 - start_offset
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
947 codon = seq[codon_start_pos:codon_start_pos + 3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
948 return codon
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
949
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
950 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
951 def name_insertion(sbjct_seq, codon_no, sbjct_nucs, aa_alt, start_offset):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
952 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
953 This function is used to name a insertion mutation based on the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
954 HGVS recommendation.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
955 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
956 start_codon_no = codon_no - 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
957
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
958 if len(sbjct_nucs) == 3:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
959 start_codon_no = codon_no
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
960
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
961 start_codon = PointFinder.get_codon(sbjct_seq, start_codon_no,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
962 start_offset)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
963 end_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
964 pos_name = "p.%s%d_%s%dins%s" % (PointFinder.aa(start_codon),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
965 start_codon_no,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
966 PointFinder.aa(end_codon),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
967 codon_no, aa_alt)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
968 return pos_name
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
969
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
970 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
971 def name_deletion(sbjct_seq, sbjct_rf_indel, sbjct_nucs, codon_no, aa_alt,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
972 start_offset, mutation="del"):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
973 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
974 This function is used to name a deletion mutation based on the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
975 HGVS recommendation. If combination of a deletion and an
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
976 insertion is identified the argument 'mutation' is set to
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
977 'delins' and the mutation name will indicate that the mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
978 is a delins mutation.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
979 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
980 del_codon = PointFinder.get_codon(sbjct_seq, codon_no, start_offset)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
981 pos_name = "p.%s%d" % (PointFinder.aa(del_codon), codon_no)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
982
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
983 # This has been changed
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
984 if len(sbjct_rf_indel) == 3 and mutation == "del":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
985 return pos_name + mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
986
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
987 end_codon_no = codon_no + math.ceil(len(sbjct_nucs) / 3) - 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
988 end_codon = PointFinder.get_codon(sbjct_seq, end_codon_no,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
989 start_offset)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
990 pos_name += "_%s%d%s" % (PointFinder.aa(end_codon), end_codon_no,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
991 mutation)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
992 if mutation == "delins":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
993 pos_name += aa_alt
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
994 return pos_name
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
995
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
996 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
997 def name_indel_mutation(sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
998 codon_no, mut, start_offset):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
999 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1000 This function serves to name the individual mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1001 dependently on the type of the mutation.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1002 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1003 # Get the subject and query sequences without gaps
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1004 sbjct_nucs = sbjct_rf_indel.replace("-", "")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1005 qry_nucs = qry_rf_indel.replace("-", "")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1006
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1007 # Translate nucleotides to amino acids
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1008 aa_ref = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1009 aa_alt = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1010
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1011 for i in range(0, len(sbjct_nucs), 3):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1012 aa_ref += PointFinder.aa(sbjct_nucs[i:i + 3])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1013
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1014 for i in range(0, len(qry_nucs), 3):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1015 aa_alt += PointFinder.aa(qry_nucs[i:i + 3])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1016
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1017 # Identify the gapped sequence
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1018 if mut == "ins":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1019 gapped_seq = sbjct_rf_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1020 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1021 gapped_seq = qry_rf_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1022
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1023 gap_size = gapped_seq.count("-")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1024
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1025 # Write mutation names
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1026 if gap_size < 3 and len(sbjct_nucs) == 3 and len(qry_nucs) == 3:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1027 # Write mutation name for substitution mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1028 mut_name = "p.%s%d%s" % (PointFinder.aa(sbjct_nucs), codon_no,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1029 PointFinder.aa(qry_nucs))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1030 elif len(gapped_seq) == gap_size:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1031
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1032 if mut == "ins":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1033 # Write mutation name for insertion mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1034 mut_name = PointFinder.name_insertion(sbjct_seq, codon_no,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1035 sbjct_nucs, aa_alt,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1036 start_offset)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1037 aa_ref = mut
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1038 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1039 # Write mutation name for deletion mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1040 mut_name = PointFinder.name_deletion(sbjct_seq, sbjct_rf_indel,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1041 sbjct_nucs, codon_no,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1042 aa_alt, start_offset,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1043 mutation="del")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1044 aa_alt = mut
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1045 # Check for delins - mix of insertion and deletion
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1046 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1047 # Write mutation name for a mixed insertion and deletion
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1048 # mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1049 mut_name = PointFinder.name_deletion(sbjct_seq,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1050 sbjct_rf_indel, sbjct_nucs,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1051 codon_no, aa_alt,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1052 start_offset,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1053 mutation="delins")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1054
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1055 # Check for frameshift
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1056 if gapped_seq.count("-") % 3 != 0:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1057 # Add the frameshift tag to mutation name
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1058 mut_name += " - Frameshift"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1059
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1060 return mut_name, aa_ref, aa_alt
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1061
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1062 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1063 def get_inframe_gap(seq, nucs_needed=3):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1064 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1065 This funtion takes a sequnece starting with a gap or the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1066 complementary seqeuence to the gap, and the number of
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1067 nucleotides that the seqeunce should contain in order to
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1068 maintain the correct reading frame. The sequence is gone
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1069 through and the number of non-gap characters are counted. When
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1070 the number has reach the number of needed nucleotides the indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1071 is returned. If the indel is a 'clean' insert or deletion that
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1072 starts in the start of a codon and can be divided by 3, then
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1073 only the gap is returned.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1074 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1075 nuc_count = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1076 gap_indel = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1077 nucs = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1078
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1079 for i in range(len(seq)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1080 # Check if the character is not a gap
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1081 if seq[i] != "-":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1082 # Check if the indel is a 'clean'
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1083 # i.e. if the insert or deletion starts at the first
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1084 # nucleotide in the codon and can be divided by 3
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1085
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1086 if(gap_indel.count("-") == len(gap_indel)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1087 and gap_indel.count("-") >= 3 and len(gap_indel) != 0):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1088 return gap_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1089
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1090 nuc_count += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1091 gap_indel += seq[i]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1092
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1093 # if the number of nucleotides in the indel equals the amount
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1094 # needed for the indel, the indel is returned.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1095 if nuc_count == nucs_needed:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1096 return gap_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1097
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1098 # This will only happen if the gap is in the very end of a sequence
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1099 return gap_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1100
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1101 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1102 def get_indels(sbjct_seq, qry_seq, start_pos):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1103 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1104 This function uses regex to find inserts and deletions in
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1105 sequences given as arguments. A list of these indels are
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1106 returned. The list includes, type of mutations(ins/del),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1107 subject codon no of found mutation, subject sequence position,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1108 insert/deletions nucleotide sequence, and the affected qry
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1109 codon no.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1110 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1111
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1112 seqs = [sbjct_seq, qry_seq]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1113 indels = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1114 gap_obj = re.compile(r"-+")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1115 for i in range(len(seqs)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1116 for match in gap_obj.finditer(seqs[i]):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1117 pos = int(match.start())
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1118 gap = match.group()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1119
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1120 # Find position of the mutation corresponding to the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1121 # subject sequence
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1122 sbj_pos = len(sbjct_seq[:pos].replace("-", "")) + start_pos
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1123
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1124 # Get indel sequence and the affected sequences in
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1125 # sbjct and qry in the reading frame
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1126 indel = seqs[abs(i - 1)][pos:pos + len(gap)]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1127
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1128 # Find codon number for mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1129 codon_no = int(math.ceil((sbj_pos) / 3))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1130 qry_pos = len(qry_seq[:pos].replace("-", "")) + start_pos
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1131 qry_codon = int(math.ceil((qry_pos) / 3))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1132
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1133 if i == 0:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1134 mut = "ins"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1135 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1136 mut = "del"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1137
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1138 indels.append([mut, codon_no, sbj_pos, indel, qry_codon])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1139
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1140 # Sort indels based on codon position and sequence position
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1141 indels = sorted(indels, key=lambda x: (x[1], x[2]))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1142
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1143 return indels
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1144
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1145 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1146 def find_codon_mismatches(sbjct_start, sbjct_seq, qry_seq):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1147 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1148 This function takes two alligned sequence (subject and query),
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1149 and the position on the subject where the alignment starts. The
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1150 sequences are compared codon by codon. If a mis matches is
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1151 found it is saved in 'mis_matches'. If a gap is found the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1152 function get_inframe_gap is used to find the indel sequence and
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1153 keep the sequence in the correct reading frame. The function
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1154 translate_indel is used to name indel mutations and translate
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1155 the indels to amino acids The function returns a list of tuples
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1156 containing all needed informations about the mutation in order
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1157 to look it up in the database dict known mutation and the with
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1158 the output files the the user.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1159 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1160 mis_matches = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1161
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1162 # Find start pos of first codon in frame, i_start
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1163 codon_offset = (sbjct_start - 1) % 3
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1164 i_start = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1165
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1166 if codon_offset != 0:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1167 i_start = 3 - codon_offset
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1168
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1169 sbjct_start = sbjct_start + i_start
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1170
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1171 # Set sequences in frame
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1172 sbjct_seq = sbjct_seq[i_start:]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1173 qry_seq = qry_seq[i_start:]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1174
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1175 # Find codon number of the first codon in the sequence, start
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1176 # at 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1177 codon_no = int((sbjct_start - 1) / 3) # 1,2,3 start on 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1178
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1179 # s_shift and q_shift are used when gaps appears
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1180 q_shift = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1181 s_shift = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1182 mut_no = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1183
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1184 # Find inserts and deletions in sequence
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1185 indel_no = 0
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1186 indels = PointFinder.get_indels(sbjct_seq, qry_seq, sbjct_start)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1187
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1188 # Go through sequence and save mutations when found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1189 for index in range(0, len(sbjct_seq), 3):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1190 # Count codon number
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1191 codon_no += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1192
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1193 # Shift index according to gaps
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1194 s_i = index + s_shift
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1195 q_i = index + q_shift
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1196
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1197 # Get codons
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1198 sbjct_codon = sbjct_seq[s_i:s_i + 3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1199 qry_codon = qry_seq[q_i:q_i + 3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1200
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1201 if(len(sbjct_seq[s_i:].replace("-", ""))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1202 + len(qry_codon[q_i:].replace("-", "")) < 6):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1203 break
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1204
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1205 # Check for mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1206 if sbjct_codon.upper() != qry_codon.upper():
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1207
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1208 # Check for codon insertions and deletions and
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1209 # frameshift mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1210 if "-" in sbjct_codon or "-" in qry_codon:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1211
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1212 # Get indel info
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1213 try:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1214 indel_data = indels[indel_no]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1215 except IndexError:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1216 sys.exit("indel_data list is out of range, bug!")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1217
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1218 mut = indel_data[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1219 codon_no_indel = indel_data[1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1220 seq_pos = indel_data[2] + sbjct_start - 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1221 indel = indel_data[3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1222 indel_no += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1223
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1224 # Get the affected sequence in frame for both for
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1225 # sbjct and qry
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1226 if mut == "ins":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1227 sbjct_rf_indel = PointFinder.get_inframe_gap(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1228 sbjct_seq[s_i:], 3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1229 qry_rf_indel = PointFinder.get_inframe_gap(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1230 qry_seq[q_i:],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1231 int(math.floor(len(sbjct_rf_indel) / 3) * 3))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1232 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1233 qry_rf_indel = PointFinder.get_inframe_gap(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1234 qry_seq[q_i:], 3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1235 sbjct_rf_indel = PointFinder.get_inframe_gap(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1236 sbjct_seq[s_i:],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1237 int(math.floor(len(qry_rf_indel) / 3) * 3))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1238
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1239 mut_name, aa_ref, aa_alt = PointFinder.name_indel_mutation(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1240 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1241 codon_no, mut, sbjct_start - 1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1242
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1243 # Set index to the correct reading frame after the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1244 # indel gap
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1245 shift_diff_before = abs(s_shift - q_shift)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1246 s_shift += len(sbjct_rf_indel) - 3
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1247 q_shift += len(qry_rf_indel) - 3
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1248 shift_diff = abs(s_shift - q_shift)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1249
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1250 if shift_diff_before != 0 and shift_diff % 3 == 0:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1251
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1252 if s_shift > q_shift:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1253 nucs_needed = (int((len(sbjct_rf_indel) / 3) * 3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1254 + shift_diff)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1255 pre_qry_indel = qry_rf_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1256 qry_rf_indel = PointFinder.get_inframe_gap(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1257 qry_seq[q_i:], nucs_needed)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1258 q_shift += len(qry_rf_indel) - len(pre_qry_indel)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1259
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1260 elif q_shift > s_shift:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1261 nucs_needed = (int((len(qry_rf_indel) / 3) * 3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1262 + shift_diff)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1263 pre_sbjct_indel = sbjct_rf_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1264 sbjct_rf_indel = PointFinder.get_inframe_gap(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1265 sbjct_seq[s_i:], nucs_needed)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1266 s_shift += (len(sbjct_rf_indel)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1267 - len(pre_sbjct_indel))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1268
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1269 mut_name, aa_ref, aa_alt = (
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1270 PointFinder.name_indel_mutation(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1271 sbjct_seq, indel, sbjct_rf_indel, qry_rf_indel,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1272 codon_no, mut, sbjct_start - 1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1273 )
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1274 if "Frameshift" in mut_name:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1275 mut_name = (mut_name.split("-")[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1276 + "- Frame restored")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1277 if mut_name is "p.V940delins - Frame restored":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1278 sys.exit()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1279 mis_matches += [[mut, codon_no_indel, seq_pos, indel,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1280 mut_name, sbjct_rf_indel, qry_rf_indel,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1281 aa_ref, aa_alt]]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1282
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1283 # Check if the next mutation in the indels list is
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1284 # in the current codon.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1285 # Find the number of individul gaps in the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1286 # evaluated sequence.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1287 no_of_indels = (len(re.findall("\-\w", sbjct_rf_indel))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1288 + len(re.findall("\-\w", qry_rf_indel)))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1289
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1290 if no_of_indels > 1:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1291
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1292 for j in range(indel_no, indel_no + no_of_indels - 1):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1293 try:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1294 indel_data = indels[j]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1295 except IndexError:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1296 sys.exit("indel_data list is out of range, "
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1297 "bug!")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1298 mut = indel_data[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1299 codon_no_indel = indel_data[1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1300 seq_pos = indel_data[2] + sbjct_start - 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1301 indel = indel_data[3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1302 indel_no += 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1303 mis_matches += [[mut, codon_no_indel, seq_pos,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1304 indel, mut_name, sbjct_rf_indel,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1305 qry_rf_indel, aa_ref, aa_alt]]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1306
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1307 # Set codon number, and save nucleotides from out
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1308 # of frame mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1309 if mut == "del":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1310 codon_no += int((len(sbjct_rf_indel) - 3) / 3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1311 # If evaluated insert is only gaps codon_no should
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1312 # not increment
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1313 elif sbjct_rf_indel.count("-") == len(sbjct_rf_indel):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1314 codon_no -= 1
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1315
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1316 # Check of point mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1317 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1318 mut = "sub"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1319 aa_ref = PointFinder.aa(sbjct_codon)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1320 aa_alt = PointFinder.aa(qry_codon)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1321
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1322 if aa_ref != aa_alt:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1323 # End search for mutation if a premature stop
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1324 # codon is found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1325 mut_name = "p." + aa_ref + str(codon_no) + aa_alt
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1326
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1327 mis_matches += [[mut, codon_no, codon_no, aa_alt,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1328 mut_name, sbjct_codon, qry_codon,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1329 aa_ref, aa_alt]]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1330 # If a Premature stop codon occur report it an stop the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1331 # loop
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1332
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1333 try:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1334 if mis_matches[-1][-1] == "*":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1335 mut_name += " - Premature stop codon"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1336 mis_matches[-1][4] = (mis_matches[-1][4].split("-")[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1337 + " - Premature stop codon")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1338 break
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1339 except IndexError:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1340 pass
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1341
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1342 # Sort mutations on position
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1343 mis_matches = sorted(mis_matches, key=lambda x: x[1])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1344
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1345 return mis_matches
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1346
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1347 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1348 def mutstr2mutdict(m):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1349 out_dict = {}
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1350
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1351 # Protein / Amino acid mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1352 # Ex.: "p.T83I"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1353 if(m.startswith("p.")):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1354 out_dict["nucleotide"] = False
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1355
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1356 # Remove frameshift tag
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1357 frameshift_match = re.search(r"(.+) - Frameshift.*$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1358 if(frameshift_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1359 m = frameshift_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1360 out_dict["frameshift"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1361
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1362 # Remove frame restored tag
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1363 framerestored_match = re.search(r"(.+) - Frame restored.*$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1364 if(framerestored_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1365 m = framerestored_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1366 out_dict["frame restored"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1367
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1368 # Remove premature stop tag
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1369 prem_stop_match = re.search(r"(.+) - Premature stop codon.*$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1370 if(prem_stop_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1371 m = prem_stop_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1372 out_dict["prem_stop"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1373
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1374 # TODO: premature or frameshift tag adds too many whitespaces
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1375 m = m.strip()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1376
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1377 # Delins
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1378 multi_delins_match = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1379 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins(\S+)$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1380 single_delins_match = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1381 r"^p.(\D{1})(\d+)delins(\S+)$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1382 # TODO: is both necessary?
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1383 multi_delins_match2 = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1384 r"^p.(\D{1})(\d+)_(\D{1})(\d+)delins$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1385 single_delins_match2 = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1386 r"^p.(\D{1})(\d+)delins$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1387 multi_ins_match = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1388 r"^p.(\D{1})(\d+)_(\D{1})(\d+)ins(\D*)$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1389 if(multi_delins_match or single_delins_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1390 out_dict["deletion"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1391 out_dict["insertion"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1392 if(single_delins_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1393 out_dict["ref_aa"] = single_delins_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1394 out_dict["pos"] = single_delins_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1395 out_dict["mut_aa"] = single_delins_match.group(3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1396 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1397 out_dict["ref_aa"] = multi_delins_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1398 out_dict["pos"] = multi_delins_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1399 out_dict["ref_aa_right"] = multi_delins_match.group(3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1400 out_dict["mut_end"] = multi_delins_match.group(4)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1401 out_dict["mut_aa"] = multi_delins_match.group(5)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1402 # Deletions
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1403 elif(m[-3:] == "del"):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1404 single_del_match = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1405 r"^p.(\D{1})(\d+)del$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1406 multi_del_match = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1407 r"^p.(\D{1})(\d+)_(\D{1})(\d+)del$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1408 out_dict["deletion"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1409 if(single_del_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1410 out_dict["ref_aa"] = single_del_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1411 out_dict["pos"] = single_del_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1412 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1413 out_dict["ref_aa"] = multi_del_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1414 out_dict["pos"] = multi_del_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1415 out_dict["ref_aa_right"] = multi_del_match.group(3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1416 out_dict["mut_end"] = multi_del_match.group(4)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1417 # Insertions
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1418 elif(multi_ins_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1419 out_dict["insertion"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1420 out_dict["ref_aa"] = multi_ins_match.group(1).lower()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1421 out_dict["pos"] = multi_ins_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1422 out_dict["ref_aa_right"] = multi_ins_match.group(3).lower()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1423 if(multi_ins_match.group(4)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1424 out_dict["mut_aa"] = multi_ins_match.group(4).lower()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1425 # Substitutions
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1426 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1427 sub_match = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1428 r"^p.(\D{1})(\d+)(\D{1})$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1429 out_dict["ref_aa"] = sub_match.group(1).lower()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1430 out_dict["pos"] = sub_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1431 out_dict["mut_aa"] = sub_match.group(3).lower()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1432
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1433 # Nucleotide mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1434 # Ex. sub: n.-42T>C
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1435 # Ex. ins: n.-13_-14insG (TODO: Verify)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1436 # Ex. del: n.42delT (TODO: Verify)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1437 # Ex. del: n.42_45del (TODO: Verify)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1438 elif(m.startswith("n.") or m.startswith("r.")):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1439 out_dict["nucleotide"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1440
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1441 sub_match = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1442 r"^[nr]{1}\.(-{0,1}\d+)(\D{1})>(\D{1})$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1443 ins_match = re.search(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1444 r"^[nr]{1}\.(-{0,1}\d+)_(-{0,1}\d+)ins(\S+)$", m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1445 # r.541delA
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1446 del_match = re.search((
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1447 r"^[nr]{1}\.(-{0,1}\d+)_{0,1}(-{0,1}\d*)del(\S*)$"), m)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1448 if(sub_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1449 out_dict["pos"] = sub_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1450 out_dict["ref_nuc"] = sub_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1451 out_dict["mut_nuc"] = sub_match.group(3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1452 elif(ins_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1453 out_dict["insertion"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1454 out_dict["pos"] = ins_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1455 out_dict["mut_end"] = ins_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1456 out_dict["mut_nuc"] = ins_match.group(3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1457 elif(del_match):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1458 out_dict["deletion"] = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1459 out_dict["pos"] = del_match.group(1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1460 if(del_match.group(2)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1461 out_dict["mut_end"] = del_match.group(2)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1462 if(del_match.group(3)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1463 out_dict["ref_nuc"] = del_match.group(3)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1464 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1465 sys.exit("ERROR: Nucleotide deletion did not contain any "
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1466 "reference sequence. mut string: {}".format(m))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1467
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1468 return out_dict
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1469
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1470 def get_mutations(self, gene, gene_name, mis_matches, unknown_flag, hit):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1471 RNA = False
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1472 if gene in self.RNA_gene_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1473 RNA = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1474
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1475 known_muts = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1476 unknown_muts = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1477
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1478 # Go through each mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1479 for i in range(len(mis_matches)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1480 m_type = mis_matches[i][0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1481 pos = mis_matches[i][1] # sort on pos?
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1482 look_up_pos = mis_matches[i][2]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1483 look_up_mut = mis_matches[i][3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1484 mut_name = mis_matches[i][4]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1485 # nuc_ref = mis_matches[i][5]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1486 # nuc_alt = mis_matches[i][6]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1487 ref = mis_matches[i][-2]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1488 alt = mis_matches[i][-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1489 mut_dict = PointFinder.mutstr2mutdict(mut_name)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1490
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1491 mut_id = ("{gene}_{pos}_{alt}"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1492 .format(gene=gene_name, pos=pos, alt=alt.lower()))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1493
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1494 ref_aa = mut_dict.get("ref_aa", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1495 ref_aa_right = mut_dict.get("ref_aa_right", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1496 mut_aa = mut_dict.get("mut_aa", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1497 ref_nuc = mut_dict.get("ref_nuc", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1498 mut_nuc = mut_dict.get("mut_nuc", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1499 is_nuc = mut_dict.get("nucleotide", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1500 is_ins = mut_dict.get("insertion", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1501 is_del = mut_dict.get("deletion", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1502 mut_end = mut_dict.get("mut_end", None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1503 prem_stop = mut_dict.get("prem_stop", False)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1504
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1505 mut = ResMutation(unique_id=mut_id, seq_region=gene_name, pos=pos,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1506 hit=hit, ref_codon=ref_nuc, mut_codon=mut_nuc,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1507 ref_aa=ref_aa, ref_aa_right=ref_aa_right,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1508 mut_aa=mut_aa, insertion=is_ins, deletion=is_del,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1509 end=mut_end, nuc=is_nuc,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1510 premature_stop=prem_stop, ab_class=None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1511
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1512 if "Premature stop codon" in mut_name:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1513 sbjct_len = hit['sbjct_length']
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1514 qry_len = pos * 3
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1515 perc_trunc = round(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1516 ((float(sbjct_len) - qry_len)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1517 / float(sbjct_len))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1518 * 100, 2
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1519 )
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1520 mut.premature_stop = perc_trunc
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1521
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1522 # Check if mutation is known
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1523 gene_mut_name, resistence, pmid = self.look_up_known_muts(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1524 gene, look_up_pos, look_up_mut, m_type, gene_name)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1525
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1526 # Collect known mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1527 if resistence != "Unknown":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1528 known_muts.append(mut)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1529 # Collect unknown mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1530 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1531 unknown_muts.append(mut)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1532
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1533 # TODO: Use ResMutation class to make sure identical mutations are
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1534 # not kept.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1535
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1536 return (known_muts, unknown_muts)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1537
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1538 def mut2str(self, gene, gene_name, mis_matches, unknown_flag, hit):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1539 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1540 This function takes a gene name a list of mis matches found
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1541 between subject and query of this gene, the dictionary of
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1542 known mutation in the point finder database, and the flag
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1543 telling weather the user wants unknown mutations to be
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1544 reported. All mis matches are looked up in the known
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1545 mutation dict to se if the mutation is known, and in this
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1546 case what drug resistence it causes. The funtions return 3
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1547 strings that are used as output to the users. One string is
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1548 only tab seperated and contains the mutations listed line
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1549 by line. If the unknown flag is set to true it will contain
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1550 both known and unknown mutations. The next string contains
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1551 only known mutation and are given in in a format that is
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1552 easy to convert to HTML. The last string is the HTML tab
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1553 sting from the unknown mutations.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1554 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1555 known_header = ("Mutation\tNucleotide change\tAmino acid change\t"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1556 "Resistance\tPMID\n")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1557
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1558 unknown_header = "Mutation\tNucleotide change\tAmino acid change\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1559
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1560 RNA = False
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1561 if gene in self.RNA_gene_list:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1562 RNA = True
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1563 known_header = "Mutation\tNucleotide change\tResistance\tPMID\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1564 unknown_header = "Mutation\tNucleotide change\n"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1565
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1566 known_lst = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1567 unknown_lst = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1568 all_results_lst = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1569 output_mut = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1570 stop_codons = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1571
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1572 # Go through each mutation
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1573 for i in range(len(mis_matches)):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1574 m_type = mis_matches[i][0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1575 pos = mis_matches[i][1] # sort on pos?
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1576 look_up_pos = mis_matches[i][2]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1577 look_up_mut = mis_matches[i][3]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1578 mut_name = mis_matches[i][4]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1579 nuc_ref = mis_matches[i][5]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1580 nuc_alt = mis_matches[i][6]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1581 ref = mis_matches[i][-2]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1582 alt = mis_matches[i][-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1583
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1584 # First index in list indicates if mutation is known
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1585 output_mut += [[]]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1586
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1587 # Define output vaiables
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1588 codon_change = nuc_ref + " -> " + nuc_alt
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1589 aa_change = ref + " -> " + alt
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1590
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1591 if RNA is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1592 aa_change = "RNA mutations"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1593 elif pos < 0:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1594 aa_change = "Promoter mutations"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1595
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1596 # Check if mutation is known
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1597 gene_mut_name, resistence, pmid = self.look_up_known_muts(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1598 gene, look_up_pos, look_up_mut, m_type, gene_name)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1599
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1600 gene_mut_name = gene_mut_name + " " + mut_name
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1601
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1602 output_mut[i] = [gene_mut_name, codon_change, aa_change,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1603 resistence, pmid]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1604
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1605 # Add mutation to output strings for known mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1606 if resistence != "Unknown":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1607 if RNA is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1608 # don't include the amino acid change field for
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1609 # RNA mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1610 known_lst += ["\t".join(output_mut[i][:2]) + "\t"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1611 + "\t".join(output_mut[i][3:])]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1612 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1613 known_lst += ["\t".join(output_mut[i])]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1614
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1615 all_results_lst += ["\t".join(output_mut[i])]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1616
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1617 # Add mutation to output strings for unknown mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1618 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1619 if RNA is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1620 unknown_lst += ["\t".join(output_mut[i][:2])]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1621 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1622 unknown_lst += ["\t".join(output_mut[i][:3])]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1623
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1624 if unknown_flag is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1625 all_results_lst += ["\t".join(output_mut[i])]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1626
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1627 # Check that you do not print two equal lines (can happen
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1628 # if two indels occure in the same codon)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1629 if len(output_mut) > 1:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1630 if output_mut[i] == output_mut[i - 1]:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1631
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1632 if resistence != "Unknown":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1633 known_lst = known_lst[:-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1634 all_results_lst = all_results_lst[:-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1635 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1636 unknown_lst = unknown_lst[:-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1637
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1638 if unknown_flag is True:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1639 all_results_lst = all_results_lst[:-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1640
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1641 if "Premature stop codon" in mut_name:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1642 sbjct_len = hit['sbjct_length']
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1643 qry_len = pos * 3
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1644 prec_truckat = round(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1645 ((float(sbjct_len) - qry_len)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1646 / float(sbjct_len))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1647 * 100, 2
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1648 )
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1649 perc = "%"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1650 stop_codons.append("Premature stop codon in %s, %.2f%s lost"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1651 % (gene, prec_truckat, perc))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1652
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1653 # Creat final strings
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1654 if(all_results_lst):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1655 all_results = "\n".join(all_results_lst)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1656 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1657 all_results = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1658 total_known_str = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1659 total_unknown_str = ""
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1660
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1661 # Check if there are only unknown mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1662 resistence_lst = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1663 for mut in output_mut:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1664 for res in mut[3].split(","):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1665 resistence_lst.append(res)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1666
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1667 # Save known mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1668 unknown_no = resistence_lst.count("Unknown")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1669 if unknown_no < len(resistence_lst):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1670 total_known_str = known_header + "\n".join(known_lst)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1671 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1672 total_known_str = "No known mutations found in %s" % gene_name
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1673
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1674 # Save unknown mutations
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1675 if unknown_no > 0:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1676 total_unknown_str = unknown_header + "\n".join(unknown_lst)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1677 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1678 total_unknown_str = "No unknown mutations found in %s" % gene_name
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1679
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1680 return (all_results, total_known_str, total_unknown_str,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1681 resistence_lst + stop_codons)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1682
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1683 @staticmethod
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1684 def get_file_content(file_path, fst_char_only=False):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1685 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1686 This function opens a file, given as the first argument
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1687 file_path and returns the lines of the file in a list or the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1688 first character of the file if fst_char_only is set to True.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1689 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1690 with open(file_path, "r") as infile:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1691 line_lst = []
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1692 for line in infile:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1693 line = line.strip()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1694 if line != "":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1695 line_lst.append(line)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1696 if fst_char_only:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1697 return line_lst[0][0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1698 return line_lst
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1699
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1700 def look_up_known_muts(self, gene, pos, found_mut, mut, gene_name):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1701 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1702 This function uses the known_mutations dictionay, a gene a
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1703 string with the gene key name, a gene position as integer,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1704 found_mut is given as amino acid or nucleotides, the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1705 mutation type (mut) is either "del", "ins", or "sub", and
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1706 gene_name is the gene name that should be returned to the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1707 user. The function looks up if the found_mut defined by the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1708 gene, position and the found_mut string is given in the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1709 known_mutations dictionary, if it is, the resistance and
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1710 the pmid are returned together with the gene_name given in
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1711 the known_mutations dict. If the mutation type is "del" the
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1712 deleted nucleotids are checked to be contained in any of
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1713 the deletion described in the known_mutation dict.
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1714 """
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1715 resistence = "Unknown"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1716 pmid = "-"
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1717 found_mut = found_mut.upper()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1718
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1719 if mut == "del":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1720 for i, i_pos in enumerate(range(pos, pos + len(found_mut))):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1721
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1722 known_indels = self.known_mutations[gene]["del"].get(i_pos, [])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1723 for known_indel in known_indels:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1724 partial_mut = found_mut[i:len(known_indel) + i]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1725
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1726 # Check if part of found mut is known and check if
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1727 # found mut and known mut is in the same reading
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1728 # frame
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1729 if(partial_mut == known_indel
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1730 and len(found_mut) % 3 == len(known_indel) % 3):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1731
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1732 resistence = (self.known_mutations[gene]["del"][i_pos]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1733 [known_indel]['drug'])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1734
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1735 pmid = (self.known_mutations[gene]["del"][i_pos]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1736 [known_indel]['pmid'])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1737
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1738 gene_name = (self.known_mutations[gene]["del"][i_pos]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1739 [known_indel]['gene_name'])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1740 break
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1741 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1742 if pos in self.known_mutations[gene][mut]:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1743 if found_mut in self.known_mutations[gene][mut][pos]:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1744 resistence = (self.known_mutations[gene][mut][pos]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1745 [found_mut]['drug'])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1746
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1747 pmid = (self.known_mutations[gene][mut][pos][found_mut]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1748 ['pmid'])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1749
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1750 gene_name = (self.known_mutations[gene][mut][pos]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1751 [found_mut]['gene_name'])
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1752
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1753 # Check if stop codons refer resistance
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1754 if "*" in found_mut and gene in self.known_stop_codon:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1755 if resistence == "Unknown":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1756 resistence = self.known_stop_codon[gene]["drug"]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1757 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1758 resistence += "," + self.known_stop_codon[gene]["drug"]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1759
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1760 return (gene_name, resistence, pmid)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1761
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1762
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1763 if __name__ == '__main__':
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1764
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1765 ##########################################################################
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1766 # PARSE COMMAND LINE OPTIONS
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1767 ##########################################################################
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1768
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1769 parser = argparse.ArgumentParser(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1770 description="This program predicting resistance associated with \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1771 chromosomal mutations based on WGS data",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1772 prog="pointfinder.py")
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1773
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1774 # required arguments
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1775 parser.add_argument("-i",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1776 dest="inputfiles",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1777 metavar="INFILE",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1778 nargs='+',
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1779 help="Input file. fastq file(s) from one sample for \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1780 KMA or one fasta file for blastn.",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1781 required=True)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1782 parser.add_argument("-o",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1783 dest="out_path",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1784 metavar="OUTFOLDER",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1785 help="Output folder, output files will be stored \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1786 here.",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1787 required=True)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1788 parser.add_argument("-s",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1789 dest="species",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1790 metavar="SPECIES",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1791 choices=['e.coli', 'gonorrhoeae', 'campylobacter',
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1792 'salmonella', 'tuberculosis'],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1793 help="Species of choice, e.coli, tuberculosis, \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1794 salmonella, campylobactor, gonorrhoeae, \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1795 klebsiella, or malaria",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1796 required=True)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1797 parser.add_argument("-m",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1798 dest="method",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1799 metavar="METHOD",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1800 choices=["kma", "blastn"],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1801 help="Method of choice, kma or blastn",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1802 required=True)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1803 parser.add_argument("-m_p",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1804 dest="method_path",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1805 help="Path to the location of blastn or kma dependent \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1806 of the chosen method",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1807 required=True)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1808 parser.add_argument("-p",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1809 dest="db_path",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1810 metavar="DATABASE",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1811 help="Path to the location of the pointfinder \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1812 database",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1813 required=True)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1814
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1815 # optional arguments
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1816 parser.add_argument("-t",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1817 dest="threshold",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1818 metavar="IDENTITY",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1819 help="Minimum gene identity threshold, default = 0.9",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1820 type=float,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1821 default=0.9)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1822 parser.add_argument("-l",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1823 dest="min_cov",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1824 metavar="COVERAGE",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1825 help="Minimum gene coverage threshold, \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1826 threshold = 0.6",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1827 type=float,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1828 default=0.6)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1829 parser.add_argument("-u",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1830 dest="unknown_mutations",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1831 help="Show all mutations found even if it's unknown \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1832 to the resistance database.",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1833 action='store_true',
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1834 default=False)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1835 parser.add_argument("-g",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1836 dest="specific_gene",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1837 nargs='+',
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1838 help="Specify genes existing in the database to \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1839 search for - if none is specified all genes are \
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1840 included in the search.",
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1841 default=None)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1842
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1843 args = parser.parse_args()
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1844
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1845 # If no arguments are given print usage message and exit
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1846 if len(sys.argv) == 1:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1847 sys.exit("Usage: " + parser.usage)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1848
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1849 if(args.method == "blastn"):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1850 method = PointFinder.TYPE_BLAST
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1851 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1852 method = PointFinder.TYPE_KMA
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1853
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1854 # Get sample name
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1855 filename = args.inputfiles[0].split("/")[-1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1856 sample_name = "".join(filename.split(".")[0:-1]) # .split("_")[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1857 if sample_name == "":
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1858 sample_name = filename
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1859
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1860 # TODO: Change ilogocal var name
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1861 kma_db_path = args.db_path + "/" + args.species
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1862
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1863 finder = PointFinder(db_path=kma_db_path, species=args.species,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1864 gene_list=args.specific_gene)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1865
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1866 if method == PointFinder.TYPE_BLAST:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1867
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1868 # Check that only one input file is given
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1869 if len(args.inputfiles) != 1:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1870 sys.exit("Input Error: Blast was chosen as mapping method only 1 "
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1871 "input file requied, not %s" % (len(args.inputfiles)))
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1872
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1873 finder_run = finder.blast(inputfile=args.inputfiles[0],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1874 out_path=args.out_path,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1875 min_cov=0.01,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1876 threshold=args.threshold,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1877 blast=args.method_path,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1878 cut_off=False)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1879 else:
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1880 inputfile_1 = args.inputfiles[0]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1881 inputfile_2 = None
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1882
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1883 if(len(args.inputfiles) == 2):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1884 inputfile_2 = args.inputfiles[1]
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1885 finder_run = finder.kma(inputfile_1=inputfile_1,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1886 inputfile_2=inputfile_2,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1887 out_path=args.out_path,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1888 db_path_kma=kma_db_path,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1889 databases=[args.species],
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1890 min_cov=args.min_cov,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1891 threshold=args.threshold,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1892 kma_path=args.method_path,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1893 sample_name=sample_name,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1894 kma_mrs=0.5, kma_gapopen=-5, kma_gapextend=-2,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1895 kma_penalty=-3, kma_reward=1)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1896
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1897 results = finder_run.results
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1898
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1899 if(args.specific_gene):
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1900 results = PointFinder.discard_unwanted_results(
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1901 results=results, wanted=args.specific_gene)
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1902
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1903 finder.write_results(out_path=args.out_path, result=results, min_cov=args.min_cov,
a16d245332d6 Uploaded
dcouvin
parents:
diff changeset
1904 res_type=method, unknown_flag=args.unknown_mutations)