annotate pmlst/pmlst.py @ 0:140d4f9e1f20 draft default tip

Uploaded
author dcouvin
date Mon, 06 Sep 2021 16:00:46 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
1 #!/usr/bin/env python3
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
2
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
3 import os, sys, re, time, pprint, io, shutil
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
4 import argparse, subprocess
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
5
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
6 from cgecore.alignment import extended_cigar
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
7 from cgecore.blaster.blaster import Blaster
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
8 from cgecore.cgefinder import CGEFinder
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
9 import json, gzip
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
10 from tabulate import tabulate
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
11
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
12
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
13 def get_read_filename(infiles):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
14 ''' Infiles must be a list with 1 or 2 input files.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
15 Removes path from given string and removes extensions:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
16 .fq .fastq .gz and .trim
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
17 extract the common sample name i 2 files are given.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
18 '''
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
19 # Remove common fastq extensions
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
20 seq_path = infiles[0]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
21 seq_file = os.path.basename(seq_path)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
22 seq_file = seq_file.replace(".fq", "")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
23 seq_file = seq_file.replace(".fastq", "")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
24 seq_file = seq_file.replace(".gz", "")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
25 seq_file = seq_file.replace(".trim", "")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
26 if len(infiles) == 1:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
27 return seq_file.rstrip()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
28
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
29 # If two files are given get the common sample name
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
30 sample_name = ""
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
31 seq_file_2 = os.path.basename(infiles[1])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
32 for i in range(len(seq_file)):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
33 if seq_file_2[i] == seq_file[i]:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
34 sample_name += seq_file[i]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
35 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
36 break
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
37 if sample_name == "":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
38 sys.error("Input error: sample names of input files, {} and {}, \
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
39 does not share a common sample name. If these files \
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
40 are paired end reads from the same sample, please rename \
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
41 them with a common sample name (e.g. 's22_R1.fq', 's22_R2.fq') \
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
42 or input them seperately.".format(infiles[0], infiles[1]))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
43
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
44 return sample_name.rstrip("-").rstrip("_")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
45
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
46 def is_gzipped(file_path):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
47 ''' Returns True if file is gzipped and False otherwise.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
48 The result is inferred from the first two bits in the file read
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
49 from the input path.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
50 On unix systems this should be: 1f 8b
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
51 Theoretically there could be exceptions to this test but it is
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
52 unlikely and impossible if the input files are otherwise expected
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
53 to be encoded in utf-8.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
54 '''
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
55 with open(file_path, mode='rb') as fh:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
56 bit_start = fh.read(2)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
57 if(bit_start == b'\x1f\x8b'):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
58 return True
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
59 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
60 return False
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
61
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
62 def get_file_format(input_files):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
63 """
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
64 Takes all input files and checks their first character to assess
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
65 the file format. Returns one of the following strings; fasta, fastq,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
66 other or mixed. fasta and fastq indicates that all input files are
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
67 of the same format, either fasta or fastq. other indiates that all
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
68 files are not fasta nor fastq files. mixed indicates that the inputfiles
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
69 are a mix of different file formats.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
70 """
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
71
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
72 # Open all input files and get the first character
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
73 file_format = []
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
74 invalid_files = []
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
75 for infile in input_files:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
76 if is_gzipped(infile):#[-3:] == ".gz":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
77 f = gzip.open(infile, "rb")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
78 fst_char = f.read(1);
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
79 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
80 f = open(infile, "rb")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
81 fst_char = f.read(1);
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
82 f.close()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
83 # Assess the first character
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
84 if fst_char == b"@":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
85 file_format.append("fastq")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
86 elif fst_char == b">":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
87 file_format.append("fasta")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
88 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
89 invalid_files.append("other")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
90 if len(set(file_format)) != 1:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
91 return "mixed"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
92 return ",".join(set(file_format))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
93
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
94 def import_profile(database, scheme, loci_list):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
95 """Import all possible allele profiles with corresponding st's
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
96 for the scheme into a dict. The profiles are stored in a dict
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
97 of dicts, to easily look up what st types are accosiated with
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
98 a specific allele number of each loci.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
99 """
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
100 # Open allele profile file from databaseloci
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
101 profile_file = open("{0}/{1}.txt.clean".format(database, scheme), "r")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
102 profile_header = profile_file.readline().strip().split("\t")[1:len(loci_list)+1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
103
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
104 # Create dict for looking up st-types with locus/allele combinations
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
105 st_profiles = {}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
106 # For each locus initate make an inner dict to store allele and st's
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
107 for locus in loci_list:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
108 st_profiles[locus] = {}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
109
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
110 # Fill inner dict with allele no as key and st-types seen with the allele as value
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
111 for line in profile_file:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
112 profile = line.strip().split("\t")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
113 st_name = profile[0]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
114 allele_list = profile[1:len(loci_list)+1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
115
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
116 # Go through all allele profiles. Save locus-allele combination with the st-type
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
117 for i in range(len(allele_list)):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
118 allele = allele_list[i]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
119 locus = profile_header[i]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
120 if allele in st_profiles[locus]:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
121 st_profiles[locus][allele] += [st_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
122 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
123 st_profiles[locus][allele] = [st_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
124 profile_file.close()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
125
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
126 return st_profiles
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
127
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
128 def st_typing(st_profiles, allele_matches, loci_list):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
129 """
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
130 Takes the path to a dictionary, the inp list of the allele
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
131 number that each loci has been assigned, and an output file string
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
132 where the found st type and similaity is written into it.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
133 """
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
134
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
135 # Find best ST type for all allele profiles
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
136 st_output = ""
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
137 note = ""
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
138
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
139 # First line contains matrix column headers, which are the specific loci
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
140 st_hits = []
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
141 st_marks = []
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
142 note = ""
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
143
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
144 # Check the quality of the alle hits
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
145 for locus in allele_matches:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
146 allele = allele_matches[locus]["allele"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
147
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
148 # Check if allele is marked as a non-perfect match. Save mark and write note.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
149 if "?*" in allele:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
150 note += "?* {}: Imperfect hit, ST can not be trusted!\n".format(locus)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
151 st_marks = ["?","*"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
152 elif "?" in allele:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
153 note += "? {}: Uncertain hit, ST can not be trusted.\n".format(locus)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
154 st_marks.append("?")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
155 elif "*" in allele:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
156 note += "* {}: Novel allele, ST may indicate nearest ST.\n".format(locus)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
157 st_marks.append("*")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
158
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
159 # Remove mark from allele so it can be used to look up nearest st types
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
160 allele = allele.rstrip("*?!")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
161
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
162 # Get all st's that have the alleles in it's allele profile
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
163 st_hits += st_profiles[locus].get(allele, ["None"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
164 if "alternative_hit" in allele_matches[locus] and allele_matches[locus]["alternative_hit"] != {}:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
165 note += "! {}: Multiple perfect hits found\n".format(locus)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
166 st_marks.append("!")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
167 for allele_name, hit_info in allele_matches[locus]["alternative_hit"].items():
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
168 allele = hit_info["allele"].rstrip("!")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
169 st_hits += st_profiles[locus].get(allele, ["None"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
170
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
171 # Save allele marks to be transfered to the ST
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
172 st_mark = "".join(set(st_marks))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
173 notes = st_mark
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
174 # Add marks information to notes
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
175 if "!" in st_mark:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
176 notes += " alleles with multiple perfect hits found, multiple STs might be found\n"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
177 if "*" in st_mark and "?" in st_mark:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
178 notes += " alleles with less than 100% identity and 100% coverages found\n"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
179 elif st_mark == "*":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
180 notes = st_mark + " alleles with less than 100% identity found\n"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
181 elif st_mark == "?":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
182 notes = st_mark + " alleles with less than 100% coverage found\n"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
183 notes += note
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
184
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
185 # Find most frequent st in st_hits
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
186 st_hits_counter = {}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
187 max_count = 0
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
188 best_hit = ""
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
189 for hit in st_hits:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
190 if hit is not "None":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
191 if hit in st_hits_counter:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
192 st_hits_counter[hit] += 1
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
193 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
194 st_hits_counter[hit] = 1
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
195 if max_count < st_hits_counter[hit]:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
196 max_count = st_hits_counter[hit]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
197 best_hit = hit
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
198
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
199 # Check if allele profile match found st 100 %
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
200 similarity = round(float(max_count)/len(loci_list)*100, 2)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
201
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
202 if similarity != 100:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
203 st = "Unknown"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
204 nearest_sts = []
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
205 # If st is not perfect find nearest st's
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
206 for st_hit, allele_score in sorted(st_hits_counter.items(), key=lambda x: x[1], reverse=True):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
207 if allele_score < max_count:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
208 break
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
209 nearest_sts.append(st_hit)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
210 nearest_sts = ",".join(nearest_sts) #+ st_mark
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
211 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
212 # allele profile has a perfect ST hit but the st marks given to the alleles might indicate imperfect hits
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
213 sts = [st for st, no in st_hits_counter.items() if no == max_count]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
214 #if len(sts) > 1:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
215 st = "{},".format(st_mark).join(sts) + st_mark
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
216 #st = best_hit + st_mark
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
217 nearest_sts = ""
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
218
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
219 return st, notes, nearest_sts
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
220
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
221 def make_aln(scheme, file_handle, allele_matches, query_aligns, homol_aligns, sbjct_aligns):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
222 for locus, locus_info in allele_matches.items():
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
223 allele_name = locus_info["allele_name"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
224 if allele_name == "No hit found":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
225 continue
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
226 hit_name = locus_info["hit_name"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
227
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
228 seqs = ["","",""]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
229 seqs[0] = sbjct_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
230 seqs[1] = homol_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
231 seqs[2] = query_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
232
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
233 write_align(seqs, allele_name, file_handle)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
234
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
235
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
236 # write alternative seq
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
237 if "alternative_hit" in locus_info:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
238 for allele_name in locus_info["alternative_hit"]:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
239 hit_name = locus_info["alternative_hit"][allele_name]["hit_name"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
240 seqs = ["","",""]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
241 seqs[0] = sbjct_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
242 seqs[1] = homol_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
243 seqs[2] = query_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
244
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
245 write_align(seqs, allele_name, file_handle)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
246
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
247 def write_align(seq, seq_name, file_handle):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
248 file_handle.write("# {}".format(seq_name) + "\n")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
249 sbjct_seq = seq[0]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
250 homol_seq = seq[1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
251 query_seq = seq[2]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
252 for i in range(0,len(sbjct_seq),60):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
253 file_handle.write("%-10s\t%s\n"%("template:", sbjct_seq[i:i+60]))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
254 file_handle.write("%-10s\t%s\n"%("", homol_seq[i:i+60]))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
255 file_handle.write("%-10s\t%s\n\n"%("query:", query_seq[i:i+60]))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
256
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
257 def text_table(headers, rows, empty_replace='-'):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
258 ''' Create text table
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
259
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
260 USAGE:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
261 >>> from tabulate import tabulate
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
262 >>> headers = ['A','B']
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
263 >>> rows = [[1,2],[3,4]]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
264 >>> print(text_table(headers, rows))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
265 **********
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
266 A B
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
267 **********
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
268 1 2
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
269 3 4
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
270 ==========
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
271 '''
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
272 # Replace empty cells with placeholder
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
273 rows = map(lambda row: map(lambda x: x if x else empty_replace, row), rows)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
274 # Create table
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
275 table = tabulate(rows, headers, tablefmt='simple').split('\n')
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
276 # Prepare title injection
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
277 width = len(table[0])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
278 # Switch horisontal line
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
279 table[1] = '*'*(width+2)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
280 # Update table with title
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
281 table = ("%s\n"*3)%('*'*(width+2), '\n'.join(table), '='*(width+2))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
282 return table
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
283
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
284
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
285 parser = argparse.ArgumentParser(description="")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
286 # Arguments
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
287 parser.add_argument("-i", "--infile",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
288 help="FASTA or FASTQ files to do pMLST on.",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
289 nargs="+", required=True)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
290 parser.add_argument("-o", "--outdir",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
291 help="Output directory.",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
292 default=".")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
293 parser.add_argument("-s", "--scheme",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
294 help="scheme database used for pMLST prediction", required=True)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
295 parser.add_argument("-p", "--database",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
296 help="Directory containing the databases.", default="/database")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
297 parser.add_argument("-t", "--tmp_dir",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
298 help="Temporary directory for storage of the results\
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
299 from the external software.",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
300 default="tmp_pMLST")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
301 parser.add_argument("-mp", "--method_path",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
302 help="Path to the method to use (kma or blastn)\
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
303 if assembled contigs are inputted the path to executable blastn should be given,\
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
304 if fastq files are given path to executable kma should be given")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
305 parser.add_argument("-x", "--extented_output",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
306 help="Give extented output with allignment files, template and query hits in fasta and\
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
307 a tab seperated file with allele profile results", action="store_true")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
308 parser.add_argument("-q", "--quiet", action="store_true")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
309
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
310
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
311 #parser.add_argument("-c", "--coverage",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
312 # help="Minimum template coverage required", default = 0.6)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
313 #parser.add_argument("-i", "--identity",
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
314 # help="Minimum template identity required", default = 0.9)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
315 args = parser.parse_args()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
316
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
317 if args.quiet:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
318 f = open(os.devnull, 'w')
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
319 sys.stdout = f
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
320
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
321
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
322 #TODO what are the clonal complex data used for??
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
323
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
324 # TODO error handling
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
325 infile = args.infile
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
326 # Check that outdir is an existing dir...
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
327 outdir = os.path.abspath(args.outdir)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
328 scheme = args.scheme
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
329 database = os.path.abspath(args.database)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
330 tmp_dir = os.path.abspath(args.tmp_dir)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
331 # Check if method path is executable
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
332 method_path = args.method_path
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
333 extented_output = args.extented_output
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
334
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
335 min_cov = 0.6 # args.coverage
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
336 threshold = 0.95 # args.identity
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
337
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
338 # Check file format (fasta, fastq or other format)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
339 file_format = get_file_format(infile)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
340
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
341 db_path = "{}/".format(database, scheme)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
342
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
343 config_file = open(database + "/config","r")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
344
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
345 # Get profile_name from config file
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
346 scheme_list = []
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
347 for line in config_file:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
348 if line.startswith("#"):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
349 continue
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
350 line = line.split("\t")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
351 scheme_list.append(line[0])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
352 if line[0] == scheme:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
353 profile_name = line[1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
354
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
355 config_file.close()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
356
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
357 if scheme not in scheme_list:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
358 sys.exit("{}, is not a valid scheme. \n\nPlease choose a scheme available in the database:\n{}".format(scheme, ", ".join(scheme_list)))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
359
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
360 # Get loci list from allele profile file
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
361 with open("{0}/{1}.txt.clean".format(database, scheme), "r") as st_file:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
362 file_header = st_file.readline().strip().split("\t")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
363 loci_list = file_header[1:]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
364
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
365 # Call appropriate method (kma or blastn) based on file format
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
366 if file_format == "fastq":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
367 if not method_path:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
368 method_path = "kma"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
369 if shutil.which(method_path) == None:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
370 sys.exit("No valid path to a kma program was provided. Use the -mp flag to provide the path.")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
371 # Check the number of files
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
372 if len(infile) == 1:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
373 infile_1 = infile[0]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
374 infile_2 = None
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
375 elif len(infile) == 2:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
376 infile_1 = infile[0]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
377 infile_2 = infile[1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
378 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
379 sys.exit("Only 2 input file accepted for raw read data,\
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
380 if data from more runs is avaliable for the same\
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
381 sample, please concatinate the reads into two files")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
382
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
383 sample_name = get_read_filename(infile)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
384 method = "kma"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
385
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
386 # Call KMA
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
387 method_obj = CGEFinder.kma(infile_1, outdir, [scheme], db_path, min_cov=min_cov,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
388 threshold=threshold, kma_path=method_path, sample_name=sample_name,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
389 inputfile_2=infile_2, kma_mrs=0.75, kma_gapopen=-5,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
390 kma_gapextend=-1, kma_penalty=-3, kma_reward=1)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
391
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
392 elif file_format == "fasta":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
393 if not method_path:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
394 method_path = "blastn"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
395 if shutil.which(method_path) == None:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
396 sys.exit("No valid path to a blastn program was provided. Use the -mp flag to provide the path.")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
397 # Assert that only one fasta file is inputted
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
398 assert len(infile) == 1, "Only one input file accepted for assembled data."
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
399 infile = infile[0]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
400 method = "blast"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
401
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
402 # Call BLASTn
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
403 method_obj = Blaster(infile, [scheme], db_path, tmp_dir,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
404 min_cov, threshold, method_path, cut_off=False)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
405 #allewed_overlap=50)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
406 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
407 sys.exit("Input file must be fastq or fasta format, not "+ file_format)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
408
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
409 results = method_obj.results
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
410 query_aligns = method_obj.gene_align_query
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
411 homol_aligns = method_obj.gene_align_homo
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
412 sbjct_aligns = method_obj.gene_align_sbjct
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
413
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
414 # Check that the results dict is not empty
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
415 warning = ""
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
416 if results[scheme] == "No hit found":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
417 results[scheme] = {}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
418 warning = ("No MLST loci was found in the input data, "
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
419 "make sure that the correct pMLST scheme was chosen.")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
420
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
421
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
422 allele_matches = {}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
423
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
424 # Get the found allele profile contained in the results dict
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
425 for hit, locus_hit in results[scheme].items():
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
426
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
427 # Get allele number for locus
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
428 allele_name = locus_hit["sbjct_header"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
429 allele_obj = re.search("(\w+)[_|-](\w+$)", allele_name)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
430
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
431 # Get variable to later storage in the results dict
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
432 locus = allele_obj.group(1)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
433 allele = allele_obj.group(2)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
434 coverage = float(locus_hit["perc_coverage"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
435 identity = float(locus_hit["perc_ident"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
436 score = float(locus_hit["cal_score"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
437 gaps = int(locus_hit["gaps"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
438 align_len = locus_hit["HSP_length"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
439 sbj_len = int(locus_hit["sbjct_length"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
440 sbjct_seq = locus_hit["sbjct_string"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
441 query_seq = locus_hit["query_string"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
442 homol_seq = locus_hit["homo_string"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
443 cigar = extended_cigar(sbjct_aligns[scheme][hit], query_aligns[scheme][hit])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
444
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
445 # Check for perfect hits
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
446 if coverage == 100 and identity == 100:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
447 # If a perfect hit was already found the list more_perfect hits will exist this new hit is appended to this list
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
448 try:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
449 allele_matches[locus]["alternative_hit"][allele_name] = {"allele":allele+"!", "align_len":align_len, "sbj_len":sbj_len,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
450 "coverage":coverage, "identity":identity, "hit_name":hit}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
451 if allele_matches[locus]["allele"][-1] != "!":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
452 allele_matches[locus]["allele"] += "!"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
453 except KeyError:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
454 # Overwrite alleles already saved, save the perfect match and break to go to next locus
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
455 allele_matches[locus] = {"score":score, "allele":allele, "coverage":coverage,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
456 "identity":identity, "match_priority": 1, "align_len":align_len,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
457 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
458 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
459 "hit_name":hit, "cigar":cigar, "alternative_hit":{}}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
460 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
461 # If no hit has yet been stored initialize dict variables that are looked up below
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
462 if locus not in allele_matches:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
463 allele_matches[locus] = {"score":0, "match_priority": 4}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
464
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
465 # We weight full coverage higher than perfect identity match
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
466 if coverage == 100 and identity != 100:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
467 # Check that better (higher prioritized) 100% coverage hit has not been stored yet
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
468 if allele_matches[locus]["match_priority"] > 2 or (allele_matches[locus]["match_priority"] == 2 and score > allele_matches[locus]["score"]):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
469 allele_matches[locus] = {"score":score, "allele":allele+"*", "coverage":coverage,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
470 "identity":identity, "match_priority": 2, "align_len":align_len,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
471 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
472 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
473 "hit_name":hit, "cigar":cigar}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
474 elif coverage != 100 and identity == 100:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
475 # Check that higher prioritized hit was not already stored
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
476 if allele_matches[locus]["match_priority"] > 3 or (allele_matches[locus]["match_priority"] == 3 and score > allele_matches[locus]["score"]):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
477 allele_matches[locus] = {"score":score, "allele":allele + "?", "coverage":coverage,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
478 "identity":identity, "match_priority": 3, "align_len":align_len,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
479 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
480 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
481 "hit_name":hit, "cigar":cigar}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
482 else: # coverage != 100 and identity != 100:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
483 if allele_matches[locus]["match_priority"] == 4 and score > allele_matches[locus]["score"]:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
484 allele_matches[locus] = {"score":score, "allele":allele + "?*", "coverage":coverage,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
485 "identity":identity, "match_priority": 4, "align_len":align_len,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
486 "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
487 "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
488 "hit_name":hit, "cigar":cigar}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
489 for locus in loci_list:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
490 if locus not in allele_matches:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
491 allele_matches[locus] = {"identity":"", "coverage":"", "allele":"", "allele_name":"No hit found", "align_len":"", "gaps":"", "sbj_len":""}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
492
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
493 # Import all possible st profiles into dict
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
494 st_profiles = import_profile(database, scheme,loci_list)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
495
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
496 # Find st or neatest sts
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
497 st, note, nearest_sts = st_typing(st_profiles, allele_matches, loci_list)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
498
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
499 # Give warning of mlst schene if no loci were found
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
500 if note == "" and warning != "":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
501 note = warning
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
502
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
503 # Set ST for incF
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
504 if scheme.lower() == "incf":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
505 st = ["F","A", "B"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
506 if "FII" in allele_matches and allele_matches["FII"]["identity"] == 100.0:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
507 st[0] += allele_matches["FII"]["allele_name"].split("_")[-1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
508 elif "FIC" in allele_matches and allele_matches["FIC"]["identity"] == 100.0:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
509 st[0] = "C" + allele_matches["FIC"]["allele_name"].split("_")[-1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
510 elif "FIIK" in allele_matches and allele_matches["FIIK"]["identity"] == 100.0:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
511 st[0] = "K" + allele_matches["FIIK"]["allele_name"].split("_")[-1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
512 elif "FIIS" in allele_matches and allele_matches["FIIS"]["identity"] == 100.0:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
513 st[0] = "S" + allele_matches["FIIS"]["allele_name"].split("_")[-1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
514 elif "FIIY" in allele_matches and allele_matches["FIIY"]["identity"] == 100.0:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
515 st[0] = "Y" + allele_matches["FIIY"]["allele_name"].split("_")[-1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
516 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
517 st[0] += "-"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
518
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
519 if "FIA" in allele_matches and allele_matches["FIA"]["identity"] == 100.0:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
520 st[1] += allele_matches["FIA"]["allele_name"].split("_")[-1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
521 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
522 st[1] += "-"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
523
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
524 if "FIB" in allele_matches and allele_matches["FIB"]["identity"] == 100.0:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
525 st[2] += allele_matches["FIB"]["allele_name"].split("_")[-1]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
526 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
527 st[2] += "-"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
528
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
529 st = "["+":".join(st)+"]"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
530
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
531
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
532 # Check if ST is associated with a clonal complex.
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
533 clpx = ""
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
534 if st != "Unknown" or nearest_sts != "":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
535 with open("{0}/{1}.clpx".format(database,scheme), "r") as clpx_file:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
536 for line in clpx_file:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
537 line = line.split("\t")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
538 if st[0] == line[0] or nearest_sts == line[0]:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
539 clpx = line[1].strip()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
540
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
541
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
542 # Get run info for JSON file
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
543 service = os.path.basename(__file__).replace(".py", "")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
544 date = time.strftime("%d.%m.%Y")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
545 time = time.strftime("%H:%M:%S")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
546
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
547 # TODO find a system to show the database and service version using git
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
548
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
549 # Make JSON output file
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
550 data = {service:{}}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
551 allele_results = {}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
552 for locus, locus_info in allele_matches.items():
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
553 allele_results[locus] = {"identity":0, "coverage":0, "allele":[], "allele_name":[], "align_len":[], "gaps":0, "sbj_len":[]}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
554 for (key, value) in locus_info.items():
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
555 if key in allele_results[locus] or (key == "alternative_hit" and value != {}):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
556 allele_results[locus][key] = value
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
557
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
558 userinput = {"filename":args.infile, "scheme":args.scheme, "profile":profile_name,"file_format":file_format}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
559 run_info = {"date":date, "time":time}#, "database":{"remote_db":remote_db, "last_commit_hash":head_hash}}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
560 server_results = {"sequence_type":st, "allele_profile": allele_results,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
561 "nearest_sts":nearest_sts,"clonal_complex":clpx, "notes":note}
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
562
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
563 data[service]["user_input"] = userinput
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
564 data[service]["run_info"] = run_info
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
565 data[service]["results"] = server_results
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
566
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
567 pprint.pprint(data)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
568
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
569 # Save json output
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
570 result_file = "{}/data.json".format(outdir)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
571 with open(result_file, "w") as outfile:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
572 json.dump(data, outfile)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
573
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
574 if extented_output:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
575 # Define extented output
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
576 table_filename = "{}/results_tab.tsv".format(outdir)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
577 query_filename = "{}/Hit_in_genome_seq.fsa".format(outdir)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
578 sbjct_filename = "{}/pMLST_allele_seq.fsa".format(outdir)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
579 result_filename = "{}/results.txt".format(outdir)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
580 table_file = open(table_filename, "w")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
581 query_file = open(query_filename, "w")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
582 sbjct_file = open(sbjct_filename, "w")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
583 result_file = open(result_filename, "w")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
584
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
585 # Make results file
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
586 result_file.write("{0} Results\n\n".format(service))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
587 result_file.write("pMLST profile: {}\n\nSequence Type: {}\n".format(profile_name, st))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
588 # If ST is unknown report nearest ST
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
589 if st == "Unknown" and nearest_sts != "":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
590 if len(nearest_sts.split(",")) == 1:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
591 result_file.write("Nearest ST: {}\n".format(nearest_sts))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
592 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
593 result_file.write("Nearest STs: {}\n".format(nearest_sts))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
594
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
595 # Report clonal complex if one was associated with ST:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
596 if clpx != "":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
597 result_file.write("Clonal complex: {}\n".format(clpx))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
598
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
599 # Write tsv table header
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
600 table_header = ["Locus", "Identity", "Coverage", "Alignment Length", "Allele Length", "Gaps", "Allele"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
601 table_file.write("\t".join(table_header) + "\n")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
602 rows = []
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
603 for locus, allele_info in allele_matches.items():
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
604
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
605 identity = str(allele_info["identity"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
606 coverage = str(allele_info["coverage"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
607 allele = allele_info["allele"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
608 allele_name = allele_info["allele_name"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
609 align_len = str(allele_info["align_len"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
610 sbj_len = str(allele_info["sbj_len"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
611 gaps = str(allele_info["gaps"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
612
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
613 # Write alleles names with indications of imperfect hits
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
614 if allele_name != "No hit found":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
615 allele_name_w_mark = locus + "_" + allele
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
616 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
617 allele_name_w_mark = allele_name
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
618
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
619 # Write allele results to tsv table
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
620 row = [locus, identity, coverage, align_len, sbj_len, gaps, allele_name_w_mark]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
621 rows.append(row)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
622 if "alternative_hit" in allele_info:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
623 for allele_name, dic in allele_info["alternative_hit"].items():
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
624 row = [locus, identity, coverage, str(dic["align_len"]), str(dic["sbj_len"]), "0", allele_name + "!"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
625 rows.append(row)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
626 #
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
627
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
628 if allele_name == "No hit found":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
629 continue
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
630
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
631 # Write query fasta output
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
632 hit_name = allele_info["hit_name"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
633 query_seq = query_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
634 sbjct_seq = sbjct_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
635 homol_seq = homol_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
636
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
637 if allele_info["match_priority"] == 1:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
638 match = "PERFECT MATCH"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
639 else:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
640 match = "WARNING"
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
641 header = ">{}:{} ID:{}% COV:{}% Best_match:{}\n".format(locus, match, allele_info["identity"],
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
642 allele_info["coverage"], allele_info["allele_name"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
643 query_file.write(header)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
644 for i in range(0,len(query_seq),60):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
645 query_file.write(query_seq[i:i+60] + "\n")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
646
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
647 # Write template fasta output
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
648 header = ">{}\n".format(allele_info["allele_name"])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
649 sbjct_file.write(header)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
650 for i in range(0,len(sbjct_seq),60):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
651 sbjct_file.write(sbjct_seq[i:i+60] + "\n")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
652
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
653 if "alternative_hit" in allele_info:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
654 for allele_name in allele_info["alternative_hit"]:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
655 header = ">{}:{} ID:{}% COV:{}% Best_match:{}\n".format(locus, "PERFECT MATCH", 100,
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
656 100, allele_name)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
657 hit_name = allele_info["alternative_hit"][allele_name]["hit_name"]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
658 query_seq = query_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
659 sbjct_seq = sbjct_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
660 homol_seq = homol_aligns[scheme][hit_name]
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
661 query_file.write(header)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
662 for i in range(0,len(query_seq),60):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
663 query_file.write(query_seq[i:i+60] + "\n")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
664
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
665 # Write template fasta output
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
666 header = ">{}\n".format(allele_name)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
667 sbjct_file.write(header)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
668 for i in range(0,len(sbjct_seq),60):
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
669 sbjct_file.write(sbjct_seq[i:i+60] + "\n")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
670
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
671 # Write Allele profile results tables in results file and table file
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
672 rows.sort(key=lambda x: x[0])
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
673 result_file.write(text_table(table_header, rows))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
674 for row in rows:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
675 table_file.write("\t".join(row) + "\n")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
676 # Write any notes
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
677 if note != "":
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
678 result_file.write("\nNotes: {}\n\n".format(note))
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
679
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
680 # Write allignment output
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
681 result_file.write("\n\nExtended Output:\n\n")
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
682 make_aln(scheme, result_file, allele_matches, query_aligns, homol_aligns, sbjct_aligns)
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
683
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
684 # Close all files
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
685 query_file.close()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
686 sbjct_file.close()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
687 table_file.close()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
688 result_file.close()
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
689
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
690 if args.quiet:
140d4f9e1f20 Uploaded
dcouvin
parents:
diff changeset
691 f.close()