comparison lrn_risk.py @ 2:8dc6d4aa17ec draft

Uploaded
author greg
date Fri, 28 Apr 2023 17:03:46 +0000
parents f98c92618a6c
children 5a27ac020c9e
comparison
equal deleted inserted replaced
1:f98c92618a6c 2:8dc6d4aa17ec
13 with open(f, 'r') as fh: 13 with open(f, 'r') as fh:
14 for i, line in enumerate(fh): 14 for i, line in enumerate(fh):
15 if i == 0: 15 if i == 0:
16 # Skip header. 16 # Skip header.
17 continue 17 continue
18 items = line.split('\t') 18 try:
19 tax = items[1].strip() 19 items = line.split('\t')
20 tax = tax.split(';')[-1].strip() 20 tax = items[1]
21 # split on GTDB species tag 21 tax = tax.split(';')[-1]
22 tax = tax.split('s__')[1].strip() 22 # split on GTDB species tag
23 tax = tax.split('s__')[1]
24 except Exception:
25 return '(Unknown Species)'
23 if len(tax) == 0: 26 if len(tax) == 0:
24 tax = '(Unknown Species)' 27 return '(Unknown Species)'
25 return tax 28 return tax
26 29
27 30
28 def get_blast_genes(f): 31 def get_blast_genes(f):
29 # reads genes detected via BLAST 32 # reads genes detected via BLAST
30 # BLAST header is as follows: 33 # BLAST header is as follows:
31 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen 34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen
32 d = {} 35 d = {}
33 with open(f, 'r') as fh: 36 with open(f, 'r') as fh:
34 for line in fh: 37 for line in fh:
35 items = line.split('\t') 38 try:
36 gene = items[0] 39 items = line.split('\t')
37 # contig = items[1] 40 gene = items[0]
38 # pid = items[2] 41 # contig = items[1]
39 alen = items[3] 42 # pid = items[2]
40 # e = items[-4] 43 alen = items[3]
41 qlen = items[-1] 44 # e = items[-4]
42 # calculate query coverage by dividing alignment length by query length 45 qlen = items[-1]
43 qcov = round(float(alen) / float(qlen) * 100.0, 2) 46 # calculate query coverage by dividing alignment length by query length
44 if gene not in d.keys(): 47 qcov = round(float(alen) / float(qlen) * 100.0, 2)
45 d[gene] = [] 48 if gene not in d.keys():
46 d[gene].append('%s\t%s' % (line, str(qcov))) 49 d[gene] = []
50 d[gene].append('%s\t%s' % (line, str(qcov)))
51 except Exception:
52 return d
47 return d 53 return d
48 54
49 55
50 def get_blacklist(v, b): 56 def get_blacklist(v, b):
51 # identify high-risk isolates based on blacklisted genes 57 # identify high-risk isolates based on blacklisted genes
52 # blacklisted genes file contains two columns: 58 # blacklisted genes file contains two columns:
53 # column 0=the gene name as it appears in the gene database 59 # column 0=the gene name as it appears in the gene database
54 # column 1=the reason why the gene was blacklisted, which will be reported 60 # column 1=the reason why the gene was blacklisted, which will be reported
55 # e.g., 'ANTHRAX TOXIN' 61 # e.g., 'ANTHRAX TOXIN'
56 bdict = {} 62 bdict = {}
63 blacklist_present = {}
57 with open(b, 'r') as fh: 64 with open(b, 'r') as fh:
58 for line in fh: 65 for line in fh:
59 items = line.split('\t') 66 try:
60 gene = items[0].strip() 67 items = line.split('\t')
61 val = items[1].strip() 68 gene = items[0]
62 bdict[gene] = val 69 val = items[1]
63 blacklist_present = {} 70 bdict[gene] = val
71 except Exception:
72 return blacklist_present
64 for key in v.keys(): 73 for key in v.keys():
65 if key in bdict.keys(): 74 if key in bdict.keys():
66 val = bdict[key] 75 val = bdict[key]
67 blacklist_present[key] = val 76 blacklist_present[key] = val
68 return blacklist_present 77 return blacklist_present
80 # GTDB species (gtdb) 89 # GTDB species (gtdb)
81 # create dictionaries based on gene distribution 90 # create dictionaries based on gene distribution
82 d = {} 91 d = {}
83 annd = {} 92 annd = {}
84 gtdbd = {} 93 gtdbd = {}
94 finallines = []
85 with open(f, 'r') as fh: 95 with open(f, 'r') as fh:
86 for line in fh: 96 for line in fh:
87 items = line.split('\t') 97 try:
88 tax = items[0].strip() 98 items = line.split('\t')
89 tax = tax.split('s__')[1].strip() 99 tax = items[0]
90 if len(tax) == 0: 100 tax = tax.split('s__')[1]
91 tax = '(Unknown Species)' 101 if len(tax) == 0:
92 gene = items[1].strip() 102 tax = '(Unknown Species)'
93 ann = items[-1].strip() 103 gene = items[1]
94 denom = items[3].strip() 104 ann = items[-1]
95 d['%s___%s' % (tax, gene)] = line 105 denom = items[3]
96 annd[gene] = ann 106 d['%s___%s' % (tax, gene)] = line
97 gtdbd[tax] = denom 107 annd[gene] = ann
108 gtdbd[tax] = denom
109 except Exception:
110 return finallines
98 # parse BLAST results 111 # parse BLAST results
99 finallines = []
100 for key in blast.keys(): 112 for key in blast.keys():
101 blastval = blast[key] 113 blastval = blast[key]
102 for bv in blastval: 114 for bv in blastval:
103 testkey = '%s___%s' % (gtdb, key) 115 testkey = '%s___%s' % (gtdb, key)
104 if testkey in d.keys() and gtdb != '(Unknown Species)': 116 if testkey in d.keys() and gtdb != '(Unknown Species)':
105 taxval = d[testkey] 117 taxval = d[testkey]
106 items = taxval.split('\t') 118 items = taxval.split('\t')
107 tax = items[0].strip() 119 tax = items[0]
108 tax = tax.split('s__')[1].strip() 120 tax = tax.split('s__')[1]
109 if len(tax) == 0: 121 if len(tax) == 0:
110 tax = '(Unknown Species)' 122 tax = '(Unknown Species)'
111 gene = items[1].strip() 123 gene = items[1]
112 pres = items[2].strip() 124 pres = items[2]
113 denom = items[3].strip() 125 denom = items[3]
114 perc = items[4].strip() 126 perc = items[4]
115 perc = str(round(float(perc), 2)) 127 perc = str(round(float(perc), 2))
116 ann = items[-1].strip() 128 ann = items[-1]
117 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) 129 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom)
118 elif gtdb != '(Unknown Species)': 130 elif gtdb != '(Unknown Species)':
119 ann = annd[key] 131 ann = annd[key]
120 denom = gtdbd[gtdb] 132 denom = gtdbd[gtdb]
121 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) 133 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom)
155 # print table of VFs if VFs detected 167 # print table of VFs if VFs detected
156 for vline in vfdist: 168 for vline in vfdist:
157 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] 169 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length']
158 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] 170 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
159 items = vline.split('\t') 171 items = vline.split('\t')
160 vgene = items[0].strip() 172 vgene = items[0]
161 vcontig = items[1].strip() 173 vcontig = items[1]
162 vid = items[2].strip() 174 vid = items[2]
163 vcov = items[-3].strip() 175 vcov = items[-3]
164 veval = items[-7].strip() 176 veval = items[-7]
165 vann = items[-2].strip() 177 vann = items[-2]
166 vnotes = items[-1].strip() 178 vnotes = items[-1]
167 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] 179 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes]
168 vfinal = '\t'.join(vfinal).strip() 180 vfinal = '\t'.join(vfinal)
169 fh.write('%s\n' % vfinal) 181 fh.write('%s\n' % vfinal)
170 182
171 183
172 def output_amr(amrdist, amr_output_file): 184 def output_amr(amrdist, amr_output_file):
173 # takes distribution of AMR genes as input (amrdist) 185 # takes distribution of AMR genes as input (amrdist)
181 # print this if AMR genes detected 193 # print this if AMR genes detected
182 for aline in amrdist: 194 for aline in amrdist:
183 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] 195 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length']
184 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] 196 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
185 items = aline.split('\t') 197 items = aline.split('\t')
186 agene = items[0].strip() 198 agene = items[0]
187 acontig = items[1].strip() 199 acontig = items[1]
188 aid = items[2].strip() 200 aid = items[2]
189 acov = items[-3].strip() 201 acov = items[-3]
190 aeval = items[-7].strip() 202 aeval = items[-7]
191 aann = items[-2].strip() 203 aann = items[-2]
192 anotes = items[-1].strip() 204 anotes = items[-1]
193 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] 205 afinal = [agene, acontig, aid, acov, aeval, aann, anotes]
194 afinal = '\t'.join(afinal).strip() 206 afinal = '\t'.join(afinal)
195 fh.write('%s\n' % afinal) 207 fh.write('%s\n' % afinal)
196 208
197 209
198 # lrnrisk_prototype arguments 210 # lrnrisk_prototype arguments
199 parser = argparse.ArgumentParser() 211 parser = argparse.ArgumentParser()