Mercurial > repos > greg > lrn_risk
comparison lrn_risk.py @ 2:8dc6d4aa17ec draft
Uploaded
author | greg |
---|---|
date | Fri, 28 Apr 2023 17:03:46 +0000 |
parents | f98c92618a6c |
children | 5a27ac020c9e |
comparison
equal
deleted
inserted
replaced
1:f98c92618a6c | 2:8dc6d4aa17ec |
---|---|
13 with open(f, 'r') as fh: | 13 with open(f, 'r') as fh: |
14 for i, line in enumerate(fh): | 14 for i, line in enumerate(fh): |
15 if i == 0: | 15 if i == 0: |
16 # Skip header. | 16 # Skip header. |
17 continue | 17 continue |
18 items = line.split('\t') | 18 try: |
19 tax = items[1].strip() | 19 items = line.split('\t') |
20 tax = tax.split(';')[-1].strip() | 20 tax = items[1] |
21 # split on GTDB species tag | 21 tax = tax.split(';')[-1] |
22 tax = tax.split('s__')[1].strip() | 22 # split on GTDB species tag |
23 tax = tax.split('s__')[1] | |
24 except Exception: | |
25 return '(Unknown Species)' | |
23 if len(tax) == 0: | 26 if len(tax) == 0: |
24 tax = '(Unknown Species)' | 27 return '(Unknown Species)' |
25 return tax | 28 return tax |
26 | 29 |
27 | 30 |
28 def get_blast_genes(f): | 31 def get_blast_genes(f): |
29 # reads genes detected via BLAST | 32 # reads genes detected via BLAST |
30 # BLAST header is as follows: | 33 # BLAST header is as follows: |
31 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen | 34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen |
32 d = {} | 35 d = {} |
33 with open(f, 'r') as fh: | 36 with open(f, 'r') as fh: |
34 for line in fh: | 37 for line in fh: |
35 items = line.split('\t') | 38 try: |
36 gene = items[0] | 39 items = line.split('\t') |
37 # contig = items[1] | 40 gene = items[0] |
38 # pid = items[2] | 41 # contig = items[1] |
39 alen = items[3] | 42 # pid = items[2] |
40 # e = items[-4] | 43 alen = items[3] |
41 qlen = items[-1] | 44 # e = items[-4] |
42 # calculate query coverage by dividing alignment length by query length | 45 qlen = items[-1] |
43 qcov = round(float(alen) / float(qlen) * 100.0, 2) | 46 # calculate query coverage by dividing alignment length by query length |
44 if gene not in d.keys(): | 47 qcov = round(float(alen) / float(qlen) * 100.0, 2) |
45 d[gene] = [] | 48 if gene not in d.keys(): |
46 d[gene].append('%s\t%s' % (line, str(qcov))) | 49 d[gene] = [] |
50 d[gene].append('%s\t%s' % (line, str(qcov))) | |
51 except Exception: | |
52 return d | |
47 return d | 53 return d |
48 | 54 |
49 | 55 |
50 def get_blacklist(v, b): | 56 def get_blacklist(v, b): |
51 # identify high-risk isolates based on blacklisted genes | 57 # identify high-risk isolates based on blacklisted genes |
52 # blacklisted genes file contains two columns: | 58 # blacklisted genes file contains two columns: |
53 # column 0=the gene name as it appears in the gene database | 59 # column 0=the gene name as it appears in the gene database |
54 # column 1=the reason why the gene was blacklisted, which will be reported | 60 # column 1=the reason why the gene was blacklisted, which will be reported |
55 # e.g., 'ANTHRAX TOXIN' | 61 # e.g., 'ANTHRAX TOXIN' |
56 bdict = {} | 62 bdict = {} |
63 blacklist_present = {} | |
57 with open(b, 'r') as fh: | 64 with open(b, 'r') as fh: |
58 for line in fh: | 65 for line in fh: |
59 items = line.split('\t') | 66 try: |
60 gene = items[0].strip() | 67 items = line.split('\t') |
61 val = items[1].strip() | 68 gene = items[0] |
62 bdict[gene] = val | 69 val = items[1] |
63 blacklist_present = {} | 70 bdict[gene] = val |
71 except Exception: | |
72 return blacklist_present | |
64 for key in v.keys(): | 73 for key in v.keys(): |
65 if key in bdict.keys(): | 74 if key in bdict.keys(): |
66 val = bdict[key] | 75 val = bdict[key] |
67 blacklist_present[key] = val | 76 blacklist_present[key] = val |
68 return blacklist_present | 77 return blacklist_present |
80 # GTDB species (gtdb) | 89 # GTDB species (gtdb) |
81 # create dictionaries based on gene distribution | 90 # create dictionaries based on gene distribution |
82 d = {} | 91 d = {} |
83 annd = {} | 92 annd = {} |
84 gtdbd = {} | 93 gtdbd = {} |
94 finallines = [] | |
85 with open(f, 'r') as fh: | 95 with open(f, 'r') as fh: |
86 for line in fh: | 96 for line in fh: |
87 items = line.split('\t') | 97 try: |
88 tax = items[0].strip() | 98 items = line.split('\t') |
89 tax = tax.split('s__')[1].strip() | 99 tax = items[0] |
90 if len(tax) == 0: | 100 tax = tax.split('s__')[1] |
91 tax = '(Unknown Species)' | 101 if len(tax) == 0: |
92 gene = items[1].strip() | 102 tax = '(Unknown Species)' |
93 ann = items[-1].strip() | 103 gene = items[1] |
94 denom = items[3].strip() | 104 ann = items[-1] |
95 d['%s___%s' % (tax, gene)] = line | 105 denom = items[3] |
96 annd[gene] = ann | 106 d['%s___%s' % (tax, gene)] = line |
97 gtdbd[tax] = denom | 107 annd[gene] = ann |
108 gtdbd[tax] = denom | |
109 except Exception: | |
110 return finallines | |
98 # parse BLAST results | 111 # parse BLAST results |
99 finallines = [] | |
100 for key in blast.keys(): | 112 for key in blast.keys(): |
101 blastval = blast[key] | 113 blastval = blast[key] |
102 for bv in blastval: | 114 for bv in blastval: |
103 testkey = '%s___%s' % (gtdb, key) | 115 testkey = '%s___%s' % (gtdb, key) |
104 if testkey in d.keys() and gtdb != '(Unknown Species)': | 116 if testkey in d.keys() and gtdb != '(Unknown Species)': |
105 taxval = d[testkey] | 117 taxval = d[testkey] |
106 items = taxval.split('\t') | 118 items = taxval.split('\t') |
107 tax = items[0].strip() | 119 tax = items[0] |
108 tax = tax.split('s__')[1].strip() | 120 tax = tax.split('s__')[1] |
109 if len(tax) == 0: | 121 if len(tax) == 0: |
110 tax = '(Unknown Species)' | 122 tax = '(Unknown Species)' |
111 gene = items[1].strip() | 123 gene = items[1] |
112 pres = items[2].strip() | 124 pres = items[2] |
113 denom = items[3].strip() | 125 denom = items[3] |
114 perc = items[4].strip() | 126 perc = items[4] |
115 perc = str(round(float(perc), 2)) | 127 perc = str(round(float(perc), 2)) |
116 ann = items[-1].strip() | 128 ann = items[-1] |
117 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) | 129 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) |
118 elif gtdb != '(Unknown Species)': | 130 elif gtdb != '(Unknown Species)': |
119 ann = annd[key] | 131 ann = annd[key] |
120 denom = gtdbd[gtdb] | 132 denom = gtdbd[gtdb] |
121 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) | 133 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) |
155 # print table of VFs if VFs detected | 167 # print table of VFs if VFs detected |
156 for vline in vfdist: | 168 for vline in vfdist: |
157 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] | 169 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] |
158 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] | 170 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] |
159 items = vline.split('\t') | 171 items = vline.split('\t') |
160 vgene = items[0].strip() | 172 vgene = items[0] |
161 vcontig = items[1].strip() | 173 vcontig = items[1] |
162 vid = items[2].strip() | 174 vid = items[2] |
163 vcov = items[-3].strip() | 175 vcov = items[-3] |
164 veval = items[-7].strip() | 176 veval = items[-7] |
165 vann = items[-2].strip() | 177 vann = items[-2] |
166 vnotes = items[-1].strip() | 178 vnotes = items[-1] |
167 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] | 179 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] |
168 vfinal = '\t'.join(vfinal).strip() | 180 vfinal = '\t'.join(vfinal) |
169 fh.write('%s\n' % vfinal) | 181 fh.write('%s\n' % vfinal) |
170 | 182 |
171 | 183 |
172 def output_amr(amrdist, amr_output_file): | 184 def output_amr(amrdist, amr_output_file): |
173 # takes distribution of AMR genes as input (amrdist) | 185 # takes distribution of AMR genes as input (amrdist) |
181 # print this if AMR genes detected | 193 # print this if AMR genes detected |
182 for aline in amrdist: | 194 for aline in amrdist: |
183 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] | 195 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] |
184 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] | 196 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] |
185 items = aline.split('\t') | 197 items = aline.split('\t') |
186 agene = items[0].strip() | 198 agene = items[0] |
187 acontig = items[1].strip() | 199 acontig = items[1] |
188 aid = items[2].strip() | 200 aid = items[2] |
189 acov = items[-3].strip() | 201 acov = items[-3] |
190 aeval = items[-7].strip() | 202 aeval = items[-7] |
191 aann = items[-2].strip() | 203 aann = items[-2] |
192 anotes = items[-1].strip() | 204 anotes = items[-1] |
193 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] | 205 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] |
194 afinal = '\t'.join(afinal).strip() | 206 afinal = '\t'.join(afinal) |
195 fh.write('%s\n' % afinal) | 207 fh.write('%s\n' % afinal) |
196 | 208 |
197 | 209 |
198 # lrnrisk_prototype arguments | 210 # lrnrisk_prototype arguments |
199 parser = argparse.ArgumentParser() | 211 parser = argparse.ArgumentParser() |