Mercurial > repos > greg > lrn_risk
comparison lrn_risk.py @ 3:5a27ac020c9e draft
Uploaded
| author | greg | 
|---|---|
| date | Tue, 27 Jun 2023 14:02:05 +0000 | 
| parents | 8dc6d4aa17ec | 
| children | 8fe04662585c | 
   comparison
  equal
  deleted
  inserted
  replaced
| 2:8dc6d4aa17ec | 3:5a27ac020c9e | 
|---|---|
| 2 | 2 | 
| 3 import argparse | 3 import argparse | 
| 4 | 4 | 
| 5 | 5 | 
| 6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category'] | 6 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category'] | 
| 7 VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes'] | 7 VFDB_HEADER = ['Gene', 'Contig', '%ID', '%COV', 'E', 'Annotation', 'Distribution'] | 
| 8 | 8 | 
| 9 | 9 | 
| 10 def get_species_from_gtdb(f): | 10 def get_species_from_gtdb(f): | 
| 11 # get GTDB species | 11 # get GTDB species | 
| 12 # assumes there is one genome in the GTDB-Tk output file | 12 # assumes there is one genome in the GTDB-Tk output file | 
| 34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen | 34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen | 
| 35 d = {} | 35 d = {} | 
| 36 with open(f, 'r') as fh: | 36 with open(f, 'r') as fh: | 
| 37 for line in fh: | 37 for line in fh: | 
| 38 try: | 38 try: | 
| 39 line = line.strip() | |
| 39 items = line.split('\t') | 40 items = line.split('\t') | 
| 40 gene = items[0] | 41 gene = items[0] | 
| 41 # contig = items[1] | 42 # contig = items[1] | 
| 42 # pid = items[2] | 43 # pid = items[2] | 
| 43 alen = items[3] | 44 alen = items[3] | 
| 62 bdict = {} | 63 bdict = {} | 
| 63 blacklist_present = {} | 64 blacklist_present = {} | 
| 64 with open(b, 'r') as fh: | 65 with open(b, 'r') as fh: | 
| 65 for line in fh: | 66 for line in fh: | 
| 66 try: | 67 try: | 
| 68 line = line.strip() | |
| 67 items = line.split('\t') | 69 items = line.split('\t') | 
| 68 gene = items[0] | 70 gene = items[0] | 
| 69 val = items[1] | 71 val = items[1] | 
| 70 bdict[gene] = val | 72 bdict[gene] = val | 
| 71 except Exception: | 73 except Exception: | 
| 90 # create dictionaries based on gene distribution | 92 # create dictionaries based on gene distribution | 
| 91 d = {} | 93 d = {} | 
| 92 annd = {} | 94 annd = {} | 
| 93 gtdbd = {} | 95 gtdbd = {} | 
| 94 finallines = [] | 96 finallines = [] | 
| 97 warnings = [] | |
| 95 with open(f, 'r') as fh: | 98 with open(f, 'r') as fh: | 
| 96 for line in fh: | 99 for line in fh: | 
| 97 try: | 100 try: | 
| 101 line = line.strip() | |
| 98 items = line.split('\t') | 102 items = line.split('\t') | 
| 99 tax = items[0] | 103 tax = items[0] | 
| 100 tax = tax.split('s__')[1] | 104 tax = tax.split('s__')[1] | 
| 101 if len(tax) == 0: | 105 if len(tax) == 0: | 
| 102 tax = '(Unknown Species)' | 106 tax = '(Unknown Species)' | 
| 124 pres = items[2] | 128 pres = items[2] | 
| 125 denom = items[3] | 129 denom = items[3] | 
| 126 perc = items[4] | 130 perc = items[4] | 
| 127 perc = str(round(float(perc), 2)) | 131 perc = str(round(float(perc), 2)) | 
| 128 ann = items[-1] | 132 ann = items[-1] | 
| 129 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) | 133 freetext = '{0}/{1} ({2}%)'.format(pres, denom, perc) | 
| 130 elif gtdb != '(Unknown Species)': | 134 elif gtdb != '(Unknown Species)': | 
| 131 ann = annd[key] | 135 ann = annd[key] | 
| 132 denom = gtdbd[gtdb] | 136 denom = gtdbd[gtdb] | 
| 133 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) | 137 freetext = "*WARNING" | 
| 138 warnings.append("*WARNING: This gene has never been detected in this species! Interpret with caution!") | |
| 134 else: | 139 else: | 
| 135 ann = annd[key] | 140 ann = annd[key] | 
| 136 freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!' | 141 freetext = "**WARNING" | 
| 137 finalline = '%s\t%s\t%s' % (bv, ann, freetext) | 142 warnings.append("**WARNING: This genome belongs to an undescribed species. Interpret with caution!") | 
| 138 finallines.append(finalline) | 143 finallines.append('%s\t%s\t%s' % (bv, ann, freetext)) | 
| 139 return finallines | 144 return [finallines, warnings] | 
| 140 | 145 | 
| 141 | 146 | 
| 142 def output_blacklist(blacklist, blacklist_output_file): | 147 def output_blacklist(blacklist, blacklist_output_file): | 
| 143 # takes detected blacklisted genes as input (blacklist) | 148 # takes detected blacklisted genes as input (blacklist) | 
| 144 # blacklist results | 149 # blacklist results | 
| 153 for key in blacklist.keys(): | 158 for key in blacklist.keys(): | 
| 154 val = blacklist[key] | 159 val = blacklist[key] | 
| 155 fh.write('%s\t%s\tHIGH RISK\n' % (key, val)) | 160 fh.write('%s\t%s\tHIGH RISK\n' % (key, val)) | 
| 156 | 161 | 
| 157 | 162 | 
| 158 def output_vfdb(vfdist, vfdb_output_file): | 163 def output_vfdb(vfdist, vfdb_output_file, vf_warnings): | 
| 159 # takes distribution of virulence factors as input (vfdist) | 164 # takes distribution of virulence factors as input (vfdist) | 
| 160 # VFDB results | 165 # VFDB results | 
| 161 with open(vfdb_output_file, 'w') as fh: | 166 with open(vfdb_output_file, 'w') as fh: | 
| 162 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | 167 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | 
| 163 if len(vfdist) == 0: | 168 if len(vfdist) == 0: | 
| 175 vcov = items[-3] | 180 vcov = items[-3] | 
| 176 veval = items[-7] | 181 veval = items[-7] | 
| 177 vann = items[-2] | 182 vann = items[-2] | 
| 178 vnotes = items[-1] | 183 vnotes = items[-1] | 
| 179 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] | 184 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] | 
| 180 vfinal = '\t'.join(vfinal) | 185 fh.write('%s\n' % '\t'.join(vfinal)) | 
| 181 fh.write('%s\n' % vfinal) | 186 for vfw in sorted(vf_warnings, key=lambda x: x.count('*')): | 
| 182 | 187 fh.write('%s\n' % vfw) | 
| 183 | 188 | 
| 184 def output_amr(amrdist, amr_output_file): | 189 | 
| 190 def output_amr(amrdist, amr_output_file, amr_warnings): | |
| 185 # takes distribution of AMR genes as input (amrdist) | 191 # takes distribution of AMR genes as input (amrdist) | 
| 186 # AMR results | 192 # AMR results | 
| 187 with open(amr_output_file, 'w') as fh: | 193 with open(amr_output_file, 'w') as fh: | 
| 188 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | 194 fh.write('%s\n' % '\t'.join(VFDB_HEADER)) | 
| 189 if len(amrdist) == 0: | 195 if len(amrdist) == 0: | 
| 201 acov = items[-3] | 207 acov = items[-3] | 
| 202 aeval = items[-7] | 208 aeval = items[-7] | 
| 203 aann = items[-2] | 209 aann = items[-2] | 
| 204 anotes = items[-1] | 210 anotes = items[-1] | 
| 205 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] | 211 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] | 
| 206 afinal = '\t'.join(afinal) | 212 fh.write('%s\n' % '\t'.join(afinal)) | 
| 207 fh.write('%s\n' % afinal) | 213 for amrw in sorted(amr_warnings, key=lambda x: x.count('*')): | 
| 214 fh.write('%s\n' % amrw) | |
| 208 | 215 | 
| 209 | 216 | 
| 210 # lrnrisk_prototype arguments | 217 # lrnrisk_prototype arguments | 
| 211 parser = argparse.ArgumentParser() | 218 parser = argparse.ArgumentParser() | 
| 212 | 219 | 
| 229 | 236 | 
| 230 blacklist = get_blacklist(virulence_genes, args.blacklist_file) | 237 blacklist = get_blacklist(virulence_genes, args.blacklist_file) | 
| 231 output_blacklist(blacklist, args.blacklist_output_file) | 238 output_blacklist(blacklist, args.blacklist_output_file) | 
| 232 | 239 | 
| 233 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species) | 240 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species) | 
| 234 output_vfdb(vf_distribution, args.vfdb_output_file) | 241 vf_warnings = vf_distribution[1] | 
| 242 vf_distribution = vf_distribution[0] | |
| 243 output_vfdb(vf_distribution, args.vfdb_output_file, vf_warnings) | |
| 235 | 244 | 
| 236 amr_genes = get_blast_genes(args.amr_determinants_file) | 245 amr_genes = get_blast_genes(args.amr_determinants_file) | 
| 237 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species) | 246 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species) | 
| 238 output_amr(amr_distribution, args.amr_output_file) | 247 amr_warnings = amr_distribution[1] | 
| 248 amr_distribution = amr_distribution[0] | |
| 249 output_amr(amr_distribution, args.amr_output_file, amr_warnings) | 
