Mercurial > repos > greg > lrn_risk

--- a/lrn_risk.py	Fri Apr 28 17:03:46 2023 +0000
+++ b/lrn_risk.py	Tue Jun 27 14:02:05 2023 +0000
@@ -4,7 +4,7 @@


 BLACKLIST_HEADER = ['Blacklisted Gene', 'Reason', 'Risk Category']
-VFDB_HEADER = ['Gene', 'Contig', '% Identity', '% Coverage', 'E-Value', 'Annotation', 'Comparison to Publicly Available Genomes']
+VFDB_HEADER = ['Gene', 'Contig', '%ID', '%COV', 'E', 'Annotation', 'Distribution']


 def get_species_from_gtdb(f):
@@ -36,6 +36,7 @@
     with open(f, 'r') as fh:
         for line in fh:
             try:
+                line = line.strip()
                 items = line.split('\t')
                 gene = items[0]
                 # contig = items[1]
@@ -64,6 +65,7 @@
     with open(b, 'r') as fh:
         for line in fh:
             try:
+                line = line.strip()
                 items = line.split('\t')
                 gene = items[0]
                 val = items[1]
@@ -92,9 +94,11 @@
     annd = {}
     gtdbd = {}
     finallines = []
+    warnings = []
     with open(f, 'r') as fh:
         for line in fh:
             try:
+                line = line.strip()
                 items = line.split('\t')
                 tax = items[0]
                 tax = tax.split('s__')[1]
@@ -126,17 +130,18 @@
                 perc = items[4]
                 perc = str(round(float(perc), 2))
                 ann = items[-1]
-                freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom)
+                freetext = '{0}/{1} ({2}%)'.format(pres, denom, perc)
             elif gtdb != '(Unknown Species)':
                 ann = annd[key]
                 denom = gtdbd[gtdb]
-                freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom)
+                freetext = "*WARNING"
+                warnings.append("*WARNING: This gene has never been detected in this species! Interpret with caution!")
             else:
                 ann = annd[key]
-                freetext = 'WARNING: Genome belongs to an undescribed species. Interpret with caution!'
-            finalline = '%s\t%s\t%s' % (bv, ann, freetext)
-            finallines.append(finalline)
-    return finallines
+                freetext = "**WARNING"
+                warnings.append("**WARNING: This genome belongs to an undescribed species. Interpret with caution!")
+            finallines.append('%s\t%s\t%s' % (bv, ann, freetext))
+    return [finallines, warnings]


 def output_blacklist(blacklist, blacklist_output_file):
@@ -155,7 +160,7 @@
                 fh.write('%s\t%s\tHIGH RISK\n' % (key, val))


-def output_vfdb(vfdist, vfdb_output_file):
+def output_vfdb(vfdist, vfdb_output_file, vf_warnings):
     # takes distribution of virulence factors as input (vfdist)
     # VFDB results
     with open(vfdb_output_file, 'w') as fh:
@@ -177,11 +182,12 @@
                 vann = items[-2]
                 vnotes = items[-1]
                 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes]
-                vfinal = '\t'.join(vfinal)
-                fh.write('%s\n' % vfinal)
+                fh.write('%s\n' % '\t'.join(vfinal))
+            for vfw in sorted(vf_warnings, key=lambda x: x.count('*')):
+                fh.write('%s\n' % vfw)


-def output_amr(amrdist, amr_output_file):
+def output_amr(amrdist, amr_output_file, amr_warnings):
     # takes distribution of AMR genes as input (amrdist)
     # AMR results
     with open(amr_output_file, 'w') as fh:
@@ -203,8 +209,9 @@
                 aann = items[-2]
                 anotes = items[-1]
                 afinal = [agene, acontig, aid, acov, aeval, aann, anotes]
-                afinal = '\t'.join(afinal)
-                fh.write('%s\n' % afinal)
+                fh.write('%s\n' % '\t'.join(afinal))
+            for amrw in sorted(amr_warnings, key=lambda x: x.count('*')):
+                fh.write('%s\n' % amrw)


 # lrnrisk_prototype arguments
@@ -231,8 +238,12 @@
 output_blacklist(blacklist, args.blacklist_output_file)

 vf_distribution = gene_dist(args.vf_distribution_file, virulence_genes, species)
-output_vfdb(vf_distribution, args.vfdb_output_file)
+vf_warnings = vf_distribution[1]
+vf_distribution = vf_distribution[0]
+output_vfdb(vf_distribution, args.vfdb_output_file, vf_warnings)

 amr_genes = get_blast_genes(args.amr_determinants_file)
 amr_distribution = gene_dist(args.amr_distribution_file, amr_genes, species)
-output_amr(amr_distribution, args.amr_output_file)
+amr_warnings = amr_distribution[1]
+amr_distribution = amr_distribution[0]
+output_amr(amr_distribution, args.amr_output_file, amr_warnings)