lrn_risk: lrn_risk.py comparison

comparison lrn_risk.py @ 2:8dc6d4aa17ec draft

Uploaded

author	greg
date	Fri, 28 Apr 2023 17:03:46 +0000
parents	f98c92618a6c
children	5a27ac020c9e

comparison

equal deleted inserted replaced

-:f98c92618a6c
+:8dc6d4aa17ec
 with open(f, 'r') as fh:
 for i, line in enumerate(fh):
 if i == 0:
 # Skip header.
 continue
-items = line.split('\t')
+try:
-tax = items[1].strip()
+items = line.split('\t')
-tax = tax.split(';')[-1].strip()
+tax = items[1]
-# split on GTDB species tag
+tax = tax.split(';')[-1]
-tax = tax.split('s__')[1].strip()
+# split on GTDB species tag
+tax = tax.split('s__')[1]
+except Exception:
+return '(Unknown Species)'
 if len(tax) == 0:
-tax = '(Unknown Species)'
+return '(Unknown Species)'
 return tax
 def get_blast_genes(f):
 # reads genes detected via BLAST
 # BLAST header is as follows:
 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen
 d = {}
 with open(f, 'r') as fh:
 for line in fh:
-items = line.split('\t')
+try:
-gene = items[0]
+items = line.split('\t')
-# contig = items[1]
+gene = items[0]
-# pid = items[2]
+# contig = items[1]
-alen = items[3]
+# pid = items[2]
-# e = items[-4]
+alen = items[3]
-qlen = items[-1]
+# e = items[-4]
-# calculate query coverage by dividing alignment length by query length
+qlen = items[-1]
-qcov = round(float(alen) / float(qlen) * 100.0, 2)
+# calculate query coverage by dividing alignment length by query length
-if gene not in d.keys():
+qcov = round(float(alen) / float(qlen) * 100.0, 2)
-d[gene] = []
+if gene not in d.keys():
-d[gene].append('%s\t%s' % (line, str(qcov)))
+d[gene] = []
+d[gene].append('%s\t%s' % (line, str(qcov)))
+except Exception:
+return d
 return d
 def get_blacklist(v, b):
 # identify high-risk isolates based on blacklisted genes
 # blacklisted genes file contains two columns:
 # column 0=the gene name as it appears in the gene database
 # column 1=the reason why the gene was blacklisted, which will be reported
 # e.g., 'ANTHRAX TOXIN'
 bdict = {}
+blacklist_present = {}
 with open(b, 'r') as fh:
 for line in fh:
-items = line.split('\t')
+try:
-gene = items[0].strip()
+items = line.split('\t')
-val = items[1].strip()
+gene = items[0]
-bdict[gene] = val
+val = items[1]
-blacklist_present = {}
+bdict[gene] = val
+except Exception:
+return blacklist_present
 for key in v.keys():
 if key in bdict.keys():
 val = bdict[key]
 blacklist_present[key] = val
 return blacklist_present
 # GTDB species (gtdb)
 # create dictionaries based on gene distribution
 d = {}
 annd = {}
 gtdbd = {}
+finallines = []
 with open(f, 'r') as fh:
 for line in fh:
-items = line.split('\t')
+try:
-tax = items[0].strip()
+items = line.split('\t')
-tax = tax.split('s__')[1].strip()
+tax = items[0]
-if len(tax) == 0:
+tax = tax.split('s__')[1]
-tax = '(Unknown Species)'
+if len(tax) == 0:
-gene = items[1].strip()
+tax = '(Unknown Species)'
-ann = items[-1].strip()
+gene = items[1]
-denom = items[3].strip()
+ann = items[-1]
-d['%s___%s' % (tax, gene)] = line
+denom = items[3]
-annd[gene] = ann
+d['%s___%s' % (tax, gene)] = line
-gtdbd[tax] = denom
+annd[gene] = ann
+gtdbd[tax] = denom
+except Exception:
+return finallines
 # parse BLAST results
-finallines = []
 for key in blast.keys():
 blastval = blast[key]
 for bv in blastval:
 testkey = '%s___%s' % (gtdb, key)
 if testkey in d.keys() and gtdb != '(Unknown Species)':
 taxval = d[testkey]
 items = taxval.split('\t')
-tax = items[0].strip()
+tax = items[0]
-tax = tax.split('s__')[1].strip()
+tax = tax.split('s__')[1]
 if len(tax) == 0:
 tax = '(Unknown Species)'
-gene = items[1].strip()
+gene = items[1]
-pres = items[2].strip()
+pres = items[2]
-denom = items[3].strip()
+denom = items[3]
-perc = items[4].strip()
+perc = items[4]
 perc = str(round(float(perc), 2))
-ann = items[-1].strip()
+ann = items[-1]
 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom)
 elif gtdb != '(Unknown Species)':
 ann = annd[key]
 denom = gtdbd[gtdb]
 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom)
 # print table of VFs if VFs detected
 for vline in vfdist:
 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score',  'Identical Matches', 'Query Length']
 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
 items = vline.split('\t')
-vgene = items[0].strip()
+vgene = items[0]
-vcontig = items[1].strip()
+vcontig = items[1]
-vid = items[2].strip()
+vid = items[2]
-vcov = items[-3].strip()
+vcov = items[-3]
-veval = items[-7].strip()
+veval = items[-7]
-vann = items[-2].strip()
+vann = items[-2]
-vnotes = items[-1].strip()
+vnotes = items[-1]
 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes]
-vfinal = '\t'.join(vfinal).strip()
+vfinal = '\t'.join(vfinal)
 fh.write('%s\n' % vfinal)
 def output_amr(amrdist, amr_output_file):
 # takes distribution of AMR genes as input (amrdist)
 # print this if AMR genes detected
 for aline in amrdist:
 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score',  'Identical Matches', 'Query Length']
 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes']
 items = aline.split('\t')
-agene = items[0].strip()
+agene = items[0]
-acontig = items[1].strip()
+acontig = items[1]
-aid = items[2].strip()
+aid = items[2]
-acov = items[-3].strip()
+acov = items[-3]
-aeval = items[-7].strip()
+aeval = items[-7]
-aann = items[-2].strip()
+aann = items[-2]
-anotes = items[-1].strip()
+anotes = items[-1]
 afinal = [agene, acontig, aid, acov, aeval, aann, anotes]
-afinal = '\t'.join(afinal).strip()
+afinal = '\t'.join(afinal)
 fh.write('%s\n' % afinal)
 # lrnrisk_prototype arguments
 parser = argparse.ArgumentParser()

Mercurial > repos > greg > lrn_risk

comparison lrn_risk.py @ 2:8dc6d4aa17ec draft