Mercurial > repos > greg > lrn_risk
comparison lrn_risk.py @ 2:8dc6d4aa17ec draft
Uploaded
| author | greg |
|---|---|
| date | Fri, 28 Apr 2023 17:03:46 +0000 |
| parents | f98c92618a6c |
| children | 5a27ac020c9e |
comparison
equal
deleted
inserted
replaced
| 1:f98c92618a6c | 2:8dc6d4aa17ec |
|---|---|
| 13 with open(f, 'r') as fh: | 13 with open(f, 'r') as fh: |
| 14 for i, line in enumerate(fh): | 14 for i, line in enumerate(fh): |
| 15 if i == 0: | 15 if i == 0: |
| 16 # Skip header. | 16 # Skip header. |
| 17 continue | 17 continue |
| 18 items = line.split('\t') | 18 try: |
| 19 tax = items[1].strip() | 19 items = line.split('\t') |
| 20 tax = tax.split(';')[-1].strip() | 20 tax = items[1] |
| 21 # split on GTDB species tag | 21 tax = tax.split(';')[-1] |
| 22 tax = tax.split('s__')[1].strip() | 22 # split on GTDB species tag |
| 23 tax = tax.split('s__')[1] | |
| 24 except Exception: | |
| 25 return '(Unknown Species)' | |
| 23 if len(tax) == 0: | 26 if len(tax) == 0: |
| 24 tax = '(Unknown Species)' | 27 return '(Unknown Species)' |
| 25 return tax | 28 return tax |
| 26 | 29 |
| 27 | 30 |
| 28 def get_blast_genes(f): | 31 def get_blast_genes(f): |
| 29 # reads genes detected via BLAST | 32 # reads genes detected via BLAST |
| 30 # BLAST header is as follows: | 33 # BLAST header is as follows: |
| 31 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen | 34 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qlen |
| 32 d = {} | 35 d = {} |
| 33 with open(f, 'r') as fh: | 36 with open(f, 'r') as fh: |
| 34 for line in fh: | 37 for line in fh: |
| 35 items = line.split('\t') | 38 try: |
| 36 gene = items[0] | 39 items = line.split('\t') |
| 37 # contig = items[1] | 40 gene = items[0] |
| 38 # pid = items[2] | 41 # contig = items[1] |
| 39 alen = items[3] | 42 # pid = items[2] |
| 40 # e = items[-4] | 43 alen = items[3] |
| 41 qlen = items[-1] | 44 # e = items[-4] |
| 42 # calculate query coverage by dividing alignment length by query length | 45 qlen = items[-1] |
| 43 qcov = round(float(alen) / float(qlen) * 100.0, 2) | 46 # calculate query coverage by dividing alignment length by query length |
| 44 if gene not in d.keys(): | 47 qcov = round(float(alen) / float(qlen) * 100.0, 2) |
| 45 d[gene] = [] | 48 if gene not in d.keys(): |
| 46 d[gene].append('%s\t%s' % (line, str(qcov))) | 49 d[gene] = [] |
| 50 d[gene].append('%s\t%s' % (line, str(qcov))) | |
| 51 except Exception: | |
| 52 return d | |
| 47 return d | 53 return d |
| 48 | 54 |
| 49 | 55 |
| 50 def get_blacklist(v, b): | 56 def get_blacklist(v, b): |
| 51 # identify high-risk isolates based on blacklisted genes | 57 # identify high-risk isolates based on blacklisted genes |
| 52 # blacklisted genes file contains two columns: | 58 # blacklisted genes file contains two columns: |
| 53 # column 0=the gene name as it appears in the gene database | 59 # column 0=the gene name as it appears in the gene database |
| 54 # column 1=the reason why the gene was blacklisted, which will be reported | 60 # column 1=the reason why the gene was blacklisted, which will be reported |
| 55 # e.g., 'ANTHRAX TOXIN' | 61 # e.g., 'ANTHRAX TOXIN' |
| 56 bdict = {} | 62 bdict = {} |
| 63 blacklist_present = {} | |
| 57 with open(b, 'r') as fh: | 64 with open(b, 'r') as fh: |
| 58 for line in fh: | 65 for line in fh: |
| 59 items = line.split('\t') | 66 try: |
| 60 gene = items[0].strip() | 67 items = line.split('\t') |
| 61 val = items[1].strip() | 68 gene = items[0] |
| 62 bdict[gene] = val | 69 val = items[1] |
| 63 blacklist_present = {} | 70 bdict[gene] = val |
| 71 except Exception: | |
| 72 return blacklist_present | |
| 64 for key in v.keys(): | 73 for key in v.keys(): |
| 65 if key in bdict.keys(): | 74 if key in bdict.keys(): |
| 66 val = bdict[key] | 75 val = bdict[key] |
| 67 blacklist_present[key] = val | 76 blacklist_present[key] = val |
| 68 return blacklist_present | 77 return blacklist_present |
| 80 # GTDB species (gtdb) | 89 # GTDB species (gtdb) |
| 81 # create dictionaries based on gene distribution | 90 # create dictionaries based on gene distribution |
| 82 d = {} | 91 d = {} |
| 83 annd = {} | 92 annd = {} |
| 84 gtdbd = {} | 93 gtdbd = {} |
| 94 finallines = [] | |
| 85 with open(f, 'r') as fh: | 95 with open(f, 'r') as fh: |
| 86 for line in fh: | 96 for line in fh: |
| 87 items = line.split('\t') | 97 try: |
| 88 tax = items[0].strip() | 98 items = line.split('\t') |
| 89 tax = tax.split('s__')[1].strip() | 99 tax = items[0] |
| 90 if len(tax) == 0: | 100 tax = tax.split('s__')[1] |
| 91 tax = '(Unknown Species)' | 101 if len(tax) == 0: |
| 92 gene = items[1].strip() | 102 tax = '(Unknown Species)' |
| 93 ann = items[-1].strip() | 103 gene = items[1] |
| 94 denom = items[3].strip() | 104 ann = items[-1] |
| 95 d['%s___%s' % (tax, gene)] = line | 105 denom = items[3] |
| 96 annd[gene] = ann | 106 d['%s___%s' % (tax, gene)] = line |
| 97 gtdbd[tax] = denom | 107 annd[gene] = ann |
| 108 gtdbd[tax] = denom | |
| 109 except Exception: | |
| 110 return finallines | |
| 98 # parse BLAST results | 111 # parse BLAST results |
| 99 finallines = [] | |
| 100 for key in blast.keys(): | 112 for key in blast.keys(): |
| 101 blastval = blast[key] | 113 blastval = blast[key] |
| 102 for bv in blastval: | 114 for bv in blastval: |
| 103 testkey = '%s___%s' % (gtdb, key) | 115 testkey = '%s___%s' % (gtdb, key) |
| 104 if testkey in d.keys() and gtdb != '(Unknown Species)': | 116 if testkey in d.keys() and gtdb != '(Unknown Species)': |
| 105 taxval = d[testkey] | 117 taxval = d[testkey] |
| 106 items = taxval.split('\t') | 118 items = taxval.split('\t') |
| 107 tax = items[0].strip() | 119 tax = items[0] |
| 108 tax = tax.split('s__')[1].strip() | 120 tax = tax.split('s__')[1] |
| 109 if len(tax) == 0: | 121 if len(tax) == 0: |
| 110 tax = '(Unknown Species)' | 122 tax = '(Unknown Species)' |
| 111 gene = items[1].strip() | 123 gene = items[1] |
| 112 pres = items[2].strip() | 124 pres = items[2] |
| 113 denom = items[3].strip() | 125 denom = items[3] |
| 114 perc = items[4].strip() | 126 perc = items[4] |
| 115 perc = str(round(float(perc), 2)) | 127 perc = str(round(float(perc), 2)) |
| 116 ann = items[-1].strip() | 128 ann = items[-1] |
| 117 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) | 129 freetext = 'Gene {0} has been detected in {1}% of {2} genomes ({3} of {4} genomes queried)'.format(gene, perc, tax, pres, denom) |
| 118 elif gtdb != '(Unknown Species)': | 130 elif gtdb != '(Unknown Species)': |
| 119 ann = annd[key] | 131 ann = annd[key] |
| 120 denom = gtdbd[gtdb] | 132 denom = gtdbd[gtdb] |
| 121 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) | 133 freetext = 'WARNING: Gene {0} ({1}) has never been detected in species {2} (n={3} genomes queried)! Interpret with caution!'.format(key, ann, gtdb, denom) |
| 155 # print table of VFs if VFs detected | 167 # print table of VFs if VFs detected |
| 156 for vline in vfdist: | 168 for vline in vfdist: |
| 157 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] | 169 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] |
| 158 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] | 170 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] |
| 159 items = vline.split('\t') | 171 items = vline.split('\t') |
| 160 vgene = items[0].strip() | 172 vgene = items[0] |
| 161 vcontig = items[1].strip() | 173 vcontig = items[1] |
| 162 vid = items[2].strip() | 174 vid = items[2] |
| 163 vcov = items[-3].strip() | 175 vcov = items[-3] |
| 164 veval = items[-7].strip() | 176 veval = items[-7] |
| 165 vann = items[-2].strip() | 177 vann = items[-2] |
| 166 vnotes = items[-1].strip() | 178 vnotes = items[-1] |
| 167 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] | 179 vfinal = [vgene, vcontig, vid, vcov, veval, vann, vnotes] |
| 168 vfinal = '\t'.join(vfinal).strip() | 180 vfinal = '\t'.join(vfinal) |
| 169 fh.write('%s\n' % vfinal) | 181 fh.write('%s\n' % vfinal) |
| 170 | 182 |
| 171 | 183 |
| 172 def output_amr(amrdist, amr_output_file): | 184 def output_amr(amrdist, amr_output_file): |
| 173 # takes distribution of AMR genes as input (amrdist) | 185 # takes distribution of AMR genes as input (amrdist) |
| 181 # print this if AMR genes detected | 193 # print this if AMR genes detected |
| 182 for aline in amrdist: | 194 for aline in amrdist: |
| 183 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] | 195 # blast_header=['Gene', 'Contig', 'Percent (%) Nucleotide Identity', 'Alignment Length', 'Mismatches', 'Gaps', 'Query Start', 'Query End', 'Subject Start', 'Subject End', 'E-Value', 'Bit Score', 'Identical Matches', 'Query Length'] |
| 184 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] | 196 # lc_header=['Query Coverage', 'Annotation', 'Comparison to Publicly Available Genomes'] |
| 185 items = aline.split('\t') | 197 items = aline.split('\t') |
| 186 agene = items[0].strip() | 198 agene = items[0] |
| 187 acontig = items[1].strip() | 199 acontig = items[1] |
| 188 aid = items[2].strip() | 200 aid = items[2] |
| 189 acov = items[-3].strip() | 201 acov = items[-3] |
| 190 aeval = items[-7].strip() | 202 aeval = items[-7] |
| 191 aann = items[-2].strip() | 203 aann = items[-2] |
| 192 anotes = items[-1].strip() | 204 anotes = items[-1] |
| 193 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] | 205 afinal = [agene, acontig, aid, acov, aeval, aann, anotes] |
| 194 afinal = '\t'.join(afinal).strip() | 206 afinal = '\t'.join(afinal) |
| 195 fh.write('%s\n' % afinal) | 207 fh.write('%s\n' % afinal) |
| 196 | 208 |
| 197 | 209 |
| 198 # lrnrisk_prototype arguments | 210 # lrnrisk_prototype arguments |
| 199 parser = argparse.ArgumentParser() | 211 parser = argparse.ArgumentParser() |
