Mercurial > repos > cpt > cpt_related_genomes_prot
comparison cpt_related_genome_prot/relatedness_prot.py @ 0:ebcc87a27f9c draft default tip
Uploaded
| author | cpt |
|---|---|
| date | Fri, 10 Jun 2022 08:46:28 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ebcc87a27f9c |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import argparse | |
| 4 import json | |
| 5 import logging | |
| 6 from Bio.Blast import NCBIXML | |
| 7 | |
| 8 logging.basicConfig(level=logging.DEBUG) | |
| 9 log = logging.getLogger() | |
| 10 | |
| 11 def parse_blast(blast, isXML = False): | |
| 12 res = [] | |
| 13 finalRes = [] | |
| 14 if isXML: | |
| 15 for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1): | |
| 16 for alignment in blast_record.alignments: | |
| 17 tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:] | |
| 18 tempID = tempID[:tempID.find("|")] | |
| 19 tempDesc = alignment.title | |
| 20 while tempDesc.find("|") >= 0: | |
| 21 tempDesc = tempDesc[tempDesc.find("|") + 1:] | |
| 22 tempDesc = tempDesc.strip() | |
| 23 tempID = tempID.strip() | |
| 24 #for hsp in alignment.hsps: | |
| 25 line = [str(blast_record.query)[:str(blast_record.query).find("[")].strip()] | |
| 26 line.append(alignment.hit_id) | |
| 27 line.append(tempDesc) | |
| 28 line.append(alignment.accession) | |
| 29 res.append(line) | |
| 30 blast.seek(0) | |
| 31 resInd = -1 | |
| 32 taxLine = blast.readline() | |
| 33 while taxLine: | |
| 34 if "<Hit>" in taxLine: | |
| 35 resInd += 1 | |
| 36 taxSlice = "" | |
| 37 elif "<taxid>" in taxLine: | |
| 38 taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")] | |
| 39 finalRes.append(res[resInd]) | |
| 40 finalRes[-1].append(taxSlice) | |
| 41 taxLine = blast.readline() | |
| 42 return finalRes | |
| 43 else: | |
| 44 for line in blast: | |
| 45 finalRes.append(line.strip("\n").split("\t")) | |
| 46 return finalRes | |
| 47 | |
| 48 def with_dice(blast): | |
| 49 for data in blast: | |
| 50 dice = 2 * int(data[14]) / (float(data[22]) + float(data[23])) | |
| 51 yield data + [dice] | |
| 52 | |
| 53 | |
| 54 def filter_dice(blast, threshold=0.5): | |
| 55 for data in blast: | |
| 56 if data[-1] > threshold: | |
| 57 yield data | |
| 58 | |
| 59 | |
| 60 def split_identifiers_nucl(_, ident): | |
| 61 if "<>" in ident: | |
| 62 idents = ident.split("<>") | |
| 63 else: | |
| 64 idents = [ident] | |
| 65 return idents | |
| 66 | |
| 67 | |
| 68 def split_identifiers_prot(_, ident): | |
| 69 if "<>" in ident: | |
| 70 idents = ident.split("<>") | |
| 71 else: | |
| 72 idents = [ident] | |
| 73 return [ | |
| 74 x[x.index("[") + 1 : x.rindex("]")] | |
| 75 for x in idents | |
| 76 # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638] | |
| 77 if "[" in x and "]" in x | |
| 78 ] | |
| 79 | |
| 80 | |
| 81 def split_identifiers_phage(par, ident): | |
| 82 par = par.replace("lcl|", "") | |
| 83 par = par[0 : par.index("_prot_")] | |
| 84 return [par] | |
| 85 | |
| 86 | |
| 87 def important_only(blast, split_identifiers): | |
| 88 for data in blast: | |
| 89 yield [ | |
| 90 data[0], # 01 Query Seq-id (ID of your sequence) | |
| 91 data[1], # 13 All subject Seq-id(s), separated by a ';' | |
| 92 split_identifiers( | |
| 93 data[1], data[2] | |
| 94 ), # 25 All subject title(s), separated by a '<>' | |
| 95 data[3].split(";"), # Extra: All Subject Accessions | |
| 96 data[4].split(";"), # Extra: All TaxIDs | |
| 97 ] | |
| 98 | |
| 99 | |
| 100 def deform_scores(blast): | |
| 101 for data in blast: | |
| 102 for org in data[2]: | |
| 103 yield [data[0], data[1], org, data[3], data[4]] | |
| 104 | |
| 105 | |
| 106 def expand_fields(blast): | |
| 107 for data in blast: | |
| 108 for x in range(0, len(data[4])): | |
| 109 yield [data[0], data[1], data[2][x], data[3], int(data[4][x])] | |
| 110 | |
| 111 def expand_taxIDs(blast, taxFilter): | |
| 112 for data in blast: | |
| 113 # if(len(data[4]) > 0): | |
| 114 # print(data[0]) | |
| 115 for ID in data[4]: | |
| 116 if ID != "N/A": | |
| 117 filterOut = False | |
| 118 for tax in taxFilter: | |
| 119 if str(ID).strip() == tax: | |
| 120 filterOut = True | |
| 121 if not filterOut: | |
| 122 yield [data[0], data[1], data[2], data[3], int(ID)] | |
| 123 | |
| 124 | |
| 125 def expand_titles(blast): | |
| 126 for data in blast: | |
| 127 for title in data[2]: | |
| 128 yield [data[0], data[1], title, data[3], data[4]] | |
| 129 | |
| 130 | |
| 131 def filter_phage(blast, phageTaxLookup, phageSciNames): | |
| 132 for data in blast: | |
| 133 for x in range(0, len(phageTaxLookup)): | |
| 134 if (data[4]) == phageTaxLookup[x]: | |
| 135 yield [data[0], data[1], phageSciNames[x], data[3], data[4]] | |
| 136 break | |
| 137 | |
| 138 | |
| 139 def remove_dupes(data): | |
| 140 has_seen = {} | |
| 141 for row in data: | |
| 142 # qseqid, sseqid | |
| 143 key = (row[0], row[4]) | |
| 144 # If we've seen the key before, we can exit | |
| 145 if key in has_seen: | |
| 146 continue | |
| 147 | |
| 148 # Otherwise, continue on | |
| 149 has_seen[key] = True | |
| 150 # Pretty simple | |
| 151 yield row | |
| 152 | |
| 153 def scoreMap(blast): | |
| 154 c = {} | |
| 155 m = {} | |
| 156 for (qseq, subID, subTitle, access, ID) in blast: | |
| 157 if (str(subTitle), ID) not in c: | |
| 158 m[(str(subTitle), ID)] = access | |
| 159 c[(str(subTitle), ID)] = 0 | |
| 160 | |
| 161 c[(str(subTitle), ID)] += 1 | |
| 162 return c, m | |
| 163 | |
| 164 | |
| 165 if __name__ == "__main__": | |
| 166 parser = argparse.ArgumentParser(description="Top related genomes") | |
| 167 parser.add_argument( | |
| 168 "blast", type=argparse.FileType("r"), help="Blast 25 Column Results" | |
| 169 ) | |
| 170 parser.add_argument("phagedb", type=argparse.FileType("r")) | |
| 171 parser.add_argument("--access", action="store_true") | |
| 172 parser.add_argument("--protein", action="store_true") | |
| 173 parser.add_argument("--canonical", action="store_true") | |
| 174 parser.add_argument("--noFilter", action="store_true") | |
| 175 #parser.add_argument("--title", action="store_true") # Add when ready to update XML after semester | |
| 176 parser.add_argument("--hits", type=int, default=5) | |
| 177 parser.add_argument("--xmlMode", action="store_true") | |
| 178 parser.add_argument("--taxFilter", type=str) | |
| 179 | |
| 180 args = parser.parse_args() | |
| 181 | |
| 182 phageDb = args.phagedb | |
| 183 phageTaxLookup = [] | |
| 184 sciName = [] | |
| 185 line = phageDb.readline() | |
| 186 | |
| 187 taxList = [] | |
| 188 if args.taxFilter and args.taxFilter != "" : | |
| 189 args.taxFilter = args.taxFilter.split(" ") | |
| 190 for ind in args.taxFilter: | |
| 191 taxList.append(ind.strip()) | |
| 192 | |
| 193 while line: | |
| 194 line = line.split("\t") | |
| 195 phageTaxLookup.append(int(line[0])) | |
| 196 line[1] = line[1].strip() | |
| 197 if (line[1] == ""): | |
| 198 line[1] = "Novel Genome" | |
| 199 sciName.append(line[1]) | |
| 200 line = phageDb.readline() | |
| 201 | |
| 202 if args.protein: | |
| 203 splitId = split_identifiers_prot | |
| 204 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb} | |
| 205 elif args.canonical: | |
| 206 splitId = split_identifiers_phage | |
| 207 # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb} | |
| 208 else: | |
| 209 splitId = split_identifiers_nucl | |
| 210 # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb} | |
| 211 | |
| 212 data = parse_blast(args.blast, args.xmlMode) | |
| 213 # data = with_dice(data) | |
| 214 # data = filter_dice(data, threshold=0.0) | |
| 215 data = important_only(data, splitId) | |
| 216 | |
| 217 data = expand_taxIDs(data, taxList) | |
| 218 data = remove_dupes(data) | |
| 219 if not args.noFilter: | |
| 220 data = filter_phage(data, phageTaxLookup, sciName) | |
| 221 listify = [] | |
| 222 for x in data: | |
| 223 listify.append(x) | |
| 224 #listify = greatest_taxID(listify) | |
| 225 | |
| 226 count_label = "Similar Unique Proteins" | |
| 227 | |
| 228 counts, accessions = scoreMap(listify) | |
| 229 | |
| 230 nameRec = listify[0][0] | |
| 231 sys.stdout.write( | |
| 232 "Top %d matches for BLASTp results of %s\n" | |
| 233 % (args.hits, nameRec) | |
| 234 ) | |
| 235 header = "# TaxID\t" | |
| 236 #if args.title: | |
| 237 header += "Name\t" | |
| 238 if args.access: | |
| 239 header += "Accessions\t" | |
| 240 header += "Similar Unique Proteins\n" | |
| 241 sys.stdout.write(header) | |
| 242 | |
| 243 for idx, ((name, ID), num) in enumerate( | |
| 244 sorted(counts.items(), key=lambda item: -item[1]) | |
| 245 ): | |
| 246 if idx > args.hits - 1: | |
| 247 break | |
| 248 line = str(ID) + "\t" | |
| 249 #if args.title: | |
| 250 line += str(name) + "\t" | |
| 251 if args.access: | |
| 252 line += str(accessions[(name, ID)][0]) + "\t" | |
| 253 line += str(num) + "\n" | |
| 254 sys.stdout.write(line) |
