Mercurial > repos > cpt > cpt_prophage_relatedness
comparison prophage_relatedness.py @ 0:7a23dda2e932 draft
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
| author | cpt |
|---|---|
| date | Thu, 08 Aug 2024 03:09:32 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7a23dda2e932 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import argparse | |
| 3 from math import floor | |
| 4 from Bio.Blast import NCBIXML | |
| 5 import logging | |
| 6 | |
| 7 logging.basicConfig(level=logging.DEBUG) | |
| 8 log = logging.getLogger() | |
| 9 | |
| 10 | |
| 11 def parseXML(blastxml, outFile): # Modified from intron_detection | |
| 12 blast = [] | |
| 13 for iter_num, blast_record in enumerate(NCBIXML.parse(blastxml), 1): | |
| 14 align_num = 0 | |
| 15 outFile.write("Query ID\tQuery Length\tTotal Number of Hits\n") | |
| 16 outFile.write( | |
| 17 "%s\t%d\t%d\n\n" | |
| 18 % ( | |
| 19 blast_record.query_id, | |
| 20 blast_record.query_length, | |
| 21 len(blast_record.alignments), | |
| 22 ) | |
| 23 ) | |
| 24 | |
| 25 for alignment in blast_record.alignments: | |
| 26 | |
| 27 align_num += 1 | |
| 28 gi_nos = str(alignment.accession) | |
| 29 blast_gene = [] | |
| 30 for hsp in alignment.hsps: | |
| 31 | |
| 32 x = float(hsp.identities - 1) / ((hsp.query_end) - hsp.query_start) | |
| 33 nice_name = blast_record.query | |
| 34 if " " in nice_name: | |
| 35 nice_name = nice_name[0 : nice_name.index(" ")] | |
| 36 blast_gene.append( | |
| 37 { | |
| 38 "gi_nos": gi_nos, | |
| 39 "sbjct_length": alignment.length, | |
| 40 "query_length": blast_record.query_length, | |
| 41 "sbjct_range": (hsp.sbjct_start, hsp.sbjct_end), | |
| 42 "query_range": (hsp.query_start, hsp.query_end), | |
| 43 "name": nice_name, | |
| 44 "evalue": hsp.expect, | |
| 45 "identity": hsp.identities, | |
| 46 "identity_percent": x, | |
| 47 "hit_num": align_num, | |
| 48 "iter_num": iter_num, | |
| 49 "match_id": alignment.title.partition(">")[0], | |
| 50 "align_len": hsp.align_length, | |
| 51 } | |
| 52 ) | |
| 53 blast.append(blast_gene) | |
| 54 | |
| 55 return blast | |
| 56 | |
| 57 | |
| 58 def openTSV(blasttsv, outFile): # Modified from intron_detection | |
| 59 blast = [] | |
| 60 activeAlign = "" | |
| 61 numAlignments = 0 | |
| 62 qLen = 0 | |
| 63 for line in blasttsv: | |
| 64 line = line.strip("\n") | |
| 65 data = line.split("\t") | |
| 66 for x in range(0, len(data)): | |
| 67 data[x] = data[x].strip() | |
| 68 qLen = data[22] | |
| 69 if activeAlign == "": | |
| 70 numAlignments += 1 | |
| 71 blast_gene = [] | |
| 72 hsp_num = 1 | |
| 73 elif activeAlign != data[1]: | |
| 74 numAlignments += 1 | |
| 75 blast.append(blast_gene) | |
| 76 blast_gene = [] | |
| 77 hsp_num = 1 | |
| 78 else: | |
| 79 hsp_num += 1 | |
| 80 gi_nos = data[12] | |
| 81 activeAlign = data[1] | |
| 82 x = float(float(data[14]) - 1) / (float(data[7]) - float(data[6])) | |
| 83 nice_name = data[1] | |
| 84 if " " in nice_name: | |
| 85 nice_name = nice_name[0 : nice_name.index(" ")] | |
| 86 blast_gene.append( | |
| 87 { | |
| 88 "gi_nos": gi_nos, | |
| 89 "sbjct_length": int(data[23]), | |
| 90 "query_length": int(data[22]), | |
| 91 "sbjct_range": (int(data[8]), int(data[9])), | |
| 92 "query_range": (int(data[6]), int(data[7])), | |
| 93 "name": nice_name, | |
| 94 "evalue": float(data[10]), | |
| 95 "identity": int(data[14]), | |
| 96 "identity_percent": x, | |
| 97 "hit_num": numAlignments, | |
| 98 "iter_num": hsp_num, | |
| 99 "match_id": data[24].partition(">")[0], | |
| 100 "align_len": int(data[3]), | |
| 101 } | |
| 102 ) | |
| 103 | |
| 104 blast.append(blast_gene) | |
| 105 outFile.write("Query ID\tQuery Length\tTotal Number of Hits\n") | |
| 106 outFile.write("%s\t%d\t%d\n\n" % (data[0], int(data[22]), numAlignments)) | |
| 107 | |
| 108 return blast | |
| 109 | |
| 110 | |
| 111 def test_true(feature, **kwargs): | |
| 112 return True | |
| 113 | |
| 114 | |
| 115 def superSets(inSets): | |
| 116 inSets.sort(key=len, reverse=True) | |
| 117 nextInd = 0 | |
| 118 res = [] | |
| 119 for i in range(0, len(inSets)): | |
| 120 if i == 0: | |
| 121 res.append(inSets[i]) | |
| 122 continue | |
| 123 for par in res: | |
| 124 complete = True | |
| 125 for x in inSets[i]: | |
| 126 if not (x in par): | |
| 127 complete = False | |
| 128 if complete: | |
| 129 break # Subset of at least one member | |
| 130 if not complete: | |
| 131 res.append(inSets[i]) | |
| 132 return res | |
| 133 | |
| 134 | |
| 135 def disjointSets(inSets): | |
| 136 inSets.sort(key=lambda x: x[0]["sbjct_range"][0]) | |
| 137 res = [inSets[0]] | |
| 138 for i in range(1, len(inSets)): | |
| 139 disjoint = True | |
| 140 for elem in inSets[i]: | |
| 141 for cand in res: | |
| 142 if elem in cand: | |
| 143 disjoint = False | |
| 144 break | |
| 145 if not disjoint: | |
| 146 break | |
| 147 if disjoint: | |
| 148 res.append(inSets[i]) | |
| 149 return res | |
| 150 | |
| 151 | |
| 152 def compPhage(inRec, outFile, padding=1.2, cutoff=0.3, numReturn=20, isTSV=False): | |
| 153 | |
| 154 if isTSV: | |
| 155 inRec = openTSV(inRec, outFile) | |
| 156 else: | |
| 157 inRec = parseXML(inRec, outFile) | |
| 158 res = [] | |
| 159 for group in inRec: | |
| 160 window = floor(padding * float(group[0]["query_length"])) | |
| 161 group = sorted(group, key=lambda x: x["sbjct_range"][0]) | |
| 162 hspGroups = [] | |
| 163 lastInd = len(res) | |
| 164 | |
| 165 for x in range(0, len(group)): | |
| 166 hspGroups.append([group[x]]) | |
| 167 startBound = group[x]["sbjct_range"][0] | |
| 168 endBound = startBound + window | |
| 169 for hsp in group[x + 1 :]: | |
| 170 if ( | |
| 171 hsp["sbjct_range"][0] >= startBound | |
| 172 and hsp["sbjct_range"][1] <= endBound | |
| 173 ): | |
| 174 hspGroups[-1].append(hsp) | |
| 175 | |
| 176 for x in disjointSets(superSets(hspGroups)): | |
| 177 res.append(x) | |
| 178 | |
| 179 maxID = 0.0 | |
| 180 for x in res[lastInd:]: | |
| 181 sumID = 0.0 | |
| 182 totAlign = 0 | |
| 183 for y in x: | |
| 184 totAlign += y["align_len"] | |
| 185 sumID += float(y["identity"]) | |
| 186 x.append(totAlign) | |
| 187 x.append(sumID / float(x[0]["query_length"])) | |
| 188 maxID = max(maxID, x[-1]) | |
| 189 | |
| 190 res = sorted(res, key=lambda x: x[-1], reverse=True) | |
| 191 | |
| 192 outList = [] | |
| 193 outNum = 0 | |
| 194 for x in res: | |
| 195 if outNum == numReturn or x[-1] < cutoff: | |
| 196 break | |
| 197 outNum += 1 | |
| 198 outList.append(x) | |
| 199 | |
| 200 # Original request was that low scoring clusters would make it to the final results IF | |
| 201 # they were part of an Accession cluster that did have at least one high scoring member. | |
| 202 | |
| 203 outFile.write( | |
| 204 "Accession Number\tCluster Start Location\tEnd Location\tSubject Cluster Length\t# HSPs in Cluster\tTotal Aligned Length\t% of Query Aligned\tOverall % Query Identity\tOverall % Subject Identity\tComplete Accession Info\n" | |
| 205 ) | |
| 206 for x in outList: | |
| 207 minStart = min(x[0]["sbjct_range"][0], x[0]["sbjct_range"][1]) | |
| 208 maxEnd = max(x[0]["sbjct_range"][0], x[0]["sbjct_range"][1]) | |
| 209 if "|gb|" in x[0]["match_id"]: | |
| 210 startSlice = x[0]["match_id"].index("gb|") + 3 | |
| 211 endSlice = (x[0]["match_id"][startSlice:]).index("|") | |
| 212 accOut = x[0]["match_id"][startSlice : startSlice + endSlice] | |
| 213 else: | |
| 214 accOut = x[0]["gi_nos"] | |
| 215 for y in x[0:-2]: | |
| 216 # ("\t%.3f\t" % (x[-1])) | |
| 217 minStart = min(minStart, y["sbjct_range"][0]) | |
| 218 maxEnd = max(maxEnd, y["sbjct_range"][1]) | |
| 219 outFile.write( | |
| 220 accOut | |
| 221 + "\t" | |
| 222 + str(minStart) | |
| 223 + "\t" | |
| 224 + str(maxEnd) | |
| 225 + "\t" | |
| 226 + str(maxEnd - minStart + 1) | |
| 227 + "\t" | |
| 228 + str(len(x) - 1) | |
| 229 + "\t" | |
| 230 + str(x[-2]) | |
| 231 + ("\t%.3f" % (float(x[-2]) / float(x[0]["query_length"]) * 100.00)) | |
| 232 + ("\t%.3f" % (x[-1] * 100.00)) | |
| 233 + ("\t%.3f" % (float(x[-2]) / float(maxEnd - minStart + 1) * 100.00)) | |
| 234 + "\t" | |
| 235 + x[0]["match_id"] | |
| 236 + "\n" | |
| 237 ) | |
| 238 | |
| 239 # accession start end number | |
| 240 | |
| 241 | |
| 242 if __name__ == "__main__": | |
| 243 parser = argparse.ArgumentParser(description="Intron detection") | |
| 244 parser.add_argument( | |
| 245 "inRec", type=argparse.FileType("r"), help="blast XML protein results" | |
| 246 ) | |
| 247 parser.add_argument( | |
| 248 "--outFile", | |
| 249 type=argparse.FileType("w"), | |
| 250 help="Output Error Log", | |
| 251 default="./compPro.tsv", | |
| 252 ) | |
| 253 parser.add_argument( | |
| 254 "--padding", | |
| 255 help="Gap minimum (Default -1, set to a negative number to allow overlap)", | |
| 256 default=1.2, | |
| 257 type=float, | |
| 258 ) | |
| 259 parser.add_argument( | |
| 260 "--cutoff", | |
| 261 help="Gap minimum (Default -1, set to a negative number to allow overlap)", | |
| 262 default=0.3, | |
| 263 type=float, | |
| 264 ) | |
| 265 parser.add_argument( | |
| 266 "--numReturn", | |
| 267 help="Gap maximum in genome (Default 10000)", | |
| 268 default=20, | |
| 269 type=int, | |
| 270 ) | |
| 271 parser.add_argument( | |
| 272 "--isTSV", | |
| 273 help="Opening Blast TSV result", | |
| 274 action="store_true", | |
| 275 ) | |
| 276 args = parser.parse_args() | |
| 277 | |
| 278 compPhage(**vars(args)) |
