diff cpt_related_genome_nuc/relatedness.py @ 0:5a5fe0a6f78d draft default tip

Uploaded
author cpt
date Fri, 10 Jun 2022 08:45:13 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_related_genome_nuc/relatedness.py	Fri Jun 10 08:45:13 2022 +0000
@@ -0,0 +1,364 @@
+#!/usr/bin/env python
+import sys
+import argparse
+import json
+import logging
+from Bio.Blast import NCBIXML
+
+
+logging.basicConfig(level=logging.DEBUG)
+log = logging.getLogger()
+
+
+def parse_blast(blast, isXML = False):
+    res = []
+    finalRes = []
+    if isXML:
+      for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1):
+        for alignment in blast_record.alignments:
+            tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:]
+            tempID = tempID[:tempID.find("|")]
+            tempDesc = alignment.title
+            while tempDesc.find("|") >= 0:
+              tempDesc = tempDesc[tempDesc.find("|") + 1:]
+            tempDesc = tempDesc.strip()
+            tempID = tempID.strip()
+            for hsp in alignment.hsps:
+              line = [str(blast_record.query)]
+              line.append(str(hsp.align_length))
+              line.append(str(hsp.identities))
+              line.append(str(blast_record.query_length)) 
+              line.append(str(alignment.length))
+              line.append(tempDesc)
+              line.append(tempID)
+              #line.append("0000000")
+              #print(line)
+              res.append(line)
+      blast.seek(0)
+      resInd = -1
+      taxLine = blast.readline()
+      while taxLine: 
+        if "<Hit>" in taxLine:
+          resInd += 1
+          taxSlice = ""
+        elif "<taxid>" in taxLine:
+          taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")]
+          finalRes.append(res[resInd])
+          finalRes[-1].append(taxSlice)
+          #print(finalRes[-1])
+        taxLine = blast.readline()
+      return finalRes
+    else:
+      for line in blast:
+        taxSplit = []
+        preTaxSplit = line.strip("\n").split("\t")
+        for tax in preTaxSplit[-1].split(";"):
+            shallowCopy = []
+            for x in range(len(preTaxSplit)):
+                shallowCopy.append(preTaxSplit[x])
+            shallowCopy[-1] = tax
+            res.append(shallowCopy)
+      for line in res:
+        for access in line[6].split(";"):
+            shallowCopy = []
+            for x in range(len(line)):
+                shallowCopy.append(line[x])
+            shallowCopy[6] = access
+            finalRes.append(shallowCopy)
+    # for x in finalRes:
+    #  print(x)
+    # exit()
+    return finalRes
+
+
+def add_dice(blast):
+    res = []
+    for data in blast:
+        dice = (2 * float(data[2])) / (float(data[3]) + float(data[4]))
+        res.append(data + [dice])
+    return res
+
+
+def make_num(blast):
+    res = []
+    for data in blast:
+        res.append(
+            [
+                data[0],
+                int(data[1]),
+                int(data[2]),
+                int(data[3]),
+                int(data[4]),
+                data[5],
+                data[6],
+                (data[7]),
+            ]
+        )
+    return res
+
+
+def bundle_dice(blast):
+    res = []
+    ind = 0
+    seen = {}
+
+    for x in blast:
+        if (x[6] + "_" + (x[7])) in seen.keys():
+            res[seen[(x[6] + "_" + (x[7]))]][1] += x[1]
+            res[seen[(x[6] + "_" + (x[7]))]][2] += x[2]
+            res[seen[(x[6] + "_" + (x[7]))]][8] += x[8]
+            res[seen[(x[6] + "_" + (x[7]))]][9] += 1  # Num HSPs
+        else:
+            seen[(x[6] + "_" + (x[7]))] = ind
+            shallowCopy = []
+            for i in range(len(x)):
+                shallowCopy.append(x[i])
+            shallowCopy.append(1)
+            # print(shallowCopy)
+            res.append(shallowCopy)
+            ind += 1
+    # print(seen)
+    # stop = 0
+    # for i in res:
+    #  print(i)
+    #  stop += 1
+    #  if stop > 7:
+    #    exit()
+    return res
+
+
+"""def bundle_dice(blast):
+    res = []
+    ind = 0
+    seen = {}
+    for x in blast:
+      if ((x[0] + x[5])) in seen.keys():
+        res[seen[(x[0] + x[5])]][1] += x[1]
+        res[seen[(x[0] + x[5])]][2] += x[2]
+        res[seen[(x[0] + x[5])]][9] += 1 # Num HSPs
+        res[seen[(x[0] + x[5])]][8] = ((res[seen[(x[0] + x[5])]][8] * (res[seen[(x[0] + x[5])]][9] - 1)) + x[8]) / res[seen[(x[0] + x[5])]][9] 
+      else:
+        seen[(x[0] + x[5])] = ind
+        res.append(x + [1])
+        ind += 1
+    return res """
+
+
+def filter_dice(blast, threshold=0.5):
+    for data in blast:
+        if data[-1] > threshold:
+            yield data
+
+
+def split_identifiers_nucl(_, ident):
+    if "<>" in ident:
+        idents = ident.split("<>")
+    else:
+        idents = [ident]
+    return idents
+
+
+def split_identifiers_prot(_, ident):
+    if "<>" in ident:
+        idents = ident.split("<>")
+    else:
+        idents = [ident]
+    return [
+        x[x.index("[") + 1 : x.rindex("]")]
+        for x in idents
+        # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638]
+        if "[" in x and "]" in x
+    ]
+
+
+def split_identifiers_phage(par, ident):
+    par = par.replace("lcl|", "")
+    par = par[0 : par.index("_prot_")]
+    return [par]
+
+
+def important_only(blast, split_identifiers):
+    for data in blast:
+        yield [
+            data[0],  # 01 Query Seq-id (ID of your sequence)
+            data[1],  # 13 All subject Seq-id(s), separated by a ';'
+            split_identifiers(
+                data[1], data[2]
+            ),  # 25 All subject title(s), separated by a '<>'
+            data[3].split(";"),  # Extra: All Subject Accessions
+            data[4].split(";"),  # Extra: All TaxIDs
+        ]
+
+
+def deform_scores(blast):
+    for data in blast:
+        for org in data[2]:
+            yield [data[0], data[1], org, data[3], data[4]]
+
+
+def expand_taxIDs(blast):
+    for data in blast:
+        # if(len(data[4]) > 0):
+        #  print(data[0])
+        for ID in data[7]:
+            yield [
+                data[0],
+                data[1],
+                data[2],
+                data[3],
+                data[4],
+                data[5],
+                data[6],
+                int(ID),
+                data[8],
+            ]
+
+
+def expand_titles(blast):
+    for data in blast:
+        for title in data[5]:
+            yield [
+                data[0],
+                data[1],
+                data[2],
+                data[3],
+                data[4],
+                title,
+                data[6],
+                data[7],
+                data[8],
+            ]
+
+
+def filter_phage(blast, phageTaxLookup):
+    res = []
+    for data in blast:
+        if int(data[7]) in phageTaxLookup:
+            res.append(data)
+    return res
+
+
+def remove_dupes(data):
+    has_seen = {}
+    res = []
+    for row in data:
+        # qseqid, sseqid
+        key = (row[0], row[6], row[7])
+        # If we've seen the key before, we can exit
+        if key in has_seen:
+            continue
+
+        # Otherwise, continue on
+        has_seen[key] = True
+        # Pretty simple
+        res.append(row)
+    return res
+
+
+def scoreMap(blast):
+    c = {}
+    m = {}
+    for (qseq, subID, subTitle, access, ID) in blast:
+        if (subTitle, ID) not in c:
+            m[(subTitle, ID)] = access
+            c[(subTitle, ID)] = 0
+
+        c[(subTitle, ID)] += 1
+    return c, m
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Top related genomes")
+    parser.add_argument(
+        "blast", type=argparse.FileType("r"), help="Blast 25 Column Results"
+    )
+    parser.add_argument("phagedb", type=argparse.FileType("r"))
+    parser.add_argument("--access", action="store_true")
+    parser.add_argument("--protein", action="store_true")
+    parser.add_argument("--canonical", action="store_true")
+    parser.add_argument("--hits", type=int, default=5)
+    parser.add_argument("--noFilter", action="store_true")
+    parser.add_argument("--xmlMode", action="store_true")
+
+    args = parser.parse_args()
+
+    phageDb = args.phagedb
+    phageTaxLookup = []
+    line = phageDb.readline()
+    while line:
+        line = line.split("\t")
+        phageTaxLookup.append(int(line[0]))
+        line = phageDb.readline()
+
+    if args.protein:
+        splitId = split_identifiers_prot
+        # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
+    elif args.canonical:
+        splitId = split_identifiers_phage
+        # phageNameLookup = {k['source'].rstrip('.'): k['id'] for k in phageDb}
+    else:
+        splitId = split_identifiers_nucl
+        # phageNameLookup = {k['desc'].rstrip('.'): k['id'] for k in phageDb}
+
+    data = []  # Reformatting to list rather than generator
+
+    data = parse_blast(args.blast, args.xmlMode)
+    nameRec = data[0][0] 
+    data = make_num(data)
+    data = add_dice(data)
+    data = bundle_dice(data)
+     
+    # data = filter_dice(data, threshold=0.0)
+    # data = important_only(data, splitId)
+
+    # data = expand_taxIDs(data)
+    # data = deform_scores(data)
+    if not args.noFilter:
+      data = filter_phage(data, phageTaxLookup)
+    # data = expand_titles(data)
+
+    if args.protein or args.canonical:
+        data = remove_dupes(data)  # Probably obsolete, bundle dice should do this
+        count_label = "Similar Unique Proteins"
+    else:
+        count_label = "Nucleotide Hits"
+    # data = with_dice(data)
+    data.sort(key=lambda data: -data[8])
+    # counts, accessions = scoreMap(data)
+
+    if args.access:
+        sys.stdout.write(
+            "Top %d matches for BLASTn results of %s\t\t\t\t\t\t\n"
+            % (args.hits, nameRec)
+        )
+        sys.stdout.write(
+            "TaxID\tName\tAccessions\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n"
+        )
+        ind = 0
+        for out in data:
+            if ind >= args.hits:
+                break
+            ind += 1
+            sys.stdout.write(
+                "%s\t%s\t%s\t%s\t%s\t%s\t%.4f\n"
+                % (out[7], out[5], out[6], out[4], out[9], out[2], out[8])
+            )
+            
+    else:
+        sys.stdout.write(
+            "Top %d matches for BLASTn results of %s\t\t\t\t\t\n"
+            % (args.hits, data[0][0])
+        )
+        sys.stdout.write(
+            "TaxID\tName\tSubject Length\tNumber of HSPs\tTotal Aligned Length\tDice Score\n"
+        )
+        ind = 0
+        for out in data:
+            if ind >= args.hits:
+                break
+            ind += 1
+            sys.stdout.write(
+                "%s\t%s\t%s\t%s\t%s\t%.4f\n"
+                % (out[7], out[5], out[4], out[9], out[1], out[8])
+            )
+