# HG changeset patch
# User cpt
# Date 1723086572 0
# Node ID 7a23dda2e932de24032cdda1adce2dc8ee6492f1
planemo upload commit 7ebbd0df0aea9e58c4df58b61d6da385ee0ebb49
diff -r 000000000000 -r 7a23dda2e932 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Thu Aug 08 03:09:32 2024 +0000
@@ -0,0 +1,170 @@
+
+
+
+ python
+ biopython
+ cpt_gffparser
+
+
+
+ 2.4.0
+
+ 10.1093/bioinformatics/btm039
+
+
+ '$xmfa'
+
+
+
+
+
+ '$sequences'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ '$gff3_data'
+
+
+ #if str($reference_genome.reference_genome_source) == 'cached':
+ '${reference_genome.fasta_indexes.fields.path}'
+ #else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+ #end if
+
+
+ #if $reference_genome.reference_genome_source == 'history':
+ ln -s '$reference_genome.genome_fasta' genomeref.fa;
+ #end if
+
+
+ #if str($reference_genome.reference_genome_source) == 'cached':
+ '${reference_genome.fasta_indexes.fields.path}'
+ #else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+ #end if
+
+
diff -r 000000000000 -r 7a23dda2e932 prophage_relatedness.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/prophage_relatedness.py Thu Aug 08 03:09:32 2024 +0000
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+import argparse
+from math import floor
+from Bio.Blast import NCBIXML
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+log = logging.getLogger()
+
+
+def parseXML(blastxml, outFile): # Modified from intron_detection
+ blast = []
+ for iter_num, blast_record in enumerate(NCBIXML.parse(blastxml), 1):
+ align_num = 0
+ outFile.write("Query ID\tQuery Length\tTotal Number of Hits\n")
+ outFile.write(
+ "%s\t%d\t%d\n\n"
+ % (
+ blast_record.query_id,
+ blast_record.query_length,
+ len(blast_record.alignments),
+ )
+ )
+
+ for alignment in blast_record.alignments:
+
+ align_num += 1
+ gi_nos = str(alignment.accession)
+ blast_gene = []
+ for hsp in alignment.hsps:
+
+ x = float(hsp.identities - 1) / ((hsp.query_end) - hsp.query_start)
+ nice_name = blast_record.query
+ if " " in nice_name:
+ nice_name = nice_name[0 : nice_name.index(" ")]
+ blast_gene.append(
+ {
+ "gi_nos": gi_nos,
+ "sbjct_length": alignment.length,
+ "query_length": blast_record.query_length,
+ "sbjct_range": (hsp.sbjct_start, hsp.sbjct_end),
+ "query_range": (hsp.query_start, hsp.query_end),
+ "name": nice_name,
+ "evalue": hsp.expect,
+ "identity": hsp.identities,
+ "identity_percent": x,
+ "hit_num": align_num,
+ "iter_num": iter_num,
+ "match_id": alignment.title.partition(">")[0],
+ "align_len": hsp.align_length,
+ }
+ )
+ blast.append(blast_gene)
+
+ return blast
+
+
+def openTSV(blasttsv, outFile): # Modified from intron_detection
+ blast = []
+ activeAlign = ""
+ numAlignments = 0
+ qLen = 0
+ for line in blasttsv:
+ line = line.strip("\n")
+ data = line.split("\t")
+ for x in range(0, len(data)):
+ data[x] = data[x].strip()
+ qLen = data[22]
+ if activeAlign == "":
+ numAlignments += 1
+ blast_gene = []
+ hsp_num = 1
+ elif activeAlign != data[1]:
+ numAlignments += 1
+ blast.append(blast_gene)
+ blast_gene = []
+ hsp_num = 1
+ else:
+ hsp_num += 1
+ gi_nos = data[12]
+ activeAlign = data[1]
+ x = float(float(data[14]) - 1) / (float(data[7]) - float(data[6]))
+ nice_name = data[1]
+ if " " in nice_name:
+ nice_name = nice_name[0 : nice_name.index(" ")]
+ blast_gene.append(
+ {
+ "gi_nos": gi_nos,
+ "sbjct_length": int(data[23]),
+ "query_length": int(data[22]),
+ "sbjct_range": (int(data[8]), int(data[9])),
+ "query_range": (int(data[6]), int(data[7])),
+ "name": nice_name,
+ "evalue": float(data[10]),
+ "identity": int(data[14]),
+ "identity_percent": x,
+ "hit_num": numAlignments,
+ "iter_num": hsp_num,
+ "match_id": data[24].partition(">")[0],
+ "align_len": int(data[3]),
+ }
+ )
+
+ blast.append(blast_gene)
+ outFile.write("Query ID\tQuery Length\tTotal Number of Hits\n")
+ outFile.write("%s\t%d\t%d\n\n" % (data[0], int(data[22]), numAlignments))
+
+ return blast
+
+
+def test_true(feature, **kwargs):
+ return True
+
+
+def superSets(inSets):
+ inSets.sort(key=len, reverse=True)
+ nextInd = 0
+ res = []
+ for i in range(0, len(inSets)):
+ if i == 0:
+ res.append(inSets[i])
+ continue
+ for par in res:
+ complete = True
+ for x in inSets[i]:
+ if not (x in par):
+ complete = False
+ if complete:
+ break # Subset of at least one member
+ if not complete:
+ res.append(inSets[i])
+ return res
+
+
+def disjointSets(inSets):
+ inSets.sort(key=lambda x: x[0]["sbjct_range"][0])
+ res = [inSets[0]]
+ for i in range(1, len(inSets)):
+ disjoint = True
+ for elem in inSets[i]:
+ for cand in res:
+ if elem in cand:
+ disjoint = False
+ break
+ if not disjoint:
+ break
+ if disjoint:
+ res.append(inSets[i])
+ return res
+
+
+def compPhage(inRec, outFile, padding=1.2, cutoff=0.3, numReturn=20, isTSV=False):
+
+ if isTSV:
+ inRec = openTSV(inRec, outFile)
+ else:
+ inRec = parseXML(inRec, outFile)
+ res = []
+ for group in inRec:
+ window = floor(padding * float(group[0]["query_length"]))
+ group = sorted(group, key=lambda x: x["sbjct_range"][0])
+ hspGroups = []
+ lastInd = len(res)
+
+ for x in range(0, len(group)):
+ hspGroups.append([group[x]])
+ startBound = group[x]["sbjct_range"][0]
+ endBound = startBound + window
+ for hsp in group[x + 1 :]:
+ if (
+ hsp["sbjct_range"][0] >= startBound
+ and hsp["sbjct_range"][1] <= endBound
+ ):
+ hspGroups[-1].append(hsp)
+
+ for x in disjointSets(superSets(hspGroups)):
+ res.append(x)
+
+ maxID = 0.0
+ for x in res[lastInd:]:
+ sumID = 0.0
+ totAlign = 0
+ for y in x:
+ totAlign += y["align_len"]
+ sumID += float(y["identity"])
+ x.append(totAlign)
+ x.append(sumID / float(x[0]["query_length"]))
+ maxID = max(maxID, x[-1])
+
+ res = sorted(res, key=lambda x: x[-1], reverse=True)
+
+ outList = []
+ outNum = 0
+ for x in res:
+ if outNum == numReturn or x[-1] < cutoff:
+ break
+ outNum += 1
+ outList.append(x)
+
+ # Original request was that low scoring clusters would make it to the final results IF
+ # they were part of an Accession cluster that did have at least one high scoring member.
+
+ outFile.write(
+ "Accession Number\tCluster Start Location\tEnd Location\tSubject Cluster Length\t# HSPs in Cluster\tTotal Aligned Length\t% of Query Aligned\tOverall % Query Identity\tOverall % Subject Identity\tComplete Accession Info\n"
+ )
+ for x in outList:
+ minStart = min(x[0]["sbjct_range"][0], x[0]["sbjct_range"][1])
+ maxEnd = max(x[0]["sbjct_range"][0], x[0]["sbjct_range"][1])
+ if "|gb|" in x[0]["match_id"]:
+ startSlice = x[0]["match_id"].index("gb|") + 3
+ endSlice = (x[0]["match_id"][startSlice:]).index("|")
+ accOut = x[0]["match_id"][startSlice : startSlice + endSlice]
+ else:
+ accOut = x[0]["gi_nos"]
+ for y in x[0:-2]:
+ # ("\t%.3f\t" % (x[-1]))
+ minStart = min(minStart, y["sbjct_range"][0])
+ maxEnd = max(maxEnd, y["sbjct_range"][1])
+ outFile.write(
+ accOut
+ + "\t"
+ + str(minStart)
+ + "\t"
+ + str(maxEnd)
+ + "\t"
+ + str(maxEnd - minStart + 1)
+ + "\t"
+ + str(len(x) - 1)
+ + "\t"
+ + str(x[-2])
+ + ("\t%.3f" % (float(x[-2]) / float(x[0]["query_length"]) * 100.00))
+ + ("\t%.3f" % (x[-1] * 100.00))
+ + ("\t%.3f" % (float(x[-2]) / float(maxEnd - minStart + 1) * 100.00))
+ + "\t"
+ + x[0]["match_id"]
+ + "\n"
+ )
+
+ # accession start end number
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Intron detection")
+ parser.add_argument(
+ "inRec", type=argparse.FileType("r"), help="blast XML protein results"
+ )
+ parser.add_argument(
+ "--outFile",
+ type=argparse.FileType("w"),
+ help="Output Error Log",
+ default="./compPro.tsv",
+ )
+ parser.add_argument(
+ "--padding",
+ help="Gap minimum (Default -1, set to a negative number to allow overlap)",
+ default=1.2,
+ type=float,
+ )
+ parser.add_argument(
+ "--cutoff",
+ help="Gap minimum (Default -1, set to a negative number to allow overlap)",
+ default=0.3,
+ type=float,
+ )
+ parser.add_argument(
+ "--numReturn",
+ help="Gap maximum in genome (Default 10000)",
+ default=20,
+ type=int,
+ )
+ parser.add_argument(
+ "--isTSV",
+ help="Opening Blast TSV result",
+ action="store_true",
+ )
+ args = parser.parse_args()
+
+ compPhage(**vars(args))
diff -r 000000000000 -r 7a23dda2e932 prophage_relatedness.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/prophage_relatedness.xml Thu Aug 08 03:09:32 2024 +0000
@@ -0,0 +1,53 @@
+
+
+
+ macros.xml
+
+
+ python
+ biopython
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+