# HG changeset patch
# User cpt
# Date 1655471235 0
# Node ID 10d13d0c37d36c3786d30c479d2e11f50fb33ca4
Uploaded
diff -r 000000000000 -r 10d13d0c37d3 cpt_putative_isp/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/cpt-macros.xml Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r 10d13d0c37d3 cpt_putative_isp/cpt.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/cpt.py Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,341 @@
+#!/usr/bin/env python
+from Bio.Seq import Seq, reverse_complement, translate
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from Bio.Data import CodonTable
+import logging
+
+logging.basicConfig()
+log = logging.getLogger()
+
+PHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*phage (?P.*)$")
+BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*bacteriophage (?P.*)$")
+STARTS_WITH_PHAGE = re.compile(
+ "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P.*)$"
+)
+NEW_STYLE_NAMES = re.compile("(?Pv[A-Z]_[A-Z][a-z]{2}_.*)")
+
+
+def phage_name_parser(name):
+ host = None
+ phage = None
+ name = name.replace(", complete genome.", "")
+ name = name.replace(", complete genome", "")
+
+ m = BACTERIOPHAGE_IN_MIDDLE.match(name)
+ if m:
+ host = m.group("host")
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = PHAGE_IN_MIDDLE.match(name)
+ if m:
+ host = m.group("host")
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = STARTS_WITH_PHAGE.match(name)
+ if m:
+ phage = m.group("phage")
+ return (host, phage)
+
+ m = NEW_STYLE_NAMES.match(name)
+ if m:
+ phage = m.group("phage")
+ return (host, phage)
+
+ return (host, phage)
+
+
+class OrfFinder(object):
+ def __init__(self, table, ftype, ends, min_len, strand):
+ self.table = table
+ self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+ self.ends = ends
+ self.ftype = ftype
+ self.min_len = min_len
+ self.starts = sorted(self.table_obj.start_codons)
+ self.stops = sorted(self.table_obj.stop_codons)
+ self.re_starts = re.compile("|".join(self.starts))
+ self.re_stops = re.compile("|".join(self.stops))
+ self.strand = strand
+
+ def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+ seq_format = "fasta"
+ log.debug("Genetic code table %i" % self.table)
+ log.debug("Minimum length %i aa" % self.min_len)
+
+ out_count = 0
+
+ out_gff3.write("##gff-version 3\n")
+
+ for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+ for i, (f_start, f_end, f_strand, n, t) in enumerate(
+ self.get_all_peptides(str(record.seq).upper())
+ ):
+ out_count += 1
+
+ descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+ len(t),
+ len(n),
+ f_start,
+ f_end,
+ f_strand,
+ record.description,
+ )
+ fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+ r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+ t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+ SeqIO.write(r, out_nuc, "fasta")
+ SeqIO.write(t, out_prot, "fasta")
+
+ nice_strand = "+" if f_strand == +1 else "-"
+
+ out_bed.write(
+ "\t".join(
+ map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+ )
+ + "\n"
+ )
+
+ out_gff3.write(
+ "\t".join(
+ map(
+ str,
+ [
+ record.id,
+ "getOrfsOrCds",
+ "CDS",
+ f_start + 1,
+ f_end,
+ ".",
+ nice_strand,
+ 0,
+ "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+ ],
+ )
+ )
+ + "\n"
+ )
+ log.info("Found %i %ss", out_count, self.ftype)
+
+ def start_chop_and_trans(self, s, strict=True):
+ """Returns offset, trimmed nuc, protein."""
+ if strict:
+ assert s[-3:] in self.stops, s
+ assert len(s) % 3 == 0
+ for match in self.re_starts.finditer(s, overlapped=True):
+ # Must check the start is in frame
+ start = match.start()
+ if start % 3 == 0:
+ n = s[start:]
+ assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+ if strict:
+ t = translate(n, self.table)
+ else:
+ # Use when missing stop codon,
+ t = "M" + translate(n[3:], self.table, to_stop=True)
+ yield start, n, t # Edited by CPT to be a generator
+
+ def break_up_frame(self, s):
+ """Returns offset, nuc, protein."""
+ start = 0
+ for match in self.re_stops.finditer(s, overlapped=True):
+ index = match.start() + 3
+ if index % 3 != 0:
+ continue
+ n = s[start:index]
+ for (offset, n, t) in self.start_chop_and_trans(n):
+ if n and len(t) >= self.min_len:
+ yield start + offset, n, t
+ start = index
+
+ def putative_genes_in_sequence(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+ Co-ordinates are Python style zero-based.
+ """
+ nuc_seq = nuc_seq.upper()
+ # TODO - Refactor to use a generator function (in start order)
+ # rather than making a list and sorting?
+ answer = []
+ full_len = len(nuc_seq)
+
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ answer.append((start, start + len(n), +1, n, t))
+
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ answer.append((start, start - len(n), -1, n, t))
+ answer.sort()
+ return answer
+
+ def get_all_peptides(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+
+ Co-ordinates are Python style zero-based.
+ """
+ # Refactored into generator by CPT
+ full_len = len(nuc_seq)
+ if self.strand != "reverse":
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ yield (start, start + len(n), +1, n, t)
+ if self.strand != "forward":
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ yield (start - len(n), start, -1, n, t)
+
+
+class MGAFinder(object):
+ def __init__(self, table, ftype, ends, min_len):
+ self.table = table
+ self.table_obj = CodonTable.ambiguous_generic_by_id[table]
+ self.ends = ends
+ self.ftype = ftype
+ self.min_len = min_len
+ self.starts = sorted(self.table_obj.start_codons)
+ self.stops = sorted(self.table_obj.stop_codons)
+ self.re_starts = re.compile("|".join(self.starts))
+ self.re_stops = re.compile("|".join(self.stops))
+
+ def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3):
+ seq_format = "fasta"
+ log.debug("Genetic code table %i" % self.table)
+ log.debug("Minimum length %i aa" % self.min_len)
+
+ out_count = 0
+
+ out_gff3.write("##gff-version 3\n")
+
+ for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)):
+ for i, (f_start, f_end, f_strand, n, t) in enumerate(
+ self.get_all_peptides(str(record.seq).upper())
+ ):
+ out_count += 1
+
+ descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % (
+ len(t),
+ len(n),
+ f_start,
+ f_end,
+ f_strand,
+ record.description,
+ )
+ fid = record.id + "|%s%i" % (self.ftype, i + 1)
+
+ r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+ t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+
+ SeqIO.write(r, out_nuc, "fasta")
+ SeqIO.write(t, out_prot, "fasta")
+
+ nice_strand = "+" if f_strand == +1 else "-"
+
+ out_bed.write(
+ "\t".join(
+ map(str, [record.id, f_start, f_end, fid, 0, nice_strand])
+ )
+ + "\n"
+ )
+
+ out_gff3.write(
+ "\t".join(
+ map(
+ str,
+ [
+ record.id,
+ "getOrfsOrCds",
+ "CDS",
+ f_start + 1,
+ f_end,
+ ".",
+ nice_strand,
+ 0,
+ "ID=%s.%s.%s" % (self.ftype, idx, i + 1),
+ ],
+ )
+ )
+ + "\n"
+ )
+ log.info("Found %i %ss", out_count, self.ftype)
+
+ def start_chop_and_trans(self, s, strict=True):
+ """Returns offset, trimmed nuc, protein."""
+ if strict:
+ assert s[-3:] in self.stops, s
+ assert len(s) % 3 == 0
+ for match in self.re_starts.finditer(s, overlapped=True):
+ # Must check the start is in frame
+ start = match.start()
+ if start % 3 == 0:
+ n = s[start:]
+ assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+ if strict:
+ t = translate(n, self.table)
+ else:
+ # Use when missing stop codon,
+ t = "M" + translate(n[3:], self.table, to_stop=True)
+ yield start, n, t
+
+ def break_up_frame(self, s):
+ """Returns offset, nuc, protein."""
+ start = 0
+ for match in self.re_stops.finditer(s, overlapped=True):
+ index = match.start() + 3
+ if index % 3 != 0:
+ continue
+ n = s[start:index]
+ for (offset, n, t) in self.start_chop_and_trans(n):
+ if n and len(t) >= self.min_len:
+ yield start + offset, n, t
+ start = index
+
+ def putative_genes_in_sequence(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+ Co-ordinates are Python style zero-based.
+ """
+ nuc_seq = nuc_seq.upper()
+ # TODO - Refactor to use a generator function (in start order)
+ # rather than making a list and sorting?
+ answer = []
+ full_len = len(nuc_seq)
+
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ answer.append((start, start + len(n), +1, n, t))
+
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ answer.append((start, start - len(n), -1, n, t))
+ answer.sort()
+ return answer
+
+ def get_all_peptides(self, nuc_seq):
+ """Returns start, end, strand, nucleotides, protein.
+
+ Co-ordinates are Python style zero-based.
+ """
+ # Refactored into generator by CPT
+
+ full_len = len(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(nuc_seq[frame:]):
+ start = frame + offset # zero based
+ yield (start, start + len(n), +1, n, t)
+ rc = reverse_complement(nuc_seq)
+ for frame in range(0, 3):
+ for offset, n, t in self.break_up_frame(rc[frame:]):
+ start = full_len - frame - offset # zero based
+ yield (start - len(n), start, -1, n, t)
diff -r 000000000000 -r 10d13d0c37d3 cpt_putative_isp/generate-putative-isp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/generate-putative-isp.py Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,317 @@
+#!/usr/bin/env python
+
+##### findSpanin.pl --> findSpanin.py
+######### Incooperated from the findSpanin.pl script, but better and more snakey.
+
+import argparse
+from cpt import OrfFinder
+from Bio import SeqIO
+from Bio import Seq
+import re
+from spaninFuncs import *
+import os
+
+# if __name__ == '__main__':
+# pass
+###############################################################################
+
+if __name__ == "__main__":
+
+ # Common parameters for both ISP / OSP portion of script
+
+ parser = argparse.ArgumentParser(
+ description="Get putative protein candidates for spanins"
+ )
+ parser.add_argument(
+ "fasta_file", type=argparse.FileType("r"), help="Fasta file"
+ ) # the "input" argument
+
+ parser.add_argument(
+ "-f",
+ "--format",
+ dest="seq_format",
+ default="fasta",
+ help="Sequence format (e.g. fasta, fastq, sff)",
+ ) # optional formats for input, currently just going to do ntFASTA
+
+ parser.add_argument(
+ "--strand",
+ dest="strand",
+ choices=("both", "forward", "reverse"),
+ default="both",
+ help="select strand",
+ ) # Selection of +, -, or both strands
+
+ parser.add_argument(
+ "--table", dest="table", default=11, help="NCBI Translation table", type=int
+ ) # Uses "default" NCBI codon table. This should always (afaik) be what we want...
+
+ parser.add_argument(
+ "-t",
+ "--ftype",
+ dest="ftype",
+ choices=("CDS", "ORF"),
+ default="ORF",
+ help="Find ORF or CDSs",
+ ) # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF
+
+ parser.add_argument(
+ "-e",
+ "--ends",
+ dest="ends",
+ choices=("open", "closed"),
+ default="closed",
+ help="Open or closed. Closed ensures start/stop codons are present",
+ ) # includes the start and stop codon
+
+ parser.add_argument(
+ "-m",
+ "--mode",
+ dest="mode",
+ choices=("all", "top", "one"),
+ default="all", # I think we want this to JUST be all...nearly always
+ help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length",
+ )
+
+ parser.add_argument(
+ "--switch",
+ dest="switch",
+ default="all",
+ help="switch between ALL putative osps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321",
+ )
+ # isp parameters
+ parser.add_argument(
+ "--isp_min_len",
+ dest="isp_min_len",
+ default=60,
+ help="Minimum ORF length, measured in codons",
+ type=int,
+ )
+ parser.add_argument(
+ "--isp_on",
+ dest="out_isp_nuc",
+ type=argparse.FileType("w"),
+ default="_out_isp.fna",
+ help="Output nucleotide sequences, FASTA",
+ )
+ parser.add_argument(
+ "--isp_op",
+ dest="out_isp_prot",
+ type=argparse.FileType("w"),
+ default="_out_isp.fa",
+ help="Output protein sequences, FASTA",
+ )
+ parser.add_argument(
+ "--isp_ob",
+ dest="out_isp_bed",
+ type=argparse.FileType("w"),
+ default="_out_isp.bed",
+ help="Output BED file",
+ )
+ parser.add_argument(
+ "--isp_og",
+ dest="out_isp_gff3",
+ type=argparse.FileType("w"),
+ default="_out_isp.gff3",
+ help="Output GFF3 file",
+ )
+ parser.add_argument(
+ "--isp_min_dist",
+ dest="isp_min_dist",
+ default=10,
+ help="Minimal distance to first AA of TMD, measured in AA",
+ type=int,
+ )
+ parser.add_argument(
+ "--isp_max_dist",
+ dest="isp_max_dist",
+ default=30,
+ help="Maximum distance to first AA of TMD, measured in AA",
+ type=int,
+ )
+ parser.add_argument(
+ "--putative_isp",
+ dest="putative_isp_fa",
+ type=argparse.FileType("w"),
+ default="_putative_isp.fa",
+ help="Output of putative FASTA file",
+ )
+ parser.add_argument(
+ "--min_tmd_size",
+ dest="min_tmd_size",
+ default=10,
+ help="Minimal size of the TMD domain",
+ type=int,
+ )
+ parser.add_argument(
+ "--max_tmd_size",
+ dest="max_tmd_size",
+ default=20,
+ help="Maximum size of the TMD domain",
+ type=int,
+ )
+ parser.add_argument(
+ "--summary_isp_txt",
+ dest="summary_isp_txt",
+ type=argparse.FileType("w"),
+ default="_summary_isp.txt",
+ help="Summary statistics on putative i-spanins",
+ )
+ parser.add_argument(
+ "--putative_isp_gff",
+ dest="putative_isp_gff",
+ type=argparse.FileType("w"),
+ default="_putative_isp.gff3",
+ help="gff3 output for putative i-spanins",
+ )
+
+ parser.add_argument(
+ "--max_isp",
+ dest="max_isp",
+ default=230,
+ help="Maximum size of the ISP",
+ type=int,
+ )
+
+ parser.add_argument(
+ "--isp_mode",
+ action="store_true",
+ default=True
+ )
+
+ parser.add_argument(
+ "--peri_min",
+ type=int,
+ default=18,
+ help="amount of residues after TMD is found min"
+ )
+
+ parser.add_argument(
+ "--peri_max",
+ type=int,
+ default=206,
+ help="amount of residues after TMD is found max"
+ )
+ # parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated?
+ args = parser.parse_args()
+ the_args = vars(parser.parse_args())
+
+ ### isp output, naive ORF finding:
+ isps = OrfFinder(args.table, args.ftype, args.ends, args.isp_min_len, args.strand)
+ isps.locate(
+ args.fasta_file,
+ args.out_isp_nuc,
+ args.out_isp_prot,
+ args.out_isp_bed,
+ args.out_isp_gff3,
+ )
+ """
+ >T7_EIS MLEFLRKLIPWVLVGMLFGLGWHLGSDSMDAKWKQEVHNEYVKRVEAAKSTQRAIGAVSAKYQEDLAALEGSTDRIISDLRSDNKRLRVRVKTTGISDGQCGFEPDGRAELDDRDAKRILAVTQKGDAWIRALQDTIRELQRK
+ >lambda_EIS MSRVTAIISALVICIIVCLSWAVNHYRDNAITYKAQRDKNARELKLANAAITDMQMRQRDVAALDAKYTKELADAKAENDALRDDVAAGRRRLHIKAVCQSVREATTASGVDNAASPRLADTAERDYFTLRERLITMQKQLEGTQKYINEQCR
+ """
+ args.fasta_file.close()
+ args.fasta_file = open(args.fasta_file.name, "r")
+ args.out_isp_prot.close()
+ args.out_isp_prot = open(args.out_isp_prot.name, "r")
+
+ pairs = tuple_fasta(fasta_file=args.out_isp_prot)
+
+ # print(pairs)
+
+ have_tmd = [] # empty candidates list to be passed through the user input criteria
+
+ for (
+ each_pair
+ ) in (
+ pairs
+ ): # grab transmembrane domains from spaninFuncts (queries for lysin snorkels # and a range of hydrophobic regions that could be TMDs)
+ if len(each_pair[1]) <= args.max_isp:
+ try:
+ have_tmd += find_tmd(
+ pair=each_pair,
+ minimum=args.isp_min_dist,
+ maximum=args.isp_max_dist,
+ TMDmin=args.min_tmd_size,
+ TMDmax=args.max_tmd_size,
+ isp_mode=args.isp_mode,
+ peri_min=args.peri_min,
+ peri_max=args.peri_max,
+ )
+ except TypeError:
+ continue
+
+ if args.switch == "all":
+ pass
+ else:
+ # for each_pair in have_lipo:
+ range_of = args.switch
+ range_of = re.search(("[\d]+:[\d]+"), range_of).group(0)
+ start = int(range_of.split(":")[0])
+ end = int(range_of.split(":")[1])
+ have_tmd = parse_a_range(pair=have_tmd, start=start, end=end)
+
+ total_isp = len(have_tmd)
+
+ # ORF = [] # mightttttttttttt use eventually
+ length = [] # grabbing length of the sequences
+ candidate_dict = {k: v for k, v in have_tmd}
+ with args.putative_isp_fa as f:
+ for desc, s in candidate_dict.items(): # description / sequence
+ f.write(">" + str(desc))
+ f.write("\n" + lineWrapper(str(s).replace("*","")) + "\n")
+ length.append(len(s))
+ # ORF.append(desc)
+ if not length:
+ raise Exception("Parameters yielded no candidates.")
+ bot_size = min(length)
+ top_size = max(length)
+ avg = (sum(length)) / total_isp
+ n = len(length)
+ if n == 0:
+ raise Exception("no median for empty data")
+ if n % 2 == 1:
+ med = length[n // 2]
+ else:
+ i = n // 2
+ med = (length[i - 1] + length[i]) / 2
+
+ #### Extra statistics
+ args.out_isp_prot.close()
+ all_orfs = open(args.out_isp_prot.name, "r")
+ all_isps = open(args.putative_isp_fa.name, "r")
+ #record = SeqIO.read(all_orfs, "fasta")
+ #print(len(record))
+ n = 0
+ for line in all_orfs:
+ if line.startswith(">"):
+ n += 1
+ all_orfs_counts = n
+
+ c = 0
+ for line in all_isps:
+ if line.startswith(">"):
+ c += 1
+ all_isps_counts = c
+
+ #print(f"{n} -> {c}")
+ #count = 0
+ #for feature in record.features:
+ # count += 1
+ #print(count)
+
+
+ with args.summary_isp_txt as f:
+ f.write("total potential o-spanins: " + str(total_isp) + "\n")
+ f.write("average length (AA): " + str(avg) + "\n")
+ f.write("median length (AA): " + str(med) + "\n")
+ f.write("maximum orf in size (AA): " + str(top_size) + "\n")
+ f.write("minimum orf in size (AA): " + str(bot_size) + "\n")
+ f.write("ratio of isps found from naive orfs: " + str(c) + "/" +str(n))
+
+ # Output the putative list in gff3 format
+ args.putative_isp_fa = open(args.putative_isp_fa.name, "r")
+ gff_data = prep_a_gff3(fa=args.putative_isp_fa, spanin_type="isp",org=args.fasta_file)
+ write_gff3(data=gff_data, output=args.putative_isp_gff)
+
+ """https://docs.python.org/3.4/library/subprocess.html"""
+ """https://github.tamu.edu/CPT/Galaxy-Tools/blob/f0bf4a4b8e5124d4f3082d21b738dfaa8e1a3cf6/tools/phage/transmembrane.py"""
diff -r 000000000000 -r 10d13d0c37d3 cpt_putative_isp/generate-putative-isp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/generate-putative-isp.xml Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,82 @@
+
+
+ constructs a putative list of potential i-spanin from an input genomic FASTA
+
+ macros.xml
+ cpt-macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Genomic FASTA
+*NOTE: This tool only takes a SINGLE genomic fasta. It does not work with multiFASTAs.*
+
+**OUTPUT** --> putative_isp.fa (FASTA) file, putative_isp.gff3, and basic summary statistics as summary_isp.txt.
+
+Protein sequences which passed the above filters are returned as the candidate ISPs.
+
+]]>
+
+
diff -r 000000000000 -r 10d13d0c37d3 cpt_putative_isp/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/macros.xml Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,63 @@
+
+
+
+
+ regex
+ python
+ biopython
+ cpt_gffparser
+
+
+
+
+ "$blast_tsv"
+
+
+
+
+
+
+ "$blast_xml"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ "$gff3_data"
+
+
+ genomeref.fa
+
+
+ ln -s $genome_fasta genomeref.fa;
+
+
+ genomeref.fa
+
+
+
+
+
+
+ "$sequences"
+
+
+
+
+
diff -r 000000000000 -r 10d13d0c37d3 cpt_putative_isp/spaninFuncs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_putative_isp/spaninFuncs.py Fri Jun 17 13:07:15 2022 +0000
@@ -0,0 +1,469 @@
+"""
+PREMISE
+### Functions/Classes that are used in both generate-putative-osp.py and generate-putative-isp.py
+###### Main premise here is to make the above scripts a little more DRY, as well as easily readable for execution.
+###### Documentation will ATTEMPT to be thourough here
+"""
+
+import re
+from Bio import SeqIO
+from Bio import Seq
+from collections import OrderedDict
+
+# Not written in OOP for a LITTLE bit of trying to keep the complication down in case adjustments are needed by someone else.
+# Much of the manipulation is string based; so it should be straightforward as well as moderately quick
+################## GLobal Variables
+Lys = "K"
+
+
+def check_back_end_snorkels(seq, tmsize):
+ """
+ Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match.
+ --> seq : should be the sequence fed from the "search_region" portion of the sequence
+ --> tmsize : size of the potential TMD being investigated
+ """
+ found = []
+ if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]):
+ found = "match"
+ return found
+ elif seq[tmsize - 3] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 4]):
+ found = "match"
+ return found
+ elif seq[tmsize - 2] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 3]):
+ found = "match"
+ return found
+ elif seq[tmsize - 1] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 2]):
+ found = "match"
+ return found
+ else:
+ found = "NOTmatch"
+ return found
+
+
+def prep_a_gff3(fa, spanin_type, org):
+ """
+ Function parses an input detailed 'fa' file and outputs a 'gff3' file
+ ---> fa = input .fa file
+ ---> output = output a returned list of data, easily portable to a gff3 next
+ ---> spanin_type = 'isp' or 'osp'
+ """
+ with org as f:
+ header = f.readline()
+ orgacc = header.split(" ")
+ orgacc = orgacc[0].split(">")[1].strip()
+ fa_zip = tuple_fasta(fa)
+ data = []
+ for a_pair in fa_zip:
+ # print(a_pair)
+ if re.search(("(\[1\])"), a_pair[0]):
+ strand = "+"
+ elif re.search(("(\[-1\])"), a_pair[0]):
+ strand = "-" # column 7
+ start = re.search(("[\d]+\.\."), a_pair[0]).group(0).split("..")[0] # column 4
+ end = re.search(("\.\.[\d]+"), a_pair[0]).group(0).split("..")[1] # column 5
+ orfid = re.search(("(ORF)[\d]+"), a_pair[0]).group(0) # column 1
+ if spanin_type == "isp":
+ methodtype = "CDS" # column 3
+ spanin = "isp"
+ elif spanin_type == "osp":
+ methodtype = "CDS" # column 3
+ spanin = "osp"
+ elif spanin_type == "usp":
+ methodtype = "CDS"
+ spanin = "usp"
+ else:
+ raise "need to input spanin type"
+ source = "cpt.py|putative-*.py" # column 2
+ score = "." # column 6
+ phase = "." # column 8
+ attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1] # column 9
+ sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]]
+ data += sequence
+ return data
+
+
+def write_gff3(data, output="results.gff3"):
+ """
+ Parses results from prep_a_gff3 into a gff3 file
+ ---> input : list from prep_a_gff3
+ ---> output : gff3 file
+ """
+ data = data
+ filename = output
+ with filename as f:
+ f.write("#gff-version 3\n")
+ for value in data:
+ f.write(
+ "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+ value[0],
+ value[1],
+ value[2],
+ value[3],
+ value[4],
+ value[5],
+ value[6],
+ value[7],
+ value[8],
+ )
+ )
+ f.close()
+
+
+def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206):
+ """
+ Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD
+ ---> pair : Input of tuple with description and AA sequence (str)
+ ---> minimum : How close from the initial start codon a TMD can be within
+ ---> maximum : How far from the initial start codon a TMD can be within
+ ---> TMDmin : The minimum size that a transmembrane can be (default = 10)
+ ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20)
+ """
+ # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S']
+ tmd = []
+ s = str(pair[1]) # sequence being analyzed
+ # print(s) # for trouble shooting
+ if maximum > len(s):
+ maximum = len(s)
+ search_region = s[minimum - 1 : maximum + 1]
+ #print(f"this is the search region: {search_region}")
+ # print(search_region) # for trouble shooting
+
+ for tmsize in range(TMDmin, TMDmax+1, 1):
+ #print(f"this is the current tmsize we're trying: {tmsize}")
+ # print('==============='+str(tmsize)+'================') # print for troubleshooting
+ pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}" # searches for these hydrophobic residues tmsize total times
+ #print(pattern)
+ #print(f"sending to regex: {search_region}")
+ if re.search(
+ ("[K]"), search_region[1:8]): # grabbing one below with search region, so I want to grab one ahead here when I query.
+ store_search = re.search(("[K]"), search_region[1:8]) # storing regex object
+ where_we_are = store_search.start() # finding where we got the hit
+ if re.search(
+ ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]
+ ) and re.search(
+ ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1]
+ ): # hydrophobic neighbor
+ #try:
+ g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group()
+ backend = check_back_end_snorkels(search_region, tmsize)
+ if backend == "match":
+ if isp_mode:
+ g = re.search((pattern), search_region).group()
+ end_of_tmd = re.search((g), s).end()+1
+ amt_peri = len(s) - end_of_tmd
+ if peri_min <= amt_peri <= peri_max:
+ pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
+ new_pair = (pair_desc,pair[1])
+ tmd.append(new_pair)
+ else:
+ tmd.append(pair)
+ else:
+ continue
+ #else:
+ #print("I'm continuing out of snorkel loop")
+ #print(f"{search_region}")
+ #continue
+ if re.search((pattern), search_region):
+ #print(f"found match: {}")
+ #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE")
+ #try:
+ if isp_mode:
+ g = re.search((pattern), search_region).group()
+ end_of_tmd = re.search((g), s).end()+1
+ amt_peri = len(s) - end_of_tmd
+ if peri_min <= amt_peri <= peri_max:
+ pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
+ new_pair = (pair_desc,pair[1])
+ tmd.append(new_pair)
+ else:
+ tmd.append(pair)
+ else:
+ continue
+
+ return tmd
+
+
+def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False):
+ """
+ Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox
+ ---> minimum - min distance from start codon to first AA of lipobox
+ ---> maximum - max distance from start codon to first AA of lipobox
+ ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy
+
+ """
+ if regex == 1:
+ pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl
+ elif regex == 2:
+ pattern = "[ACGSILMFTV][^REKD][GAS]C" # regex for Lipobox from LipoRy
+
+ candidates = []
+ s = str(pair[1])
+ # print(s) # trouble shooting
+ search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input
+ # print(search_region) # trouble shooting
+ patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"]
+ for pattern in patterns:
+ #print(pattern) # trouble shooting
+ if re.search((pattern), search_region): # lipobox must be WITHIN the range...
+ # searches the sequence with the input RegEx AND omits if
+ g = re.search((pattern), search_region).group() # find the exact group match
+ amt_peri = len(s) - re.search((g), s).end() + 1
+ if min_after <= amt_peri <= max_after: # find the lipobox end region
+ if osp_mode:
+ pair_desc = pair[0] + ", peri_count~="+str(amt_peri)
+ new_pair = (pair_desc,pair[1])
+ candidates.append(new_pair)
+ else:
+ candidates.append(pair)
+
+ return candidates
+
+
+def tuple_fasta(fasta_file):
+ """
+ #### INPUT: Fasta File
+ #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence
+ ####
+ """
+ fasta = SeqIO.parse(fasta_file, "fasta")
+ descriptions = []
+ sequences = []
+ for r in fasta: # iterates and stores each description and sequence
+ description = r.description
+ sequence = str(r.seq)
+ if (
+ sequence[0] != "I"
+ ): # the translation table currently has I as a potential start codon ==> this will remove all ORFs that start with I
+ descriptions.append(description)
+ sequences.append(sequence)
+ else:
+ continue
+
+ return zip(descriptions, sequences)
+
+
+def lineWrapper(text, charactersize=60):
+
+ if len(text) <= charactersize:
+ return text
+ else:
+ return (
+ text[:charactersize]
+ + "\n"
+ + lineWrapper(text[charactersize:], charactersize)
+ )
+
+
+def getDescriptions(fasta):
+ """
+ Takes an output FASTA file, and parses retrieves the description headers. These headers contain information needed
+ for finding locations of a potential i-spanin and o-spanin proximity to one another.
+ """
+ desc = []
+ with fasta as f:
+ for line in f:
+ if line.startswith(">"):
+ desc.append(line)
+ return desc
+
+
+def splitStrands(text, strand="+"):
+ # positive_strands = []
+ # negative_strands = []
+ if strand == "+":
+ if re.search(("(\[1\])"), text):
+ return text
+ elif strand == "-":
+ if re.search(("(\[-1\])"), text):
+ return text
+ # return positive_strands, negative_strands
+
+
+def parse_a_range(pair, start, end):
+ """
+ Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range
+ ---> data : fasta tuple data
+ ---> start : start range to keep
+ ---> end : end range to keep (will need to + 1)
+ """
+ matches = []
+ for each_pair in pair:
+
+ s = re.search(("[\d]+\.\."), each_pair[0]).group(0) # Start of the sequence
+ s = int(s.split("..")[0])
+ e = re.search(("\.\.[\d]+"), each_pair[0]).group(0)
+ e = int(e.split("..")[1])
+ if start - 1 <= s and e <= end + 1:
+ matches.append(each_pair)
+ else:
+ continue
+ # else:
+ # continue
+ # if matches != []:
+ return matches
+ # else:
+ # print('no candidates within selected range')
+
+
+def grabLocs(text):
+ """
+ Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module
+ from cpt.py
+ """
+ start = re.search(("[\d]+\.\."), text).group(0) # Start of the sequence ; looks for [numbers]..
+ end = re.search(("\.\.[\d]+"), text).group(0) # End of the sequence ; Looks for ..[numbers]
+ orf = re.search(("(ORF)[\d]+"), text).group(0) # Looks for ORF and the numbers that are after it
+ if re.search(("(\[1\])"), text): # stores strand
+ strand = "+"
+ elif re.search(("(\[-1\])"), text): # stores strand
+ strand = "-"
+ start = int(start.split("..")[0])
+ end = int(end.split("..")[1])
+ vals = [start, end, orf, strand]
+
+ return vals
+
+
+def spaninProximity(isp, osp, max_dist=30):
+ """
+ _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_
+ Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site
+ to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3)
+ I modified this on 07.30.2020 to bypass the pick + or - strand. To
+ INPUT: list of OSP and ISP candidates
+ OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list
+ """
+
+ embedded = {}
+ overlap = {}
+ separate = {}
+ for iseq in isp:
+ embedded[iseq[2]] = []
+ overlap[iseq[2]] = []
+ separate[iseq[2]] = []
+ for oseq in osp:
+ if iseq[3] == "+":
+ if oseq[3] == "+":
+ if iseq[0] < oseq[0] < iseq[1] and oseq[1] < iseq[1]:
+ ### EMBEDDED ###
+ combo = [
+ iseq[0],
+ iseq[1],
+ oseq[2],
+ oseq[0],
+ oseq[1],
+ iseq[3],
+ ] # ordering a return for dic
+ embedded[iseq[2]] += [combo]
+ elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]:
+ ### OVERLAP / SEPARATE ###
+ if (iseq[1] - oseq[0]) < 6:
+ combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+ separate[iseq[2]] += [combo]
+ else:
+ combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+ overlap[iseq[2]] += [combo]
+ elif iseq[1] <= oseq[0] <= iseq[1] + max_dist:
+ combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+ separate[iseq[2]] += [combo]
+ else:
+ continue
+ if iseq[3] == "-":
+ if oseq[3] == "-":
+ if iseq[0] <= oseq[1] <= iseq[1] and oseq[0] > iseq[0]:
+ ### EMBEDDED ###
+ combo = [
+ iseq[0],
+ iseq[1],
+ oseq[2],
+ oseq[0],
+ oseq[1],
+ iseq[3],
+ ] # ordering a return for dict
+ embedded[iseq[2]] += [combo]
+ elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]:
+ if (oseq[1] - iseq[0]) < 6:
+ combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+ separate[iseq[2]] += [combo]
+ else:
+ combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+ overlap[iseq[2]] += [combo]
+ elif iseq[0] - 10 < oseq[1] < iseq[0]:
+ combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]]
+ separate[iseq[2]] += [combo]
+ else:
+ continue
+
+ embedded = {k: embedded[k] for k in embedded if embedded[k]}
+ overlap = {k: overlap[k] for k in overlap if overlap[k]}
+ separate = {k: separate[k] for k in separate if separate[k]}
+
+ return embedded, overlap, separate
+
+
+def check_for_usp():
+ " pass "
+
+############################################### TEST RANGE #########################################################################
+####################################################################################################################################
+if __name__ == "__main__":
+
+ #### TMD TEST
+ test_desc = ["one", "two", "three", "four", "five"]
+ test_seq = [
+ "XXXXXXXXXXXXXXXFMCFMCFMCFMCFMCXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
+ "XXXXXXX",
+ "XXXXXXXXXXXKXXXXXXXXXX",
+ "XXXXXXXXXXAKXXXXXXXXXXAKXXXXXXXX",
+ ]
+ # for l in
+ # combo = zip(test_desc,test_seq)
+ pairs = zip(test_desc, test_seq)
+ tmd = []
+ for each_pair in pairs:
+ # print(each_pair)
+ try:
+ tmd += find_tmd(pair=each_pair)
+ except (IndexError, TypeError):
+ continue
+ # try:s = each_pair[1]
+ # tmd += find_tmd(seq=s, tmsize=15)
+ # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+ # print(tmd)
+ # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+
+ #### tuple-fasta TEST
+ # fasta_file = 'out_isp.fa'
+ # ret = tuple_fasta(fasta_file)
+ # print('=============')
+ # for i in ret:
+ # print(i[1])
+
+ #### LipoBox TEST
+ test_desc = ["one", "two", "three", "four", "five", "six", "seven"]
+ test_seq = [
+ "XXXXXXXXXTGGCXXXXXXXXXXXXXXXX",
+ "XXXXXXXXAAKKKKKKKKKKKKKKKXXXXXXXXXXXXX",
+ "XXXXXXX",
+ "AGGCXXXXXXXXXXXXXXXXXXXXTT",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXTGGC",
+ "MSTLRELRLRRALKEQSMRYLLSIKKTLPRWKGALIGLFLICVATISGCASESKLPEPPMVSVDSSLMVEPNLTTEMLNVFSQ*",
+ ]
+ pairs = zip(test_desc, test_seq)
+ lipo = []
+ for each_pair in pairs:
+ #print(each_pair)
+ # try:
+ try:
+ lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8)
+ except TypeError: # catches if something doesnt have the min/max requirements (something is too small)
+ continue
+ # except:
+ # continue
+ # print('\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
+ #############################3
+ # g = prep_a_gff3(fa='putative_isp.fa', spanin_type='isp')
+ # print(g)
+ # write_gff3(data=g)