cpt_putative_isp: generate-putative-isp.py comparison

comparison generate-putative-isp.py @ 1:4e02e6e9e77d draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c

author	cpt
date	Mon, 05 Jun 2023 02:51:35 +0000
parents
children

comparison

equal deleted inserted replaced

-:10d13d0c37d3
+:4e02e6e9e77d
+#!/usr/bin/env python
+##### findSpanin.pl --> findSpanin.py
+######### Incooperated from the findSpanin.pl script, but better and more snakey.
+import argparse
+from cpt import OrfFinder
+from Bio import SeqIO
+from Bio import Seq
+import re
+from spaninFuncs import *
+import os
+# if __name__ == '__main__':
+# pass
+###############################################################################
+if __name__ == "__main__":
+# Common parameters for both ISP / OSP portion of script
+parser = argparse.ArgumentParser(
+description="Get putative protein candidates for spanins"
+)
+parser.add_argument(
+"fasta_file", type=argparse.FileType("r"), help="Fasta file"
+)  # the "input" argument
+parser.add_argument(
+"-f",
+"--format",
+dest="seq_format",
+default="fasta",
+help="Sequence format (e.g. fasta, fastq, sff)",
+)  # optional formats for input, currently just going to do ntFASTA
+parser.add_argument(
+"--strand",
+dest="strand",
+choices=("both", "forward", "reverse"),
+default="both",
+help="select strand",
+)  # Selection of +, -, or both strands
+parser.add_argument(
+"--table", dest="table", default=11, help="NCBI Translation table", type=int
+)  # Uses "default" NCBI codon table. This should always (afaik) be what we want...
+parser.add_argument(
+"-t",
+"--ftype",
+dest="ftype",
+choices=("CDS", "ORF"),
+default="ORF",
+help="Find ORF or CDSs",
+)  # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF
+parser.add_argument(
+"-e",
+"--ends",
+dest="ends",
+choices=("open", "closed"),
+default="closed",
+help="Open or closed. Closed ensures start/stop codons are present",
+)  # includes the start and stop codon
+parser.add_argument(
+"-m",
+"--mode",
+dest="mode",
+choices=("all", "top", "one"),
+default="all",  # I think we want this to JUST be all...nearly always
+help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length",
+)
+parser.add_argument(
+"--switch",
+dest="switch",
+default="all",
+help="switch between ALL putative osps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321",
+)
+# isp parameters
+parser.add_argument(
+"--isp_min_len",
+dest="isp_min_len",
+default=60,
+help="Minimum ORF length, measured in codons",
+type=int,
+)
+parser.add_argument(
+"--isp_on",
+dest="out_isp_nuc",
+type=argparse.FileType("w"),
+default="_out_isp.fna",
+help="Output nucleotide sequences, FASTA",
+)
+parser.add_argument(
+"--isp_op",
+dest="out_isp_prot",
+type=argparse.FileType("w"),
+default="_out_isp.fa",
+help="Output protein sequences, FASTA",
+)
+parser.add_argument(
+"--isp_ob",
+dest="out_isp_bed",
+type=argparse.FileType("w"),
+default="_out_isp.bed",
+help="Output BED file",
+)
+parser.add_argument(
+"--isp_og",
+dest="out_isp_gff3",
+type=argparse.FileType("w"),
+default="_out_isp.gff3",
+help="Output GFF3 file",
+)
+parser.add_argument(
+"--isp_min_dist",
+dest="isp_min_dist",
+default=10,
+help="Minimal distance to first AA of TMD, measured in AA",
+type=int,
+)
+parser.add_argument(
+"--isp_max_dist",
+dest="isp_max_dist",
+default=30,
+help="Maximum distance to first AA of TMD, measured in AA",
+type=int,
+)
+parser.add_argument(
+"--putative_isp",
+dest="putative_isp_fa",
+type=argparse.FileType("w"),
+default="_putative_isp.fa",
+help="Output of putative FASTA file",
+)
+parser.add_argument(
+"--min_tmd_size",
+dest="min_tmd_size",
+default=10,
+help="Minimal size of the TMD domain",
+type=int,
+)
+parser.add_argument(
+"--max_tmd_size",
+dest="max_tmd_size",
+default=20,
+help="Maximum size of the TMD domain",
+type=int,
+)
+parser.add_argument(
+"--summary_isp_txt",
+dest="summary_isp_txt",
+type=argparse.FileType("w"),
+default="_summary_isp.txt",
+help="Summary statistics on putative i-spanins",
+)
+parser.add_argument(
+"--putative_isp_gff",
+dest="putative_isp_gff",
+type=argparse.FileType("w"),
+default="_putative_isp.gff3",
+help="gff3 output for putative i-spanins",
+)
+parser.add_argument(
+"--max_isp",
+dest="max_isp",
+default=230,
+help="Maximum size of the ISP",
+type=int,
+)
+parser.add_argument("--isp_mode", action="store_true", default=True)
+parser.add_argument(
+"--peri_min",
+type=int,
+default=18,
+help="amount of residues after TMD is found min",
+)
+parser.add_argument(
+"--peri_max",
+type=int,
+default=206,
+help="amount of residues after TMD is found max",
+)
+# parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated?
+args = parser.parse_args()
+the_args = vars(parser.parse_args())
+### isp output, naive ORF finding:
+isps = OrfFinder(args.table, args.ftype, args.ends, args.isp_min_len, args.strand)
+isps.locate(
+args.fasta_file,
+args.out_isp_nuc,
+args.out_isp_prot,
+args.out_isp_bed,
+args.out_isp_gff3,
+)
+"""
+>T7_EIS MLEFLRKLIPWVLVGMLFGLGWHLGSDSMDAKWKQEVHNEYVKRVEAAKSTQRAIGAVSAKYQEDLAALEGSTDRIISDLRSDNKRLRVRVKTTGISDGQCGFEPDGRAELDDRDAKRILAVTQKGDAWIRALQDTIRELQRK
+>lambda_EIS MSRVTAIISALVICIIVCLSWAVNHYRDNAITYKAQRDKNARELKLANAAITDMQMRQRDVAALDAKYTKELADAKAENDALRDDVAAGRRRLHIKAVCQSVREATTASGVDNAASPRLADTAERDYFTLRERLITMQKQLEGTQKYINEQCR
+"""
+args.fasta_file.close()
+args.fasta_file = open(args.fasta_file.name, "r")
+args.out_isp_prot.close()
+args.out_isp_prot = open(args.out_isp_prot.name, "r")
+pairs = tuple_fasta(fasta_file=args.out_isp_prot)
+# print(pairs)
+have_tmd = []  # empty candidates list to be passed through the user input criteria
+for (
+each_pair
+) in (
+pairs
+):  # grab transmembrane domains from spaninFuncts (queries for lysin snorkels # and a range of hydrophobic regions that could be TMDs)
+if len(each_pair[1]) <= args.max_isp:
+try:
+have_tmd += find_tmd(
+pair=each_pair,
+minimum=args.isp_min_dist,
+maximum=args.isp_max_dist,
+TMDmin=args.min_tmd_size,
+TMDmax=args.max_tmd_size,
+isp_mode=args.isp_mode,
+peri_min=args.peri_min,
+peri_max=args.peri_max,
+)
+except TypeError:
+continue
+if args.switch == "all":
+pass
+else:
+# for each_pair in have_lipo:
+range_of = args.switch
+range_of = re.search(("[\d]+:[\d]+"), range_of).group(0)
+start = int(range_of.split(":")[0])
+end = int(range_of.split(":")[1])
+have_tmd = parse_a_range(pair=have_tmd, start=start, end=end)
+total_isp = len(have_tmd)
+# ORF = [] # mightttttttttttt use eventually
+length = []  # grabbing length of the sequences
+candidate_dict = {k: v for k, v in have_tmd}
+with args.putative_isp_fa as f:
+for desc, s in candidate_dict.items():  # description / sequence
+f.write(">" + str(desc))
+f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n")
+length.append(len(s))
+# ORF.append(desc)
+if not length:
+raise Exception("Parameters yielded no candidates.")
+bot_size = min(length)
+top_size = max(length)
+avg = (sum(length)) / total_isp
+n = len(length)
+if n == 0:
+raise Exception("no median for empty data")
+if n % 2 == 1:
+med = length[n // 2]
+else:
+i = n // 2
+med = (length[i - 1] + length[i]) / 2
+#### Extra statistics
+args.out_isp_prot.close()
+all_orfs = open(args.out_isp_prot.name, "r")
+all_isps = open(args.putative_isp_fa.name, "r")
+# record = SeqIO.read(all_orfs, "fasta")
+# print(len(record))
+n = 0
+for line in all_orfs:
+if line.startswith(">"):
+n += 1
+all_orfs_counts = n
+c = 0
+for line in all_isps:
+if line.startswith(">"):
+c += 1
+all_isps_counts = c
+# print(f"{n} -> {c}")
+# count = 0
+# for feature in record.features:
+#    count += 1
+# print(count)
+with args.summary_isp_txt as f:
+f.write("total potential o-spanins: " + str(total_isp) + "\n")
+f.write("average length (AA): " + str(avg) + "\n")
+f.write("median length (AA): " + str(med) + "\n")
+f.write("maximum orf in size (AA): " + str(top_size) + "\n")
+f.write("minimum orf in size (AA): " + str(bot_size) + "\n")
+f.write("ratio of isps found from naive orfs: " + str(c) + "/" + str(n))
+# Output the putative list in gff3 format
+args.putative_isp_fa = open(args.putative_isp_fa.name, "r")
+gff_data = prep_a_gff3(
+fa=args.putative_isp_fa, spanin_type="isp", org=args.fasta_file
+)
+write_gff3(data=gff_data, output=args.putative_isp_gff)
+"""https://docs.python.org/3.4/library/subprocess.html"""
+"""https://github.tamu.edu/CPT/Galaxy-Tools/blob/f0bf4a4b8e5124d4f3082d21b738dfaa8e1a3cf6/tools/phage/transmembrane.py"""

Mercurial > repos > cpt > cpt_putative_isp

comparison generate-putative-isp.py @ 1:4e02e6e9e77d draft