Mercurial > repos > cpt > cpt_putative_isp
comparison generate-putative-isp.py @ 1:4e02e6e9e77d draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:51:35 +0000 |
| parents | |
| children | 08499fbf8697 |
comparison
equal
deleted
inserted
replaced
| 0:10d13d0c37d3 | 1:4e02e6e9e77d |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ##### findSpanin.pl --> findSpanin.py | |
| 4 ######### Incooperated from the findSpanin.pl script, but better and more snakey. | |
| 5 | |
| 6 import argparse | |
| 7 from cpt import OrfFinder | |
| 8 from Bio import SeqIO | |
| 9 from Bio import Seq | |
| 10 import re | |
| 11 from spaninFuncs import * | |
| 12 import os | |
| 13 | |
| 14 # if __name__ == '__main__': | |
| 15 # pass | |
| 16 ############################################################################### | |
| 17 | |
| 18 if __name__ == "__main__": | |
| 19 | |
| 20 # Common parameters for both ISP / OSP portion of script | |
| 21 | |
| 22 parser = argparse.ArgumentParser( | |
| 23 description="Get putative protein candidates for spanins" | |
| 24 ) | |
| 25 parser.add_argument( | |
| 26 "fasta_file", type=argparse.FileType("r"), help="Fasta file" | |
| 27 ) # the "input" argument | |
| 28 | |
| 29 parser.add_argument( | |
| 30 "-f", | |
| 31 "--format", | |
| 32 dest="seq_format", | |
| 33 default="fasta", | |
| 34 help="Sequence format (e.g. fasta, fastq, sff)", | |
| 35 ) # optional formats for input, currently just going to do ntFASTA | |
| 36 | |
| 37 parser.add_argument( | |
| 38 "--strand", | |
| 39 dest="strand", | |
| 40 choices=("both", "forward", "reverse"), | |
| 41 default="both", | |
| 42 help="select strand", | |
| 43 ) # Selection of +, -, or both strands | |
| 44 | |
| 45 parser.add_argument( | |
| 46 "--table", dest="table", default=11, help="NCBI Translation table", type=int | |
| 47 ) # Uses "default" NCBI codon table. This should always (afaik) be what we want... | |
| 48 | |
| 49 parser.add_argument( | |
| 50 "-t", | |
| 51 "--ftype", | |
| 52 dest="ftype", | |
| 53 choices=("CDS", "ORF"), | |
| 54 default="ORF", | |
| 55 help="Find ORF or CDSs", | |
| 56 ) # "functional type(?)" --> Finds ORF or CDS, for this we want just the ORF | |
| 57 | |
| 58 parser.add_argument( | |
| 59 "-e", | |
| 60 "--ends", | |
| 61 dest="ends", | |
| 62 choices=("open", "closed"), | |
| 63 default="closed", | |
| 64 help="Open or closed. Closed ensures start/stop codons are present", | |
| 65 ) # includes the start and stop codon | |
| 66 | |
| 67 parser.add_argument( | |
| 68 "-m", | |
| 69 "--mode", | |
| 70 dest="mode", | |
| 71 choices=("all", "top", "one"), | |
| 72 default="all", # I think we want this to JUST be all...nearly always | |
| 73 help="Output all ORFs/CDSs from sequence, all ORFs/CDSs with max length, or first with maximum length", | |
| 74 ) | |
| 75 | |
| 76 parser.add_argument( | |
| 77 "--switch", | |
| 78 dest="switch", | |
| 79 default="all", | |
| 80 help="switch between ALL putative osps, or a range. If not all, insert a range of two integers separated by a colon (:). Eg: 1234:4321", | |
| 81 ) | |
| 82 # isp parameters | |
| 83 parser.add_argument( | |
| 84 "--isp_min_len", | |
| 85 dest="isp_min_len", | |
| 86 default=60, | |
| 87 help="Minimum ORF length, measured in codons", | |
| 88 type=int, | |
| 89 ) | |
| 90 parser.add_argument( | |
| 91 "--isp_on", | |
| 92 dest="out_isp_nuc", | |
| 93 type=argparse.FileType("w"), | |
| 94 default="_out_isp.fna", | |
| 95 help="Output nucleotide sequences, FASTA", | |
| 96 ) | |
| 97 parser.add_argument( | |
| 98 "--isp_op", | |
| 99 dest="out_isp_prot", | |
| 100 type=argparse.FileType("w"), | |
| 101 default="_out_isp.fa", | |
| 102 help="Output protein sequences, FASTA", | |
| 103 ) | |
| 104 parser.add_argument( | |
| 105 "--isp_ob", | |
| 106 dest="out_isp_bed", | |
| 107 type=argparse.FileType("w"), | |
| 108 default="_out_isp.bed", | |
| 109 help="Output BED file", | |
| 110 ) | |
| 111 parser.add_argument( | |
| 112 "--isp_og", | |
| 113 dest="out_isp_gff3", | |
| 114 type=argparse.FileType("w"), | |
| 115 default="_out_isp.gff3", | |
| 116 help="Output GFF3 file", | |
| 117 ) | |
| 118 parser.add_argument( | |
| 119 "--isp_min_dist", | |
| 120 dest="isp_min_dist", | |
| 121 default=10, | |
| 122 help="Minimal distance to first AA of TMD, measured in AA", | |
| 123 type=int, | |
| 124 ) | |
| 125 parser.add_argument( | |
| 126 "--isp_max_dist", | |
| 127 dest="isp_max_dist", | |
| 128 default=30, | |
| 129 help="Maximum distance to first AA of TMD, measured in AA", | |
| 130 type=int, | |
| 131 ) | |
| 132 parser.add_argument( | |
| 133 "--putative_isp", | |
| 134 dest="putative_isp_fa", | |
| 135 type=argparse.FileType("w"), | |
| 136 default="_putative_isp.fa", | |
| 137 help="Output of putative FASTA file", | |
| 138 ) | |
| 139 parser.add_argument( | |
| 140 "--min_tmd_size", | |
| 141 dest="min_tmd_size", | |
| 142 default=10, | |
| 143 help="Minimal size of the TMD domain", | |
| 144 type=int, | |
| 145 ) | |
| 146 parser.add_argument( | |
| 147 "--max_tmd_size", | |
| 148 dest="max_tmd_size", | |
| 149 default=20, | |
| 150 help="Maximum size of the TMD domain", | |
| 151 type=int, | |
| 152 ) | |
| 153 parser.add_argument( | |
| 154 "--summary_isp_txt", | |
| 155 dest="summary_isp_txt", | |
| 156 type=argparse.FileType("w"), | |
| 157 default="_summary_isp.txt", | |
| 158 help="Summary statistics on putative i-spanins", | |
| 159 ) | |
| 160 parser.add_argument( | |
| 161 "--putative_isp_gff", | |
| 162 dest="putative_isp_gff", | |
| 163 type=argparse.FileType("w"), | |
| 164 default="_putative_isp.gff3", | |
| 165 help="gff3 output for putative i-spanins", | |
| 166 ) | |
| 167 | |
| 168 parser.add_argument( | |
| 169 "--max_isp", | |
| 170 dest="max_isp", | |
| 171 default=230, | |
| 172 help="Maximum size of the ISP", | |
| 173 type=int, | |
| 174 ) | |
| 175 | |
| 176 parser.add_argument("--isp_mode", action="store_true", default=True) | |
| 177 | |
| 178 parser.add_argument( | |
| 179 "--peri_min", | |
| 180 type=int, | |
| 181 default=18, | |
| 182 help="amount of residues after TMD is found min", | |
| 183 ) | |
| 184 | |
| 185 parser.add_argument( | |
| 186 "--peri_max", | |
| 187 type=int, | |
| 188 default=206, | |
| 189 help="amount of residues after TMD is found max", | |
| 190 ) | |
| 191 # parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated? | |
| 192 args = parser.parse_args() | |
| 193 the_args = vars(parser.parse_args()) | |
| 194 | |
| 195 ### isp output, naive ORF finding: | |
| 196 isps = OrfFinder(args.table, args.ftype, args.ends, args.isp_min_len, args.strand) | |
| 197 isps.locate( | |
| 198 args.fasta_file, | |
| 199 args.out_isp_nuc, | |
| 200 args.out_isp_prot, | |
| 201 args.out_isp_bed, | |
| 202 args.out_isp_gff3, | |
| 203 ) | |
| 204 """ | |
| 205 >T7_EIS MLEFLRKLIPWVLVGMLFGLGWHLGSDSMDAKWKQEVHNEYVKRVEAAKSTQRAIGAVSAKYQEDLAALEGSTDRIISDLRSDNKRLRVRVKTTGISDGQCGFEPDGRAELDDRDAKRILAVTQKGDAWIRALQDTIRELQRK | |
| 206 >lambda_EIS MSRVTAIISALVICIIVCLSWAVNHYRDNAITYKAQRDKNARELKLANAAITDMQMRQRDVAALDAKYTKELADAKAENDALRDDVAAGRRRLHIKAVCQSVREATTASGVDNAASPRLADTAERDYFTLRERLITMQKQLEGTQKYINEQCR | |
| 207 """ | |
| 208 args.fasta_file.close() | |
| 209 args.fasta_file = open(args.fasta_file.name, "r") | |
| 210 args.out_isp_prot.close() | |
| 211 args.out_isp_prot = open(args.out_isp_prot.name, "r") | |
| 212 | |
| 213 pairs = tuple_fasta(fasta_file=args.out_isp_prot) | |
| 214 | |
| 215 # print(pairs) | |
| 216 | |
| 217 have_tmd = [] # empty candidates list to be passed through the user input criteria | |
| 218 | |
| 219 for ( | |
| 220 each_pair | |
| 221 ) in ( | |
| 222 pairs | |
| 223 ): # grab transmembrane domains from spaninFuncts (queries for lysin snorkels # and a range of hydrophobic regions that could be TMDs) | |
| 224 if len(each_pair[1]) <= args.max_isp: | |
| 225 try: | |
| 226 have_tmd += find_tmd( | |
| 227 pair=each_pair, | |
| 228 minimum=args.isp_min_dist, | |
| 229 maximum=args.isp_max_dist, | |
| 230 TMDmin=args.min_tmd_size, | |
| 231 TMDmax=args.max_tmd_size, | |
| 232 isp_mode=args.isp_mode, | |
| 233 peri_min=args.peri_min, | |
| 234 peri_max=args.peri_max, | |
| 235 ) | |
| 236 except TypeError: | |
| 237 continue | |
| 238 | |
| 239 if args.switch == "all": | |
| 240 pass | |
| 241 else: | |
| 242 # for each_pair in have_lipo: | |
| 243 range_of = args.switch | |
| 244 range_of = re.search(("[\d]+:[\d]+"), range_of).group(0) | |
| 245 start = int(range_of.split(":")[0]) | |
| 246 end = int(range_of.split(":")[1]) | |
| 247 have_tmd = parse_a_range(pair=have_tmd, start=start, end=end) | |
| 248 | |
| 249 total_isp = len(have_tmd) | |
| 250 | |
| 251 # ORF = [] # mightttttttttttt use eventually | |
| 252 length = [] # grabbing length of the sequences | |
| 253 candidate_dict = {k: v for k, v in have_tmd} | |
| 254 with args.putative_isp_fa as f: | |
| 255 for desc, s in candidate_dict.items(): # description / sequence | |
| 256 f.write(">" + str(desc)) | |
| 257 f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n") | |
| 258 length.append(len(s)) | |
| 259 # ORF.append(desc) | |
| 260 if not length: | |
| 261 raise Exception("Parameters yielded no candidates.") | |
| 262 bot_size = min(length) | |
| 263 top_size = max(length) | |
| 264 avg = (sum(length)) / total_isp | |
| 265 n = len(length) | |
| 266 if n == 0: | |
| 267 raise Exception("no median for empty data") | |
| 268 if n % 2 == 1: | |
| 269 med = length[n // 2] | |
| 270 else: | |
| 271 i = n // 2 | |
| 272 med = (length[i - 1] + length[i]) / 2 | |
| 273 | |
| 274 #### Extra statistics | |
| 275 args.out_isp_prot.close() | |
| 276 all_orfs = open(args.out_isp_prot.name, "r") | |
| 277 all_isps = open(args.putative_isp_fa.name, "r") | |
| 278 # record = SeqIO.read(all_orfs, "fasta") | |
| 279 # print(len(record)) | |
| 280 n = 0 | |
| 281 for line in all_orfs: | |
| 282 if line.startswith(">"): | |
| 283 n += 1 | |
| 284 all_orfs_counts = n | |
| 285 | |
| 286 c = 0 | |
| 287 for line in all_isps: | |
| 288 if line.startswith(">"): | |
| 289 c += 1 | |
| 290 all_isps_counts = c | |
| 291 | |
| 292 # print(f"{n} -> {c}") | |
| 293 # count = 0 | |
| 294 # for feature in record.features: | |
| 295 # count += 1 | |
| 296 # print(count) | |
| 297 | |
| 298 with args.summary_isp_txt as f: | |
| 299 f.write("total potential o-spanins: " + str(total_isp) + "\n") | |
| 300 f.write("average length (AA): " + str(avg) + "\n") | |
| 301 f.write("median length (AA): " + str(med) + "\n") | |
| 302 f.write("maximum orf in size (AA): " + str(top_size) + "\n") | |
| 303 f.write("minimum orf in size (AA): " + str(bot_size) + "\n") | |
| 304 f.write("ratio of isps found from naive orfs: " + str(c) + "/" + str(n)) | |
| 305 | |
| 306 # Output the putative list in gff3 format | |
| 307 args.putative_isp_fa = open(args.putative_isp_fa.name, "r") | |
| 308 gff_data = prep_a_gff3( | |
| 309 fa=args.putative_isp_fa, spanin_type="isp", org=args.fasta_file | |
| 310 ) | |
| 311 write_gff3(data=gff_data, output=args.putative_isp_gff) | |
| 312 | |
| 313 """https://docs.python.org/3.4/library/subprocess.html""" | |
| 314 """https://github.tamu.edu/CPT/Galaxy-Tools/blob/f0bf4a4b8e5124d4f3082d21b738dfaa8e1a3cf6/tools/phage/transmembrane.py""" |
