Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/SkewIT/src/skewi.py @ 1:032f6b3806a3 draft
Uploaded
| author | dereeper |
|---|---|
| date | Thu, 30 May 2024 11:16:08 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:3cbb01081cde | 1:032f6b3806a3 |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 ########################################################################## | |
| 3 #skewi.py calculates a single SkewI value for a given chromosome | |
| 4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu | |
| 5 # | |
| 6 #This file is part of SkewIT | |
| 7 # | |
| 8 #SkewIT is free software; you can redistribute it and/or modify | |
| 9 #it under the terms of the GNU General Public License as published by | |
| 10 #the Free Software Foundation; either version 3 of the license, or | |
| 11 #(at your option) any later version. | |
| 12 | |
| 13 #This program is distributed in the hope that it will be useful, | |
| 14 #but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 #GNU General Public License for more details. | |
| 17 | |
| 18 #You should have received a copy of the GNU General Public License | |
| 19 #along with this program; if not, see <http://www.gnu.org/licenses/>. | |
| 20 | |
| 21 ##################################################################### | |
| 22 #Jennifer Lu, jennifer.lu717@gmail.com | |
| 23 #02/17/2020 | |
| 24 # | |
| 25 #This program calculates a Skew Index (SkewI) | |
| 26 #for a given genome within the range 0 to 1. Higher SkewI values | |
| 27 #indicate a strong GC Skew signal while lower SkewI values | |
| 28 #indicate a potentially low GC Skew signal | |
| 29 # | |
| 30 #Given a multi-fasta file, the program will output one SkewI per sequence | |
| 31 ###################################################################### | |
| 32 import sys, os, argparse | |
| 33 from time import gmtime | |
| 34 from time import strftime | |
| 35 from Bio import SeqIO | |
| 36 import numpy as np | |
| 37 ##################################################################### | |
| 38 def usage(): | |
| 39 sys.stderr.write("\n #########################################################################################\n") | |
| 40 sys.stderr.write(" ################################## USAGE: SKEWI.PY ######################################\n") | |
| 41 sys.stderr.write(" ## > python skewi.py -i SEQ.FASTA -o SKEW.TXT\t\t\t\t\t\t##\n") | |
| 42 sys.stderr.write(" ## \t -i SEQ.FASTA............fasta file (multi-fasta permitted)\t\t\t##\n") | |
| 43 sys.stderr.write(" ## Optional Parameters:\t\t\t\t\t\t\t\t##\n") | |
| 44 sys.stderr.write(" ## \t -o SKEW.TXT.............output file (see below)\t\t\t\t##\n") | |
| 45 sys.stderr.write(" ## \t -k WINDOW_SIZE..........length of subsequences for which to calculate gc skew\t##\n") | |
| 46 sys.stderr.write(" ## \t .....................[default:20000]\t\t\t\t\t##\n" ) | |
| 47 sys.stderr.write(" ## \t -f FREQUENCY............number of bases between the start of each window\t##\n") | |
| 48 sys.stderr.write(" ## \t .....................[default: k == f]\t\t\t\t\t##\n") | |
| 49 sys.stderr.write(" ## \t --min-seq-len LEN.......set a minimum sequence length\t\t\t\t##\n") | |
| 50 sys.stderr.write(" ## \t .....................[default: 500,000bp]\t\t\t\t\t##\n") | |
| 51 sys.stderr.write(" ## \t --complete/--all........only analyze sequences with 'complete' in header\t##\n") | |
| 52 sys.stderr.write(" ## \t .....................[default: --complete]\t\t\t\t\t##\n") | |
| 53 sys.stderr.write(" ## \t --plasmid/--no-plasmid..include/exclude plasmid sequences from analysis\t##\n") | |
| 54 sys.stderr.write(" ## \t .....................[default: --no-plasmid]\t\t\t\t##\n") | |
| 55 sys.stderr.write(" ## Output file: If no output file is provided, SkewI will be printed to standard out\t##\n") | |
| 56 sys.stderr.write(" ## \t Otherwise, a tab-delimited file will be generated:\t\t\t##\n") | |
| 57 sys.stderr.write(" ## \t with two columns: 1) sequence ID 2) skewI value\t\t\t\t##\n") | |
| 58 sys.stderr.write(" #########################################################################################\n") | |
| 59 sys.stderr.write(" #########################################################################################\n\n") | |
| 60 exit(0) | |
| 61 ##################################################################### | |
| 62 def main(): | |
| 63 parser = argparse.ArgumentParser() | |
| 64 #Required parameter: | |
| 65 parser.add_argument("-i","--input","-s","--seq", | |
| 66 dest="in_file", required=False, default="", | |
| 67 help="Sequence file for which to calculate gc_skew") | |
| 68 #Defaults =20K/Same as window size | |
| 69 parser.add_argument("-k","-w","--window-len", | |
| 70 dest="window_size", required=False, default=20000, type=int, | |
| 71 help="Window size for which to calculate each sign(g-c) [default: 20kb]") | |
| 72 parser.add_argument("-f","--freq", dest="freq", type=int, default=-1, | |
| 73 help="Length between the start indices of each window. \ | |
| 74 [default: same as window_len, must be less than window size]") | |
| 75 #No output provided = standard out | |
| 76 parser.add_argument('-o','--output', required=False, | |
| 77 dest="out_file",default="", | |
| 78 help="Output text file to save skewi (skew index values)") | |
| 79 #Complete Genomes/Plasmid Sequence options | |
| 80 parser.add_argument('--complete', required=False, dest='complete_tf', | |
| 81 action='store_true', default=True, help='Only analyze complete genomes') | |
| 82 parser.add_argument('--all', required=False, dest='complete_tf', | |
| 83 action='store_false', default=True, help='Analyze complete and draft genomes') | |
| 84 parser.add_argument('--plasmid', required=False, dest='plasmid_tf', | |
| 85 action='store_true', default=False, help='Analyze plasmid sequences') | |
| 86 parser.add_argument('--no-plasmid', required=False, dest='plasmid_tf', | |
| 87 action='store_false', default=False, help='Exclude plasmid sequences') | |
| 88 #Sequence length minimum | |
| 89 parser.add_argument('--min-len', required=False, dest='min_seq_len', | |
| 90 type=int, default=500000, | |
| 91 help='Minimum sequence length to analyze [default: 500kb]') | |
| 92 #Usage option | |
| 93 parser.add_argument('--usage', required=False, dest='usage', | |
| 94 action='store_true', default=False, | |
| 95 help='Prints usage information for this program') | |
| 96 args=parser.parse_args() | |
| 97 | |
| 98 ######################## | |
| 99 #Test parameters | |
| 100 if args.usage: | |
| 101 usage() | |
| 102 if (args.in_file == ""): | |
| 103 sys.stderr.write(" >> Please provide an input sequence file\n") | |
| 104 usage() | |
| 105 exit(1) | |
| 106 if (not os.path.isfile(args.in_file)): | |
| 107 sys.stderr.write(" >> ERROR: %s is not a valid file\n" % args.in_file) | |
| 108 exit(1) | |
| 109 #Lengths | |
| 110 if (args.freq == -1): | |
| 111 args.freq = args.window_size | |
| 112 if (args.window_size < 0): | |
| 113 sys.stderr.write(" >> ERROR: --window-size must be greater than 0\n") | |
| 114 exit(1) | |
| 115 if (args.freq < 0): | |
| 116 sys.stderr.write(" >> ERROR: --freq must be greater than 0\n") | |
| 117 exit(1) | |
| 118 if (args.freq > args.window_size): | |
| 119 sys.stderr.write(" >> ERROR: window size must be >= frequency\n") | |
| 120 exit(1) | |
| 121 ######################## | |
| 122 #Start program | |
| 123 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
| 124 sys.stderr.write(" >> PROGRAM START TIME: " + time + '\n') | |
| 125 sys.stderr.write(" Input file: %s\n" % args.in_file) | |
| 126 if args.out_file != "": | |
| 127 sys.stderr.write(" Output: %s\n" % args.out_file) | |
| 128 else: | |
| 129 sys.stderr.write(" Output: system standard out\n") | |
| 130 sys.stderr.write(" Window size (bp): %i\n" % args.window_size) | |
| 131 sys.stderr.write(" Frequency (bp): %i\n" % args.freq) | |
| 132 sys.stderr.write(" Minimum sequence length (bp): %i\n" % args.min_seq_len) | |
| 133 sys.stderr.write(" Complete sequences only: %s\n" % args.complete_tf) | |
| 134 sys.stderr.write(" Exclude plasmids: %s\n" % (not args.plasmid_tf)) | |
| 135 sys.stderr.flush() | |
| 136 ######################## | |
| 137 count_seqs = 0 | |
| 138 argmax = [] | |
| 139 seq2skewi = {} | |
| 140 sys.stderr.write("\n") | |
| 141 sys.stderr.write(" >> Processing file: %s\n" % args.in_file) | |
| 142 sys.stderr.write("\r\t%i seqs evaluated: " % (count_seqs)) | |
| 143 for record in SeqIO.parse(args.in_file,'fasta'): | |
| 144 my_description = str(record.description) | |
| 145 if ("complete" not in my_description) and args.complete_tf: | |
| 146 continue | |
| 147 if ("plasmid" in my_description) and not args.plasmid_tf: | |
| 148 continue | |
| 149 my_seq = str(record.seq) | |
| 150 if len(my_seq) < args.min_seq_len: | |
| 151 continue | |
| 152 count_seqs += 1 | |
| 153 #Get sequence and description | |
| 154 #Print Update | |
| 155 count = 0 | |
| 156 tot = len(my_seq)/args.window_size | |
| 157 #Calculate skew | |
| 158 skew = [] | |
| 159 for i in range(0,len(my_seq),args.window_size): | |
| 160 g = my_seq[i:i+args.window_size].count("G") | |
| 161 c = my_seq[i:i+args.window_size].count("C") | |
| 162 if (g-c) > 0: | |
| 163 skew.append(1) | |
| 164 elif (g-c) < 0: | |
| 165 skew.append(-1) | |
| 166 else: | |
| 167 skew.append(0) | |
| 168 #Final print | |
| 169 curr_size = len(skew)/2 | |
| 170 sys.stdout.flush() | |
| 171 skew += skew[:curr_size] | |
| 172 #Calculate abs(T2-T1) where T is sum of values | |
| 173 curr_diffs = [] | |
| 174 x = sum(skew[0:curr_size]) | |
| 175 y = sum(skew[curr_size:2*curr_size]) | |
| 176 for i in range(0, curr_size-1): | |
| 177 curr_diffs.append(abs(x-y)) | |
| 178 x = x - skew[i] + skew[i+curr_size] | |
| 179 y = y - skew[i+curr_size] + skew[i+2*curr_size] | |
| 180 curr_diffs.append(abs(x-y)) | |
| 181 if len(curr_diffs) > 0: | |
| 182 max_cd = float(max(curr_diffs)) | |
| 183 seq2skewi[my_description] = max_cd/float(len(my_seq))*float(args.window_size) | |
| 184 sys.stderr.write("\r\t%i sequences evaluated" % (count_seqs)) | |
| 185 sys.stderr.flush() | |
| 186 | |
| 187 sys.stderr.write("\r\t%i total sequences evaluated (all finished)\n" % (count_seqs)) | |
| 188 sys.stderr.flush() | |
| 189 | |
| 190 if args.out_file == "": | |
| 191 sys.stdout.write("Sequence\tSkewI\n") | |
| 192 for seq in seq2skewi: | |
| 193 sys.stdout.write("%s\t%0.10f\n" % (seq,seq2skewi[seq])) | |
| 194 else: | |
| 195 sys.stdout.write(" >> Printing SkewI values to file: %s\n" % args.out_file) | |
| 196 o_file = open(args.out_file,'w') | |
| 197 o_file.write("Sequence\tSkewI\n") | |
| 198 for seq in seq2skewi: | |
| 199 o_file.write("%s\t%0.10f\n" % (seq,seq2skewi[seq])) | |
| 200 o_file.close() | |
| 201 | |
| 202 #End program | |
| 203 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
| 204 sys.stderr.write(" >> PROGRAM FINISH TIME: " + time + '\n') | |
| 205 if __name__== "__main__": | |
| 206 main() |
