Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/SkewIT/src/gcskew.py @ 1:032f6b3806a3 draft
Uploaded
| author | dereeper |
|---|---|
| date | Thu, 30 May 2024 11:16:08 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:3cbb01081cde | 1:032f6b3806a3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 ########################################################################## | |
| 3 #gcskew.py calculates gcskew values for a given chromosome | |
| 4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu | |
| 5 # | |
| 6 #This file is part of SkewIT | |
| 7 # | |
| 8 #SkewIT is free software; you can redistribute it and/or modify | |
| 9 #it under the terms of the GNU General Public License as published by | |
| 10 #the Free Software Foundation; either version 3 of the license, or | |
| 11 #(at your option) any later version. | |
| 12 | |
| 13 #This program is distributed in the hope that it will be useful, | |
| 14 #but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 #GNU General Public License for more details. | |
| 17 | |
| 18 #You should have received a copy of the GNU General Public License | |
| 19 #along with this program; if not, see <http://www.gnu.org/licenses/>. | |
| 20 ############################################################################## | |
| 21 #Jennifer Lu, jennifer.lu717@gmail.com | |
| 22 #02/26/2020 | |
| 23 # | |
| 24 #This program calculate gc_skew for a genome provided to this program. | |
| 25 #By default, the program will calculate gc-skew for 20kb windows every 20kb | |
| 26 #(adjacent/non-overlapping windows of 20kb) | |
| 27 # | |
| 28 #Users can specify window size and frequency of the calculation. If a specified | |
| 29 #frequency is less than the window size, windows will be overlapping | |
| 30 ###################################################################### | |
| 31 import sys, os, argparse | |
| 32 from time import gmtime | |
| 33 from time import strftime | |
| 34 from Bio import SeqIO | |
| 35 import numpy as np | |
| 36 ##################################################################### | |
| 37 def usage(): | |
| 38 sys.stderr.write("\n #########################################################################################\n") | |
| 39 sys.stderr.write(" ################################## USAGE: SKEWI.PY ######################################\n") | |
| 40 sys.stderr.write(" ## > python gcskew.py -i SEQ.FASTA -o GCSKEW.TXT\t\t\t\t\t##\n") | |
| 41 sys.stderr.write(" ## \t -i SEQ.FASTA............fasta file (multi-fasta permitted)\t\t\t##\n") | |
| 42 sys.stderr.write(" ## Optional Parameters:\t\t\t\t\t\t\t\t##\n") | |
| 43 sys.stderr.write(" ## \t -o SKEW.TXT.............output file (see below)\t\t\t\t##\n") | |
| 44 sys.stderr.write(" ## \t -k WINDOW_SIZE..........length of subsequences for which to calculate gc skew\t##\n") | |
| 45 sys.stderr.write(" ## \t .....................[default:20000]\t\t\t\t\t##\n" ) | |
| 46 sys.stderr.write(" ## \t -f FREQUENCY............number of bases between the start of each window\t##\n") | |
| 47 sys.stderr.write(" ## \t .....................[default: k == f]\t\t\t\t\t##\n") | |
| 48 sys.stderr.write(" ## Output file: If no output file is provided, SkewI will be printed to gcskew.txt\t##\n") | |
| 49 sys.stderr.write(" ## \t The output file is a 3 column, tab-delimited file listing:\t\t##\n") | |
| 50 sys.stderr.write(" ## \t (1) sequence ID (2) starting index of window (3) gc skew value\t\t##\n") | |
| 51 sys.stderr.write(" #########################################################################################\n") | |
| 52 sys.stderr.write(" #########################################################################################\n\n") | |
| 53 exit(0) | |
| 54 ##################################################################### | |
| 55 def main(): | |
| 56 parser = argparse.ArgumentParser() | |
| 57 parser.add_argument("-i","--input","-s","--seq", | |
| 58 dest="in_file", required=False, default="", | |
| 59 help="FASTA/Multi-FASTA sequence file for which to calculate gc_skew") | |
| 60 parser.add_argument("-k","-w","--window-size", default=20000, | |
| 61 dest="window_size", required=False, type=int, | |
| 62 help="Window size for which to calculate each g-c/g+c [default: 20Kbp]") | |
| 63 parser.add_argument("-f","--freq",required=False,type=int, | |
| 64 dest="freq", default=-1, | |
| 65 help="Frequency at which to calculate GC skew [default: same as window size]") | |
| 66 parser.add_argument("-o","--output", required=False, default="gcskew.txt", | |
| 67 dest="out_file", help="Output text file to save gc skew calculations [default: gcskew.txt]") | |
| 68 parser.add_argument('--usage', required=False, dest='usage', | |
| 69 action='store_true', default=False, | |
| 70 help='Prints usage information for this program') | |
| 71 args=parser.parse_args() | |
| 72 | |
| 73 ############################################## | |
| 74 #Test Parameters | |
| 75 if args.usage: | |
| 76 usage() | |
| 77 if (args.in_file == ""): | |
| 78 sys.stderr.write(" >> Please provide an input sequence file\n") | |
| 79 usage() | |
| 80 exit(1) | |
| 81 if (not os.path.isfile(args.in_file)): | |
| 82 sys.stderr.write(" >> ERROR: %s is not a valid file\n" % args.in_file) | |
| 83 exit(1) | |
| 84 if args.freq == -1: | |
| 85 freq = args.window_size | |
| 86 else: | |
| 87 freq = args.freq | |
| 88 if (args.window_size < 0): | |
| 89 sys.stderr.write(" >> ERROR: window size -w must be greater than 0\n") | |
| 90 exit(1) | |
| 91 if (freq < 0): | |
| 92 sys.stderr.write(" >> ERROR: -f/--freq must be greater than 0\n") | |
| 93 exit(1) | |
| 94 if (args.freq > args.window_size): | |
| 95 sys.stderr.write(" >> ERROR: window size must be >= frequency\n") | |
| 96 exit(1) | |
| 97 ############################################## | |
| 98 #Start program | |
| 99 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
| 100 sys.stdout.write("PROGRAM START TIME: " + time + '\n') | |
| 101 sys.stdout.write(" Input file: %s\n" % args.in_file) | |
| 102 sys.stdout.write(" Output: %s\n" % args.out_file) | |
| 103 sys.stdout.write(" Window size (bp): %i\n" % args.window_size) | |
| 104 sys.stdout.write(" Frequency (bp): %i\n" % freq) | |
| 105 sys.stdout.flush() | |
| 106 ############################################################################ | |
| 107 #Process sequence file | |
| 108 id2string = {} | |
| 109 id2name = {} | |
| 110 count_seqs = 0 | |
| 111 sys.stdout.write(">> Reading sequence file %s\n" % args.in_file) | |
| 112 sys.stdout.write("\t%i seqs found" % (count_seqs)) | |
| 113 sys.stdout.flush() | |
| 114 for record in SeqIO.parse(args.in_file,'fasta'): | |
| 115 count_seqs += 1 | |
| 116 sys.stdout.write("\r\t%i seqs found" % (count_seqs)) | |
| 117 sys.stdout.flush() | |
| 118 #Save string | |
| 119 id2name[count_seqs] = record.id | |
| 120 id2string[count_seqs] = str(record.seq) | |
| 121 sys.stdout.write("\r\t%i seqs found\n" % (count_seqs)) | |
| 122 sys.stdout.flush() | |
| 123 | |
| 124 #Calculate and plot gc skew | |
| 125 tot = count_seqs | |
| 126 count_seqs = 0 | |
| 127 sys.stdout.write(">> Calculating/Saving GC Skew for %i seqs\n" % tot) | |
| 128 sys.stdout.write("\tworking on sequence %i/%i" % (count_seqs,tot)) | |
| 129 sys.stdout.flush() | |
| 130 #Prep output file | |
| 131 o_file = open(args.out_file,'w') | |
| 132 o_file.write("Sequence\tIndex\tGC Skew (%ikb)\n" % (args.window_size/1000)) | |
| 133 #For each sequence, calculate gc skew and print | |
| 134 for i in range(1,tot+1): | |
| 135 #Update | |
| 136 count_seqs += 1 | |
| 137 sys.stdout.write("\r\tworking on sequence %i/%i" % (count_seqs,tot)) | |
| 138 sys.stdout.flush() | |
| 139 #Calculate gc skew | |
| 140 my_seq = id2string[i].upper() | |
| 141 my_description = id2name[i] | |
| 142 count = 0 | |
| 143 #Calculate skew | |
| 144 g = 0.0 | |
| 145 c = 0.0 | |
| 146 curr_seq = "" | |
| 147 for i in range(0, len(my_seq), freq): | |
| 148 if (i+args.window_size) >= len(my_seq): | |
| 149 break | |
| 150 curr_seq = my_seq[i:i+args.window_size] | |
| 151 g = float(curr_seq.count("G")) | |
| 152 c = float(curr_seq.count("C")) | |
| 153 if (g+c) > 0: | |
| 154 new_calc = (g-c)/(g+c) | |
| 155 else: | |
| 156 new_calc = 0.0 | |
| 157 #Print to file | |
| 158 o_file.write("%s\t%i\t%0.8f\n" % (my_description, i, new_calc)) | |
| 159 sys.stdout.write("\r\tFinished calculating gc skew for %i sequences\n" % tot) | |
| 160 sys.stdout.flush() | |
| 161 o_file.close() | |
| 162 #End program | |
| 163 time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | |
| 164 sys.stdout.write("PROGRAM FINISH TIME: " + time + '\n') | |
| 165 | |
| 166 if __name__== "__main__": | |
| 167 main() |
