| 3 | 1 #! /usr/bin/env python | 
|  | 2 ########################################################################## | 
|  | 3 #plot_gcskew.py plots gcskew values for a given chromosome | 
|  | 4 #Copyright (C) 2020 Jennifer Lu, jlu26@jhmi.edu | 
|  | 5 # | 
|  | 6 #This file is part of SkewIT | 
|  | 7 # | 
|  | 8 #SkewIT is free software; you can redistribute it and/or modify | 
|  | 9 #it under the terms of the GNU General Public License as published by | 
|  | 10 #the Free Software Foundation; either version 3 of the license, or | 
|  | 11 #(at your option) any later version. | 
|  | 12 | 
|  | 13 #This program is distributed in the hope that it will be useful, | 
|  | 14 #but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 15 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 
|  | 16 #GNU General Public License for more details. | 
|  | 17 | 
|  | 18 #You should have received a copy of the GNU General Public License | 
|  | 19 #along with this program; if not, see <http://www.gnu.org/licenses/>. | 
|  | 20 ############################################################################## | 
|  | 21 #Jennifer Lu, jlu26@jhmi.edu | 
|  | 22 #2019/01/15 | 
|  | 23 # | 
|  | 24 #This program plots gc_skew for a genome provided to this program | 
|  | 25 #By default, the program will calculate gc-skew for 20kb windows every 20kb | 
|  | 26 #(adjacent/non-overlapping windows of 20kb) | 
|  | 27 # | 
|  | 28 #Users can specify window size and frequency of the calculation. If a specified | 
|  | 29 #frequency is less than the window size, windows will be overlapping | 
|  | 30 ###################################################################### | 
|  | 31 import sys, os, argparse | 
|  | 32 from time import gmtime | 
|  | 33 from time import strftime | 
|  | 34 from Bio import SeqIO | 
|  | 35 import numpy as np | 
|  | 36 | 
|  | 37 import matplotlib | 
|  | 38 import matplotlib.pyplot as plt | 
|  | 39 import matplotlib.ticker as tick | 
|  | 40 | 
|  | 41 def reformat_bp(tick_val, pos): | 
|  | 42     val = round(tick_val/1000000,1) | 
|  | 43     new_tick = '{:} Mb'.format(val) | 
|  | 44     new_tick = str(new_tick) | 
|  | 45     return new_tick | 
|  | 46 | 
|  | 47 def main(): | 
|  | 48     parser = argparse.ArgumentParser() | 
|  | 49     parser.add_argument("-i","--input","-s","--seq", | 
|  | 50         dest="in_file", required=True, | 
|  | 51         help="Sequence for which to calculate gc_skew") | 
|  | 52     parser.add_argument("-k","-l","--window_len", default=20000, | 
|  | 53         dest="window_size", required=False, type=int, | 
|  | 54         help="Window size for which to calculate each g-c/g+c [default: 20Kbp]") | 
|  | 55     parser.add_argument("--freq",required=False,type=int, | 
|  | 56         dest="freq", default=1000, | 
|  | 57         help="Frequency at which to calculate GC skew [default: 1Kbp]") | 
|  | 58     parser.add_argument("-o","--output", required=False, default="curr.png", | 
|  | 59         dest="out_file", help="Name of GC Skew Plot to Save to (default: curr.png)") | 
|  | 60     args=parser.parse_args() | 
|  | 61 | 
|  | 62     #Start program | 
|  | 63     time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | 
|  | 64     sys.stdout.write("   PROGRAM START TIME: " + time + '\n') | 
|  | 65     if args.freq == 0: | 
|  | 66         freq = args.window_size | 
|  | 67     else: | 
|  | 68         freq = args.freq | 
|  | 69     #Process sequence file | 
|  | 70     id2string = {} | 
|  | 71     id2name = {} | 
|  | 72     count_seqs = 0 | 
|  | 73     sys.stdout.write("\t>> Processing sequence file\n") | 
|  | 74     sys.stdout.write("\t\t%i seqs found" % (count_seqs)) | 
|  | 75     sys.stdout.flush() | 
|  | 76     for record in SeqIO.parse(args.in_file,'fasta'): | 
|  | 77         count_seqs += 1 | 
|  | 78         if count_seqs % 100 == 0: | 
|  | 79             sys.stdout.write("\r\t\t%i seqs found" % (count_seqs)) | 
|  | 80             sys.stdout.flush() | 
|  | 81         #Save string | 
|  | 82         id2name[count_seqs] = record.id | 
|  | 83         id2string[count_seqs] = str(record.seq) | 
|  | 84     sys.stdout.write("\r\t\t%i seqs found\n" % (count_seqs)) | 
|  | 85     sys.stdout.flush() | 
|  | 86 | 
|  | 87     #Calculate and plot gc skew | 
|  | 88     tot = count_seqs | 
|  | 89     count_seqs = 0 | 
|  | 90     sys.stdout.write("\t>> Plotting GC Skew for %i seqs\n" % tot) | 
|  | 91     sys.stdout.write("\t\t%i sequences processed" % (count_seqs)) | 
|  | 92     sys.stdout.flush() | 
|  | 93     plot_filename = args.out_file | 
|  | 94     fig,ax = plt.subplots(nrows=tot, ncols=1,figsize=(10,3*tot),squeeze=False) | 
|  | 95     for i in range(1,tot+1): | 
|  | 96         #Calculate/plot skew | 
|  | 97         my_seq = id2string[i] | 
|  | 98         my_description = id2name[i] | 
|  | 99         count = 0 | 
|  | 100         #Calculate skew | 
|  | 101         g = 0.0 | 
|  | 102         c = 0.0 | 
|  | 103         curr_seq = "" | 
|  | 104         indices = [] | 
|  | 105         skews = [] | 
|  | 106         for j in range(0, len(my_seq), freq): | 
|  | 107             if (j+args.window_size) >= len(my_seq): | 
|  | 108                 break | 
|  | 109             curr_seq = my_seq[j:j+args.window_size] | 
|  | 110             g = float(curr_seq.count("G")) | 
|  | 111             c = float(curr_seq.count("C")) | 
|  | 112             if (g+c) > 0: | 
|  | 113                 new_calc = (g-c)/(g+c) | 
|  | 114             else: | 
|  | 115                 new_calc = 0.0 | 
|  | 116             #Save values | 
|  | 117             indices.append(j) | 
|  | 118             skews.append(new_calc) | 
|  | 119         #Final print | 
|  | 120         count_seqs += 1 | 
|  | 121         sys.stdout.write("\r\t\t%i sequences processed" % (count_seqs)) | 
|  | 122         sys.stdout.flush() | 
|  | 123         #Split to pos/neg | 
|  | 124         s = np.array(skews) | 
|  | 125         pos = np.ma.masked_where(s <= 0, s) | 
|  | 126         neg = np.ma.masked_where(s >= 0, s) | 
|  | 127         mid = np.ma.masked_where(s != 0, s) | 
|  | 128         #Print to file | 
|  | 129         lines = ax[i-1,0].plot(indices,pos, 'deepskyblue', indices,mid, 'g', indices,neg,'k') | 
|  | 130         ax[i-1,0].set(xlabel='Genome Position', ylabel='GC Skew', | 
|  | 131             title=my_description) | 
|  | 132         ax[i-1,0].xaxis.set_major_formatter(tick.FuncFormatter(reformat_bp)) | 
|  | 133         ax[i-1,0].grid() | 
|  | 134         #Font Sizes | 
|  | 135         plt.setp(lines,linewidth=0.3) | 
|  | 136         plt.xticks(fontsize=8) | 
|  | 137         plt.yticks(fontsize=8) | 
|  | 138         plt.tight_layout() | 
|  | 139     fig.savefig(plot_filename) | 
|  | 140     sys.stdout.write("\r\t\t%i sequences processed (all finished)\n" % (count_seqs)) | 
|  | 141     sys.stdout.flush() | 
|  | 142 | 
|  | 143     #End program | 
|  | 144     time = strftime("%m-%d-%Y %H:%M:%S", gmtime()) | 
|  | 145     sys.stdout.write("   PROGRAM FINISH TIME: " + time + '\n') | 
|  | 146 | 
|  | 147 if __name__== "__main__": | 
|  | 148     main() |