0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 '''
|
|
4 Author: Silvia Chang
|
|
5 Metagenomic Final Project
|
|
6 Fall 2014
|
|
7
|
|
8 Script to create fragment recruitment plots, after read-reference alignment using Bowtie (SAM output) and FR-HIT (fr-hit output) programs
|
|
9 Simplified/revised version of the fragrecruit project from https://github.com/mstrosaker/fragrecruit
|
|
10
|
|
11 Usage: ./frp_tool.py [options] --input --output
|
|
12
|
|
13 '''
|
|
14 import matplotlib.pyplot as plt
|
|
15 import os, sys, getopt, imp, errno
|
|
16 # import shutil
|
|
17 from refseq_parser import SequenceFile
|
|
18 from SAM_parser import SAMFile
|
|
19 from FRHIT_parser import FRHITFile
|
|
20
|
|
21 class FragmentPlotter:
|
|
22 '''Contains all functions to create fragment recruitment plots'''
|
|
23
|
|
24 # Color list. Can only handle 5 samples. Add more colors for more samples.
|
|
25 colors = [
|
|
26 (0.78, 0.59, 0.17, 1.0), #dark gold
|
|
27 (0.99, 0.60, 0.17, 1.0), #light orange
|
|
28 (0.60, 0.82, 0.47, 1.0), #medium/light green
|
|
29 (0.95, 0.40, 0.40, 1.0), #dark pink/fuschia
|
|
30 (0.17, 0.50, 0.69, 1.0), #medium teal/blue
|
|
31 (0.85, 0.64, 0.60, 1.0), #light plum
|
|
32 ]
|
|
33
|
|
34 def __init__(self, microbiome, pt_color, ymin=50):
|
|
35 self.microbiome = microbiome
|
|
36 self.decorated = False
|
|
37 self.pt_color = pt_color
|
|
38 self.ymin = ymin
|
|
39
|
|
40 def new_plot(self, ref_name, contigs, begin, length):
|
|
41 ''' Clears previous data and start a blank plot'''
|
|
42 # refgenome - string, organism name of the refgenome
|
|
43 # contigs - a list containing a [name,length] pair of the refgenome
|
|
44 # begin, length - integers indicate beginning/length of plot
|
|
45
|
|
46 plt.close()
|
|
47 # Initialize variables that will contain all the fragments information
|
|
48 self.refgenome = ref_name
|
|
49 self.contigs = contigs
|
|
50 self.begin = begin
|
|
51 self.samples = []
|
|
52 self.n_points = 0
|
|
53 self.x_points = []
|
|
54 self.y_points = []
|
|
55 self.clr_points = []
|
|
56 self.decorated = False
|
|
57
|
|
58 self.full_length = 0
|
|
59
|
|
60 # Builds the full length of the reference genome, or partial based on begin and length variables
|
|
61 for contig in self.contigs:
|
|
62 self.full_length += contig[1]
|
|
63
|
|
64 # Sets the range of the x axis
|
|
65 if length != 0:
|
|
66 self.xextent = begin + length
|
|
67 else:
|
|
68 self.xextent = self.full_length
|
|
69
|
|
70 def decorate(self):
|
|
71 ''' Decorates the plot by adding axes, axes labels, title, tickmarks, etc.'''
|
|
72 if not self.decorated:
|
|
73 self.decorated = True
|
|
74
|
|
75 # Sets the rc parameters, add x-axis tickmarks and labels them
|
|
76 plt.rc('xtick', labelsize=8)
|
|
77 ticks = [self.begin] # beginning tick
|
|
78 ticks.append(self.xextent/2) # mid tick
|
|
79 ticks.append(self.xextent) # last tick
|
|
80
|
|
81 labels = [int(self.begin)] # beginning nucleotide
|
|
82 labels.append(str(self.xextent/2)) # mid nucleotide
|
|
83 labels.append(str(self.xextent)) # last nucleotide
|
|
84
|
|
85 plt.xticks(ticks, labels)
|
|
86
|
|
87 # Sets x and y axes boundaries and labels
|
|
88 plt.xlim(self.begin, self.xextent)
|
|
89 plt.ylim(self.ymin, 100)
|
|
90 plt.ylabel("% Identity")
|
|
91
|
|
92 # Set plot titles
|
|
93 plt.title(self.refgenome)
|
|
94 plt.suptitle(self.microbiome)
|
|
95
|
|
96 def draw(self, filename):
|
|
97 '''Creates a scatter plot of fragments after they're added in add_points() and save the plots in the specified file name'''
|
|
98
|
|
99 # Sets points size and shape based on number of points recruited
|
|
100 if self.n_points > 10000:
|
|
101 pt_shape = 'o'
|
|
102 pt_size = 1
|
|
103 else:
|
|
104 pt_shape ='s'
|
|
105 pt_size = 3
|
|
106
|
|
107 # Generate the scatter plot
|
|
108 # plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none')
|
|
109
|
|
110 # Adds the legend on the plot. Currently, can only add one. Uncomment the plt.legend method.
|
|
111 plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none', label=self.samples[0])
|
|
112 plt.legend(loc='lower left', ncol=3, scatterpoints=1, markerscale=8, prop={'size':10})
|
|
113
|
|
114 # Adds titles, axes, etc
|
|
115 self.decorate()
|
|
116
|
|
117 # Save the images
|
|
118 plt.savefig(filename, dpi=150, bbox_inches=0)
|
|
119
|
|
120 def legend(self, filename, width=4):
|
|
121 '''Create a legend for the plot'''
|
|
122 plt.figure(figsize=(width, 0.45*len(self.samples)))
|
|
123 plt.axis('off')
|
|
124 plt.ylim(-1,0.1)
|
|
125
|
|
126 i = 0
|
|
127 ystep = 0
|
|
128 for s in self.samples:
|
|
129 plt.text(0.00, -1*ystep-0.1, 'text',
|
|
130 backgroundcolor=self.colors[i], color=self.colors[i])
|
|
131 plt.text(0.27, -1*ystep-0.1, s)
|
|
132 i += 1
|
|
133 ystep += 0.2
|
|
134
|
|
135 plt.title("Samples")
|
|
136 plt.savefig(filename, bbox_inches=0)
|
|
137
|
|
138 def show(self):
|
|
139 self.decorate()
|
|
140 plt.show()
|
|
141
|
|
142 def add_sample(self, name):
|
|
143 self.samples.append(name)
|
|
144
|
|
145 def add_points(self, sample, ref_contig, coordinate, identity):
|
|
146 x= 0
|
|
147
|
|
148 # Builds the reference length
|
|
149 for c in self.contigs:
|
|
150 if (ref_contig == c[0]):
|
|
151 break
|
|
152 x += c[1]
|
|
153
|
|
154 # Applies new color to different samples
|
|
155 clr = self.pt_color
|
|
156 for s in self.samples:
|
|
157 if(sample == s):
|
|
158 break
|
|
159 clr += 1
|
|
160
|
|
161 if x + coordinate >= self.begin and x + coordinate <= self.xextent:
|
|
162 self.n_points += 1
|
|
163 self.x_points.append(x + coordinate)
|
|
164 self.y_points.append(identity)
|
|
165 self.clr_points.append(self.colors[clr])
|
|
166
|
|
167 def usage():
|
|
168 print ('''Usage:
|
|
169 %s [options] <aligned_input> <ref_genome>
|
|
170 Options
|
|
171 -h: display this help message
|
|
172 -i <int>: minimum identity to include in plot (y-axis extent)
|
|
173 -b <int>: begin the plot at the nth nucleotide in the ref genome
|
|
174 -l <int>: include n nucleotides in the x-axis of the plot
|
|
175 the default behavior is to plot the entirety of the genome
|
|
176 -t <str>: title of the plot (e.g. name of reference genome). Put in quotes.
|
|
177 -m <str>: Name of the microbiome. Put in quotes.
|
|
178 -c <int>: number indicating color for plot points.
|
|
179 ''' % sys.argv[0])
|
|
180
|
|
181 if __name__ == '__main__':
|
|
182 # Parse all command-line arguments
|
|
183 # variable opts contains a list of the key-value pair of (option, value); variable args contains a list of the configuration file
|
|
184 try:
|
|
185 opts, args = getopt.getopt(sys.argv[1:], 'hi:b:l:t:m:c:', ['help', 'identity=', 'begin=', 'length=', 'title=', 'microbiome=', 'color='])
|
|
186 except getopt.GetoptError as err:
|
|
187 print(str(err))
|
|
188 usage()
|
|
189 sys.exit(2)
|
|
190
|
|
191 # Set some default values
|
|
192 identity = 50.0
|
|
193 begin = 0
|
|
194 length = 0
|
|
195 title = ''
|
|
196 microbiome=''
|
|
197 color = 0
|
|
198
|
|
199 # Parse options and set user-defined values
|
|
200 for opt, value in opts:
|
|
201 if opt in ('-h', '--help'):
|
|
202 usage()
|
|
203 sys.exit(0)
|
|
204 elif opt in ('-i', '--identity'):
|
|
205 identity = float(value)
|
|
206 elif opt in ('-b', '--begin'):
|
|
207 begin = int(value)
|
|
208 elif opt in ('-l', '--length'):
|
|
209 length = int(length)
|
|
210 elif opt in ('-t', '--title'):
|
|
211 title = str(value)
|
|
212 elif opt in ('-m', '--microbiome'):
|
|
213 microbiome = str(value)
|
|
214 elif opt in ('-c', '--color'):
|
|
215 color = int(value)
|
|
216 else:
|
|
217 assert False, "invalid option"
|
|
218
|
|
219 # Verify there are input files: input and ref_genome
|
|
220 if len(args) == 0:
|
|
221 print ('Need input files.')
|
|
222 usage()
|
|
223 sys.exit
|
|
224
|
|
225 # Load and read the configuration file. Treats the file as module object
|
|
226 # config = imp.load_source('*', args[0])
|
|
227
|
|
228 aligned_input= args[0]
|
|
229 ref_genome = args[1]
|
|
230
|
|
231 # Instantiate plotting object
|
|
232 plotter = FragmentPlotter(microbiome, color, ymin=identity)
|
|
233
|
|
234
|
|
235 # Begin the plotting commands, starting each with the reference genome
|
|
236 # for refgenome in ref_genome:
|
|
237 '''refgenome = [(name, taxonID, file_path), ..., ]'''
|
|
238 contigs = []
|
|
239
|
|
240 # Parse the reference genome's FASTA file.
|
|
241 fasta = open(ref_genome, 'r')
|
|
242 for refseq in SequenceFile(fasta):
|
|
243 contigs.append([refseq.name.split('|')[3], len(refseq.seq)])
|
|
244
|
|
245 fasta.close()
|
|
246
|
|
247 # Sets the ref genome's accession # as the title if title is not set (i.e. empty)
|
|
248 if title == '':
|
|
249 title = contigs[0][0]
|
|
250
|
|
251 # Initialize the new plot
|
|
252 plotter.new_plot(title, contigs, begin, length)
|
|
253
|
|
254 count_samp = 0
|
|
255 # for sample in aligned_input:
|
|
256 '''sample = [(name, {taxonID:path}), ...,]'''
|
|
257 # plotter.add_sample("test")
|
|
258 count_samp += 1
|
|
259
|
|
260 # Call the appropriate parser based on sample file extension
|
|
261 # file = '%s%s' % (config.basepath, sample[1][refgenome[1]])
|
|
262
|
|
263 # if aligned_input.endswith('.sam'):
|
|
264 # parser_func = SAMFile
|
|
265 # elif aligned_input.endswith('.fr-hit'):
|
|
266 # parser_func = FRHITFile
|
|
267 # else:
|
|
268 # print('Invalid file type')
|
|
269
|
|
270 parser_func = SAMFile
|
|
271
|
|
272 sample_name =""
|
|
273 for frag in parser_func(aligned_input):
|
|
274 if frag.identity >= identity:
|
|
275 sample_name = frag.name.split('.')[0]
|
|
276 plotter.add_points(frag.name, frag.refseq, frag.location, frag.identity)
|
|
277
|
|
278 plotter.add_sample(sample_name)
|
|
279 # Put this inside the sample for-loop to create individual plot per sample
|
|
280 plotter.draw('%s.png' % ("frp_image"))
|
|
281
|
|
282
|
|
283 # plotter.show()
|
|
284
|
|
285 # Draw the points and save file with taxonID as base name. Put this outside the sample for-loop to create an all-samples plot.
|
|
286 # Put this inside the sample for-loop to create individual plot per sample
|
|
287 # plotter.draw('%s/%s.png' % (config.shortname, refgenome[1]))
|
|
288
|
|
289 # plotter.legend('%s/legend.png' % (config.shortname))
|
|
290
|
|
291 # Make a tarball of the resulting PNGs
|
|
292 # os.system('tar -cvf %s.tar %s >/dev/null' %
|
|
293 # ( config.shortname, config.shortname))
|
|
294 # os.system('gzip %s.tar' % (config.shortname))
|
|
295 # shutil.rmtree(config.shortname)
|
|
296
|
|
297
|
|
298
|
|
299
|
|
300 |