comparison frp_tool.py @ 0:e223b827a13e draft

Uploaded
author schang
date Tue, 02 Dec 2014 19:38:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e223b827a13e
1 #!/usr/bin/env python
2
3 '''
4 Author: Silvia Chang
5 Metagenomic Final Project
6 Fall 2014
7
8 Script to create fragment recruitment plots, after read-reference alignment using Bowtie (SAM output) and FR-HIT (fr-hit output) programs
9 Simplified/revised version of the fragrecruit project from https://github.com/mstrosaker/fragrecruit
10
11 Usage: ./frp_tool.py [options] --input --output
12
13 '''
14 import matplotlib.pyplot as plt
15 import os, sys, getopt, imp, errno
16 # import shutil
17 from refseq_parser import SequenceFile
18 from SAM_parser import SAMFile
19 from FRHIT_parser import FRHITFile
20
21 class FragmentPlotter:
22 '''Contains all functions to create fragment recruitment plots'''
23
24 # Color list. Can only handle 5 samples. Add more colors for more samples.
25 colors = [
26 (0.78, 0.59, 0.17, 1.0), #dark gold
27 (0.99, 0.60, 0.17, 1.0), #light orange
28 (0.60, 0.82, 0.47, 1.0), #medium/light green
29 (0.95, 0.40, 0.40, 1.0), #dark pink/fuschia
30 (0.17, 0.50, 0.69, 1.0), #medium teal/blue
31 (0.85, 0.64, 0.60, 1.0), #light plum
32 ]
33
34 def __init__(self, microbiome, pt_color, ymin=50):
35 self.microbiome = microbiome
36 self.decorated = False
37 self.pt_color = pt_color
38 self.ymin = ymin
39
40 def new_plot(self, ref_name, contigs, begin, length):
41 ''' Clears previous data and start a blank plot'''
42 # refgenome - string, organism name of the refgenome
43 # contigs - a list containing a [name,length] pair of the refgenome
44 # begin, length - integers indicate beginning/length of plot
45
46 plt.close()
47 # Initialize variables that will contain all the fragments information
48 self.refgenome = ref_name
49 self.contigs = contigs
50 self.begin = begin
51 self.samples = []
52 self.n_points = 0
53 self.x_points = []
54 self.y_points = []
55 self.clr_points = []
56 self.decorated = False
57
58 self.full_length = 0
59
60 # Builds the full length of the reference genome, or partial based on begin and length variables
61 for contig in self.contigs:
62 self.full_length += contig[1]
63
64 # Sets the range of the x axis
65 if length != 0:
66 self.xextent = begin + length
67 else:
68 self.xextent = self.full_length
69
70 def decorate(self):
71 ''' Decorates the plot by adding axes, axes labels, title, tickmarks, etc.'''
72 if not self.decorated:
73 self.decorated = True
74
75 # Sets the rc parameters, add x-axis tickmarks and labels them
76 plt.rc('xtick', labelsize=8)
77 ticks = [self.begin] # beginning tick
78 ticks.append(self.xextent/2) # mid tick
79 ticks.append(self.xextent) # last tick
80
81 labels = [int(self.begin)] # beginning nucleotide
82 labels.append(str(self.xextent/2)) # mid nucleotide
83 labels.append(str(self.xextent)) # last nucleotide
84
85 plt.xticks(ticks, labels)
86
87 # Sets x and y axes boundaries and labels
88 plt.xlim(self.begin, self.xextent)
89 plt.ylim(self.ymin, 100)
90 plt.ylabel("% Identity")
91
92 # Set plot titles
93 plt.title(self.refgenome)
94 plt.suptitle(self.microbiome)
95
96 def draw(self, filename):
97 '''Creates a scatter plot of fragments after they're added in add_points() and save the plots in the specified file name'''
98
99 # Sets points size and shape based on number of points recruited
100 if self.n_points > 10000:
101 pt_shape = 'o'
102 pt_size = 1
103 else:
104 pt_shape ='s'
105 pt_size = 3
106
107 # Generate the scatter plot
108 # plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none')
109
110 # Adds the legend on the plot. Currently, can only add one. Uncomment the plt.legend method.
111 plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none', label=self.samples[0])
112 plt.legend(loc='lower left', ncol=3, scatterpoints=1, markerscale=8, prop={'size':10})
113
114 # Adds titles, axes, etc
115 self.decorate()
116
117 # Save the images
118 plt.savefig(filename, dpi=150, bbox_inches=0)
119
120 def legend(self, filename, width=4):
121 '''Create a legend for the plot'''
122 plt.figure(figsize=(width, 0.45*len(self.samples)))
123 plt.axis('off')
124 plt.ylim(-1,0.1)
125
126 i = 0
127 ystep = 0
128 for s in self.samples:
129 plt.text(0.00, -1*ystep-0.1, 'text',
130 backgroundcolor=self.colors[i], color=self.colors[i])
131 plt.text(0.27, -1*ystep-0.1, s)
132 i += 1
133 ystep += 0.2
134
135 plt.title("Samples")
136 plt.savefig(filename, bbox_inches=0)
137
138 def show(self):
139 self.decorate()
140 plt.show()
141
142 def add_sample(self, name):
143 self.samples.append(name)
144
145 def add_points(self, sample, ref_contig, coordinate, identity):
146 x= 0
147
148 # Builds the reference length
149 for c in self.contigs:
150 if (ref_contig == c[0]):
151 break
152 x += c[1]
153
154 # Applies new color to different samples
155 clr = self.pt_color
156 for s in self.samples:
157 if(sample == s):
158 break
159 clr += 1
160
161 if x + coordinate >= self.begin and x + coordinate <= self.xextent:
162 self.n_points += 1
163 self.x_points.append(x + coordinate)
164 self.y_points.append(identity)
165 self.clr_points.append(self.colors[clr])
166
167 def usage():
168 print ('''Usage:
169 %s [options] <aligned_input> <ref_genome>
170 Options
171 -h: display this help message
172 -i <int>: minimum identity to include in plot (y-axis extent)
173 -b <int>: begin the plot at the nth nucleotide in the ref genome
174 -l <int>: include n nucleotides in the x-axis of the plot
175 the default behavior is to plot the entirety of the genome
176 -t <str>: title of the plot (e.g. name of reference genome). Put in quotes.
177 -m <str>: Name of the microbiome. Put in quotes.
178 -c <int>: number indicating color for plot points.
179 ''' % sys.argv[0])
180
181 if __name__ == '__main__':
182 # Parse all command-line arguments
183 # variable opts contains a list of the key-value pair of (option, value); variable args contains a list of the configuration file
184 try:
185 opts, args = getopt.getopt(sys.argv[1:], 'hi:b:l:t:m:c:', ['help', 'identity=', 'begin=', 'length=', 'title=', 'microbiome=', 'color='])
186 except getopt.GetoptError as err:
187 print(str(err))
188 usage()
189 sys.exit(2)
190
191 # Set some default values
192 identity = 50.0
193 begin = 0
194 length = 0
195 title = ''
196 microbiome=''
197 color = 0
198
199 # Parse options and set user-defined values
200 for opt, value in opts:
201 if opt in ('-h', '--help'):
202 usage()
203 sys.exit(0)
204 elif opt in ('-i', '--identity'):
205 identity = float(value)
206 elif opt in ('-b', '--begin'):
207 begin = int(value)
208 elif opt in ('-l', '--length'):
209 length = int(length)
210 elif opt in ('-t', '--title'):
211 title = str(value)
212 elif opt in ('-m', '--microbiome'):
213 microbiome = str(value)
214 elif opt in ('-c', '--color'):
215 color = int(value)
216 else:
217 assert False, "invalid option"
218
219 # Verify there are input files: input and ref_genome
220 if len(args) == 0:
221 print ('Need input files.')
222 usage()
223 sys.exit
224
225 # Load and read the configuration file. Treats the file as module object
226 # config = imp.load_source('*', args[0])
227
228 aligned_input= args[0]
229 ref_genome = args[1]
230
231 # Instantiate plotting object
232 plotter = FragmentPlotter(microbiome, color, ymin=identity)
233
234
235 # Begin the plotting commands, starting each with the reference genome
236 # for refgenome in ref_genome:
237 '''refgenome = [(name, taxonID, file_path), ..., ]'''
238 contigs = []
239
240 # Parse the reference genome's FASTA file.
241 fasta = open(ref_genome, 'r')
242 for refseq in SequenceFile(fasta):
243 contigs.append([refseq.name.split('|')[3], len(refseq.seq)])
244
245 fasta.close()
246
247 # Sets the ref genome's accession # as the title if title is not set (i.e. empty)
248 if title == '':
249 title = contigs[0][0]
250
251 # Initialize the new plot
252 plotter.new_plot(title, contigs, begin, length)
253
254 count_samp = 0
255 # for sample in aligned_input:
256 '''sample = [(name, {taxonID:path}), ...,]'''
257 # plotter.add_sample("test")
258 count_samp += 1
259
260 # Call the appropriate parser based on sample file extension
261 # file = '%s%s' % (config.basepath, sample[1][refgenome[1]])
262
263 # if aligned_input.endswith('.sam'):
264 # parser_func = SAMFile
265 # elif aligned_input.endswith('.fr-hit'):
266 # parser_func = FRHITFile
267 # else:
268 # print('Invalid file type')
269
270 parser_func = SAMFile
271
272 sample_name =""
273 for frag in parser_func(aligned_input):
274 if frag.identity >= identity:
275 sample_name = frag.name.split('.')[0]
276 plotter.add_points(frag.name, frag.refseq, frag.location, frag.identity)
277
278 plotter.add_sample(sample_name)
279 # Put this inside the sample for-loop to create individual plot per sample
280 plotter.draw('%s.png' % ("frp_image"))
281
282
283 # plotter.show()
284
285 # Draw the points and save file with taxonID as base name. Put this outside the sample for-loop to create an all-samples plot.
286 # Put this inside the sample for-loop to create individual plot per sample
287 # plotter.draw('%s/%s.png' % (config.shortname, refgenome[1]))
288
289 # plotter.legend('%s/legend.png' % (config.shortname))
290
291 # Make a tarball of the resulting PNGs
292 # os.system('tar -cvf %s.tar %s >/dev/null' %
293 # ( config.shortname, config.shortname))
294 # os.system('gzip %s.tar' % (config.shortname))
295 # shutil.rmtree(config.shortname)
296
297
298
299
300