changeset 0:e223b827a13e draft

Uploaded
author schang
date Tue, 02 Dec 2014 19:38:14 -0500
parents
children cbdcc9621730
files frp_tool.py
diffstat 1 files changed, 300 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/frp_tool.py	Tue Dec 02 19:38:14 2014 -0500
@@ -0,0 +1,300 @@
+#!/usr/bin/env python
+
+'''
+Author: Silvia Chang
+Metagenomic Final Project
+Fall 2014
+
+Script to create fragment recruitment plots, after read-reference alignment using Bowtie (SAM output) and FR-HIT (fr-hit output) programs
+Simplified/revised version of the fragrecruit project from https://github.com/mstrosaker/fragrecruit
+
+Usage: ./frp_tool.py [options] --input --output
+
+'''
+import matplotlib.pyplot as plt
+import os, sys, getopt, imp, errno
+# import shutil
+from refseq_parser import SequenceFile
+from SAM_parser import SAMFile
+from FRHIT_parser import FRHITFile
+
+class FragmentPlotter:
+    '''Contains all functions to create fragment recruitment plots'''
+    
+    # Color list. Can only handle 5 samples. Add more colors for more samples. 
+    colors = [              
+        (0.78, 0.59, 0.17, 1.0),   #dark gold
+        (0.99, 0.60, 0.17, 1.0),   #light orange   
+        (0.60, 0.82, 0.47, 1.0),   #medium/light green
+        (0.95, 0.40, 0.40, 1.0),   #dark pink/fuschia
+        (0.17, 0.50, 0.69, 1.0),   #medium teal/blue
+        (0.85, 0.64, 0.60, 1.0),   #light plum
+    ]
+
+    def __init__(self, microbiome, pt_color, ymin=50):
+        self.microbiome = microbiome
+        self.decorated = False
+        self.pt_color = pt_color
+        self.ymin = ymin
+              
+    def new_plot(self, ref_name, contigs, begin, length):
+        ''' Clears previous data and start a blank plot'''
+        # refgenome - string, organism name of the refgenome
+        # contigs - a list containing a [name,length] pair of the refgenome
+        # begin, length - integers indicate beginning/length of plot
+        
+        plt.close()
+        # Initialize variables that will contain all the fragments information
+        self.refgenome = ref_name
+        self.contigs = contigs
+        self.begin = begin
+        self.samples = []
+        self.n_points = 0
+        self.x_points = []
+        self.y_points = []
+        self.clr_points = []
+        self.decorated = False
+        
+        self.full_length = 0
+        
+        # Builds the full length of the reference genome, or partial based on begin and length variables
+        for contig in self.contigs:
+            self.full_length += contig[1]
+        
+        # Sets the range of the x axis
+        if length != 0:
+            self.xextent = begin + length
+        else:
+            self.xextent = self.full_length
+    
+    def decorate(self):
+        ''' Decorates the plot by adding axes, axes labels, title, tickmarks, etc.'''
+        if not self.decorated:
+            self.decorated = True
+            
+            # Sets the rc parameters, add x-axis tickmarks and labels them
+            plt.rc('xtick', labelsize=8)
+            ticks = [self.begin] # beginning tick
+            ticks.append(self.xextent/2) # mid tick
+            ticks.append(self.xextent) # last tick
+            
+            labels = [int(self.begin)] # beginning nucleotide
+            labels.append(str(self.xextent/2)) # mid nucleotide
+            labels.append(str(self.xextent)) # last nucleotide
+            
+            plt.xticks(ticks, labels)
+            
+            # Sets x and y axes boundaries and labels
+            plt.xlim(self.begin, self.xextent)
+            plt.ylim(self.ymin, 100)
+            plt.ylabel("% Identity")
+            
+            # Set plot titles
+            plt.title(self.refgenome)
+            plt.suptitle(self.microbiome)
+            
+    def draw(self, filename):
+        '''Creates a scatter plot of fragments after they're added in add_points() and save the plots in the specified file name'''
+        
+        # Sets points size and shape based on number of points recruited
+        if self.n_points > 10000:
+            pt_shape = 'o'
+            pt_size = 1
+        else:
+            pt_shape ='s'
+            pt_size = 3
+ 
+        # Generate the scatter plot
+#         plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none')
+         
+        # Adds the legend on the plot. Currently, can only add one. Uncomment the plt.legend method.
+        plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none', label=self.samples[0])
+        plt.legend(loc='lower left', ncol=3, scatterpoints=1, markerscale=8, prop={'size':10})
+        
+        # Adds titles, axes, etc
+        self.decorate()
+        
+        # Save the images
+        plt.savefig(filename, dpi=150, bbox_inches=0)
+
+    def legend(self, filename, width=4):
+        '''Create a legend for the plot'''
+        plt.figure(figsize=(width, 0.45*len(self.samples)))
+        plt.axis('off')
+        plt.ylim(-1,0.1)
+  
+        i = 0
+        ystep = 0
+        for s in self.samples:
+            plt.text(0.00, -1*ystep-0.1, 'text',
+                       backgroundcolor=self.colors[i], color=self.colors[i])
+            plt.text(0.27, -1*ystep-0.1, s)
+            i += 1
+            ystep += 0.2
+
+        plt.title("Samples")
+        plt.savefig(filename, bbox_inches=0)   
+                
+    def show(self):
+        self.decorate()
+        plt.show()
+    
+    def add_sample(self, name):
+        self.samples.append(name)
+    
+    def add_points(self, sample, ref_contig, coordinate, identity):
+        x= 0
+        
+        # Builds the reference length
+        for c  in self.contigs:
+            if (ref_contig == c[0]):
+                break
+            x += c[1]   
+            
+        # Applies new color to different samples
+        clr = self.pt_color
+        for s in self.samples:
+            if(sample == s):
+                break
+            clr += 1
+            
+        if x + coordinate >= self.begin and x + coordinate <= self.xextent:
+            self.n_points += 1
+            self.x_points.append(x + coordinate)
+            self.y_points.append(identity)
+            self.clr_points.append(self.colors[clr])               
+            
+def usage():
+    print ('''Usage:
+    %s [options] <aligned_input> <ref_genome>
+      Options
+        -h: display this help message
+        -i <int>: minimum identity to include in plot (y-axis extent)
+        -b <int>: begin the plot at the nth nucleotide in the ref genome
+        -l <int>: include n nucleotides in the x-axis of the plot
+           the default behavior is to plot the entirety of the genome
+        -t <str>: title of the plot (e.g. name of reference genome). Put in quotes. 
+        -m <str>: Name of the microbiome. Put in quotes.
+        -c <int>: number indicating color for plot points.
+           ''' % sys.argv[0])
+            
+if __name__ == '__main__':
+    # Parse all command-line arguments
+    # variable opts contains a list of the key-value pair of (option, value); variable args contains a list of the configuration file
+    try: 
+        opts, args = getopt.getopt(sys.argv[1:], 'hi:b:l:t:m:c:', ['help', 'identity=', 'begin=', 'length=', 'title=', 'microbiome=', 'color='])
+    except getopt.GetoptError as err:
+        print(str(err))
+        usage()
+        sys.exit(2)
+    
+    # Set some default values
+    identity = 50.0
+    begin = 0
+    length = 0
+    title = ''
+    microbiome=''
+    color = 0
+    
+    # Parse options and set user-defined values
+    for opt, value in opts:
+        if opt in ('-h', '--help'):
+            usage()
+            sys.exit(0)
+        elif opt in ('-i', '--identity'):
+            identity = float(value)
+        elif opt in ('-b', '--begin'):
+            begin = int(value)
+        elif opt in ('-l', '--length'):
+            length = int(length)
+        elif opt in ('-t', '--title'):
+            title = str(value)
+        elif opt in ('-m', '--microbiome'):
+            microbiome = str(value)
+        elif opt in ('-c', '--color'):
+            color = int(value)
+        else:
+            assert False, "invalid option" 
+    
+    # Verify there are input files: input and ref_genome
+    if len(args) == 0:
+        print ('Need input files.')
+        usage()
+        sys.exit
+    
+    # Load and read the configuration file. Treats the file as module object
+#     config = imp.load_source('*', args[0])
+    
+    aligned_input= args[0]
+    ref_genome = args[1]
+    
+    # Instantiate plotting object
+    plotter = FragmentPlotter(microbiome, color, ymin=identity)
+    
+           
+    # Begin the plotting commands, starting each with the reference genome
+#     for refgenome in ref_genome:
+    '''refgenome = [(name, taxonID, file_path), ..., ]'''
+    contigs = []
+    
+    # Parse the reference genome's FASTA file.
+    fasta = open(ref_genome, 'r')
+    for refseq in SequenceFile(fasta):
+        contigs.append([refseq.name.split('|')[3], len(refseq.seq)])
+    
+    fasta.close()
+    
+    # Sets the ref genome's accession # as the title if title is not set (i.e. empty)
+    if title == '':
+        title = contigs[0][0]
+    
+    # Initialize the new plot
+    plotter.new_plot(title, contigs, begin, length)
+    
+    count_samp = 0
+#     for sample in aligned_input:
+    '''sample = [(name, {taxonID:path}), ...,]'''
+#     plotter.add_sample("test")
+    count_samp += 1
+    
+    # Call the appropriate parser based on sample file extension
+#             file = '%s%s' % (config.basepath, sample[1][refgenome[1]])
+
+#     if aligned_input.endswith('.sam'):
+#         parser_func = SAMFile
+#     elif aligned_input.endswith('.fr-hit'):
+#         parser_func = FRHITFile
+#     else:
+#         print('Invalid file type')
+    
+    parser_func = SAMFile
+        
+    sample_name =""
+    for frag in parser_func(aligned_input):
+        if frag.identity >= identity:
+            sample_name = frag.name.split('.')[0]
+            plotter.add_points(frag.name, frag.refseq, frag.location, frag.identity)
+    
+    plotter.add_sample(sample_name)    
+        # Put this inside the sample for-loop to create individual plot per sample
+    plotter.draw('%s.png' % ("frp_image"))
+
+    
+#     plotter.show()
+            
+        # Draw the points and save file with taxonID as base name. Put this outside the sample for-loop to create an all-samples plot. 
+        # Put this inside the sample for-loop to create individual plot per sample
+#         plotter.draw('%s/%s.png' % (config.shortname, refgenome[1]))
+    
+#     plotter.legend('%s/legend.png' % (config.shortname))
+    
+    # Make a tarball of the resulting PNGs
+#     os.system('tar -cvf %s.tar %s >/dev/null' %
+#               ( config.shortname, config.shortname))
+#     os.system('gzip %s.tar' % (config.shortname))
+#     shutil.rmtree(config.shortname)
+            
+        
+                                                                                                                                                                                  
+        
+        
\ No newline at end of file