view frp_tool.py @ 6:d4ac3899145b draft

Uploaded
author schang
date Tue, 02 Dec 2014 19:54:26 -0500
parents e223b827a13e
children
line wrap: on
line source

#!/usr/bin/env python

'''
Author: Silvia Chang
Metagenomic Final Project
Fall 2014

Script to create fragment recruitment plots, after read-reference alignment using Bowtie (SAM output) and FR-HIT (fr-hit output) programs
Simplified/revised version of the fragrecruit project from https://github.com/mstrosaker/fragrecruit

Usage: ./frp_tool.py [options] --input --output

'''
import matplotlib.pyplot as plt
import os, sys, getopt, imp, errno
# import shutil
from refseq_parser import SequenceFile
from SAM_parser import SAMFile
from FRHIT_parser import FRHITFile

class FragmentPlotter:
    '''Contains all functions to create fragment recruitment plots'''
    
    # Color list. Can only handle 5 samples. Add more colors for more samples. 
    colors = [              
        (0.78, 0.59, 0.17, 1.0),   #dark gold
        (0.99, 0.60, 0.17, 1.0),   #light orange   
        (0.60, 0.82, 0.47, 1.0),   #medium/light green
        (0.95, 0.40, 0.40, 1.0),   #dark pink/fuschia
        (0.17, 0.50, 0.69, 1.0),   #medium teal/blue
        (0.85, 0.64, 0.60, 1.0),   #light plum
    ]

    def __init__(self, microbiome, pt_color, ymin=50):
        self.microbiome = microbiome
        self.decorated = False
        self.pt_color = pt_color
        self.ymin = ymin
              
    def new_plot(self, ref_name, contigs, begin, length):
        ''' Clears previous data and start a blank plot'''
        # refgenome - string, organism name of the refgenome
        # contigs - a list containing a [name,length] pair of the refgenome
        # begin, length - integers indicate beginning/length of plot
        
        plt.close()
        # Initialize variables that will contain all the fragments information
        self.refgenome = ref_name
        self.contigs = contigs
        self.begin = begin
        self.samples = []
        self.n_points = 0
        self.x_points = []
        self.y_points = []
        self.clr_points = []
        self.decorated = False
        
        self.full_length = 0
        
        # Builds the full length of the reference genome, or partial based on begin and length variables
        for contig in self.contigs:
            self.full_length += contig[1]
        
        # Sets the range of the x axis
        if length != 0:
            self.xextent = begin + length
        else:
            self.xextent = self.full_length
    
    def decorate(self):
        ''' Decorates the plot by adding axes, axes labels, title, tickmarks, etc.'''
        if not self.decorated:
            self.decorated = True
            
            # Sets the rc parameters, add x-axis tickmarks and labels them
            plt.rc('xtick', labelsize=8)
            ticks = [self.begin] # beginning tick
            ticks.append(self.xextent/2) # mid tick
            ticks.append(self.xextent) # last tick
            
            labels = [int(self.begin)] # beginning nucleotide
            labels.append(str(self.xextent/2)) # mid nucleotide
            labels.append(str(self.xextent)) # last nucleotide
            
            plt.xticks(ticks, labels)
            
            # Sets x and y axes boundaries and labels
            plt.xlim(self.begin, self.xextent)
            plt.ylim(self.ymin, 100)
            plt.ylabel("% Identity")
            
            # Set plot titles
            plt.title(self.refgenome)
            plt.suptitle(self.microbiome)
            
    def draw(self, filename):
        '''Creates a scatter plot of fragments after they're added in add_points() and save the plots in the specified file name'''
        
        # Sets points size and shape based on number of points recruited
        if self.n_points > 10000:
            pt_shape = 'o'
            pt_size = 1
        else:
            pt_shape ='s'
            pt_size = 3
 
        # Generate the scatter plot
#         plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none')
         
        # Adds the legend on the plot. Currently, can only add one. Uncomment the plt.legend method.
        plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none', label=self.samples[0])
        plt.legend(loc='lower left', ncol=3, scatterpoints=1, markerscale=8, prop={'size':10})
        
        # Adds titles, axes, etc
        self.decorate()
        
        # Save the images
        plt.savefig(filename, dpi=150, bbox_inches=0)

    def legend(self, filename, width=4):
        '''Create a legend for the plot'''
        plt.figure(figsize=(width, 0.45*len(self.samples)))
        plt.axis('off')
        plt.ylim(-1,0.1)
  
        i = 0
        ystep = 0
        for s in self.samples:
            plt.text(0.00, -1*ystep-0.1, 'text',
                       backgroundcolor=self.colors[i], color=self.colors[i])
            plt.text(0.27, -1*ystep-0.1, s)
            i += 1
            ystep += 0.2

        plt.title("Samples")
        plt.savefig(filename, bbox_inches=0)   
                
    def show(self):
        self.decorate()
        plt.show()
    
    def add_sample(self, name):
        self.samples.append(name)
    
    def add_points(self, sample, ref_contig, coordinate, identity):
        x= 0
        
        # Builds the reference length
        for c  in self.contigs:
            if (ref_contig == c[0]):
                break
            x += c[1]   
            
        # Applies new color to different samples
        clr = self.pt_color
        for s in self.samples:
            if(sample == s):
                break
            clr += 1
            
        if x + coordinate >= self.begin and x + coordinate <= self.xextent:
            self.n_points += 1
            self.x_points.append(x + coordinate)
            self.y_points.append(identity)
            self.clr_points.append(self.colors[clr])               
            
def usage():
    print ('''Usage:
    %s [options] <aligned_input> <ref_genome>
      Options
        -h: display this help message
        -i <int>: minimum identity to include in plot (y-axis extent)
        -b <int>: begin the plot at the nth nucleotide in the ref genome
        -l <int>: include n nucleotides in the x-axis of the plot
           the default behavior is to plot the entirety of the genome
        -t <str>: title of the plot (e.g. name of reference genome). Put in quotes. 
        -m <str>: Name of the microbiome. Put in quotes.
        -c <int>: number indicating color for plot points.
           ''' % sys.argv[0])
            
if __name__ == '__main__':
    # Parse all command-line arguments
    # variable opts contains a list of the key-value pair of (option, value); variable args contains a list of the configuration file
    try: 
        opts, args = getopt.getopt(sys.argv[1:], 'hi:b:l:t:m:c:', ['help', 'identity=', 'begin=', 'length=', 'title=', 'microbiome=', 'color='])
    except getopt.GetoptError as err:
        print(str(err))
        usage()
        sys.exit(2)
    
    # Set some default values
    identity = 50.0
    begin = 0
    length = 0
    title = ''
    microbiome=''
    color = 0
    
    # Parse options and set user-defined values
    for opt, value in opts:
        if opt in ('-h', '--help'):
            usage()
            sys.exit(0)
        elif opt in ('-i', '--identity'):
            identity = float(value)
        elif opt in ('-b', '--begin'):
            begin = int(value)
        elif opt in ('-l', '--length'):
            length = int(length)
        elif opt in ('-t', '--title'):
            title = str(value)
        elif opt in ('-m', '--microbiome'):
            microbiome = str(value)
        elif opt in ('-c', '--color'):
            color = int(value)
        else:
            assert False, "invalid option" 
    
    # Verify there are input files: input and ref_genome
    if len(args) == 0:
        print ('Need input files.')
        usage()
        sys.exit
    
    # Load and read the configuration file. Treats the file as module object
#     config = imp.load_source('*', args[0])
    
    aligned_input= args[0]
    ref_genome = args[1]
    
    # Instantiate plotting object
    plotter = FragmentPlotter(microbiome, color, ymin=identity)
    
           
    # Begin the plotting commands, starting each with the reference genome
#     for refgenome in ref_genome:
    '''refgenome = [(name, taxonID, file_path), ..., ]'''
    contigs = []
    
    # Parse the reference genome's FASTA file.
    fasta = open(ref_genome, 'r')
    for refseq in SequenceFile(fasta):
        contigs.append([refseq.name.split('|')[3], len(refseq.seq)])
    
    fasta.close()
    
    # Sets the ref genome's accession # as the title if title is not set (i.e. empty)
    if title == '':
        title = contigs[0][0]
    
    # Initialize the new plot
    plotter.new_plot(title, contigs, begin, length)
    
    count_samp = 0
#     for sample in aligned_input:
    '''sample = [(name, {taxonID:path}), ...,]'''
#     plotter.add_sample("test")
    count_samp += 1
    
    # Call the appropriate parser based on sample file extension
#             file = '%s%s' % (config.basepath, sample[1][refgenome[1]])

#     if aligned_input.endswith('.sam'):
#         parser_func = SAMFile
#     elif aligned_input.endswith('.fr-hit'):
#         parser_func = FRHITFile
#     else:
#         print('Invalid file type')
    
    parser_func = SAMFile
        
    sample_name =""
    for frag in parser_func(aligned_input):
        if frag.identity >= identity:
            sample_name = frag.name.split('.')[0]
            plotter.add_points(frag.name, frag.refseq, frag.location, frag.identity)
    
    plotter.add_sample(sample_name)    
        # Put this inside the sample for-loop to create individual plot per sample
    plotter.draw('%s.png' % ("frp_image"))

    
#     plotter.show()
            
        # Draw the points and save file with taxonID as base name. Put this outside the sample for-loop to create an all-samples plot. 
        # Put this inside the sample for-loop to create individual plot per sample
#         plotter.draw('%s/%s.png' % (config.shortname, refgenome[1]))
    
#     plotter.legend('%s/legend.png' % (config.shortname))
    
    # Make a tarball of the resulting PNGs
#     os.system('tar -cvf %s.tar %s >/dev/null' %
#               ( config.shortname, config.shortname))
#     os.system('gzip %s.tar' % (config.shortname))
#     shutil.rmtree(config.shortname)