| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 '''
 | 
| 
 | 
     4 Author: Silvia Chang
 | 
| 
 | 
     5 Metagenomic Final Project
 | 
| 
 | 
     6 Fall 2014
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 Script to create fragment recruitment plots, after read-reference alignment using Bowtie (SAM output) and FR-HIT (fr-hit output) programs
 | 
| 
 | 
     9 Simplified/revised version of the fragrecruit project from https://github.com/mstrosaker/fragrecruit
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 Usage: ./frp_tool.py [options] --input --output
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 '''
 | 
| 
 | 
    14 import matplotlib.pyplot as plt
 | 
| 
 | 
    15 import os, sys, getopt, imp, errno
 | 
| 
 | 
    16 # import shutil
 | 
| 
 | 
    17 from refseq_parser import SequenceFile
 | 
| 
 | 
    18 from SAM_parser import SAMFile
 | 
| 
 | 
    19 from FRHIT_parser import FRHITFile
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 class FragmentPlotter:
 | 
| 
 | 
    22     '''Contains all functions to create fragment recruitment plots'''
 | 
| 
 | 
    23     
 | 
| 
 | 
    24     # Color list. Can only handle 5 samples. Add more colors for more samples. 
 | 
| 
 | 
    25     colors = [              
 | 
| 
 | 
    26         (0.78, 0.59, 0.17, 1.0),   #dark gold
 | 
| 
 | 
    27         (0.99, 0.60, 0.17, 1.0),   #light orange   
 | 
| 
 | 
    28         (0.60, 0.82, 0.47, 1.0),   #medium/light green
 | 
| 
 | 
    29         (0.95, 0.40, 0.40, 1.0),   #dark pink/fuschia
 | 
| 
 | 
    30         (0.17, 0.50, 0.69, 1.0),   #medium teal/blue
 | 
| 
 | 
    31         (0.85, 0.64, 0.60, 1.0),   #light plum
 | 
| 
 | 
    32     ]
 | 
| 
 | 
    33 
 | 
| 
 | 
    34     def __init__(self, microbiome, pt_color, ymin=50):
 | 
| 
 | 
    35         self.microbiome = microbiome
 | 
| 
 | 
    36         self.decorated = False
 | 
| 
 | 
    37         self.pt_color = pt_color
 | 
| 
 | 
    38         self.ymin = ymin
 | 
| 
 | 
    39               
 | 
| 
 | 
    40     def new_plot(self, ref_name, contigs, begin, length):
 | 
| 
 | 
    41         ''' Clears previous data and start a blank plot'''
 | 
| 
 | 
    42         # refgenome - string, organism name of the refgenome
 | 
| 
 | 
    43         # contigs - a list containing a [name,length] pair of the refgenome
 | 
| 
 | 
    44         # begin, length - integers indicate beginning/length of plot
 | 
| 
 | 
    45         
 | 
| 
 | 
    46         plt.close()
 | 
| 
 | 
    47         # Initialize variables that will contain all the fragments information
 | 
| 
 | 
    48         self.refgenome = ref_name
 | 
| 
 | 
    49         self.contigs = contigs
 | 
| 
 | 
    50         self.begin = begin
 | 
| 
 | 
    51         self.samples = []
 | 
| 
 | 
    52         self.n_points = 0
 | 
| 
 | 
    53         self.x_points = []
 | 
| 
 | 
    54         self.y_points = []
 | 
| 
 | 
    55         self.clr_points = []
 | 
| 
 | 
    56         self.decorated = False
 | 
| 
 | 
    57         
 | 
| 
 | 
    58         self.full_length = 0
 | 
| 
 | 
    59         
 | 
| 
 | 
    60         # Builds the full length of the reference genome, or partial based on begin and length variables
 | 
| 
 | 
    61         for contig in self.contigs:
 | 
| 
 | 
    62             self.full_length += contig[1]
 | 
| 
 | 
    63         
 | 
| 
 | 
    64         # Sets the range of the x axis
 | 
| 
 | 
    65         if length != 0:
 | 
| 
 | 
    66             self.xextent = begin + length
 | 
| 
 | 
    67         else:
 | 
| 
 | 
    68             self.xextent = self.full_length
 | 
| 
 | 
    69     
 | 
| 
 | 
    70     def decorate(self):
 | 
| 
 | 
    71         ''' Decorates the plot by adding axes, axes labels, title, tickmarks, etc.'''
 | 
| 
 | 
    72         if not self.decorated:
 | 
| 
 | 
    73             self.decorated = True
 | 
| 
 | 
    74             
 | 
| 
 | 
    75             # Sets the rc parameters, add x-axis tickmarks and labels them
 | 
| 
 | 
    76             plt.rc('xtick', labelsize=8)
 | 
| 
 | 
    77             ticks = [self.begin] # beginning tick
 | 
| 
 | 
    78             ticks.append(self.xextent/2) # mid tick
 | 
| 
 | 
    79             ticks.append(self.xextent) # last tick
 | 
| 
 | 
    80             
 | 
| 
 | 
    81             labels = [int(self.begin)] # beginning nucleotide
 | 
| 
 | 
    82             labels.append(str(self.xextent/2)) # mid nucleotide
 | 
| 
 | 
    83             labels.append(str(self.xextent)) # last nucleotide
 | 
| 
 | 
    84             
 | 
| 
 | 
    85             plt.xticks(ticks, labels)
 | 
| 
 | 
    86             
 | 
| 
 | 
    87             # Sets x and y axes boundaries and labels
 | 
| 
 | 
    88             plt.xlim(self.begin, self.xextent)
 | 
| 
 | 
    89             plt.ylim(self.ymin, 100)
 | 
| 
 | 
    90             plt.ylabel("% Identity")
 | 
| 
 | 
    91             
 | 
| 
 | 
    92             # Set plot titles
 | 
| 
 | 
    93             plt.title(self.refgenome)
 | 
| 
 | 
    94             plt.suptitle(self.microbiome)
 | 
| 
 | 
    95             
 | 
| 
 | 
    96     def draw(self, filename):
 | 
| 
 | 
    97         '''Creates a scatter plot of fragments after they're added in add_points() and save the plots in the specified file name'''
 | 
| 
 | 
    98         
 | 
| 
 | 
    99         # Sets points size and shape based on number of points recruited
 | 
| 
 | 
   100         if self.n_points > 10000:
 | 
| 
 | 
   101             pt_shape = 'o'
 | 
| 
 | 
   102             pt_size = 1
 | 
| 
 | 
   103         else:
 | 
| 
 | 
   104             pt_shape ='s'
 | 
| 
 | 
   105             pt_size = 3
 | 
| 
 | 
   106  
 | 
| 
 | 
   107         # Generate the scatter plot
 | 
| 
 | 
   108 #         plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none')
 | 
| 
 | 
   109          
 | 
| 
 | 
   110         # Adds the legend on the plot. Currently, can only add one. Uncomment the plt.legend method.
 | 
| 
 | 
   111         plt.scatter(self.x_points, self.y_points, s=pt_size, c=self.clr_points, marker=pt_shape, alpha = 0.1, edgecolors='none', label=self.samples[0])
 | 
| 
 | 
   112         plt.legend(loc='lower left', ncol=3, scatterpoints=1, markerscale=8, prop={'size':10})
 | 
| 
 | 
   113         
 | 
| 
 | 
   114         # Adds titles, axes, etc
 | 
| 
 | 
   115         self.decorate()
 | 
| 
 | 
   116         
 | 
| 
 | 
   117         # Save the images
 | 
| 
 | 
   118         plt.savefig(filename, dpi=150, bbox_inches=0)
 | 
| 
 | 
   119 
 | 
| 
 | 
   120     def legend(self, filename, width=4):
 | 
| 
 | 
   121         '''Create a legend for the plot'''
 | 
| 
 | 
   122         plt.figure(figsize=(width, 0.45*len(self.samples)))
 | 
| 
 | 
   123         plt.axis('off')
 | 
| 
 | 
   124         plt.ylim(-1,0.1)
 | 
| 
 | 
   125   
 | 
| 
 | 
   126         i = 0
 | 
| 
 | 
   127         ystep = 0
 | 
| 
 | 
   128         for s in self.samples:
 | 
| 
 | 
   129             plt.text(0.00, -1*ystep-0.1, 'text',
 | 
| 
 | 
   130                        backgroundcolor=self.colors[i], color=self.colors[i])
 | 
| 
 | 
   131             plt.text(0.27, -1*ystep-0.1, s)
 | 
| 
 | 
   132             i += 1
 | 
| 
 | 
   133             ystep += 0.2
 | 
| 
 | 
   134 
 | 
| 
 | 
   135         plt.title("Samples")
 | 
| 
 | 
   136         plt.savefig(filename, bbox_inches=0)   
 | 
| 
 | 
   137                 
 | 
| 
 | 
   138     def show(self):
 | 
| 
 | 
   139         self.decorate()
 | 
| 
 | 
   140         plt.show()
 | 
| 
 | 
   141     
 | 
| 
 | 
   142     def add_sample(self, name):
 | 
| 
 | 
   143         self.samples.append(name)
 | 
| 
 | 
   144     
 | 
| 
 | 
   145     def add_points(self, sample, ref_contig, coordinate, identity):
 | 
| 
 | 
   146         x= 0
 | 
| 
 | 
   147         
 | 
| 
 | 
   148         # Builds the reference length
 | 
| 
 | 
   149         for c  in self.contigs:
 | 
| 
 | 
   150             if (ref_contig == c[0]):
 | 
| 
 | 
   151                 break
 | 
| 
 | 
   152             x += c[1]   
 | 
| 
 | 
   153             
 | 
| 
 | 
   154         # Applies new color to different samples
 | 
| 
 | 
   155         clr = self.pt_color
 | 
| 
 | 
   156         for s in self.samples:
 | 
| 
 | 
   157             if(sample == s):
 | 
| 
 | 
   158                 break
 | 
| 
 | 
   159             clr += 1
 | 
| 
 | 
   160             
 | 
| 
 | 
   161         if x + coordinate >= self.begin and x + coordinate <= self.xextent:
 | 
| 
 | 
   162             self.n_points += 1
 | 
| 
 | 
   163             self.x_points.append(x + coordinate)
 | 
| 
 | 
   164             self.y_points.append(identity)
 | 
| 
 | 
   165             self.clr_points.append(self.colors[clr])               
 | 
| 
 | 
   166             
 | 
| 
 | 
   167 def usage():
 | 
| 
 | 
   168     print ('''Usage:
 | 
| 
 | 
   169     %s [options] <aligned_input> <ref_genome>
 | 
| 
 | 
   170       Options
 | 
| 
 | 
   171         -h: display this help message
 | 
| 
 | 
   172         -i <int>: minimum identity to include in plot (y-axis extent)
 | 
| 
 | 
   173         -b <int>: begin the plot at the nth nucleotide in the ref genome
 | 
| 
 | 
   174         -l <int>: include n nucleotides in the x-axis of the plot
 | 
| 
 | 
   175            the default behavior is to plot the entirety of the genome
 | 
| 
 | 
   176         -t <str>: title of the plot (e.g. name of reference genome). Put in quotes. 
 | 
| 
 | 
   177         -m <str>: Name of the microbiome. Put in quotes.
 | 
| 
 | 
   178         -c <int>: number indicating color for plot points.
 | 
| 
 | 
   179            ''' % sys.argv[0])
 | 
| 
 | 
   180             
 | 
| 
 | 
   181 if __name__ == '__main__':
 | 
| 
 | 
   182     # Parse all command-line arguments
 | 
| 
 | 
   183     # variable opts contains a list of the key-value pair of (option, value); variable args contains a list of the configuration file
 | 
| 
 | 
   184     try: 
 | 
| 
 | 
   185         opts, args = getopt.getopt(sys.argv[1:], 'hi:b:l:t:m:c:', ['help', 'identity=', 'begin=', 'length=', 'title=', 'microbiome=', 'color='])
 | 
| 
 | 
   186     except getopt.GetoptError as err:
 | 
| 
 | 
   187         print(str(err))
 | 
| 
 | 
   188         usage()
 | 
| 
 | 
   189         sys.exit(2)
 | 
| 
 | 
   190     
 | 
| 
 | 
   191     # Set some default values
 | 
| 
 | 
   192     identity = 50.0
 | 
| 
 | 
   193     begin = 0
 | 
| 
 | 
   194     length = 0
 | 
| 
 | 
   195     title = ''
 | 
| 
 | 
   196     microbiome=''
 | 
| 
 | 
   197     color = 0
 | 
| 
 | 
   198     
 | 
| 
 | 
   199     # Parse options and set user-defined values
 | 
| 
 | 
   200     for opt, value in opts:
 | 
| 
 | 
   201         if opt in ('-h', '--help'):
 | 
| 
 | 
   202             usage()
 | 
| 
 | 
   203             sys.exit(0)
 | 
| 
 | 
   204         elif opt in ('-i', '--identity'):
 | 
| 
 | 
   205             identity = float(value)
 | 
| 
 | 
   206         elif opt in ('-b', '--begin'):
 | 
| 
 | 
   207             begin = int(value)
 | 
| 
 | 
   208         elif opt in ('-l', '--length'):
 | 
| 
 | 
   209             length = int(length)
 | 
| 
 | 
   210         elif opt in ('-t', '--title'):
 | 
| 
 | 
   211             title = str(value)
 | 
| 
 | 
   212         elif opt in ('-m', '--microbiome'):
 | 
| 
 | 
   213             microbiome = str(value)
 | 
| 
 | 
   214         elif opt in ('-c', '--color'):
 | 
| 
 | 
   215             color = int(value)
 | 
| 
 | 
   216         else:
 | 
| 
 | 
   217             assert False, "invalid option" 
 | 
| 
 | 
   218     
 | 
| 
 | 
   219     # Verify there are input files: input and ref_genome
 | 
| 
 | 
   220     if len(args) == 0:
 | 
| 
 | 
   221         print ('Need input files.')
 | 
| 
 | 
   222         usage()
 | 
| 
 | 
   223         sys.exit
 | 
| 
 | 
   224     
 | 
| 
 | 
   225     # Load and read the configuration file. Treats the file as module object
 | 
| 
 | 
   226 #     config = imp.load_source('*', args[0])
 | 
| 
 | 
   227     
 | 
| 
 | 
   228     aligned_input= args[0]
 | 
| 
 | 
   229     ref_genome = args[1]
 | 
| 
 | 
   230     
 | 
| 
 | 
   231     # Instantiate plotting object
 | 
| 
 | 
   232     plotter = FragmentPlotter(microbiome, color, ymin=identity)
 | 
| 
 | 
   233     
 | 
| 
 | 
   234            
 | 
| 
 | 
   235     # Begin the plotting commands, starting each with the reference genome
 | 
| 
 | 
   236 #     for refgenome in ref_genome:
 | 
| 
 | 
   237     '''refgenome = [(name, taxonID, file_path), ..., ]'''
 | 
| 
 | 
   238     contigs = []
 | 
| 
 | 
   239     
 | 
| 
 | 
   240     # Parse the reference genome's FASTA file.
 | 
| 
 | 
   241     fasta = open(ref_genome, 'r')
 | 
| 
 | 
   242     for refseq in SequenceFile(fasta):
 | 
| 
 | 
   243         contigs.append([refseq.name.split('|')[3], len(refseq.seq)])
 | 
| 
 | 
   244     
 | 
| 
 | 
   245     fasta.close()
 | 
| 
 | 
   246     
 | 
| 
 | 
   247     # Sets the ref genome's accession # as the title if title is not set (i.e. empty)
 | 
| 
 | 
   248     if title == '':
 | 
| 
 | 
   249         title = contigs[0][0]
 | 
| 
 | 
   250     
 | 
| 
 | 
   251     # Initialize the new plot
 | 
| 
 | 
   252     plotter.new_plot(title, contigs, begin, length)
 | 
| 
 | 
   253     
 | 
| 
 | 
   254     count_samp = 0
 | 
| 
 | 
   255 #     for sample in aligned_input:
 | 
| 
 | 
   256     '''sample = [(name, {taxonID:path}), ...,]'''
 | 
| 
 | 
   257 #     plotter.add_sample("test")
 | 
| 
 | 
   258     count_samp += 1
 | 
| 
 | 
   259     
 | 
| 
 | 
   260     # Call the appropriate parser based on sample file extension
 | 
| 
 | 
   261 #             file = '%s%s' % (config.basepath, sample[1][refgenome[1]])
 | 
| 
 | 
   262 
 | 
| 
 | 
   263 #     if aligned_input.endswith('.sam'):
 | 
| 
 | 
   264 #         parser_func = SAMFile
 | 
| 
 | 
   265 #     elif aligned_input.endswith('.fr-hit'):
 | 
| 
 | 
   266 #         parser_func = FRHITFile
 | 
| 
 | 
   267 #     else:
 | 
| 
 | 
   268 #         print('Invalid file type')
 | 
| 
 | 
   269     
 | 
| 
 | 
   270     parser_func = SAMFile
 | 
| 
 | 
   271         
 | 
| 
 | 
   272     sample_name =""
 | 
| 
 | 
   273     for frag in parser_func(aligned_input):
 | 
| 
 | 
   274         if frag.identity >= identity:
 | 
| 
 | 
   275             sample_name = frag.name.split('.')[0]
 | 
| 
 | 
   276             plotter.add_points(frag.name, frag.refseq, frag.location, frag.identity)
 | 
| 
 | 
   277     
 | 
| 
 | 
   278     plotter.add_sample(sample_name)    
 | 
| 
 | 
   279         # Put this inside the sample for-loop to create individual plot per sample
 | 
| 
 | 
   280     plotter.draw('%s.png' % ("frp_image"))
 | 
| 
 | 
   281 
 | 
| 
 | 
   282     
 | 
| 
 | 
   283 #     plotter.show()
 | 
| 
 | 
   284             
 | 
| 
 | 
   285         # Draw the points and save file with taxonID as base name. Put this outside the sample for-loop to create an all-samples plot. 
 | 
| 
 | 
   286         # Put this inside the sample for-loop to create individual plot per sample
 | 
| 
 | 
   287 #         plotter.draw('%s/%s.png' % (config.shortname, refgenome[1]))
 | 
| 
 | 
   288     
 | 
| 
 | 
   289 #     plotter.legend('%s/legend.png' % (config.shortname))
 | 
| 
 | 
   290     
 | 
| 
 | 
   291     # Make a tarball of the resulting PNGs
 | 
| 
 | 
   292 #     os.system('tar -cvf %s.tar %s >/dev/null' %
 | 
| 
 | 
   293 #               ( config.shortname, config.shortname))
 | 
| 
 | 
   294 #     os.system('gzip %s.tar' % (config.shortname))
 | 
| 
 | 
   295 #     shutil.rmtree(config.shortname)
 | 
| 
 | 
   296             
 | 
| 
 | 
   297         
 | 
| 
 | 
   298                                                                                                                                                                                   
 | 
| 
 | 
   299         
 | 
| 
 | 
   300          |