diff riboseqr/triplet.py @ 5:423ad61697c4

Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero). readingFrame function fails with subscript out of bounds. Bugfix 2: [triplet] Check if transcript name in SAM matches the name in FASTA. Produce an error message if it's not. This fixes the problem where an empty plot is produced (no bars). [ribosome_profile] - A proper error message is now produced if an invalid transcript name is provided. Updated test data
author Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com>
date Tue, 27 Oct 2015 12:21:50 +0000
parents c34c364ce75d
children
line wrap: on
line diff
--- a/riboseqr/triplet.py	Tue Jul 21 15:40:22 2015 +0100
+++ b/riboseqr/triplet.py	Tue Oct 27 12:21:50 2015 +0000
@@ -11,6 +11,10 @@
 R = robjects.r
 
 
+class TripletPeriodicityError(Exception):
+    pass
+
+
 def run_rscript(command=None):
     """Run R command, log it, append to rscript"""
     global rscript
@@ -24,7 +28,7 @@
 
 def find_periodicity(
         rdata_load='Prepare.rda', start_codons='ATG', stop_codons='TAG,TAA,TGA',
-        fasta_file=None, include_lengths='25:30', analyze_plot_lengths='26:30',
+        fasta_file=None, include_lengths='25:30', analyze_plot_lengths=None,
         text_legend='Frame 0, Frame 1, Frame 2', rdata_save='Periodicity.rda',
         html_file='Periodicity-report.html', output_path=os.getcwd()):
     """Plot triplet periodicity from prepared R data file. """
@@ -36,6 +40,27 @@
     cmd = 'load("{}")'.format(rdata_load)
     run_rscript(cmd)
 
+    # get the first seqname (populated from input file which was generated from
+    # SAM)
+    refseq = R('levels(riboDat@riboGR$`RiboSeq file 1`@seqnames)[1]')
+    refseq = tuple(refseq)[0]
+
+    fasta_valid = False
+    for line in open(fasta_file):
+        if line.startswith('>'):
+            seqname = line.strip().lstrip('>').split()
+            # get the FASTA header, split on space, and use the first column to
+            # compare. Multiple fields separated by space doesn't appear to be
+            # a problem for riboSeqR
+            if seqname[0] == refseq:
+                fasta_valid = True
+                break
+    if not fasta_valid:
+        raise TripletPeriodicityError(
+            'Transcript name in SAM alignment does not match with name in the transcriptome '
+            'FASTA header. Please also check if the reads were aligned to this transcriptome '
+            'and not the genome')
+
     # R("""options(showTailLines=Inf)""")
     starts, stops = (utils.process_args(start_codons, ret_mode='charvector'),
                      utils.process_args(stop_codons, ret_mode='charvector'))
@@ -49,8 +74,12 @@
     logging.debug('{}\n'.format(R['fastaCDS']))
 
     cmd = """fCs <- frameCounting(riboDat, fastaCDS, lengths={0})
-    fS <- readingFrame(rC=fCs, lengths={1}); fS""".\
-        format(include_lengths, analyze_plot_lengths)
+
+    """.format(include_lengths)
+    if analyze_plot_lengths:
+        cmd += 'fS <- readingFrame(rC=fCs, lengths={0}); fS'.format(analyze_plot_lengths)
+    else:
+        cmd += 'fS <- readingFrame(rC=fCs); fS'
     run_rscript(cmd)
 
     logging.debug('riboDat \n{}\n'.format(R['riboDat']))
@@ -97,10 +126,10 @@
 
 
 if __name__ == '__main__':
-    
+
     parser = argparse.ArgumentParser(
         description='Plot triplet periodicity for different read lengths.')
-    
+
     # required arguments
     flags = parser.add_argument_group('required arguments')
     flags.add_argument(
@@ -109,7 +138,7 @@
     flags.add_argument(
         '--fasta_file', required=True,
         help='FASTA file of the reference transcriptome')
-    
+
     # optional arguments
     parser.add_argument(
         '--start_codons',
@@ -123,8 +152,7 @@
              '(default: %(default)s)', default='25:30')
     parser.add_argument(
         '--analyze_plot_lengths',
-        help='Lenghts of reads to be analyzed for frame-shift and plotting '
-             '(default: %(default)s)', default='26:30')
+        help='Lenghts of reads to be analyzed for frame-shift and plotting')
     parser.add_argument(
         '--text_legend',
         help='Text for legend used in the plot (default: %(default)s)',
@@ -137,7 +165,7 @@
                         help='Files are saved in this directory')
     parser.add_argument(
         '--debug', help='Produce debug output', action='store_true')
-    
+
     args = parser.parse_args()
     if args.debug:
         logging.basicConfig(format='%(module)s: %(levelname)s - %(message)s',