Mercurial > repos > vimalkumarvelayudhan > riboseqr_wrapper
diff riboseqr/triplet.py @ 5:423ad61697c4
Bugfix 1: [triplet] Lengths (frameCounting) if given should be a range (not zero).
readingFrame function fails with subscript out of bounds.
Bugfix 2: [triplet] Check if transcript name in SAM matches the name in FASTA. Produce
an error message if it's not. This fixes the problem where an empty plot is
produced (no bars).
[ribosome_profile] - A proper error message is now produced if an invalid transcript name is
provided.
Updated test data
| author | Vimalkumar Velayudhan <vimalkumarvelayudhan@gmail.com> |
|---|---|
| date | Tue, 27 Oct 2015 12:21:50 +0000 |
| parents | c34c364ce75d |
| children |
line wrap: on
line diff
--- a/riboseqr/triplet.py Tue Jul 21 15:40:22 2015 +0100 +++ b/riboseqr/triplet.py Tue Oct 27 12:21:50 2015 +0000 @@ -11,6 +11,10 @@ R = robjects.r +class TripletPeriodicityError(Exception): + pass + + def run_rscript(command=None): """Run R command, log it, append to rscript""" global rscript @@ -24,7 +28,7 @@ def find_periodicity( rdata_load='Prepare.rda', start_codons='ATG', stop_codons='TAG,TAA,TGA', - fasta_file=None, include_lengths='25:30', analyze_plot_lengths='26:30', + fasta_file=None, include_lengths='25:30', analyze_plot_lengths=None, text_legend='Frame 0, Frame 1, Frame 2', rdata_save='Periodicity.rda', html_file='Periodicity-report.html', output_path=os.getcwd()): """Plot triplet periodicity from prepared R data file. """ @@ -36,6 +40,27 @@ cmd = 'load("{}")'.format(rdata_load) run_rscript(cmd) + # get the first seqname (populated from input file which was generated from + # SAM) + refseq = R('levels(riboDat@riboGR$`RiboSeq file 1`@seqnames)[1]') + refseq = tuple(refseq)[0] + + fasta_valid = False + for line in open(fasta_file): + if line.startswith('>'): + seqname = line.strip().lstrip('>').split() + # get the FASTA header, split on space, and use the first column to + # compare. Multiple fields separated by space doesn't appear to be + # a problem for riboSeqR + if seqname[0] == refseq: + fasta_valid = True + break + if not fasta_valid: + raise TripletPeriodicityError( + 'Transcript name in SAM alignment does not match with name in the transcriptome ' + 'FASTA header. Please also check if the reads were aligned to this transcriptome ' + 'and not the genome') + # R("""options(showTailLines=Inf)""") starts, stops = (utils.process_args(start_codons, ret_mode='charvector'), utils.process_args(stop_codons, ret_mode='charvector')) @@ -49,8 +74,12 @@ logging.debug('{}\n'.format(R['fastaCDS'])) cmd = """fCs <- frameCounting(riboDat, fastaCDS, lengths={0}) - fS <- readingFrame(rC=fCs, lengths={1}); fS""".\ - format(include_lengths, analyze_plot_lengths) + + """.format(include_lengths) + if analyze_plot_lengths: + cmd += 'fS <- readingFrame(rC=fCs, lengths={0}); fS'.format(analyze_plot_lengths) + else: + cmd += 'fS <- readingFrame(rC=fCs); fS' run_rscript(cmd) logging.debug('riboDat \n{}\n'.format(R['riboDat'])) @@ -97,10 +126,10 @@ if __name__ == '__main__': - + parser = argparse.ArgumentParser( description='Plot triplet periodicity for different read lengths.') - + # required arguments flags = parser.add_argument_group('required arguments') flags.add_argument( @@ -109,7 +138,7 @@ flags.add_argument( '--fasta_file', required=True, help='FASTA file of the reference transcriptome') - + # optional arguments parser.add_argument( '--start_codons', @@ -123,8 +152,7 @@ '(default: %(default)s)', default='25:30') parser.add_argument( '--analyze_plot_lengths', - help='Lenghts of reads to be analyzed for frame-shift and plotting ' - '(default: %(default)s)', default='26:30') + help='Lenghts of reads to be analyzed for frame-shift and plotting') parser.add_argument( '--text_legend', help='Text for legend used in the plot (default: %(default)s)', @@ -137,7 +165,7 @@ help='Files are saved in this directory') parser.add_argument( '--debug', help='Produce debug output', action='store_true') - + args = parser.parse_args() if args.debug: logging.basicConfig(format='%(module)s: %(levelname)s - %(message)s',
