diff riboseqr/prepare.py @ 0:e01de823e919 draft default tip

Uploaded
author biotechcoder
date Fri, 01 May 2015 05:41:51 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/riboseqr/prepare.py	Fri May 01 05:41:51 2015 -0400
@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+import os
+import sys
+import argparse
+import logging
+import rpy2.robjects as robjects
+
+import utils
+
+rscript = ''
+R = robjects.r
+
+
+def run_rscript(command=None):
+    """Run R command, log it, append to rscript"""
+    global rscript
+    if not command:
+        return
+    logging.debug(command)
+    rscript += '{}\n'.format(command)
+    R(command)
+
+
+def prep_riboseqr_input(sam_file, output_file):
+    """Generate input file for riboSeqR from SAM format file."""
+    with open(output_file, 'w') as f:
+        for line in open(sam_file):
+            if line.startswith('@'):
+                continue
+            line = line.split()
+            flag = line[1]
+
+            if flag != '0':
+                continue
+            # make start 0-indexed, sam alignments are 1-indexed
+            start = int(line[3]) - 1
+            (name, sequence) = (line[2], line[9])
+            f.write('"+"\t"{0}"\t{1}\t"{2}"\n'.format(name, start, sequence))
+
+
+def batch_process(sam_files, seq_type, output_path):
+    """Batch process the conversion of SAM format files -> riboSeqR format
+    input files.
+
+    Files are saved with file names corresponding to their sequence type.
+
+    """
+    outputs = []
+    prefix = '{}'
+    if seq_type == 'riboseq':
+        prefix = 'RiboSeq file {}'
+    elif seq_type == 'rnaseq':
+        prefix = 'RNASeq file {}'
+
+    for count, fname in enumerate(sam_files):
+        count += 1
+        out_file = os.path.join(output_path, prefix.format(count))
+        logging.debug('Processing: {}'.format(fname))
+        logging.debug('Writing output to: {}'.format(out_file))
+        prep_riboseqr_input(fname, out_file)
+        outputs.append(out_file)
+    return outputs
+
+
+def generate_ribodata(ribo_files='', rna_files='', replicate_names='',
+                      seqnames='', rdata_save='Prepare.rda', sam_format=True,
+                      html_file='Prepare-report.html', output_path=os.getcwd()):
+    """Prepares Ribo and RNA seq data in the format required for riboSeqR. Calls
+    the readRibodata function of riboSeqR and saves the result objects in an
+    R data file which can be used as input for the next step.
+
+    """
+    input_ribo_files = utils.process_args(ribo_files, ret_mode='list')
+    logging.debug('Found {} Ribo-Seq files'.format(len(input_ribo_files)))
+    logging.debug(input_ribo_files)
+
+    input_rna_files = []
+    if rna_files:
+        input_rna_files = utils.process_args(rna_files, ret_mode='list')
+        logging.debug('Found {} RNA-Seq files'.format(len(input_rna_files)))
+        logging.debug(input_rna_files)
+
+    replicates = utils.process_args(replicate_names, ret_mode='charvector')
+    logging.debug('Replicates: {}\n'.format(replicates))
+
+    if sam_format:
+        ribo_seq_files = batch_process(input_ribo_files, 'riboseq', output_path)
+    else:
+        ribo_seq_files = input_ribo_files
+
+    html = '<h2>Prepare riboSeqR input - results</h2><hr>'
+    if len(ribo_seq_files):
+        html += '<h4>Generated riboSeqR format input files ' \
+                '<em>(RiboSeq)</em></h4><p>'
+        for fname in ribo_seq_files:
+            html += '<a href="{0}">{0}</a><br>'.format(
+                os.path.basename(fname))
+        html += '</p>'
+
+    rna_seq_files = []
+    if len(input_rna_files):
+        if sam_format:
+            rna_seq_files = batch_process(
+                input_rna_files, 'rnaseq', output_path)
+        else:
+            rna_seq_files = input_rna_files
+
+    if len(rna_seq_files):
+        html += ('<h4>Generated riboSeqR format input files '
+                 '<em>(RNASeq)</em></h4><p>')
+        for fname in rna_seq_files:
+            html += '<a href="{0}">{0}</a><br>'.format(
+                os.path.basename(fname))
+        html += '</p>'
+
+    input_seqnames = utils.process_args(seqnames, ret_mode='charvector')
+    options = {'ribo_seq_files': 'c({})'.format(str(ribo_seq_files)[1:-1]),
+               'rna_seq_files': 'c({})'.format(str(rna_seq_files)[1:-1]),
+               'input_replicates': replicates,
+               'input_seqnames': input_seqnames}
+
+    logging.debug('{}'.format(R('sessionInfo()')))
+
+    script = ''
+    cmd = 'library(riboSeqR)'
+    run_rscript(cmd)
+    script += '{}\n'.format(cmd)
+
+    if len(rna_seq_files):
+        cmd_args = ('riboFiles={ribo_seq_files}, '
+                    'rnaFiles={rna_seq_files}'.format(**options))
+    else:
+        cmd_args = 'riboFiles={ribo_seq_files}'.format(**options)
+
+    if input_seqnames:
+        cmd_args += ', seqnames={input_seqnames}'.format(**options)
+    if replicates:
+        cmd_args += ', replicates={input_replicates}'.format(**options)
+    else:
+        cmd_args += ', replicates=c("")'
+    cmd = 'riboDat <- readRibodata({0})'.format(cmd_args)
+    run_rscript(cmd)
+    script += '{}\n'.format(cmd)
+
+    ribo_data = R['riboDat']
+    logging.debug('riboDat \n{}\n'.format(ribo_data))
+    cmd = 'save("riboDat", file="{}", compress=FALSE)'.format(rdata_save)
+    run_rscript(cmd)
+    script += '{}\n'.format(cmd)
+
+    msg = '\n{:#^80}\n{}\n{:#^80}\n'.format(
+        ' R script for this session ', script, ' End R script ')
+    logging.debug(msg)
+
+    with open(os.path.join(output_path, 'prepare.R'), 'w') as r:
+        r.write(script)
+
+    html += ('<h4>R script for this session</h4>'
+             '<p><a href="prepare.R">prepare.R</a></p>'
+             '<p>Next step: <em>Triplet periodicity</em></p>')
+
+    with open(html_file, 'w') as f:
+        f.write(html)
+
+    return ribo_data
+
+
+if __name__ == '__main__':
+
+    description = (
+        'Prepare riboSeqR input file from SAM format RNA/Ribo-Seq alignment.')
+    parser = argparse.ArgumentParser(description=description)
+
+    # required arguments
+    flags = parser.add_argument_group('required arguments')
+    flags.add_argument('--ribo_files', required=True,
+                       help='List of Ribo-Seq files, comma-separated')
+    # optional arguments
+    parser.add_argument('--rna_files',
+                        help='List of RNA-Seq files. Comma-separated')
+    parser.add_argument('--replicate_names',
+                        help='Replicate names, comma-separated')
+    parser.add_argument('--seqnames',
+                        help='Transcript (seqname) names to be read')
+    parser.add_argument('--rdata_save',
+                        help='File to write RData to (default: %(default)s)',
+                        default='Prepare.rda')
+    parser.add_argument(
+        '--sam_format',
+        help='Flag. Input is in SAM format', action='store_true')
+    parser.add_argument('--html_file', help='Output file for results (HTML)')
+    parser.add_argument('--output_path',
+                        help='Files are saved in this directory')
+    parser.add_argument('--debug', help='Flag. Produce debug output',
+                        action='store_true')
+    args = parser.parse_args()
+
+    if args.debug:
+        level = logging.DEBUG
+    else:
+        level = logging.INFO
+
+    logging.basicConfig(format='%(module)s: %(levelname)s - %(message)s',
+                        level=level, stream=sys.stdout)
+    logging.debug('Supplied Arguments: {}'.format(vars(args)))
+
+    if not os.path.exists(args.output_path):
+        os.mkdir(args.output_path)
+
+    generate_ribodata(
+        ribo_files=args.ribo_files, rna_files=args.rna_files,
+        replicate_names=args.replicate_names, seqnames=args.seqnames,
+        rdata_save=args.rdata_save,
+        sam_format=args.sam_format, html_file=args.html_file,
+        output_path=args.output_path
+    )
+    logging.debug('Done')