0
|
1 #!/usr/bin/env python
|
|
2 import os
|
|
3 import sys
|
|
4 import argparse
|
|
5 import logging
|
|
6 import rpy2.robjects as robjects
|
|
7
|
|
8 import utils
|
|
9
|
|
10 rscript = ''
|
|
11 R = robjects.r
|
|
12
|
|
13
|
|
14 def run_rscript(command=None):
|
|
15 """Run R command, log it, append to rscript"""
|
|
16 global rscript
|
|
17 if not command:
|
|
18 return
|
|
19 logging.debug(command)
|
|
20 rscript += '{}\n'.format(command)
|
|
21 R(command)
|
|
22
|
|
23
|
|
24 def prep_riboseqr_input(sam_file, output_file):
|
|
25 """Generate input file for riboSeqR from SAM format file."""
|
|
26 with open(output_file, 'w') as f:
|
|
27 for line in open(sam_file):
|
|
28 if line.startswith('@'):
|
|
29 continue
|
|
30 line = line.split()
|
|
31 flag = line[1]
|
|
32
|
|
33 if flag != '0':
|
|
34 continue
|
|
35 # make start 0-indexed, sam alignments are 1-indexed
|
|
36 start = int(line[3]) - 1
|
|
37 (name, sequence) = (line[2], line[9])
|
|
38 f.write('"+"\t"{0}"\t{1}\t"{2}"\n'.format(name, start, sequence))
|
|
39
|
|
40
|
|
41 def batch_process(sam_files, seq_type, output_path):
|
|
42 """Batch process the conversion of SAM format files -> riboSeqR format
|
|
43 input files.
|
|
44
|
|
45 Files are saved with file names corresponding to their sequence type.
|
|
46
|
|
47 """
|
|
48 outputs = []
|
|
49 prefix = '{}'
|
|
50 if seq_type == 'riboseq':
|
|
51 prefix = 'RiboSeq file {}'
|
|
52 elif seq_type == 'rnaseq':
|
|
53 prefix = 'RNASeq file {}'
|
|
54
|
|
55 for count, fname in enumerate(sam_files):
|
|
56 count += 1
|
|
57 out_file = os.path.join(output_path, prefix.format(count))
|
|
58 logging.debug('Processing: {}'.format(fname))
|
|
59 logging.debug('Writing output to: {}'.format(out_file))
|
|
60 prep_riboseqr_input(fname, out_file)
|
|
61 outputs.append(out_file)
|
|
62 return outputs
|
|
63
|
|
64
|
|
65 def generate_ribodata(ribo_files='', rna_files='', replicate_names='',
|
|
66 seqnames='', rdata_save='Prepare.rda', sam_format=True,
|
|
67 html_file='Prepare-report.html', output_path=os.getcwd()):
|
|
68 """Prepares Ribo and RNA seq data in the format required for riboSeqR. Calls
|
|
69 the readRibodata function of riboSeqR and saves the result objects in an
|
|
70 R data file which can be used as input for the next step.
|
|
71
|
|
72 """
|
|
73 input_ribo_files = utils.process_args(ribo_files, ret_mode='list')
|
|
74 logging.debug('Found {} Ribo-Seq files'.format(len(input_ribo_files)))
|
|
75 logging.debug(input_ribo_files)
|
|
76
|
|
77 input_rna_files = []
|
|
78 if rna_files:
|
|
79 input_rna_files = utils.process_args(rna_files, ret_mode='list')
|
|
80 logging.debug('Found {} RNA-Seq files'.format(len(input_rna_files)))
|
|
81 logging.debug(input_rna_files)
|
|
82
|
|
83 replicates = utils.process_args(replicate_names, ret_mode='charvector')
|
|
84 logging.debug('Replicates: {}\n'.format(replicates))
|
|
85
|
|
86 if sam_format:
|
|
87 ribo_seq_files = batch_process(input_ribo_files, 'riboseq', output_path)
|
|
88 else:
|
|
89 ribo_seq_files = input_ribo_files
|
|
90
|
|
91 html = '<h2>Prepare riboSeqR input - results</h2><hr>'
|
|
92 if len(ribo_seq_files):
|
|
93 html += '<h4>Generated riboSeqR format input files ' \
|
|
94 '<em>(RiboSeq)</em></h4><p>'
|
|
95 for fname in ribo_seq_files:
|
|
96 html += '<a href="{0}">{0}</a><br>'.format(
|
|
97 os.path.basename(fname))
|
|
98 html += '</p>'
|
|
99
|
|
100 rna_seq_files = []
|
|
101 if len(input_rna_files):
|
|
102 if sam_format:
|
|
103 rna_seq_files = batch_process(
|
|
104 input_rna_files, 'rnaseq', output_path)
|
|
105 else:
|
|
106 rna_seq_files = input_rna_files
|
|
107
|
|
108 if len(rna_seq_files):
|
|
109 html += ('<h4>Generated riboSeqR format input files '
|
|
110 '<em>(RNASeq)</em></h4><p>')
|
|
111 for fname in rna_seq_files:
|
|
112 html += '<a href="{0}">{0}</a><br>'.format(
|
|
113 os.path.basename(fname))
|
|
114 html += '</p>'
|
|
115
|
|
116 input_seqnames = utils.process_args(seqnames, ret_mode='charvector')
|
|
117 options = {'ribo_seq_files': 'c({})'.format(str(ribo_seq_files)[1:-1]),
|
|
118 'rna_seq_files': 'c({})'.format(str(rna_seq_files)[1:-1]),
|
|
119 'input_replicates': replicates,
|
|
120 'input_seqnames': input_seqnames}
|
|
121
|
|
122 logging.debug('{}'.format(R('sessionInfo()')))
|
|
123
|
|
124 script = ''
|
|
125 cmd = 'library(riboSeqR)'
|
|
126 run_rscript(cmd)
|
|
127 script += '{}\n'.format(cmd)
|
|
128
|
|
129 if len(rna_seq_files):
|
|
130 cmd_args = ('riboFiles={ribo_seq_files}, '
|
|
131 'rnaFiles={rna_seq_files}'.format(**options))
|
|
132 else:
|
|
133 cmd_args = 'riboFiles={ribo_seq_files}'.format(**options)
|
|
134
|
|
135 if input_seqnames:
|
|
136 cmd_args += ', seqnames={input_seqnames}'.format(**options)
|
|
137 if replicates:
|
|
138 cmd_args += ', replicates={input_replicates}'.format(**options)
|
|
139 else:
|
|
140 cmd_args += ', replicates=c("")'
|
|
141 cmd = 'riboDat <- readRibodata({0})'.format(cmd_args)
|
|
142 run_rscript(cmd)
|
|
143 script += '{}\n'.format(cmd)
|
|
144
|
|
145 ribo_data = R['riboDat']
|
|
146 logging.debug('riboDat \n{}\n'.format(ribo_data))
|
|
147 cmd = 'save("riboDat", file="{}", compress=FALSE)'.format(rdata_save)
|
|
148 run_rscript(cmd)
|
|
149 script += '{}\n'.format(cmd)
|
|
150
|
|
151 msg = '\n{:#^80}\n{}\n{:#^80}\n'.format(
|
|
152 ' R script for this session ', script, ' End R script ')
|
|
153 logging.debug(msg)
|
|
154
|
|
155 with open(os.path.join(output_path, 'prepare.R'), 'w') as r:
|
|
156 r.write(script)
|
|
157
|
|
158 html += ('<h4>R script for this session</h4>'
|
|
159 '<p><a href="prepare.R">prepare.R</a></p>'
|
|
160 '<p>Next step: <em>Triplet periodicity</em></p>')
|
|
161
|
|
162 with open(html_file, 'w') as f:
|
|
163 f.write(html)
|
|
164
|
|
165 return ribo_data
|
|
166
|
|
167
|
|
168 if __name__ == '__main__':
|
|
169
|
|
170 description = (
|
|
171 'Prepare riboSeqR input file from SAM format RNA/Ribo-Seq alignment.')
|
|
172 parser = argparse.ArgumentParser(description=description)
|
|
173
|
|
174 # required arguments
|
|
175 flags = parser.add_argument_group('required arguments')
|
|
176 flags.add_argument('--ribo_files', required=True,
|
|
177 help='List of Ribo-Seq files, comma-separated')
|
|
178 # optional arguments
|
|
179 parser.add_argument('--rna_files',
|
|
180 help='List of RNA-Seq files. Comma-separated')
|
|
181 parser.add_argument('--replicate_names',
|
|
182 help='Replicate names, comma-separated')
|
|
183 parser.add_argument('--seqnames',
|
|
184 help='Transcript (seqname) names to be read')
|
|
185 parser.add_argument('--rdata_save',
|
|
186 help='File to write RData to (default: %(default)s)',
|
|
187 default='Prepare.rda')
|
|
188 parser.add_argument(
|
|
189 '--sam_format',
|
|
190 help='Flag. Input is in SAM format', action='store_true')
|
|
191 parser.add_argument('--html_file', help='Output file for results (HTML)')
|
|
192 parser.add_argument('--output_path',
|
|
193 help='Files are saved in this directory')
|
|
194 parser.add_argument('--debug', help='Flag. Produce debug output',
|
|
195 action='store_true')
|
|
196 args = parser.parse_args()
|
|
197
|
|
198 if args.debug:
|
|
199 level = logging.DEBUG
|
|
200 else:
|
|
201 level = logging.INFO
|
|
202
|
|
203 logging.basicConfig(format='%(module)s: %(levelname)s - %(message)s',
|
|
204 level=level, stream=sys.stdout)
|
|
205 logging.debug('Supplied Arguments: {}'.format(vars(args)))
|
|
206
|
|
207 if not os.path.exists(args.output_path):
|
|
208 os.mkdir(args.output_path)
|
|
209
|
|
210 generate_ribodata(
|
|
211 ribo_files=args.ribo_files, rna_files=args.rna_files,
|
|
212 replicate_names=args.replicate_names, seqnames=args.seqnames,
|
|
213 rdata_save=args.rdata_save,
|
|
214 sam_format=args.sam_format, html_file=args.html_file,
|
|
215 output_path=args.output_path
|
|
216 )
|
|
217 logging.debug('Done')
|