Mercurial > repos > pjbriggs > amplicon_analysis_pipeline
comparison amplicon_analysis_pipeline.py @ 0:47ec9c6f44b8 draft
planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit b63924933a03255872077beb4d0fde49d77afa92
author | pjbriggs |
---|---|
date | Thu, 09 Nov 2017 10:13:29 -0500 |
parents | |
children | 1c1902e12caf |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:47ec9c6f44b8 |
---|---|
1 #!/usr/bin/env python | |
2 # | |
3 # Wrapper script to run Amplicon_analysis_pipeline.sh | |
4 # from Galaxy tool | |
5 | |
6 import sys | |
7 import os | |
8 import argparse | |
9 import subprocess | |
10 import glob | |
11 | |
12 class PipelineCmd(object): | |
13 def __init__(self,cmd): | |
14 self.cmd = [str(cmd)] | |
15 def add_args(self,*args): | |
16 for arg in args: | |
17 self.cmd.append(str(arg)) | |
18 def __repr__(self): | |
19 return ' '.join([str(arg) for arg in self.cmd]) | |
20 | |
21 def ahref(target,name=None,type=None): | |
22 if name is None: | |
23 name = os.path.basename(target) | |
24 ahref = "<a href='%s'" % target | |
25 if type is not None: | |
26 ahref += " type='%s'" % type | |
27 ahref += ">%s</a>" % name | |
28 return ahref | |
29 | |
30 def check_errors(): | |
31 # Errors in Amplicon_analysis_pipeline.log | |
32 with open('Amplicon_analysis_pipeline.log','r') as pipeline_log: | |
33 log = pipeline_log.read() | |
34 if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log: | |
35 print_error("""*** Sample IDs don't match dataset names *** | |
36 | |
37 The sample IDs (first column of the Metatable file) don't match the | |
38 supplied sample names for the input Fastq pairs. | |
39 """) | |
40 # Errors in pipeline output | |
41 with open('pipeline.log','r') as pipeline_log: | |
42 log = pipeline_log.read() | |
43 if "Errors and/or warnings detected in mapping file" in log: | |
44 with open("Metatable_log/Metatable.log","r") as metatable_log: | |
45 # Echo the Metatable log file to the tool log | |
46 print_error("""*** Error in Metatable mapping file *** | |
47 | |
48 %s""" % metatable_log.read()) | |
49 elif "No header line was found in mapping file" in log: | |
50 # Report error to the tool log | |
51 print_error("""*** No header in Metatable mapping file *** | |
52 | |
53 Check you've specified the correct file as the input Metatable""") | |
54 | |
55 def print_error(message): | |
56 width = max([len(line) for line in message.split('\n')]) + 4 | |
57 sys.stderr.write("\n%s\n" % ('*'*width)) | |
58 for line in message.split('\n'): | |
59 sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4))) | |
60 sys.stderr.write("%s\n\n" % ('*'*width)) | |
61 | |
62 def clean_up_name(sample): | |
63 # Remove trailing "_L[0-9]+_001" from Fastq | |
64 # pair names | |
65 split_name = sample.split('_') | |
66 if split_name[-1] == "001": | |
67 split_name = split_name[:-1] | |
68 if split_name[-1].startswith('L'): | |
69 try: | |
70 int(split_name[-1][1:]) | |
71 split_name = split_name[:-1] | |
72 except ValueError: | |
73 pass | |
74 return '_'.join(split_name) | |
75 | |
76 def list_outputs(filen=None): | |
77 # List the output directory contents | |
78 # If filen is specified then will be the filename to | |
79 # write to, otherwise write to stdout | |
80 if filen is not None: | |
81 fp = open(filen,'w') | |
82 else: | |
83 fp = sys.stdout | |
84 results_dir = os.path.abspath("RESULTS") | |
85 fp.write("Listing contents of output dir %s:\n" % results_dir) | |
86 ix = 0 | |
87 for d,dirs,files in os.walk(results_dir): | |
88 ix += 1 | |
89 fp.write("-- %d: %s\n" % (ix, | |
90 os.path.relpath(d,results_dir))) | |
91 for f in files: | |
92 ix += 1 | |
93 fp.write("---- %d: %s\n" % (ix, | |
94 os.path.relpath(f,results_dir))) | |
95 # Close output file | |
96 if filen is not None: | |
97 fp.close() | |
98 | |
99 if __name__ == "__main__": | |
100 # Command line | |
101 print "Amplicon analysis: starting" | |
102 p = argparse.ArgumentParser() | |
103 p.add_argument("metatable", | |
104 metavar="METATABLE_FILE", | |
105 help="Metatable.txt file") | |
106 p.add_argument("fastq_pairs", | |
107 metavar="SAMPLE_NAME FQ_R1 FQ_R2", | |
108 nargs="+", | |
109 default=list(), | |
110 help="Triplets of SAMPLE_NAME followed by " | |
111 "a R1/R2 FASTQ file pair") | |
112 p.add_argument("-g",dest="forward_pcr_primer") | |
113 p.add_argument("-G",dest="reverse_pcr_primer") | |
114 p.add_argument("-q",dest="trimming_threshold") | |
115 p.add_argument("-O",dest="minimum_overlap") | |
116 p.add_argument("-L",dest="minimum_length") | |
117 p.add_argument("-l",dest="sliding_window_length") | |
118 p.add_argument("-P",dest="pipeline", | |
119 choices=["vsearch","uparse","qiime"], | |
120 type=str.lower, | |
121 default="vsearch") | |
122 p.add_argument("-S",dest="use_silva",action="store_true") | |
123 p.add_argument("-r",dest="reference_data_path") | |
124 p.add_argument("-c",dest="categories_file") | |
125 args = p.parse_args() | |
126 | |
127 # Build the environment for running the pipeline | |
128 print "Amplicon analysis: building the environment" | |
129 metatable_file = os.path.abspath(args.metatable) | |
130 os.symlink(metatable_file,"Metatable.txt") | |
131 print "-- made symlink to Metatable.txt" | |
132 | |
133 # Link to Categories.txt file (if provided) | |
134 if args.categories_file is not None: | |
135 categories_file = os.path.abspath(args.categories_file) | |
136 os.symlink(categories_file,"Categories.txt") | |
137 print "-- made symlink to Categories.txt" | |
138 | |
139 # Link to FASTQs and construct Final_name.txt file | |
140 sample_names = [] | |
141 with open("Final_name.txt",'w') as final_name: | |
142 fastqs = iter(args.fastq_pairs) | |
143 for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs): | |
144 sample_name = clean_up_name(sample_name) | |
145 r1 = "%s_R1_.fastq" % sample_name | |
146 r2 = "%s_R2_.fastq" % sample_name | |
147 os.symlink(fqr1,r1) | |
148 os.symlink(fqr2,r2) | |
149 final_name.write("%s\n" % '\t'.join((r1,sample_name))) | |
150 final_name.write("%s\n" % '\t'.join((r2,sample_name))) | |
151 sample_names.append(sample_name) | |
152 | |
153 # Construct the pipeline command | |
154 print "Amplicon analysis: constructing pipeline command" | |
155 pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh") | |
156 if args.forward_pcr_primer: | |
157 pipeline.add_args("-g",args.forward_pcr_primer) | |
158 if args.reverse_pcr_primer: | |
159 pipeline.add_args("-G",args.reverse_pcr_primer) | |
160 if args.trimming_threshold: | |
161 pipeline.add_args("-q",args.trimming_threshold) | |
162 if args.minimum_overlap: | |
163 pipeline.add_args("-O",args.minimum_overlap) | |
164 if args.minimum_length: | |
165 pipeline.add_args("-L",args.minimum_length) | |
166 if args.sliding_window_length: | |
167 pipeline.add_args("-l",args.sliding_window_length) | |
168 if args.reference_data_path: | |
169 pipeline.add_args("-r",args.reference_data_path) | |
170 pipeline.add_args("-P",args.pipeline) | |
171 if args.use_silva: | |
172 pipeline.add_args("-S") | |
173 | |
174 # Echo the pipeline command to stdout | |
175 print "Running %s" % pipeline | |
176 | |
177 # Run the pipeline | |
178 with open("pipeline.log","w") as pipeline_out: | |
179 try: | |
180 subprocess.check_call(pipeline.cmd, | |
181 stdout=pipeline_out, | |
182 stderr=subprocess.STDOUT) | |
183 exit_code = 0 | |
184 print "Pipeline completed ok" | |
185 except subprocess.CalledProcessError as ex: | |
186 # Non-zero exit status | |
187 sys.stderr.write("Pipeline failed: exit code %s\n" % | |
188 ex.returncode) | |
189 exit_code = ex.returncode | |
190 except Exception as ex: | |
191 # Some other problem | |
192 sys.stderr.write("Unexpected error: %s\n" % str(ex)) | |
193 | |
194 # Write out the list of outputs | |
195 outputs_file = "Pipeline_outputs.txt" | |
196 list_outputs(outputs_file) | |
197 | |
198 # Check for log file | |
199 log_file = "Amplicon_analysis_pipeline.log" | |
200 if os.path.exists(log_file): | |
201 print "Found log file: %s" % log_file | |
202 if exit_code == 0: | |
203 # Create an HTML file to link to log files etc | |
204 # NB the paths to the files should be correct once | |
205 # copied by Galaxy on job completion | |
206 with open("pipeline_outputs.html","w") as html_out: | |
207 html_out.write("""<html> | |
208 <head> | |
209 <title>Amplicon analysis pipeline: log files</title> | |
210 <head> | |
211 <body> | |
212 <h1>Amplicon analysis pipeline: log files</h1> | |
213 <ul> | |
214 """) | |
215 html_out.write( | |
216 "<li>%s</li>\n" % | |
217 ahref("Amplicon_analysis_pipeline.log", | |
218 type="text/plain")) | |
219 html_out.write( | |
220 "<li>%s</li>\n" % | |
221 ahref("pipeline.log",type="text/plain")) | |
222 html_out.write( | |
223 "<li>%s</li>\n" % | |
224 ahref("Pipeline_outputs.txt", | |
225 type="text/plain")) | |
226 html_out.write( | |
227 "<li>%s</li>\n" % | |
228 ahref("Metatable.html")) | |
229 html_out.write("""<ul> | |
230 </body> | |
231 </html> | |
232 """) | |
233 else: | |
234 # Check for known error messages | |
235 check_errors() | |
236 # Write pipeline stdout to tool stderr | |
237 sys.stderr.write("\nOutput from pipeline:\n") | |
238 with open("pipeline.log",'r') as log: | |
239 sys.stderr.write("%s" % log.read()) | |
240 # Write log file contents to tool log | |
241 print "\nAmplicon_analysis_pipeline.log:" | |
242 with open(log_file,'r') as log: | |
243 print "%s" % log.read() | |
244 else: | |
245 sys.stderr.write("ERROR missing log file \"%s\"\n" % | |
246 log_file) | |
247 | |
248 # Handle FastQC boxplots | |
249 print "Amplicon analysis: collating per base quality boxplots" | |
250 with open("fastqc_quality_boxplots.html","w") as quality_boxplots: | |
251 # PHRED value for trimming | |
252 phred_score = 20 | |
253 if args.trimming_threshold is not None: | |
254 phred_score = args.trimming_threshold | |
255 # Write header for HTML output file | |
256 quality_boxplots.write("""<html> | |
257 <head> | |
258 <title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title> | |
259 <head> | |
260 <body> | |
261 <h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1> | |
262 """) | |
263 # Look for raw and trimmed FastQC output for each sample | |
264 for sample_name in sample_names: | |
265 fastqc_dir = os.path.join(sample_name,"FastQC") | |
266 quality_boxplots.write("<h2>%s</h2>" % sample_name) | |
267 for d in ("Raw","cutdapt_sickle/Q%s" % phred_score): | |
268 quality_boxplots.write("<h3>%s</h3>" % d) | |
269 fastqc_html_files = glob.glob( | |
270 os.path.join(fastqc_dir,d,"*_fastqc.html")) | |
271 if not fastqc_html_files: | |
272 quality_boxplots.write("<p>No FastQC outputs found</p>") | |
273 continue | |
274 # Pull out the per-base quality boxplots | |
275 for f in fastqc_html_files: | |
276 boxplot = None | |
277 with open(f) as fp: | |
278 for line in fp.read().split(">"): | |
279 try: | |
280 line.index("alt=\"Per base quality graph\"") | |
281 boxplot = line + ">" | |
282 break | |
283 except ValueError: | |
284 pass | |
285 if boxplot is None: | |
286 boxplot = "Missing plot" | |
287 quality_boxplots.write("<h4>%s</h4><p>%s</p>" % | |
288 (os.path.basename(f), | |
289 boxplot)) | |
290 quality_boxplots.write("""</body> | |
291 </html> | |
292 """) | |
293 | |
294 # Handle additional output when categories file was supplied | |
295 if args.categories_file is not None: | |
296 # Alpha diversity boxplots | |
297 print "Amplicon analysis: indexing alpha diversity boxplots" | |
298 boxplots_dir = os.path.abspath( | |
299 os.path.join("RESULTS", | |
300 "%s_%s" % (args.pipeline.title(), | |
301 ("gg" if not args.use_silva | |
302 else "silva")), | |
303 "Alpha_diversity", | |
304 "Alpha_diversity_boxplot", | |
305 "Categories_shannon")) | |
306 print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir | |
307 boxplot_pdfs = [os.path.basename(pdf) | |
308 for pdf in | |
309 sorted(glob.glob( | |
310 os.path.join(boxplots_dir,"*.pdf")))] | |
311 with open("alpha_diversity_boxplots.html","w") as boxplots_out: | |
312 boxplots_out.write("""<html> | |
313 <head> | |
314 <title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title> | |
315 <head> | |
316 <body> | |
317 <h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1> | |
318 """) | |
319 boxplots_out.write("<ul>\n") | |
320 for pdf in boxplot_pdfs: | |
321 boxplots_out.write("<li>%s</li>\n" % ahref(pdf)) | |
322 boxplots_out.write("<ul>\n") | |
323 boxplots_out.write("""</body> | |
324 </html> | |
325 """) | |
326 | |
327 # Finish | |
328 print "Amplicon analysis: finishing, exit code: %s" % exit_code | |
329 sys.exit(exit_code) |