Mercurial > repos > rnateam > dewseq
comparison dewseq_wrapper.py @ 0:e1cb2e012307 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/dewseq commit 71db0e65b3b306904ae2b17ce3de677244aea776"
author | rnateam |
---|---|
date | Thu, 20 Oct 2022 08:18:30 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e1cb2e012307 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 import os | |
5 import re | |
6 import shutil | |
7 import subprocess | |
8 | |
9 | |
10 """ | |
11 DEWSeq wrapper script dependencies: | |
12 conda install -c bioconda bioconductor-dewseq | |
13 conda install -c conda-forge r-rmarkdown | |
14 conda install -c bioconda bioconductor-biocstyle | |
15 conda install -c conda-forge r-tidyverse | |
16 conda install -c bioconda r-ggrepel | |
17 conda install -c bioconda bioconductor-ihw | |
18 | |
19 Wrapper for DEWSeq R markdown file: | |
20 https://github.com/EMBL-Hentze-group/DEWSeq_analysis_helpers/tree/master/Parametrized_Rmd | |
21 | |
22 | |
23 Test runs | |
24 ========= | |
25 | |
26 # This reports 150 significant regions. | |
27 python dewseq_wrapper.py --annot test-data/windows.exp.txt --matrix test-data/Rbp_count_matrix.exp.txt --info test-data/sample_info.exp.txt --out test_out --ds-pvc 0.5 | |
28 | |
29 # Wherease, with LRT, DEWSeq reports just one siginificant region. | |
30 python dewseq_wrapper.py --annot test-data/windows.exp.txt --matrix test-data/Rbp_count_matrix.exp.txt --info test-data/sample_info.exp.txt --out test2_out --ds-pvc 0.5 --ds-use-lrt | |
31 | |
32 """ | |
33 | |
34 ################################################################################ | |
35 | |
36 | |
37 def setup_argument_parser(): | |
38 """Setup argparse parser.""" | |
39 help_description = """ | |
40 Wrapping DEWSeq R markdown script, to call peak regions on the CLIP-seq data, | |
41 preprocessed by htseq-clip. | |
42 """ | |
43 # Define argument parser. | |
44 p = argparse.ArgumentParser(add_help=False, | |
45 prog="dewseq_wrapper.py", | |
46 description=help_description, | |
47 formatter_class=argparse.MetavarTypeHelpFormatter) | |
48 | |
49 # Required arguments. | |
50 p.add_argument("-h", "--help", | |
51 action="help", | |
52 help="Print help message") | |
53 p.add_argument("--annot", | |
54 dest="in_annot", | |
55 type=str, | |
56 metavar='str', | |
57 required=True, | |
58 help="DEWseq annotation file, i.e. windows mapped to IDs table file (output of htseq-clip mapToId)") | |
59 p.add_argument("--matrix", | |
60 dest="in_matrix", | |
61 type=str, | |
62 metavar='str', | |
63 required=True, | |
64 help="DEWseq count matrix file (output of htseq-clip createMatrix)") | |
65 p.add_argument("--info", | |
66 dest="in_info", | |
67 type=str, | |
68 metavar='str', | |
69 required=True, | |
70 help="DEWseq sample information file (output of htsc_create_count_table.py / htseq-clip Create count table Galaxy wrapper)") | |
71 p.add_argument("--out", | |
72 dest="out_folder", | |
73 type=str, | |
74 metavar='str', | |
75 required=True, | |
76 help="Results output folder") | |
77 p.add_argument("--ds-ms", | |
78 dest="ds_ms", | |
79 type=int, | |
80 metavar='int', | |
81 default=2, | |
82 help="DEWSeq min_sample parameter. Keep only the windows with at least min_sample number of samples with crosslink site count > min_count (default: 2)") | |
83 p.add_argument("--ds-mc", | |
84 dest="ds_mc", | |
85 type=int, | |
86 metavar='int', | |
87 default=2, | |
88 help="DEWSeq min_count parameter. Minimum crosslink site per window per sample (default: 2)") | |
89 p.add_argument("--ds-pvc", | |
90 dest="ds_pvc", | |
91 type=float, | |
92 metavar='float', | |
93 default=0.1, | |
94 help="DEWSeq p_value_cutoff parameter. p adjusted value threshold for significant windows (default: 0.1)") | |
95 p.add_argument("--ds-lfcc", | |
96 dest="ds_lfcc", | |
97 type=float, | |
98 metavar='float', | |
99 default=1, | |
100 help="DEWSeq lfc_cutoff parameter. Log2 fold change threshold for significant windows (default: 1)") | |
101 p.add_argument("--ds-oc", | |
102 dest="ds_use_oc", | |
103 default=False, | |
104 action="store_true", | |
105 help="DEWSeq overlap_correction parameter. By default FALSE, i.e., do not adjust p-value for overlapping windows. If TRUE use Bonferroni family wise error rate correction on overlapping sliding windows (default: false)") | |
106 p.add_argument("--ds--disable-ihw", | |
107 dest="ds_disable_ihw", | |
108 default=False, | |
109 action="store_true", | |
110 help="Disable DEWSeq IHW parameter. By default, use IHW for multiple tesing correction instead of default BH (Benjamini Hochberg). NOTE: We recommend using IHW instead of default BH for FDR correction") | |
111 p.add_argument("--ds--disable-df", | |
112 dest="ds_disable_df", | |
113 default=False, | |
114 action="store_true", | |
115 help="Disable DEWSeq decide_fit parameter. By default, decide on dispersion estimation fit type local or parametric. If disabled, Use parametric fit. NOTE: decide_fit=TRUE will fit data using both parametric and local fit types and will choose the best fit of the two (see DESeq2 vignette for details). Typically, this should give better results compared to using the default fit type parametric. But, keep in mind that this will also increase the total run time") | |
116 p.add_argument("--ds-use-lrt", | |
117 dest="ds_use_lrt", | |
118 default=False, | |
119 action="store_true", | |
120 help="DEWSeq LRT parameter. Use LRT if the given value is TRUE (see DESeq2 vignette for details). By default, DEWSeq uses Wald test. NOTE: In our experience, LRT is more accurate than Wald test. But, keep in mind that LRT is a stringent test in comparison to Wald. So if your protein of interest is a very active binder, run the analysis with LRT=TRUE, otherwise use it with caution as you may end up with no significant windows or regions in your final output") | |
121 p.add_argument("--ds-id", | |
122 dest="ds_id", | |
123 type=str, | |
124 default="RBP", | |
125 metavar='str', | |
126 help="DEWSeq dataset ID for output report (default: RBP)") | |
127 p.add_argument("--ds-markdown", | |
128 dest="ds_markdown", | |
129 type=str, | |
130 default="analyseStudy.Rmd", | |
131 metavar='str', | |
132 help="Provide path to DEWSeq R markdown file. By default assumed to be in working directory") | |
133 p.add_argument("--copy-md", | |
134 dest="copy_markdown", | |
135 default=False, | |
136 action="store_true", | |
137 help="Copy DEWSeq R markdown file to output directory and execute [sic!] it there (default: False)") | |
138 return p | |
139 | |
140 | |
141 ################################################################################ | |
142 | |
143 def count_file_rows(in_file, | |
144 nr_cols=False): | |
145 """ | |
146 Count number of file rows. If nr_cols set, demand certain (nr_cols) number | |
147 of columns (separated by tab), in order for row to be counted. | |
148 | |
149 """ | |
150 c = 0 | |
151 with open(in_file) as f: | |
152 for line in f: | |
153 cols = line.strip().split("\t") | |
154 if nr_cols: | |
155 if len(cols) == nr_cols: | |
156 c += 1 | |
157 else: | |
158 c += 1 | |
159 f.closed | |
160 return c | |
161 | |
162 | |
163 ################################################################################ | |
164 | |
165 def check_two_dic_same_keys(d1, d2): | |
166 """ | |
167 Check if two dictionaries have same keys. | |
168 | |
169 >>> d1 = {'k1': 1, 'k2': 2} | |
170 >>> d2 = {'k1': 3, 'k2': 4} | |
171 >>> check_two_dic_same_keys(d1, d2) | |
172 True | |
173 >>> d2 = {'k1': 3, 'k3': 4} | |
174 >>> check_two_dic_same_keys(d1, d2) | |
175 False | |
176 | |
177 """ | |
178 assert d1, "given dictionary d1 empty" | |
179 assert d2, "given dictionary d2 empty" | |
180 for k in d1: | |
181 if k not in d2: | |
182 return False | |
183 for k in d2: | |
184 if k not in d1: | |
185 return False | |
186 return True | |
187 | |
188 | |
189 ################################################################################ | |
190 | |
191 if __name__ == '__main__': | |
192 | |
193 parser = setup_argument_parser() | |
194 args = parser.parse_args() | |
195 | |
196 assert os.path.exists(args.in_annot), "--annot file \"%s\" not found" % (args.in_annot) | |
197 assert os.path.exists(args.in_matrix), "--matrix file \"%s\" not found" % (args.in_matrix) | |
198 assert os.path.exists(args.in_info), "--info file \"%s\" not found" % (args.in_info) | |
199 assert os.path.exists(args.ds_markdown), "--ds-markdown file \"%s\" not found" % (args.ds_markdown) | |
200 | |
201 # Input files. | |
202 annot_in = os.path.abspath(args.in_annot) | |
203 matrix_in = os.path.abspath(args.in_matrix) | |
204 info_in = os.path.abspath(args.in_info) | |
205 md_in = os.path.abspath(args.ds_markdown) | |
206 | |
207 # Sum Sanity Checks. | |
208 matrix_ids = {} | |
209 | |
210 with open(matrix_in) as f: | |
211 for line in f: | |
212 cols = line.strip().split("\t") | |
213 for c in cols[1:]: | |
214 matrix_ids[c] = 1 | |
215 break | |
216 f.closed | |
217 | |
218 assert matrix_ids, "no dataset columns found in count table file" | |
219 | |
220 info_ids = {} | |
221 | |
222 with open(info_in) as f: | |
223 for line in f: | |
224 if re.search("^Sample name", line): | |
225 continue | |
226 cols = line.strip().split("\t") | |
227 if cols[0] == "Sample name": | |
228 continue | |
229 else: | |
230 info_ids[cols[0]] = 1 | |
231 f.closed | |
232 | |
233 assert info_ids, "no dataset columns found in info table file" | |
234 assert len(matrix_ids) == len(info_ids), "differing numbers of dataset IDs in count table and info table file" | |
235 assert check_two_dic_same_keys(matrix_ids, info_ids), "dataset IDs in count table and info table file not identical" | |
236 assert args.ds_ms <= len(matrix_ids), "set DEWSeq min_sample > number of data samples in count / info table files (%i > %i)" % (args.ds_ms, len(matrix_ids)) | |
237 print("Dataset IDs are valid ... ") | |
238 | |
239 # Output folder. | |
240 if not os.path.exists(args.out_folder): | |
241 os.makedirs(args.out_folder) | |
242 | |
243 # Output files. | |
244 abs_path_out = os.path.abspath(args.out_folder) | |
245 | |
246 # Copy markdown file to results folder. | |
247 if args.copy_markdown: | |
248 md_source = md_in | |
249 md_in = abs_path_out + "/analyseStudy.Rmd" | |
250 if not os.path.exists(md_in): | |
251 print("Copying markdown file to output folder ... ") | |
252 shutil.copyfile(md_source, md_in) | |
253 | |
254 html_out = abs_path_out + "/report.html" | |
255 win_csv_out = abs_path_out + "/windows.csv" | |
256 sig_reg_csv_out = abs_path_out + "/significant_regions.csv" | |
257 sig_win_reg_bed_out = abs_path_out + "/significant_windows_and_regions.bed" | |
258 sig_reg_bed_out = abs_path_out + "/significant_regions.bed" | |
259 | |
260 # Delete existing files (as if no peaks found old files would be reported). | |
261 if os.path.exists(html_out): | |
262 os.remove(html_out) | |
263 if os.path.exists(win_csv_out): | |
264 os.remove(win_csv_out) | |
265 if os.path.exists(sig_reg_csv_out): | |
266 os.remove(sig_reg_csv_out) | |
267 if os.path.exists(sig_win_reg_bed_out): | |
268 os.remove(sig_win_reg_bed_out) | |
269 if os.path.exists(sig_reg_bed_out): | |
270 os.remove(sig_reg_bed_out) | |
271 | |
272 """ | |
273 Run DEWSeq R markdown file. | |
274 | |
275 """ | |
276 | |
277 md_ihw = "TRUE" | |
278 md_df = "TRUE" | |
279 md_lrt = "FALSE" | |
280 md_olc = "FALSE" | |
281 if args.ds_disable_ihw: | |
282 md_ihw = "FALSE" | |
283 if args.ds_disable_df: | |
284 md_df = "FALSE" | |
285 if args.ds_use_lrt: | |
286 md_lrt = "TRUE" | |
287 if args.ds_use_oc: | |
288 md_olc = "TRUE" | |
289 | |
290 md_cmd = "Rscript -e 'rmarkdown::render(" | |
291 md_cmd += 'input = "%s", ' % (md_in) | |
292 md_cmd += 'output_file = "%s", ' % (html_out) | |
293 md_cmd += 'params = list(protein = "%s", ' % (args.ds_id) | |
294 md_cmd += 'sampleinfo_file = "%s", ' % (info_in) | |
295 md_cmd += 'countmatrix_file = "%s", ' % (matrix_in) | |
296 md_cmd += 'annotation_file = "%s", ' % (annot_in) | |
297 md_cmd += 'output_windows_file = "%s", ' % (win_csv_out) | |
298 md_cmd += 'output_regions_file = "%s", ' % (sig_reg_csv_out) | |
299 md_cmd += 'output_bed_file = "%s", ' % (sig_win_reg_bed_out) | |
300 md_cmd += 'min_count = %i, ' % (args.ds_mc) | |
301 md_cmd += 'min_sample = %i, ' % (args.ds_ms) | |
302 md_cmd += 'p_value_cutoff = %s, ' % (str(args.ds_pvc)) | |
303 md_cmd += 'lfc_cutoff = %s, ' % (str(args.ds_lfcc)) | |
304 md_cmd += 'overlap_correction = %s, ' % (md_olc) | |
305 md_cmd += 'IHW = %s, ' % (md_ihw) | |
306 md_cmd += 'decide_fit = %s, ' % (md_df) | |
307 md_cmd += 'LRT = %s))' % (md_lrt) | |
308 md_cmd += "'" | |
309 | |
310 print("Running DEWSeq R markdown file ... ") | |
311 print(md_cmd) | |
312 output = subprocess.getoutput(md_cmd) | |
313 print(output) | |
314 | |
315 print("") | |
316 print("Checking for output files ... ") | |
317 if os.path.exists(html_out): | |
318 print("FOUND HTML report file \"%s\" ... " % (html_out)) | |
319 else: | |
320 print("MISSING HTML report file \"%s\" ... " % (html_out)) | |
321 if os.path.exists(win_csv_out): | |
322 print("FOUND windows CSV file \"%s\" ... " % (win_csv_out)) | |
323 else: | |
324 print("MISSING windows CSV file \"%s\" ... " % (win_csv_out)) | |
325 if os.path.exists(sig_reg_csv_out): | |
326 print("FOUND significant regions CSV file \"%s\" ... " % (sig_reg_csv_out)) | |
327 else: | |
328 print("MISSING significant regions CSV file \"%s\" ... " % (sig_reg_csv_out)) | |
329 | |
330 assert os.path.exists(html_out) and os.path.exists(win_csv_out), "DEWSeq terminated / did no produce any output files. This could be due to too strict filter settings (e.g., min_sample, min_count ... ). Please try again with more relaxed settings" | |
331 assert os.path.exists(html_out), "output file \"%s\" not found" % (html_out) | |
332 assert os.path.exists(win_csv_out), "output file \"%s\" not found" % (win_csv_out) | |
333 | |
334 print("") | |
335 if not os.path.exists(sig_reg_csv_out): | |
336 print("WARNING: no significant regions found! (missing \"%s\" file)" % (sig_reg_csv_out)) | |
337 else: | |
338 assert os.path.exists(sig_win_reg_bed_out), "output file \"%s\" not found" % (sig_win_reg_bed_out) | |
339 c_sig_reg = count_file_rows(sig_reg_csv_out) - 1 | |
340 print("# significant regions: %i" % (c_sig_reg)) | |
341 # Save contiguous BED regions in separate file. | |
342 OUTBED = open(sig_reg_bed_out, "w") | |
343 with open(sig_reg_csv_out) as f: | |
344 for line in f: | |
345 row = line.strip() | |
346 cols = line.strip().split("\t") | |
347 if cols[1] == "region_begin": | |
348 continue | |
349 chr_id = cols[0] | |
350 reg_s = cols[1] | |
351 reg_e = cols[2] | |
352 pol = cols[3] | |
353 win_in_reg = int(cols[4]) | |
354 padj_mean = cols[7] | |
355 logfc_mean = cols[10] | |
356 reg_id = cols[12] | |
357 new_reg_id = reg_id | |
358 # For regions consisting of > 1 window. | |
359 if win_in_reg > 1: | |
360 new_reg_id = reg_id + "@region" | |
361 # Print out BED with additional columns (padj, logfc). | |
362 OUTBED.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\n" % (chr_id, reg_s, reg_e, new_reg_id, pol, padj_mean, logfc_mean)) | |
363 | |
364 f.closed | |
365 OUTBED.close() | |
366 | |
367 """ | |
368 Report. | |
369 | |
370 """ | |
371 | |
372 print("") | |
373 print("OUTPUT FILES") | |
374 print("============") | |
375 print("HTML report:\n%s" % (html_out)) | |
376 print("Windows CSV:\n%s" % (win_csv_out)) | |
377 if os.path.exists(sig_reg_csv_out): | |
378 print("Significant regions CSV:\n%s" % (sig_reg_csv_out)) | |
379 if os.path.exists(sig_win_reg_bed_out): | |
380 print("Significant windows + regions BED:\n%s" % (sig_win_reg_bed_out)) | |
381 if os.path.exists(sig_reg_bed_out): | |
382 print("Significant regions BED:\n%s" % (sig_reg_bed_out)) | |
383 print("") |