comparison dewseq_wrapper.py @ 0:e1cb2e012307 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/dewseq commit 71db0e65b3b306904ae2b17ce3de677244aea776"
author rnateam
date Thu, 20 Oct 2022 08:18:30 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e1cb2e012307
1 #!/usr/bin/env python3
2
3 import argparse
4 import os
5 import re
6 import shutil
7 import subprocess
8
9
10 """
11 DEWSeq wrapper script dependencies:
12 conda install -c bioconda bioconductor-dewseq
13 conda install -c conda-forge r-rmarkdown
14 conda install -c bioconda bioconductor-biocstyle
15 conda install -c conda-forge r-tidyverse
16 conda install -c bioconda r-ggrepel
17 conda install -c bioconda bioconductor-ihw
18
19 Wrapper for DEWSeq R markdown file:
20 https://github.com/EMBL-Hentze-group/DEWSeq_analysis_helpers/tree/master/Parametrized_Rmd
21
22
23 Test runs
24 =========
25
26 # This reports 150 significant regions.
27 python dewseq_wrapper.py --annot test-data/windows.exp.txt --matrix test-data/Rbp_count_matrix.exp.txt --info test-data/sample_info.exp.txt --out test_out --ds-pvc 0.5
28
29 # Wherease, with LRT, DEWSeq reports just one siginificant region.
30 python dewseq_wrapper.py --annot test-data/windows.exp.txt --matrix test-data/Rbp_count_matrix.exp.txt --info test-data/sample_info.exp.txt --out test2_out --ds-pvc 0.5 --ds-use-lrt
31
32 """
33
34 ################################################################################
35
36
37 def setup_argument_parser():
38 """Setup argparse parser."""
39 help_description = """
40 Wrapping DEWSeq R markdown script, to call peak regions on the CLIP-seq data,
41 preprocessed by htseq-clip.
42 """
43 # Define argument parser.
44 p = argparse.ArgumentParser(add_help=False,
45 prog="dewseq_wrapper.py",
46 description=help_description,
47 formatter_class=argparse.MetavarTypeHelpFormatter)
48
49 # Required arguments.
50 p.add_argument("-h", "--help",
51 action="help",
52 help="Print help message")
53 p.add_argument("--annot",
54 dest="in_annot",
55 type=str,
56 metavar='str',
57 required=True,
58 help="DEWseq annotation file, i.e. windows mapped to IDs table file (output of htseq-clip mapToId)")
59 p.add_argument("--matrix",
60 dest="in_matrix",
61 type=str,
62 metavar='str',
63 required=True,
64 help="DEWseq count matrix file (output of htseq-clip createMatrix)")
65 p.add_argument("--info",
66 dest="in_info",
67 type=str,
68 metavar='str',
69 required=True,
70 help="DEWseq sample information file (output of htsc_create_count_table.py / htseq-clip Create count table Galaxy wrapper)")
71 p.add_argument("--out",
72 dest="out_folder",
73 type=str,
74 metavar='str',
75 required=True,
76 help="Results output folder")
77 p.add_argument("--ds-ms",
78 dest="ds_ms",
79 type=int,
80 metavar='int',
81 default=2,
82 help="DEWSeq min_sample parameter. Keep only the windows with at least min_sample number of samples with crosslink site count > min_count (default: 2)")
83 p.add_argument("--ds-mc",
84 dest="ds_mc",
85 type=int,
86 metavar='int',
87 default=2,
88 help="DEWSeq min_count parameter. Minimum crosslink site per window per sample (default: 2)")
89 p.add_argument("--ds-pvc",
90 dest="ds_pvc",
91 type=float,
92 metavar='float',
93 default=0.1,
94 help="DEWSeq p_value_cutoff parameter. p adjusted value threshold for significant windows (default: 0.1)")
95 p.add_argument("--ds-lfcc",
96 dest="ds_lfcc",
97 type=float,
98 metavar='float',
99 default=1,
100 help="DEWSeq lfc_cutoff parameter. Log2 fold change threshold for significant windows (default: 1)")
101 p.add_argument("--ds-oc",
102 dest="ds_use_oc",
103 default=False,
104 action="store_true",
105 help="DEWSeq overlap_correction parameter. By default FALSE, i.e., do not adjust p-value for overlapping windows. If TRUE use Bonferroni family wise error rate correction on overlapping sliding windows (default: false)")
106 p.add_argument("--ds--disable-ihw",
107 dest="ds_disable_ihw",
108 default=False,
109 action="store_true",
110 help="Disable DEWSeq IHW parameter. By default, use IHW for multiple tesing correction instead of default BH (Benjamini Hochberg). NOTE: We recommend using IHW instead of default BH for FDR correction")
111 p.add_argument("--ds--disable-df",
112 dest="ds_disable_df",
113 default=False,
114 action="store_true",
115 help="Disable DEWSeq decide_fit parameter. By default, decide on dispersion estimation fit type local or parametric. If disabled, Use parametric fit. NOTE: decide_fit=TRUE will fit data using both parametric and local fit types and will choose the best fit of the two (see DESeq2 vignette for details). Typically, this should give better results compared to using the default fit type parametric. But, keep in mind that this will also increase the total run time")
116 p.add_argument("--ds-use-lrt",
117 dest="ds_use_lrt",
118 default=False,
119 action="store_true",
120 help="DEWSeq LRT parameter. Use LRT if the given value is TRUE (see DESeq2 vignette for details). By default, DEWSeq uses Wald test. NOTE: In our experience, LRT is more accurate than Wald test. But, keep in mind that LRT is a stringent test in comparison to Wald. So if your protein of interest is a very active binder, run the analysis with LRT=TRUE, otherwise use it with caution as you may end up with no significant windows or regions in your final output")
121 p.add_argument("--ds-id",
122 dest="ds_id",
123 type=str,
124 default="RBP",
125 metavar='str',
126 help="DEWSeq dataset ID for output report (default: RBP)")
127 p.add_argument("--ds-markdown",
128 dest="ds_markdown",
129 type=str,
130 default="analyseStudy.Rmd",
131 metavar='str',
132 help="Provide path to DEWSeq R markdown file. By default assumed to be in working directory")
133 p.add_argument("--copy-md",
134 dest="copy_markdown",
135 default=False,
136 action="store_true",
137 help="Copy DEWSeq R markdown file to output directory and execute [sic!] it there (default: False)")
138 return p
139
140
141 ################################################################################
142
143 def count_file_rows(in_file,
144 nr_cols=False):
145 """
146 Count number of file rows. If nr_cols set, demand certain (nr_cols) number
147 of columns (separated by tab), in order for row to be counted.
148
149 """
150 c = 0
151 with open(in_file) as f:
152 for line in f:
153 cols = line.strip().split("\t")
154 if nr_cols:
155 if len(cols) == nr_cols:
156 c += 1
157 else:
158 c += 1
159 f.closed
160 return c
161
162
163 ################################################################################
164
165 def check_two_dic_same_keys(d1, d2):
166 """
167 Check if two dictionaries have same keys.
168
169 >>> d1 = {'k1': 1, 'k2': 2}
170 >>> d2 = {'k1': 3, 'k2': 4}
171 >>> check_two_dic_same_keys(d1, d2)
172 True
173 >>> d2 = {'k1': 3, 'k3': 4}
174 >>> check_two_dic_same_keys(d1, d2)
175 False
176
177 """
178 assert d1, "given dictionary d1 empty"
179 assert d2, "given dictionary d2 empty"
180 for k in d1:
181 if k not in d2:
182 return False
183 for k in d2:
184 if k not in d1:
185 return False
186 return True
187
188
189 ################################################################################
190
191 if __name__ == '__main__':
192
193 parser = setup_argument_parser()
194 args = parser.parse_args()
195
196 assert os.path.exists(args.in_annot), "--annot file \"%s\" not found" % (args.in_annot)
197 assert os.path.exists(args.in_matrix), "--matrix file \"%s\" not found" % (args.in_matrix)
198 assert os.path.exists(args.in_info), "--info file \"%s\" not found" % (args.in_info)
199 assert os.path.exists(args.ds_markdown), "--ds-markdown file \"%s\" not found" % (args.ds_markdown)
200
201 # Input files.
202 annot_in = os.path.abspath(args.in_annot)
203 matrix_in = os.path.abspath(args.in_matrix)
204 info_in = os.path.abspath(args.in_info)
205 md_in = os.path.abspath(args.ds_markdown)
206
207 # Sum Sanity Checks.
208 matrix_ids = {}
209
210 with open(matrix_in) as f:
211 for line in f:
212 cols = line.strip().split("\t")
213 for c in cols[1:]:
214 matrix_ids[c] = 1
215 break
216 f.closed
217
218 assert matrix_ids, "no dataset columns found in count table file"
219
220 info_ids = {}
221
222 with open(info_in) as f:
223 for line in f:
224 if re.search("^Sample name", line):
225 continue
226 cols = line.strip().split("\t")
227 if cols[0] == "Sample name":
228 continue
229 else:
230 info_ids[cols[0]] = 1
231 f.closed
232
233 assert info_ids, "no dataset columns found in info table file"
234 assert len(matrix_ids) == len(info_ids), "differing numbers of dataset IDs in count table and info table file"
235 assert check_two_dic_same_keys(matrix_ids, info_ids), "dataset IDs in count table and info table file not identical"
236 assert args.ds_ms <= len(matrix_ids), "set DEWSeq min_sample > number of data samples in count / info table files (%i > %i)" % (args.ds_ms, len(matrix_ids))
237 print("Dataset IDs are valid ... ")
238
239 # Output folder.
240 if not os.path.exists(args.out_folder):
241 os.makedirs(args.out_folder)
242
243 # Output files.
244 abs_path_out = os.path.abspath(args.out_folder)
245
246 # Copy markdown file to results folder.
247 if args.copy_markdown:
248 md_source = md_in
249 md_in = abs_path_out + "/analyseStudy.Rmd"
250 if not os.path.exists(md_in):
251 print("Copying markdown file to output folder ... ")
252 shutil.copyfile(md_source, md_in)
253
254 html_out = abs_path_out + "/report.html"
255 win_csv_out = abs_path_out + "/windows.csv"
256 sig_reg_csv_out = abs_path_out + "/significant_regions.csv"
257 sig_win_reg_bed_out = abs_path_out + "/significant_windows_and_regions.bed"
258 sig_reg_bed_out = abs_path_out + "/significant_regions.bed"
259
260 # Delete existing files (as if no peaks found old files would be reported).
261 if os.path.exists(html_out):
262 os.remove(html_out)
263 if os.path.exists(win_csv_out):
264 os.remove(win_csv_out)
265 if os.path.exists(sig_reg_csv_out):
266 os.remove(sig_reg_csv_out)
267 if os.path.exists(sig_win_reg_bed_out):
268 os.remove(sig_win_reg_bed_out)
269 if os.path.exists(sig_reg_bed_out):
270 os.remove(sig_reg_bed_out)
271
272 """
273 Run DEWSeq R markdown file.
274
275 """
276
277 md_ihw = "TRUE"
278 md_df = "TRUE"
279 md_lrt = "FALSE"
280 md_olc = "FALSE"
281 if args.ds_disable_ihw:
282 md_ihw = "FALSE"
283 if args.ds_disable_df:
284 md_df = "FALSE"
285 if args.ds_use_lrt:
286 md_lrt = "TRUE"
287 if args.ds_use_oc:
288 md_olc = "TRUE"
289
290 md_cmd = "Rscript -e 'rmarkdown::render("
291 md_cmd += 'input = "%s", ' % (md_in)
292 md_cmd += 'output_file = "%s", ' % (html_out)
293 md_cmd += 'params = list(protein = "%s", ' % (args.ds_id)
294 md_cmd += 'sampleinfo_file = "%s", ' % (info_in)
295 md_cmd += 'countmatrix_file = "%s", ' % (matrix_in)
296 md_cmd += 'annotation_file = "%s", ' % (annot_in)
297 md_cmd += 'output_windows_file = "%s", ' % (win_csv_out)
298 md_cmd += 'output_regions_file = "%s", ' % (sig_reg_csv_out)
299 md_cmd += 'output_bed_file = "%s", ' % (sig_win_reg_bed_out)
300 md_cmd += 'min_count = %i, ' % (args.ds_mc)
301 md_cmd += 'min_sample = %i, ' % (args.ds_ms)
302 md_cmd += 'p_value_cutoff = %s, ' % (str(args.ds_pvc))
303 md_cmd += 'lfc_cutoff = %s, ' % (str(args.ds_lfcc))
304 md_cmd += 'overlap_correction = %s, ' % (md_olc)
305 md_cmd += 'IHW = %s, ' % (md_ihw)
306 md_cmd += 'decide_fit = %s, ' % (md_df)
307 md_cmd += 'LRT = %s))' % (md_lrt)
308 md_cmd += "'"
309
310 print("Running DEWSeq R markdown file ... ")
311 print(md_cmd)
312 output = subprocess.getoutput(md_cmd)
313 print(output)
314
315 print("")
316 print("Checking for output files ... ")
317 if os.path.exists(html_out):
318 print("FOUND HTML report file \"%s\" ... " % (html_out))
319 else:
320 print("MISSING HTML report file \"%s\" ... " % (html_out))
321 if os.path.exists(win_csv_out):
322 print("FOUND windows CSV file \"%s\" ... " % (win_csv_out))
323 else:
324 print("MISSING windows CSV file \"%s\" ... " % (win_csv_out))
325 if os.path.exists(sig_reg_csv_out):
326 print("FOUND significant regions CSV file \"%s\" ... " % (sig_reg_csv_out))
327 else:
328 print("MISSING significant regions CSV file \"%s\" ... " % (sig_reg_csv_out))
329
330 assert os.path.exists(html_out) and os.path.exists(win_csv_out), "DEWSeq terminated / did no produce any output files. This could be due to too strict filter settings (e.g., min_sample, min_count ... ). Please try again with more relaxed settings"
331 assert os.path.exists(html_out), "output file \"%s\" not found" % (html_out)
332 assert os.path.exists(win_csv_out), "output file \"%s\" not found" % (win_csv_out)
333
334 print("")
335 if not os.path.exists(sig_reg_csv_out):
336 print("WARNING: no significant regions found! (missing \"%s\" file)" % (sig_reg_csv_out))
337 else:
338 assert os.path.exists(sig_win_reg_bed_out), "output file \"%s\" not found" % (sig_win_reg_bed_out)
339 c_sig_reg = count_file_rows(sig_reg_csv_out) - 1
340 print("# significant regions: %i" % (c_sig_reg))
341 # Save contiguous BED regions in separate file.
342 OUTBED = open(sig_reg_bed_out, "w")
343 with open(sig_reg_csv_out) as f:
344 for line in f:
345 row = line.strip()
346 cols = line.strip().split("\t")
347 if cols[1] == "region_begin":
348 continue
349 chr_id = cols[0]
350 reg_s = cols[1]
351 reg_e = cols[2]
352 pol = cols[3]
353 win_in_reg = int(cols[4])
354 padj_mean = cols[7]
355 logfc_mean = cols[10]
356 reg_id = cols[12]
357 new_reg_id = reg_id
358 # For regions consisting of > 1 window.
359 if win_in_reg > 1:
360 new_reg_id = reg_id + "@region"
361 # Print out BED with additional columns (padj, logfc).
362 OUTBED.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\n" % (chr_id, reg_s, reg_e, new_reg_id, pol, padj_mean, logfc_mean))
363
364 f.closed
365 OUTBED.close()
366
367 """
368 Report.
369
370 """
371
372 print("")
373 print("OUTPUT FILES")
374 print("============")
375 print("HTML report:\n%s" % (html_out))
376 print("Windows CSV:\n%s" % (win_csv_out))
377 if os.path.exists(sig_reg_csv_out):
378 print("Significant regions CSV:\n%s" % (sig_reg_csv_out))
379 if os.path.exists(sig_win_reg_bed_out):
380 print("Significant windows + regions BED:\n%s" % (sig_win_reg_bed_out))
381 if os.path.exists(sig_reg_bed_out):
382 print("Significant regions BED:\n%s" % (sig_reg_bed_out))
383 print("")