Mercurial > repos > bgruening > htseq_clip
comparison htsc_create_count_table.py @ 0:94a987a7da69 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author | bgruening |
---|---|
date | Tue, 11 Oct 2022 16:09:23 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:94a987a7da69 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 import os | |
5 import subprocess | |
6 | |
7 import pysam | |
8 | |
9 """ | |
10 | |
11 Install deseq-clip | |
12 ================== | |
13 | |
14 conda install -c bioconda pysam | |
15 conda install -c bioconda htseq | |
16 pip install htseq-clip | |
17 | |
18 Or directly by: | |
19 conda install -c bioconda htseq-clip | |
20 | |
21 | |
22 Test call | |
23 ========= | |
24 | |
25 python htsc_create_count_table.py --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper | |
26 | |
27 Compare: | |
28 diff test-data/Rbp_count_matrix.exp.txt test_create_count_table_out/count_matrix.txt | |
29 diff test-data/sample_info.exp.txt test_create_count_table_out/sample_info.txt | |
30 | |
31 This corresponds to: | |
32 htseq-clip extract -i test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 | |
33 htseq-clip extract -i test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 | |
34 htseq-clip extract -i test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 | |
35 htseq-clip count -i test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep1.Synechocystis_pSYSM.csv | |
36 htseq-clip count -i test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep2.Synechocystis_pSYSM.csv | |
37 htseq-clip count -i test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_ctrl_rep1.Synechocystis_pSYSM.csv | |
38 htseq-clip createMatrix -i test_create_count_table_out/counts_Rbp -o test_create_count_table_out/Rbp_count_matrix.txt -b Rbp | |
39 | |
40 | |
41 To get BAM content for single chromosome: | |
42 samtools view -b -h bam_all/Rbp3_uv_rep1.bam Synechocystis_pSYSM > test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam | |
43 | |
44 More tests: | |
45 python htsc_create_count_table.py --hce-f test-data/chr_names.txt --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper | |
46 | |
47 DEWSeq input files: | |
48 test_create_count_table_out/Rbp_count_matrix.txt | |
49 test_create_count_table_out/sample_info.txt | |
50 | |
51 """ | |
52 | |
53 | |
54 ################################################################################ | |
55 | |
56 def setup_argument_parser(): | |
57 """Setup argparse parser.""" | |
58 help_description = """ | |
59 Use htseq-clip to extract crosslink sites from BAM files, and count | |
60 overlaps with window regions. Finally create an R count matrix, e.g. as | |
61 input for DEWSeq. | |
62 """ | |
63 # Define argument parser. | |
64 p = argparse.ArgumentParser(add_help=False, | |
65 prog="htsc_create_count_table.py", | |
66 description=help_description, | |
67 formatter_class=argparse.MetavarTypeHelpFormatter) | |
68 | |
69 # Required arguments. | |
70 p.add_argument("-h", "--help", | |
71 action="help", | |
72 help="Print help message") | |
73 p.add_argument("--win-bed", | |
74 dest="in_win_bed", | |
75 type=str, | |
76 metavar='str', | |
77 required=True, | |
78 help="Sliding windows BED annotation file created with htseq-clip createSlidingWindows (also accepts .bed.gz)") | |
79 p.add_argument("--exp-bams", | |
80 dest="exp_bam_list", | |
81 type=str, | |
82 metavar='str', | |
83 nargs='+', | |
84 required=True, | |
85 help="List of IP BAM files (--exp-bams ip_rep1.bam ip_rep2.bam .. )") | |
86 p.add_argument("--ctr-bams", | |
87 dest="ctr_bam_list", | |
88 type=str, | |
89 metavar='str', | |
90 nargs='+', | |
91 required=True, | |
92 help="List of control (SM input) BAM files (--ctr-bams smi_rep1.bam smi_rep2.bam .. )") | |
93 p.add_argument("--out", | |
94 dest="out_folder", | |
95 type=str, | |
96 metavar='str', | |
97 required=True, | |
98 help="Results output folder") | |
99 # htseq-clip extract. | |
100 p.add_argument("--hce-e", | |
101 dest="hce_e", | |
102 type=int, | |
103 default=1, | |
104 choices=[1, 2], | |
105 help="htseq-clip extract -e parameter. This selects read/mate to extract crosslink sites from (default: 1)") | |
106 p.add_argument("--hce-s", | |
107 dest="hce_s", | |
108 type=str, | |
109 default="m", | |
110 help="htseq-clip extract -s parameter. Choose crosslink site (s: start, m: middle, e: end ... ) (default: m)") | |
111 p.add_argument("--hce-g", | |
112 dest="hce_g", | |
113 type=int, | |
114 metavar='int', | |
115 default=0, | |
116 help="htseq-clip extract -g parameter. Number of nucleotides to offset for crosslink sites (default: 0)") | |
117 p.add_argument("--hce-q", | |
118 dest="hce_q", | |
119 type=int, | |
120 metavar='int', | |
121 default=10, | |
122 help="htseq-clip extract -q parameter. Minimum alignment quality for filtering (default: 10)") | |
123 p.add_argument("--hce-primary", | |
124 dest="hce_primary", | |
125 default=False, | |
126 action="store_true", | |
127 help="htseq-clip extract --primary parameter. Flag to use only primary positions of multimapping reads (default: False)") | |
128 p.add_argument("--hce-c", | |
129 dest="hce_c", | |
130 type=int, | |
131 metavar='int', | |
132 default=1, | |
133 help="htseq-clip extract -c parameter. Number of threads/cores to use (default: 1)") | |
134 p.add_argument("--hce-m", | |
135 dest="hce_m", | |
136 type=int, | |
137 metavar='int', | |
138 default=0, | |
139 help="htseq-clip extract -m parameter. Minimum read length for filtering (default: 0)") | |
140 p.add_argument("--hce-x", | |
141 dest="hce_x", | |
142 type=int, | |
143 metavar='int', | |
144 default=500, | |
145 help="htseq-clip extract -x parameter. Maximum read length for filtering (default: 500)") | |
146 p.add_argument("--hce-l", | |
147 dest="hce_l", | |
148 type=int, | |
149 metavar='int', | |
150 default=10000, | |
151 help="htseq-clip extract -l parameter. Maximum read interval length (default: 10000)") | |
152 p.add_argument("--hce-f", | |
153 dest="hce_f", | |
154 type=str, | |
155 help="htseq-clip extract -f parameter. Extract crosslink sites only from chromosomes given in this file (one chromosome ID per line)") | |
156 # htseq-clip count. | |
157 p.add_argument("--hcc-unstranded", | |
158 dest="hcc_unstranded", | |
159 default=False, | |
160 action="store_true", | |
161 help="htseq-clip count --unstranded parameter. Use this flag for non strand specific crosslink site counting (default: strand-specific counting)") | |
162 # More. | |
163 p.add_argument("--data-id", | |
164 dest="dataset_id", | |
165 type=str, | |
166 default="RBP", | |
167 metavar='str', | |
168 help="Dataset ID used as prefix for naming datasets (default: RBP)") | |
169 p.add_argument("--filter-bed", | |
170 dest="filter_bed", | |
171 type=str, | |
172 metavar='str', | |
173 help="Provide BED file to filter out BAM reads overlapping with --filter-bed regions") | |
174 p.add_argument("--filter-mode", | |
175 dest="filter_mode", | |
176 type=int, | |
177 default=1, | |
178 choices=[1, 2], | |
179 help="Filter mode for --filter-bed file. 1: keep BAM reads not overlapping with --filter-bed regions. 2: Keep only BAM reads overlapping with --filter-bed (default: 1)") | |
180 p.add_argument("--no-zipper", | |
181 dest="no_zipper", | |
182 default=False, | |
183 action="store_true", | |
184 help="Do not gzip output files (default: False)") | |
185 p.add_argument("--keep-imf", | |
186 dest="keep_imf", | |
187 default=False, | |
188 action="store_true", | |
189 help="Keep intermediate files (filtered BAM, BED, CSV) (default: False)") | |
190 return p | |
191 | |
192 | |
193 ################################################################################ | |
194 | |
195 def file_make_symbolic_link(file, file_link, | |
196 force_overwrite=True): | |
197 """ | |
198 Create a symbolic file link file_link (ln -s) of file. | |
199 | |
200 force_overwrite: | |
201 Overwrites existing file link. | |
202 | |
203 """ | |
204 assert os.path.exists(file), "file does not exist" | |
205 # Get absolute path needed for symlink to work. | |
206 file_abs_path = os.path.abspath(file) | |
207 | |
208 check_cmd = "ln -s " | |
209 if force_overwrite: | |
210 check_cmd += "-f " | |
211 check_cmd = check_cmd + file_abs_path + " " + file_link | |
212 output = subprocess.getoutput(check_cmd) | |
213 error = False | |
214 if output: | |
215 error = True | |
216 assert error is False, "ln has problems with your input:\n%s\n%s" % (check_cmd, output) | |
217 | |
218 | |
219 ################################################################################ | |
220 | |
221 def bam_remove_overlap_region_reads(in_bed, in_bam, out_bam, | |
222 params="", | |
223 sort_bed=False): | |
224 | |
225 """ | |
226 Remove BAM reads from in_bam, based on overlap with in_bed BED regions. | |
227 I.e. only BAM reads not overlapping with in_bed regions are stored in | |
228 out_bam. | |
229 | |
230 Using intersectBed instead of samtools view -L, which allows -s for strand | |
231 specific filtering. | |
232 | |
233 """ | |
234 assert os.path.exists(in_bed), "in_bed does not exist" | |
235 assert os.path.exists(in_bam), "in_bam does not exist" | |
236 | |
237 if sort_bed: | |
238 check_cmd = "sort -k1,1 -k2,2n " + in_bed + " | " + "intersectBed -abam " + in_bam + " -b stdin " + params + " -sorted > " + out_bam | |
239 else: | |
240 check_cmd = "intersectBed -abam " + in_bam + " -b " + in_bed + " " + params + " > " + out_bam | |
241 output = subprocess.getoutput(check_cmd) | |
242 error = False | |
243 if output: | |
244 error = True | |
245 assert error is False, "intersectBed has problems with your input:\n%s\n%s" % (check_cmd, output) | |
246 | |
247 | |
248 ################################################################################ | |
249 | |
250 if __name__ == '__main__': | |
251 | |
252 parser = setup_argument_parser() | |
253 args = parser.parse_args() | |
254 | |
255 assert os.path.exists(args.in_win_bed), "--win-bed file \"%s\" not found" % (args.in_win_bed) | |
256 | |
257 for bam_file in args.exp_bam_list: | |
258 assert os.path.exists(bam_file), "--exp-bams BAM file \"%s\" not found" % (bam_file) | |
259 | |
260 for bam_file in args.ctr_bam_list: | |
261 assert os.path.exists(bam_file), "--ctr-bams BAM file \"%s\" not found" % (bam_file) | |
262 | |
263 # assert len(args.exp_bam_list) > 1, "# --exp-bams needs to be > 1" | |
264 | |
265 if args.filter_bed: | |
266 assert os.path.exists(args.filter_bed), "--filter-bed file \"%s\" not found" % (args.filter_bed) | |
267 | |
268 hce_s_dic = {"s": 1, "i": 1, "d": 1, "m": 1, "e": 1} | |
269 assert args.hce_s in hce_s_dic, "invalid --hce-s given (choose between {s,i,d,m,e})" | |
270 | |
271 # Crop dataset ID. | |
272 data_id = args.dataset_id | |
273 if len(args.dataset_id) > 20: | |
274 data_id = args.dataset_id[:20] | |
275 # Remove spaces. | |
276 data_id = data_id.replace(" ", "_") | |
277 | |
278 # Output folders. | |
279 if not os.path.exists(args.out_folder): | |
280 os.makedirs(args.out_folder) | |
281 counts_folder = args.out_folder + "/counts_%s" % (data_id) | |
282 if not os.path.exists(counts_folder): | |
283 os.makedirs(counts_folder) | |
284 | |
285 """ | |
286 Create BAM index files. | |
287 | |
288 """ | |
289 print("# experiment BAMs: %i" % (len(args.exp_bam_list))) | |
290 print("# control BAMs: %i" % (len(args.ctr_bam_list))) | |
291 | |
292 exp_bams = [] | |
293 control_bams = [] | |
294 exp_ids = [] | |
295 control_ids = [] | |
296 params = "-s -v" | |
297 if args.filter_mode == 2: | |
298 params = "-s -u" | |
299 intermediate_files = [] | |
300 | |
301 print("Create BAM index files ... ") | |
302 for i, bam_file in enumerate(args.exp_bam_list): | |
303 | |
304 idx = i + 1 | |
305 | |
306 out_bam = args.out_folder + "/%s_exp_rep%i.bam" % (data_id, idx) | |
307 | |
308 if args.filter_bed: | |
309 print("Filter %s by --filter-bed ... " % (bam_file)) | |
310 bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params) | |
311 intermediate_files.append(out_bam) | |
312 else: | |
313 # out_bam = bam_file | |
314 file_make_symbolic_link(bam_file, out_bam) | |
315 # shutil.move(bam_file, out_bam) | |
316 | |
317 pysam.index(out_bam) | |
318 exp_bams.append(out_bam) | |
319 exp_ids.append("%s_exp_rep%i" % (data_id, idx)) | |
320 | |
321 for i, bam_file in enumerate(args.ctr_bam_list): | |
322 | |
323 idx = i + 1 | |
324 | |
325 out_bam = args.out_folder + "/%s_ctrl_rep%i.bam" % (data_id, idx) | |
326 | |
327 if args.filter_bed: | |
328 print("Filter %s by --filter-bed ... " % (bam_file)) | |
329 bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params) | |
330 intermediate_files.append(out_bam) | |
331 else: | |
332 # out_bam = bam_file | |
333 file_make_symbolic_link(bam_file, out_bam) | |
334 # shutil.move(bam_file, out_bam) | |
335 | |
336 pysam.index(out_bam) | |
337 control_bams.append(out_bam) | |
338 control_ids.append("%s_ctrl_rep%i" % (data_id, idx)) | |
339 | |
340 """ | |
341 htseq-clip extract crosslink sites. | |
342 | |
343 htseq-clip extract -i bam/Rbp3_total_rep1.bam -e 1 -s m -o sites/Rbp3_total_rep1.bed | |
344 | |
345 hce_e : -e [1,2] %i | |
346 hce_s : -s %s | |
347 -s {s,i,d,m,e}, --site {s,i,d,m,e} | |
348 Crosslink site choices, must be one of: s, i, d, m, e | |
349 s: start site | |
350 i: insertion site | |
351 d: deletion site | |
352 m: middle site | |
353 e: end site (default: e). | |
354 hce_g : -g offset nt %i | |
355 hce_q : -q 10 | |
356 hce_primary : --primary | |
357 hce_c : -c 4 | |
358 hce_m : -m 0 | |
359 hce_x : -x 500 | |
360 hce_l : -l 10000 | |
361 hce_f : -f chr_id file | |
362 | |
363 """ | |
364 | |
365 print("Extract crosslink sites from BAM files ... ") | |
366 | |
367 extract_params = " -e %i -s %s -g %i -q %i -c %i -m %i -x %i -l %i" % (args.hce_e, args.hce_s, args.hce_g, args.hce_q, args.hce_c, args.hce_m, args.hce_x, args.hce_l) | |
368 if args.hce_primary: | |
369 extract_params += " --primary " | |
370 if args.hce_f: | |
371 assert os.path.exists(args.hce_f), "--hce-f file \"%s\" not found" % (args.hce_f) | |
372 extract_params += " -f %s " % (args.hce_f) | |
373 | |
374 # Experiment BAMs. | |
375 exp_beds = [] | |
376 for i, bam_file in enumerate(exp_bams): | |
377 | |
378 idx = i + 1 | |
379 | |
380 out_bed = args.out_folder + "/%s_exp_rep%i.bed" % (data_id, idx) | |
381 | |
382 check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params | |
383 output = subprocess.getoutput(check_cmd) | |
384 | |
385 print(check_cmd) | |
386 print(output) | |
387 | |
388 assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed) | |
389 exp_beds.append(out_bed) | |
390 intermediate_files.append(out_bed) | |
391 | |
392 # Control BAMs. | |
393 control_beds = [] | |
394 for i, bam_file in enumerate(control_bams): | |
395 | |
396 idx = i + 1 | |
397 | |
398 out_bed = args.out_folder + "/%s_ctrl_rep%i.bed" % (data_id, idx) | |
399 | |
400 check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params | |
401 output = subprocess.getoutput(check_cmd) | |
402 | |
403 print(check_cmd) | |
404 print(output) | |
405 | |
406 assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed) | |
407 control_beds.append(out_bed) | |
408 intermediate_files.append(out_bed) | |
409 | |
410 """ | |
411 Count crosslink sites in sliding windows. | |
412 | |
413 htseq-clip count -i sites/Rbp3_total_rep1.bed -a annotation/Rbp3_uv_total_w75s10.txt -o counts/Rbp3_total_rep1.csv | |
414 | |
415 """ | |
416 | |
417 print("Count reads overlapping with windows ... ") | |
418 | |
419 count_params = "" | |
420 if args.hcc_unstranded: | |
421 count_params += " --unstranded" | |
422 # Experiment BAMs. | |
423 for i, bed_file in enumerate(exp_beds): | |
424 | |
425 idx = i + 1 | |
426 | |
427 out_csv = counts_folder + "/%s_exp_rep%i.csv.gz" % (data_id, idx) | |
428 if args.no_zipper: | |
429 out_csv = counts_folder + "/%s_exp_rep%i.csv" % (data_id, idx) | |
430 | |
431 check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params | |
432 output = subprocess.getoutput(check_cmd) | |
433 | |
434 print(check_cmd) | |
435 print(output) | |
436 | |
437 assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv) | |
438 intermediate_files.append(out_csv) | |
439 | |
440 # Control BAMs. | |
441 for i, bed_file in enumerate(control_beds): | |
442 | |
443 idx = i + 1 | |
444 | |
445 out_csv = counts_folder + "/%s_ctrl_rep%i.csv.gz" % (data_id, idx) | |
446 if args.no_zipper: | |
447 out_csv = counts_folder + "/%s_ctrl_rep%i.csv" % (data_id, idx) | |
448 | |
449 check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params | |
450 output = subprocess.getoutput(check_cmd) | |
451 | |
452 print(check_cmd) | |
453 print(output) | |
454 | |
455 assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv) | |
456 intermediate_files.append(out_csv) | |
457 | |
458 """ | |
459 Create an R friendly matrix file. | |
460 | |
461 htseq-clip createMatrix -i counts/ -b Rbp3 -o counts/Rbp3_uv_total_w75s10_counts.txt.gz | |
462 | |
463 """ | |
464 | |
465 print("Create R-friendly count matrix file ... ") | |
466 | |
467 # out_matrix = args.out_folder + "/%s_count_matrix.txt.gz" %(data_id) | |
468 out_matrix = args.out_folder + "/count_matrix.txt.gz" | |
469 if args.no_zipper: | |
470 # out_matrix = args.out_folder + "/%s_count_matrix.txt" %(data_id) | |
471 out_matrix = args.out_folder + "/count_matrix.txt" | |
472 | |
473 check_cmd = "htseq-clip createMatrix -i " + counts_folder + " -b " + data_id + " -o " + out_matrix | |
474 output = subprocess.getoutput(check_cmd) | |
475 | |
476 print(check_cmd) | |
477 print(output) | |
478 | |
479 assert os.path.exists(out_matrix), "htseq-clip createMatrix -o file \"%s\" not found" % (out_matrix) | |
480 | |
481 """ | |
482 Create sample info file for DEWSeq. | |
483 | |
484 Sample Info file format: | |
485 Sample name Sample type | |
486 Rbp3_uv_rep1 IP | |
487 Rbp3_uv_rep2 IP | |
488 Rbp3_uv_rep3 IP | |
489 Rbp3_total_rep1 SMI | |
490 Rbp3_total_rep2 SMI | |
491 Rbp3_total_rep3 SMI | |
492 | |
493 """ | |
494 | |
495 print("Write sample info file ... ") | |
496 sample_info_file = args.out_folder + "/sample_info.txt" | |
497 OUTTAB = open(sample_info_file, "w") | |
498 OUTTAB.write("Sample name\tSample type\n") | |
499 for sid in exp_ids: | |
500 OUTTAB.write("%s\tIP\n" % (sid)) | |
501 for sid in control_ids: | |
502 OUTTAB.write("%s\tSMI\n" % (sid)) | |
503 OUTTAB.close() | |
504 | |
505 """ | |
506 Delete intermediate files. | |
507 | |
508 """ | |
509 if not args.keep_imf: | |
510 print("Delete intermediate files ... ") | |
511 for imf in intermediate_files: | |
512 if os.path.exists(imf): | |
513 os.remove(imf) | |
514 | |
515 """ | |
516 Report. | |
517 | |
518 """ | |
519 | |
520 print("") | |
521 print("OUTPUT FILES") | |
522 print("============") | |
523 | |
524 print("Count matrix file (DEWSeq input file):\n%s" % (out_matrix)) | |
525 print("Sample info file (DEWSeq input file):\n%s" % (sample_info_file)) | |
526 print("") |