comparison htsc_create_count_table.py @ 0:94a987a7da69 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author bgruening
date Tue, 11 Oct 2022 16:09:23 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:94a987a7da69
1 #!/usr/bin/env python3
2
3 import argparse
4 import os
5 import subprocess
6
7 import pysam
8
9 """
10
11 Install deseq-clip
12 ==================
13
14 conda install -c bioconda pysam
15 conda install -c bioconda htseq
16 pip install htseq-clip
17
18 Or directly by:
19 conda install -c bioconda htseq-clip
20
21
22 Test call
23 =========
24
25 python htsc_create_count_table.py --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper
26
27 Compare:
28 diff test-data/Rbp_count_matrix.exp.txt test_create_count_table_out/count_matrix.txt
29 diff test-data/sample_info.exp.txt test_create_count_table_out/sample_info.txt
30
31 This corresponds to:
32 htseq-clip extract -i test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000
33 htseq-clip extract -i test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000
34 htseq-clip extract -i test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000
35 htseq-clip count -i test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep1.Synechocystis_pSYSM.csv
36 htseq-clip count -i test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep2.Synechocystis_pSYSM.csv
37 htseq-clip count -i test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_ctrl_rep1.Synechocystis_pSYSM.csv
38 htseq-clip createMatrix -i test_create_count_table_out/counts_Rbp -o test_create_count_table_out/Rbp_count_matrix.txt -b Rbp
39
40
41 To get BAM content for single chromosome:
42 samtools view -b -h bam_all/Rbp3_uv_rep1.bam Synechocystis_pSYSM > test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam
43
44 More tests:
45 python htsc_create_count_table.py --hce-f test-data/chr_names.txt --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper
46
47 DEWSeq input files:
48 test_create_count_table_out/Rbp_count_matrix.txt
49 test_create_count_table_out/sample_info.txt
50
51 """
52
53
54 ################################################################################
55
56 def setup_argument_parser():
57 """Setup argparse parser."""
58 help_description = """
59 Use htseq-clip to extract crosslink sites from BAM files, and count
60 overlaps with window regions. Finally create an R count matrix, e.g. as
61 input for DEWSeq.
62 """
63 # Define argument parser.
64 p = argparse.ArgumentParser(add_help=False,
65 prog="htsc_create_count_table.py",
66 description=help_description,
67 formatter_class=argparse.MetavarTypeHelpFormatter)
68
69 # Required arguments.
70 p.add_argument("-h", "--help",
71 action="help",
72 help="Print help message")
73 p.add_argument("--win-bed",
74 dest="in_win_bed",
75 type=str,
76 metavar='str',
77 required=True,
78 help="Sliding windows BED annotation file created with htseq-clip createSlidingWindows (also accepts .bed.gz)")
79 p.add_argument("--exp-bams",
80 dest="exp_bam_list",
81 type=str,
82 metavar='str',
83 nargs='+',
84 required=True,
85 help="List of IP BAM files (--exp-bams ip_rep1.bam ip_rep2.bam .. )")
86 p.add_argument("--ctr-bams",
87 dest="ctr_bam_list",
88 type=str,
89 metavar='str',
90 nargs='+',
91 required=True,
92 help="List of control (SM input) BAM files (--ctr-bams smi_rep1.bam smi_rep2.bam .. )")
93 p.add_argument("--out",
94 dest="out_folder",
95 type=str,
96 metavar='str',
97 required=True,
98 help="Results output folder")
99 # htseq-clip extract.
100 p.add_argument("--hce-e",
101 dest="hce_e",
102 type=int,
103 default=1,
104 choices=[1, 2],
105 help="htseq-clip extract -e parameter. This selects read/mate to extract crosslink sites from (default: 1)")
106 p.add_argument("--hce-s",
107 dest="hce_s",
108 type=str,
109 default="m",
110 help="htseq-clip extract -s parameter. Choose crosslink site (s: start, m: middle, e: end ... ) (default: m)")
111 p.add_argument("--hce-g",
112 dest="hce_g",
113 type=int,
114 metavar='int',
115 default=0,
116 help="htseq-clip extract -g parameter. Number of nucleotides to offset for crosslink sites (default: 0)")
117 p.add_argument("--hce-q",
118 dest="hce_q",
119 type=int,
120 metavar='int',
121 default=10,
122 help="htseq-clip extract -q parameter. Minimum alignment quality for filtering (default: 10)")
123 p.add_argument("--hce-primary",
124 dest="hce_primary",
125 default=False,
126 action="store_true",
127 help="htseq-clip extract --primary parameter. Flag to use only primary positions of multimapping reads (default: False)")
128 p.add_argument("--hce-c",
129 dest="hce_c",
130 type=int,
131 metavar='int',
132 default=1,
133 help="htseq-clip extract -c parameter. Number of threads/cores to use (default: 1)")
134 p.add_argument("--hce-m",
135 dest="hce_m",
136 type=int,
137 metavar='int',
138 default=0,
139 help="htseq-clip extract -m parameter. Minimum read length for filtering (default: 0)")
140 p.add_argument("--hce-x",
141 dest="hce_x",
142 type=int,
143 metavar='int',
144 default=500,
145 help="htseq-clip extract -x parameter. Maximum read length for filtering (default: 500)")
146 p.add_argument("--hce-l",
147 dest="hce_l",
148 type=int,
149 metavar='int',
150 default=10000,
151 help="htseq-clip extract -l parameter. Maximum read interval length (default: 10000)")
152 p.add_argument("--hce-f",
153 dest="hce_f",
154 type=str,
155 help="htseq-clip extract -f parameter. Extract crosslink sites only from chromosomes given in this file (one chromosome ID per line)")
156 # htseq-clip count.
157 p.add_argument("--hcc-unstranded",
158 dest="hcc_unstranded",
159 default=False,
160 action="store_true",
161 help="htseq-clip count --unstranded parameter. Use this flag for non strand specific crosslink site counting (default: strand-specific counting)")
162 # More.
163 p.add_argument("--data-id",
164 dest="dataset_id",
165 type=str,
166 default="RBP",
167 metavar='str',
168 help="Dataset ID used as prefix for naming datasets (default: RBP)")
169 p.add_argument("--filter-bed",
170 dest="filter_bed",
171 type=str,
172 metavar='str',
173 help="Provide BED file to filter out BAM reads overlapping with --filter-bed regions")
174 p.add_argument("--filter-mode",
175 dest="filter_mode",
176 type=int,
177 default=1,
178 choices=[1, 2],
179 help="Filter mode for --filter-bed file. 1: keep BAM reads not overlapping with --filter-bed regions. 2: Keep only BAM reads overlapping with --filter-bed (default: 1)")
180 p.add_argument("--no-zipper",
181 dest="no_zipper",
182 default=False,
183 action="store_true",
184 help="Do not gzip output files (default: False)")
185 p.add_argument("--keep-imf",
186 dest="keep_imf",
187 default=False,
188 action="store_true",
189 help="Keep intermediate files (filtered BAM, BED, CSV) (default: False)")
190 return p
191
192
193 ################################################################################
194
195 def file_make_symbolic_link(file, file_link,
196 force_overwrite=True):
197 """
198 Create a symbolic file link file_link (ln -s) of file.
199
200 force_overwrite:
201 Overwrites existing file link.
202
203 """
204 assert os.path.exists(file), "file does not exist"
205 # Get absolute path needed for symlink to work.
206 file_abs_path = os.path.abspath(file)
207
208 check_cmd = "ln -s "
209 if force_overwrite:
210 check_cmd += "-f "
211 check_cmd = check_cmd + file_abs_path + " " + file_link
212 output = subprocess.getoutput(check_cmd)
213 error = False
214 if output:
215 error = True
216 assert error is False, "ln has problems with your input:\n%s\n%s" % (check_cmd, output)
217
218
219 ################################################################################
220
221 def bam_remove_overlap_region_reads(in_bed, in_bam, out_bam,
222 params="",
223 sort_bed=False):
224
225 """
226 Remove BAM reads from in_bam, based on overlap with in_bed BED regions.
227 I.e. only BAM reads not overlapping with in_bed regions are stored in
228 out_bam.
229
230 Using intersectBed instead of samtools view -L, which allows -s for strand
231 specific filtering.
232
233 """
234 assert os.path.exists(in_bed), "in_bed does not exist"
235 assert os.path.exists(in_bam), "in_bam does not exist"
236
237 if sort_bed:
238 check_cmd = "sort -k1,1 -k2,2n " + in_bed + " | " + "intersectBed -abam " + in_bam + " -b stdin " + params + " -sorted > " + out_bam
239 else:
240 check_cmd = "intersectBed -abam " + in_bam + " -b " + in_bed + " " + params + " > " + out_bam
241 output = subprocess.getoutput(check_cmd)
242 error = False
243 if output:
244 error = True
245 assert error is False, "intersectBed has problems with your input:\n%s\n%s" % (check_cmd, output)
246
247
248 ################################################################################
249
250 if __name__ == '__main__':
251
252 parser = setup_argument_parser()
253 args = parser.parse_args()
254
255 assert os.path.exists(args.in_win_bed), "--win-bed file \"%s\" not found" % (args.in_win_bed)
256
257 for bam_file in args.exp_bam_list:
258 assert os.path.exists(bam_file), "--exp-bams BAM file \"%s\" not found" % (bam_file)
259
260 for bam_file in args.ctr_bam_list:
261 assert os.path.exists(bam_file), "--ctr-bams BAM file \"%s\" not found" % (bam_file)
262
263 # assert len(args.exp_bam_list) > 1, "# --exp-bams needs to be > 1"
264
265 if args.filter_bed:
266 assert os.path.exists(args.filter_bed), "--filter-bed file \"%s\" not found" % (args.filter_bed)
267
268 hce_s_dic = {"s": 1, "i": 1, "d": 1, "m": 1, "e": 1}
269 assert args.hce_s in hce_s_dic, "invalid --hce-s given (choose between {s,i,d,m,e})"
270
271 # Crop dataset ID.
272 data_id = args.dataset_id
273 if len(args.dataset_id) > 20:
274 data_id = args.dataset_id[:20]
275 # Remove spaces.
276 data_id = data_id.replace(" ", "_")
277
278 # Output folders.
279 if not os.path.exists(args.out_folder):
280 os.makedirs(args.out_folder)
281 counts_folder = args.out_folder + "/counts_%s" % (data_id)
282 if not os.path.exists(counts_folder):
283 os.makedirs(counts_folder)
284
285 """
286 Create BAM index files.
287
288 """
289 print("# experiment BAMs: %i" % (len(args.exp_bam_list)))
290 print("# control BAMs: %i" % (len(args.ctr_bam_list)))
291
292 exp_bams = []
293 control_bams = []
294 exp_ids = []
295 control_ids = []
296 params = "-s -v"
297 if args.filter_mode == 2:
298 params = "-s -u"
299 intermediate_files = []
300
301 print("Create BAM index files ... ")
302 for i, bam_file in enumerate(args.exp_bam_list):
303
304 idx = i + 1
305
306 out_bam = args.out_folder + "/%s_exp_rep%i.bam" % (data_id, idx)
307
308 if args.filter_bed:
309 print("Filter %s by --filter-bed ... " % (bam_file))
310 bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params)
311 intermediate_files.append(out_bam)
312 else:
313 # out_bam = bam_file
314 file_make_symbolic_link(bam_file, out_bam)
315 # shutil.move(bam_file, out_bam)
316
317 pysam.index(out_bam)
318 exp_bams.append(out_bam)
319 exp_ids.append("%s_exp_rep%i" % (data_id, idx))
320
321 for i, bam_file in enumerate(args.ctr_bam_list):
322
323 idx = i + 1
324
325 out_bam = args.out_folder + "/%s_ctrl_rep%i.bam" % (data_id, idx)
326
327 if args.filter_bed:
328 print("Filter %s by --filter-bed ... " % (bam_file))
329 bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params)
330 intermediate_files.append(out_bam)
331 else:
332 # out_bam = bam_file
333 file_make_symbolic_link(bam_file, out_bam)
334 # shutil.move(bam_file, out_bam)
335
336 pysam.index(out_bam)
337 control_bams.append(out_bam)
338 control_ids.append("%s_ctrl_rep%i" % (data_id, idx))
339
340 """
341 htseq-clip extract crosslink sites.
342
343 htseq-clip extract -i bam/Rbp3_total_rep1.bam -e 1 -s m -o sites/Rbp3_total_rep1.bed
344
345 hce_e : -e [1,2] %i
346 hce_s : -s %s
347 -s {s,i,d,m,e}, --site {s,i,d,m,e}
348 Crosslink site choices, must be one of: s, i, d, m, e
349 s: start site
350 i: insertion site
351 d: deletion site
352 m: middle site
353 e: end site (default: e).
354 hce_g : -g offset nt %i
355 hce_q : -q 10
356 hce_primary : --primary
357 hce_c : -c 4
358 hce_m : -m 0
359 hce_x : -x 500
360 hce_l : -l 10000
361 hce_f : -f chr_id file
362
363 """
364
365 print("Extract crosslink sites from BAM files ... ")
366
367 extract_params = " -e %i -s %s -g %i -q %i -c %i -m %i -x %i -l %i" % (args.hce_e, args.hce_s, args.hce_g, args.hce_q, args.hce_c, args.hce_m, args.hce_x, args.hce_l)
368 if args.hce_primary:
369 extract_params += " --primary "
370 if args.hce_f:
371 assert os.path.exists(args.hce_f), "--hce-f file \"%s\" not found" % (args.hce_f)
372 extract_params += " -f %s " % (args.hce_f)
373
374 # Experiment BAMs.
375 exp_beds = []
376 for i, bam_file in enumerate(exp_bams):
377
378 idx = i + 1
379
380 out_bed = args.out_folder + "/%s_exp_rep%i.bed" % (data_id, idx)
381
382 check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params
383 output = subprocess.getoutput(check_cmd)
384
385 print(check_cmd)
386 print(output)
387
388 assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed)
389 exp_beds.append(out_bed)
390 intermediate_files.append(out_bed)
391
392 # Control BAMs.
393 control_beds = []
394 for i, bam_file in enumerate(control_bams):
395
396 idx = i + 1
397
398 out_bed = args.out_folder + "/%s_ctrl_rep%i.bed" % (data_id, idx)
399
400 check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params
401 output = subprocess.getoutput(check_cmd)
402
403 print(check_cmd)
404 print(output)
405
406 assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed)
407 control_beds.append(out_bed)
408 intermediate_files.append(out_bed)
409
410 """
411 Count crosslink sites in sliding windows.
412
413 htseq-clip count -i sites/Rbp3_total_rep1.bed -a annotation/Rbp3_uv_total_w75s10.txt -o counts/Rbp3_total_rep1.csv
414
415 """
416
417 print("Count reads overlapping with windows ... ")
418
419 count_params = ""
420 if args.hcc_unstranded:
421 count_params += " --unstranded"
422 # Experiment BAMs.
423 for i, bed_file in enumerate(exp_beds):
424
425 idx = i + 1
426
427 out_csv = counts_folder + "/%s_exp_rep%i.csv.gz" % (data_id, idx)
428 if args.no_zipper:
429 out_csv = counts_folder + "/%s_exp_rep%i.csv" % (data_id, idx)
430
431 check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params
432 output = subprocess.getoutput(check_cmd)
433
434 print(check_cmd)
435 print(output)
436
437 assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv)
438 intermediate_files.append(out_csv)
439
440 # Control BAMs.
441 for i, bed_file in enumerate(control_beds):
442
443 idx = i + 1
444
445 out_csv = counts_folder + "/%s_ctrl_rep%i.csv.gz" % (data_id, idx)
446 if args.no_zipper:
447 out_csv = counts_folder + "/%s_ctrl_rep%i.csv" % (data_id, idx)
448
449 check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params
450 output = subprocess.getoutput(check_cmd)
451
452 print(check_cmd)
453 print(output)
454
455 assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv)
456 intermediate_files.append(out_csv)
457
458 """
459 Create an R friendly matrix file.
460
461 htseq-clip createMatrix -i counts/ -b Rbp3 -o counts/Rbp3_uv_total_w75s10_counts.txt.gz
462
463 """
464
465 print("Create R-friendly count matrix file ... ")
466
467 # out_matrix = args.out_folder + "/%s_count_matrix.txt.gz" %(data_id)
468 out_matrix = args.out_folder + "/count_matrix.txt.gz"
469 if args.no_zipper:
470 # out_matrix = args.out_folder + "/%s_count_matrix.txt" %(data_id)
471 out_matrix = args.out_folder + "/count_matrix.txt"
472
473 check_cmd = "htseq-clip createMatrix -i " + counts_folder + " -b " + data_id + " -o " + out_matrix
474 output = subprocess.getoutput(check_cmd)
475
476 print(check_cmd)
477 print(output)
478
479 assert os.path.exists(out_matrix), "htseq-clip createMatrix -o file \"%s\" not found" % (out_matrix)
480
481 """
482 Create sample info file for DEWSeq.
483
484 Sample Info file format:
485 Sample name Sample type
486 Rbp3_uv_rep1 IP
487 Rbp3_uv_rep2 IP
488 Rbp3_uv_rep3 IP
489 Rbp3_total_rep1 SMI
490 Rbp3_total_rep2 SMI
491 Rbp3_total_rep3 SMI
492
493 """
494
495 print("Write sample info file ... ")
496 sample_info_file = args.out_folder + "/sample_info.txt"
497 OUTTAB = open(sample_info_file, "w")
498 OUTTAB.write("Sample name\tSample type\n")
499 for sid in exp_ids:
500 OUTTAB.write("%s\tIP\n" % (sid))
501 for sid in control_ids:
502 OUTTAB.write("%s\tSMI\n" % (sid))
503 OUTTAB.close()
504
505 """
506 Delete intermediate files.
507
508 """
509 if not args.keep_imf:
510 print("Delete intermediate files ... ")
511 for imf in intermediate_files:
512 if os.path.exists(imf):
513 os.remove(imf)
514
515 """
516 Report.
517
518 """
519
520 print("")
521 print("OUTPUT FILES")
522 print("============")
523
524 print("Count matrix file (DEWSeq input file):\n%s" % (out_matrix))
525 print("Sample info file (DEWSeq input file):\n%s" % (sample_info_file))
526 print("")