Mercurial > repos > bgruening > htseq_clip
diff htsc_create_count_table.py @ 0:94a987a7da69 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author | bgruening |
---|---|
date | Tue, 11 Oct 2022 16:09:23 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htsc_create_count_table.py Tue Oct 11 16:09:23 2022 +0000 @@ -0,0 +1,526 @@ +#!/usr/bin/env python3 + +import argparse +import os +import subprocess + +import pysam + +""" + +Install deseq-clip +================== + +conda install -c bioconda pysam +conda install -c bioconda htseq +pip install htseq-clip + +Or directly by: +conda install -c bioconda htseq-clip + + +Test call +========= + +python htsc_create_count_table.py --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper + +Compare: +diff test-data/Rbp_count_matrix.exp.txt test_create_count_table_out/count_matrix.txt +diff test-data/sample_info.exp.txt test_create_count_table_out/sample_info.txt + +This corresponds to: +htseq-clip extract -i test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 +htseq-clip extract -i test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 +htseq-clip extract -i test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 +htseq-clip count -i test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep1.Synechocystis_pSYSM.csv +htseq-clip count -i test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep2.Synechocystis_pSYSM.csv +htseq-clip count -i test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_ctrl_rep1.Synechocystis_pSYSM.csv +htseq-clip createMatrix -i test_create_count_table_out/counts_Rbp -o test_create_count_table_out/Rbp_count_matrix.txt -b Rbp + + +To get BAM content for single chromosome: +samtools view -b -h bam_all/Rbp3_uv_rep1.bam Synechocystis_pSYSM > test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam + +More tests: +python htsc_create_count_table.py --hce-f test-data/chr_names.txt --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper + +DEWSeq input files: +test_create_count_table_out/Rbp_count_matrix.txt +test_create_count_table_out/sample_info.txt + +""" + + +################################################################################ + +def setup_argument_parser(): + """Setup argparse parser.""" + help_description = """ + Use htseq-clip to extract crosslink sites from BAM files, and count + overlaps with window regions. Finally create an R count matrix, e.g. as + input for DEWSeq. + """ + # Define argument parser. + p = argparse.ArgumentParser(add_help=False, + prog="htsc_create_count_table.py", + description=help_description, + formatter_class=argparse.MetavarTypeHelpFormatter) + + # Required arguments. + p.add_argument("-h", "--help", + action="help", + help="Print help message") + p.add_argument("--win-bed", + dest="in_win_bed", + type=str, + metavar='str', + required=True, + help="Sliding windows BED annotation file created with htseq-clip createSlidingWindows (also accepts .bed.gz)") + p.add_argument("--exp-bams", + dest="exp_bam_list", + type=str, + metavar='str', + nargs='+', + required=True, + help="List of IP BAM files (--exp-bams ip_rep1.bam ip_rep2.bam .. )") + p.add_argument("--ctr-bams", + dest="ctr_bam_list", + type=str, + metavar='str', + nargs='+', + required=True, + help="List of control (SM input) BAM files (--ctr-bams smi_rep1.bam smi_rep2.bam .. )") + p.add_argument("--out", + dest="out_folder", + type=str, + metavar='str', + required=True, + help="Results output folder") + # htseq-clip extract. + p.add_argument("--hce-e", + dest="hce_e", + type=int, + default=1, + choices=[1, 2], + help="htseq-clip extract -e parameter. This selects read/mate to extract crosslink sites from (default: 1)") + p.add_argument("--hce-s", + dest="hce_s", + type=str, + default="m", + help="htseq-clip extract -s parameter. Choose crosslink site (s: start, m: middle, e: end ... ) (default: m)") + p.add_argument("--hce-g", + dest="hce_g", + type=int, + metavar='int', + default=0, + help="htseq-clip extract -g parameter. Number of nucleotides to offset for crosslink sites (default: 0)") + p.add_argument("--hce-q", + dest="hce_q", + type=int, + metavar='int', + default=10, + help="htseq-clip extract -q parameter. Minimum alignment quality for filtering (default: 10)") + p.add_argument("--hce-primary", + dest="hce_primary", + default=False, + action="store_true", + help="htseq-clip extract --primary parameter. Flag to use only primary positions of multimapping reads (default: False)") + p.add_argument("--hce-c", + dest="hce_c", + type=int, + metavar='int', + default=1, + help="htseq-clip extract -c parameter. Number of threads/cores to use (default: 1)") + p.add_argument("--hce-m", + dest="hce_m", + type=int, + metavar='int', + default=0, + help="htseq-clip extract -m parameter. Minimum read length for filtering (default: 0)") + p.add_argument("--hce-x", + dest="hce_x", + type=int, + metavar='int', + default=500, + help="htseq-clip extract -x parameter. Maximum read length for filtering (default: 500)") + p.add_argument("--hce-l", + dest="hce_l", + type=int, + metavar='int', + default=10000, + help="htseq-clip extract -l parameter. Maximum read interval length (default: 10000)") + p.add_argument("--hce-f", + dest="hce_f", + type=str, + help="htseq-clip extract -f parameter. Extract crosslink sites only from chromosomes given in this file (one chromosome ID per line)") + # htseq-clip count. + p.add_argument("--hcc-unstranded", + dest="hcc_unstranded", + default=False, + action="store_true", + help="htseq-clip count --unstranded parameter. Use this flag for non strand specific crosslink site counting (default: strand-specific counting)") + # More. + p.add_argument("--data-id", + dest="dataset_id", + type=str, + default="RBP", + metavar='str', + help="Dataset ID used as prefix for naming datasets (default: RBP)") + p.add_argument("--filter-bed", + dest="filter_bed", + type=str, + metavar='str', + help="Provide BED file to filter out BAM reads overlapping with --filter-bed regions") + p.add_argument("--filter-mode", + dest="filter_mode", + type=int, + default=1, + choices=[1, 2], + help="Filter mode for --filter-bed file. 1: keep BAM reads not overlapping with --filter-bed regions. 2: Keep only BAM reads overlapping with --filter-bed (default: 1)") + p.add_argument("--no-zipper", + dest="no_zipper", + default=False, + action="store_true", + help="Do not gzip output files (default: False)") + p.add_argument("--keep-imf", + dest="keep_imf", + default=False, + action="store_true", + help="Keep intermediate files (filtered BAM, BED, CSV) (default: False)") + return p + + +################################################################################ + +def file_make_symbolic_link(file, file_link, + force_overwrite=True): + """ + Create a symbolic file link file_link (ln -s) of file. + + force_overwrite: + Overwrites existing file link. + + """ + assert os.path.exists(file), "file does not exist" + # Get absolute path needed for symlink to work. + file_abs_path = os.path.abspath(file) + + check_cmd = "ln -s " + if force_overwrite: + check_cmd += "-f " + check_cmd = check_cmd + file_abs_path + " " + file_link + output = subprocess.getoutput(check_cmd) + error = False + if output: + error = True + assert error is False, "ln has problems with your input:\n%s\n%s" % (check_cmd, output) + + +################################################################################ + +def bam_remove_overlap_region_reads(in_bed, in_bam, out_bam, + params="", + sort_bed=False): + + """ + Remove BAM reads from in_bam, based on overlap with in_bed BED regions. + I.e. only BAM reads not overlapping with in_bed regions are stored in + out_bam. + + Using intersectBed instead of samtools view -L, which allows -s for strand + specific filtering. + + """ + assert os.path.exists(in_bed), "in_bed does not exist" + assert os.path.exists(in_bam), "in_bam does not exist" + + if sort_bed: + check_cmd = "sort -k1,1 -k2,2n " + in_bed + " | " + "intersectBed -abam " + in_bam + " -b stdin " + params + " -sorted > " + out_bam + else: + check_cmd = "intersectBed -abam " + in_bam + " -b " + in_bed + " " + params + " > " + out_bam + output = subprocess.getoutput(check_cmd) + error = False + if output: + error = True + assert error is False, "intersectBed has problems with your input:\n%s\n%s" % (check_cmd, output) + + +################################################################################ + +if __name__ == '__main__': + + parser = setup_argument_parser() + args = parser.parse_args() + + assert os.path.exists(args.in_win_bed), "--win-bed file \"%s\" not found" % (args.in_win_bed) + + for bam_file in args.exp_bam_list: + assert os.path.exists(bam_file), "--exp-bams BAM file \"%s\" not found" % (bam_file) + + for bam_file in args.ctr_bam_list: + assert os.path.exists(bam_file), "--ctr-bams BAM file \"%s\" not found" % (bam_file) + + # assert len(args.exp_bam_list) > 1, "# --exp-bams needs to be > 1" + + if args.filter_bed: + assert os.path.exists(args.filter_bed), "--filter-bed file \"%s\" not found" % (args.filter_bed) + + hce_s_dic = {"s": 1, "i": 1, "d": 1, "m": 1, "e": 1} + assert args.hce_s in hce_s_dic, "invalid --hce-s given (choose between {s,i,d,m,e})" + + # Crop dataset ID. + data_id = args.dataset_id + if len(args.dataset_id) > 20: + data_id = args.dataset_id[:20] + # Remove spaces. + data_id = data_id.replace(" ", "_") + + # Output folders. + if not os.path.exists(args.out_folder): + os.makedirs(args.out_folder) + counts_folder = args.out_folder + "/counts_%s" % (data_id) + if not os.path.exists(counts_folder): + os.makedirs(counts_folder) + + """ + Create BAM index files. + + """ + print("# experiment BAMs: %i" % (len(args.exp_bam_list))) + print("# control BAMs: %i" % (len(args.ctr_bam_list))) + + exp_bams = [] + control_bams = [] + exp_ids = [] + control_ids = [] + params = "-s -v" + if args.filter_mode == 2: + params = "-s -u" + intermediate_files = [] + + print("Create BAM index files ... ") + for i, bam_file in enumerate(args.exp_bam_list): + + idx = i + 1 + + out_bam = args.out_folder + "/%s_exp_rep%i.bam" % (data_id, idx) + + if args.filter_bed: + print("Filter %s by --filter-bed ... " % (bam_file)) + bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params) + intermediate_files.append(out_bam) + else: + # out_bam = bam_file + file_make_symbolic_link(bam_file, out_bam) + # shutil.move(bam_file, out_bam) + + pysam.index(out_bam) + exp_bams.append(out_bam) + exp_ids.append("%s_exp_rep%i" % (data_id, idx)) + + for i, bam_file in enumerate(args.ctr_bam_list): + + idx = i + 1 + + out_bam = args.out_folder + "/%s_ctrl_rep%i.bam" % (data_id, idx) + + if args.filter_bed: + print("Filter %s by --filter-bed ... " % (bam_file)) + bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params) + intermediate_files.append(out_bam) + else: + # out_bam = bam_file + file_make_symbolic_link(bam_file, out_bam) + # shutil.move(bam_file, out_bam) + + pysam.index(out_bam) + control_bams.append(out_bam) + control_ids.append("%s_ctrl_rep%i" % (data_id, idx)) + + """ + htseq-clip extract crosslink sites. + + htseq-clip extract -i bam/Rbp3_total_rep1.bam -e 1 -s m -o sites/Rbp3_total_rep1.bed + + hce_e : -e [1,2] %i + hce_s : -s %s + -s {s,i,d,m,e}, --site {s,i,d,m,e} + Crosslink site choices, must be one of: s, i, d, m, e + s: start site + i: insertion site + d: deletion site + m: middle site + e: end site (default: e). + hce_g : -g offset nt %i + hce_q : -q 10 + hce_primary : --primary + hce_c : -c 4 + hce_m : -m 0 + hce_x : -x 500 + hce_l : -l 10000 + hce_f : -f chr_id file + + """ + + print("Extract crosslink sites from BAM files ... ") + + extract_params = " -e %i -s %s -g %i -q %i -c %i -m %i -x %i -l %i" % (args.hce_e, args.hce_s, args.hce_g, args.hce_q, args.hce_c, args.hce_m, args.hce_x, args.hce_l) + if args.hce_primary: + extract_params += " --primary " + if args.hce_f: + assert os.path.exists(args.hce_f), "--hce-f file \"%s\" not found" % (args.hce_f) + extract_params += " -f %s " % (args.hce_f) + + # Experiment BAMs. + exp_beds = [] + for i, bam_file in enumerate(exp_bams): + + idx = i + 1 + + out_bed = args.out_folder + "/%s_exp_rep%i.bed" % (data_id, idx) + + check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params + output = subprocess.getoutput(check_cmd) + + print(check_cmd) + print(output) + + assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed) + exp_beds.append(out_bed) + intermediate_files.append(out_bed) + + # Control BAMs. + control_beds = [] + for i, bam_file in enumerate(control_bams): + + idx = i + 1 + + out_bed = args.out_folder + "/%s_ctrl_rep%i.bed" % (data_id, idx) + + check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params + output = subprocess.getoutput(check_cmd) + + print(check_cmd) + print(output) + + assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed) + control_beds.append(out_bed) + intermediate_files.append(out_bed) + + """ + Count crosslink sites in sliding windows. + + htseq-clip count -i sites/Rbp3_total_rep1.bed -a annotation/Rbp3_uv_total_w75s10.txt -o counts/Rbp3_total_rep1.csv + + """ + + print("Count reads overlapping with windows ... ") + + count_params = "" + if args.hcc_unstranded: + count_params += " --unstranded" + # Experiment BAMs. + for i, bed_file in enumerate(exp_beds): + + idx = i + 1 + + out_csv = counts_folder + "/%s_exp_rep%i.csv.gz" % (data_id, idx) + if args.no_zipper: + out_csv = counts_folder + "/%s_exp_rep%i.csv" % (data_id, idx) + + check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params + output = subprocess.getoutput(check_cmd) + + print(check_cmd) + print(output) + + assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv) + intermediate_files.append(out_csv) + + # Control BAMs. + for i, bed_file in enumerate(control_beds): + + idx = i + 1 + + out_csv = counts_folder + "/%s_ctrl_rep%i.csv.gz" % (data_id, idx) + if args.no_zipper: + out_csv = counts_folder + "/%s_ctrl_rep%i.csv" % (data_id, idx) + + check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params + output = subprocess.getoutput(check_cmd) + + print(check_cmd) + print(output) + + assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv) + intermediate_files.append(out_csv) + + """ + Create an R friendly matrix file. + + htseq-clip createMatrix -i counts/ -b Rbp3 -o counts/Rbp3_uv_total_w75s10_counts.txt.gz + + """ + + print("Create R-friendly count matrix file ... ") + + # out_matrix = args.out_folder + "/%s_count_matrix.txt.gz" %(data_id) + out_matrix = args.out_folder + "/count_matrix.txt.gz" + if args.no_zipper: + # out_matrix = args.out_folder + "/%s_count_matrix.txt" %(data_id) + out_matrix = args.out_folder + "/count_matrix.txt" + + check_cmd = "htseq-clip createMatrix -i " + counts_folder + " -b " + data_id + " -o " + out_matrix + output = subprocess.getoutput(check_cmd) + + print(check_cmd) + print(output) + + assert os.path.exists(out_matrix), "htseq-clip createMatrix -o file \"%s\" not found" % (out_matrix) + + """ + Create sample info file for DEWSeq. + + Sample Info file format: + Sample name Sample type + Rbp3_uv_rep1 IP + Rbp3_uv_rep2 IP + Rbp3_uv_rep3 IP + Rbp3_total_rep1 SMI + Rbp3_total_rep2 SMI + Rbp3_total_rep3 SMI + + """ + + print("Write sample info file ... ") + sample_info_file = args.out_folder + "/sample_info.txt" + OUTTAB = open(sample_info_file, "w") + OUTTAB.write("Sample name\tSample type\n") + for sid in exp_ids: + OUTTAB.write("%s\tIP\n" % (sid)) + for sid in control_ids: + OUTTAB.write("%s\tSMI\n" % (sid)) + OUTTAB.close() + + """ + Delete intermediate files. + + """ + if not args.keep_imf: + print("Delete intermediate files ... ") + for imf in intermediate_files: + if os.path.exists(imf): + os.remove(imf) + + """ + Report. + + """ + + print("") + print("OUTPUT FILES") + print("============") + + print("Count matrix file (DEWSeq input file):\n%s" % (out_matrix)) + print("Sample info file (DEWSeq input file):\n%s" % (sample_info_file)) + print("")