Mercurial > repos > bgruening > htseq_clip
view htsc_create_count_table.py @ 0:94a987a7da69 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author | bgruening |
---|---|
date | Tue, 11 Oct 2022 16:09:23 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python3 import argparse import os import subprocess import pysam """ Install deseq-clip ================== conda install -c bioconda pysam conda install -c bioconda htseq pip install htseq-clip Or directly by: conda install -c bioconda htseq-clip Test call ========= python htsc_create_count_table.py --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper Compare: diff test-data/Rbp_count_matrix.exp.txt test_create_count_table_out/count_matrix.txt diff test-data/sample_info.exp.txt test_create_count_table_out/sample_info.txt This corresponds to: htseq-clip extract -i test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 htseq-clip extract -i test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 htseq-clip extract -i test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000 htseq-clip count -i test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep1.Synechocystis_pSYSM.csv htseq-clip count -i test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep2.Synechocystis_pSYSM.csv htseq-clip count -i test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_ctrl_rep1.Synechocystis_pSYSM.csv htseq-clip createMatrix -i test_create_count_table_out/counts_Rbp -o test_create_count_table_out/Rbp_count_matrix.txt -b Rbp To get BAM content for single chromosome: samtools view -b -h bam_all/Rbp3_uv_rep1.bam Synechocystis_pSYSM > test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam More tests: python htsc_create_count_table.py --hce-f test-data/chr_names.txt --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper DEWSeq input files: test_create_count_table_out/Rbp_count_matrix.txt test_create_count_table_out/sample_info.txt """ ################################################################################ def setup_argument_parser(): """Setup argparse parser.""" help_description = """ Use htseq-clip to extract crosslink sites from BAM files, and count overlaps with window regions. Finally create an R count matrix, e.g. as input for DEWSeq. """ # Define argument parser. p = argparse.ArgumentParser(add_help=False, prog="htsc_create_count_table.py", description=help_description, formatter_class=argparse.MetavarTypeHelpFormatter) # Required arguments. p.add_argument("-h", "--help", action="help", help="Print help message") p.add_argument("--win-bed", dest="in_win_bed", type=str, metavar='str', required=True, help="Sliding windows BED annotation file created with htseq-clip createSlidingWindows (also accepts .bed.gz)") p.add_argument("--exp-bams", dest="exp_bam_list", type=str, metavar='str', nargs='+', required=True, help="List of IP BAM files (--exp-bams ip_rep1.bam ip_rep2.bam .. )") p.add_argument("--ctr-bams", dest="ctr_bam_list", type=str, metavar='str', nargs='+', required=True, help="List of control (SM input) BAM files (--ctr-bams smi_rep1.bam smi_rep2.bam .. )") p.add_argument("--out", dest="out_folder", type=str, metavar='str', required=True, help="Results output folder") # htseq-clip extract. p.add_argument("--hce-e", dest="hce_e", type=int, default=1, choices=[1, 2], help="htseq-clip extract -e parameter. This selects read/mate to extract crosslink sites from (default: 1)") p.add_argument("--hce-s", dest="hce_s", type=str, default="m", help="htseq-clip extract -s parameter. Choose crosslink site (s: start, m: middle, e: end ... ) (default: m)") p.add_argument("--hce-g", dest="hce_g", type=int, metavar='int', default=0, help="htseq-clip extract -g parameter. Number of nucleotides to offset for crosslink sites (default: 0)") p.add_argument("--hce-q", dest="hce_q", type=int, metavar='int', default=10, help="htseq-clip extract -q parameter. Minimum alignment quality for filtering (default: 10)") p.add_argument("--hce-primary", dest="hce_primary", default=False, action="store_true", help="htseq-clip extract --primary parameter. Flag to use only primary positions of multimapping reads (default: False)") p.add_argument("--hce-c", dest="hce_c", type=int, metavar='int', default=1, help="htseq-clip extract -c parameter. Number of threads/cores to use (default: 1)") p.add_argument("--hce-m", dest="hce_m", type=int, metavar='int', default=0, help="htseq-clip extract -m parameter. Minimum read length for filtering (default: 0)") p.add_argument("--hce-x", dest="hce_x", type=int, metavar='int', default=500, help="htseq-clip extract -x parameter. Maximum read length for filtering (default: 500)") p.add_argument("--hce-l", dest="hce_l", type=int, metavar='int', default=10000, help="htseq-clip extract -l parameter. Maximum read interval length (default: 10000)") p.add_argument("--hce-f", dest="hce_f", type=str, help="htseq-clip extract -f parameter. Extract crosslink sites only from chromosomes given in this file (one chromosome ID per line)") # htseq-clip count. p.add_argument("--hcc-unstranded", dest="hcc_unstranded", default=False, action="store_true", help="htseq-clip count --unstranded parameter. Use this flag for non strand specific crosslink site counting (default: strand-specific counting)") # More. p.add_argument("--data-id", dest="dataset_id", type=str, default="RBP", metavar='str', help="Dataset ID used as prefix for naming datasets (default: RBP)") p.add_argument("--filter-bed", dest="filter_bed", type=str, metavar='str', help="Provide BED file to filter out BAM reads overlapping with --filter-bed regions") p.add_argument("--filter-mode", dest="filter_mode", type=int, default=1, choices=[1, 2], help="Filter mode for --filter-bed file. 1: keep BAM reads not overlapping with --filter-bed regions. 2: Keep only BAM reads overlapping with --filter-bed (default: 1)") p.add_argument("--no-zipper", dest="no_zipper", default=False, action="store_true", help="Do not gzip output files (default: False)") p.add_argument("--keep-imf", dest="keep_imf", default=False, action="store_true", help="Keep intermediate files (filtered BAM, BED, CSV) (default: False)") return p ################################################################################ def file_make_symbolic_link(file, file_link, force_overwrite=True): """ Create a symbolic file link file_link (ln -s) of file. force_overwrite: Overwrites existing file link. """ assert os.path.exists(file), "file does not exist" # Get absolute path needed for symlink to work. file_abs_path = os.path.abspath(file) check_cmd = "ln -s " if force_overwrite: check_cmd += "-f " check_cmd = check_cmd + file_abs_path + " " + file_link output = subprocess.getoutput(check_cmd) error = False if output: error = True assert error is False, "ln has problems with your input:\n%s\n%s" % (check_cmd, output) ################################################################################ def bam_remove_overlap_region_reads(in_bed, in_bam, out_bam, params="", sort_bed=False): """ Remove BAM reads from in_bam, based on overlap with in_bed BED regions. I.e. only BAM reads not overlapping with in_bed regions are stored in out_bam. Using intersectBed instead of samtools view -L, which allows -s for strand specific filtering. """ assert os.path.exists(in_bed), "in_bed does not exist" assert os.path.exists(in_bam), "in_bam does not exist" if sort_bed: check_cmd = "sort -k1,1 -k2,2n " + in_bed + " | " + "intersectBed -abam " + in_bam + " -b stdin " + params + " -sorted > " + out_bam else: check_cmd = "intersectBed -abam " + in_bam + " -b " + in_bed + " " + params + " > " + out_bam output = subprocess.getoutput(check_cmd) error = False if output: error = True assert error is False, "intersectBed has problems with your input:\n%s\n%s" % (check_cmd, output) ################################################################################ if __name__ == '__main__': parser = setup_argument_parser() args = parser.parse_args() assert os.path.exists(args.in_win_bed), "--win-bed file \"%s\" not found" % (args.in_win_bed) for bam_file in args.exp_bam_list: assert os.path.exists(bam_file), "--exp-bams BAM file \"%s\" not found" % (bam_file) for bam_file in args.ctr_bam_list: assert os.path.exists(bam_file), "--ctr-bams BAM file \"%s\" not found" % (bam_file) # assert len(args.exp_bam_list) > 1, "# --exp-bams needs to be > 1" if args.filter_bed: assert os.path.exists(args.filter_bed), "--filter-bed file \"%s\" not found" % (args.filter_bed) hce_s_dic = {"s": 1, "i": 1, "d": 1, "m": 1, "e": 1} assert args.hce_s in hce_s_dic, "invalid --hce-s given (choose between {s,i,d,m,e})" # Crop dataset ID. data_id = args.dataset_id if len(args.dataset_id) > 20: data_id = args.dataset_id[:20] # Remove spaces. data_id = data_id.replace(" ", "_") # Output folders. if not os.path.exists(args.out_folder): os.makedirs(args.out_folder) counts_folder = args.out_folder + "/counts_%s" % (data_id) if not os.path.exists(counts_folder): os.makedirs(counts_folder) """ Create BAM index files. """ print("# experiment BAMs: %i" % (len(args.exp_bam_list))) print("# control BAMs: %i" % (len(args.ctr_bam_list))) exp_bams = [] control_bams = [] exp_ids = [] control_ids = [] params = "-s -v" if args.filter_mode == 2: params = "-s -u" intermediate_files = [] print("Create BAM index files ... ") for i, bam_file in enumerate(args.exp_bam_list): idx = i + 1 out_bam = args.out_folder + "/%s_exp_rep%i.bam" % (data_id, idx) if args.filter_bed: print("Filter %s by --filter-bed ... " % (bam_file)) bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params) intermediate_files.append(out_bam) else: # out_bam = bam_file file_make_symbolic_link(bam_file, out_bam) # shutil.move(bam_file, out_bam) pysam.index(out_bam) exp_bams.append(out_bam) exp_ids.append("%s_exp_rep%i" % (data_id, idx)) for i, bam_file in enumerate(args.ctr_bam_list): idx = i + 1 out_bam = args.out_folder + "/%s_ctrl_rep%i.bam" % (data_id, idx) if args.filter_bed: print("Filter %s by --filter-bed ... " % (bam_file)) bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params) intermediate_files.append(out_bam) else: # out_bam = bam_file file_make_symbolic_link(bam_file, out_bam) # shutil.move(bam_file, out_bam) pysam.index(out_bam) control_bams.append(out_bam) control_ids.append("%s_ctrl_rep%i" % (data_id, idx)) """ htseq-clip extract crosslink sites. htseq-clip extract -i bam/Rbp3_total_rep1.bam -e 1 -s m -o sites/Rbp3_total_rep1.bed hce_e : -e [1,2] %i hce_s : -s %s -s {s,i,d,m,e}, --site {s,i,d,m,e} Crosslink site choices, must be one of: s, i, d, m, e s: start site i: insertion site d: deletion site m: middle site e: end site (default: e). hce_g : -g offset nt %i hce_q : -q 10 hce_primary : --primary hce_c : -c 4 hce_m : -m 0 hce_x : -x 500 hce_l : -l 10000 hce_f : -f chr_id file """ print("Extract crosslink sites from BAM files ... ") extract_params = " -e %i -s %s -g %i -q %i -c %i -m %i -x %i -l %i" % (args.hce_e, args.hce_s, args.hce_g, args.hce_q, args.hce_c, args.hce_m, args.hce_x, args.hce_l) if args.hce_primary: extract_params += " --primary " if args.hce_f: assert os.path.exists(args.hce_f), "--hce-f file \"%s\" not found" % (args.hce_f) extract_params += " -f %s " % (args.hce_f) # Experiment BAMs. exp_beds = [] for i, bam_file in enumerate(exp_bams): idx = i + 1 out_bed = args.out_folder + "/%s_exp_rep%i.bed" % (data_id, idx) check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params output = subprocess.getoutput(check_cmd) print(check_cmd) print(output) assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed) exp_beds.append(out_bed) intermediate_files.append(out_bed) # Control BAMs. control_beds = [] for i, bam_file in enumerate(control_bams): idx = i + 1 out_bed = args.out_folder + "/%s_ctrl_rep%i.bed" % (data_id, idx) check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params output = subprocess.getoutput(check_cmd) print(check_cmd) print(output) assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed) control_beds.append(out_bed) intermediate_files.append(out_bed) """ Count crosslink sites in sliding windows. htseq-clip count -i sites/Rbp3_total_rep1.bed -a annotation/Rbp3_uv_total_w75s10.txt -o counts/Rbp3_total_rep1.csv """ print("Count reads overlapping with windows ... ") count_params = "" if args.hcc_unstranded: count_params += " --unstranded" # Experiment BAMs. for i, bed_file in enumerate(exp_beds): idx = i + 1 out_csv = counts_folder + "/%s_exp_rep%i.csv.gz" % (data_id, idx) if args.no_zipper: out_csv = counts_folder + "/%s_exp_rep%i.csv" % (data_id, idx) check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params output = subprocess.getoutput(check_cmd) print(check_cmd) print(output) assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv) intermediate_files.append(out_csv) # Control BAMs. for i, bed_file in enumerate(control_beds): idx = i + 1 out_csv = counts_folder + "/%s_ctrl_rep%i.csv.gz" % (data_id, idx) if args.no_zipper: out_csv = counts_folder + "/%s_ctrl_rep%i.csv" % (data_id, idx) check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params output = subprocess.getoutput(check_cmd) print(check_cmd) print(output) assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv) intermediate_files.append(out_csv) """ Create an R friendly matrix file. htseq-clip createMatrix -i counts/ -b Rbp3 -o counts/Rbp3_uv_total_w75s10_counts.txt.gz """ print("Create R-friendly count matrix file ... ") # out_matrix = args.out_folder + "/%s_count_matrix.txt.gz" %(data_id) out_matrix = args.out_folder + "/count_matrix.txt.gz" if args.no_zipper: # out_matrix = args.out_folder + "/%s_count_matrix.txt" %(data_id) out_matrix = args.out_folder + "/count_matrix.txt" check_cmd = "htseq-clip createMatrix -i " + counts_folder + " -b " + data_id + " -o " + out_matrix output = subprocess.getoutput(check_cmd) print(check_cmd) print(output) assert os.path.exists(out_matrix), "htseq-clip createMatrix -o file \"%s\" not found" % (out_matrix) """ Create sample info file for DEWSeq. Sample Info file format: Sample name Sample type Rbp3_uv_rep1 IP Rbp3_uv_rep2 IP Rbp3_uv_rep3 IP Rbp3_total_rep1 SMI Rbp3_total_rep2 SMI Rbp3_total_rep3 SMI """ print("Write sample info file ... ") sample_info_file = args.out_folder + "/sample_info.txt" OUTTAB = open(sample_info_file, "w") OUTTAB.write("Sample name\tSample type\n") for sid in exp_ids: OUTTAB.write("%s\tIP\n" % (sid)) for sid in control_ids: OUTTAB.write("%s\tSMI\n" % (sid)) OUTTAB.close() """ Delete intermediate files. """ if not args.keep_imf: print("Delete intermediate files ... ") for imf in intermediate_files: if os.path.exists(imf): os.remove(imf) """ Report. """ print("") print("OUTPUT FILES") print("============") print("Count matrix file (DEWSeq input file):\n%s" % (out_matrix)) print("Sample info file (DEWSeq input file):\n%s" % (sample_info_file)) print("")