diff htsc_create_count_table.py @ 0:94a987a7da69 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/htseq-clip commit 4879439f0df3386b97d8507c5991051fbdda053a
author bgruening
date Tue, 11 Oct 2022 16:09:23 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htsc_create_count_table.py	Tue Oct 11 16:09:23 2022 +0000
@@ -0,0 +1,526 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import subprocess
+
+import pysam
+
+"""
+
+Install deseq-clip
+==================
+
+conda install -c bioconda pysam
+conda install -c bioconda htseq
+pip install htseq-clip
+
+Or directly by:
+conda install -c bioconda htseq-clip
+
+
+Test call
+=========
+
+python htsc_create_count_table.py --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper
+
+Compare:
+diff test-data/Rbp_count_matrix.exp.txt test_create_count_table_out/count_matrix.txt
+diff test-data/sample_info.exp.txt test_create_count_table_out/sample_info.txt
+
+This corresponds to:
+htseq-clip extract -i test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000
+htseq-clip extract -i test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000
+htseq-clip extract -i test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam -o test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -e 1 -s m -g 0 -q 10 -c 4 -m 0 -x 500 -l 10000
+htseq-clip count -i test_create_count_table_out/Rbp_exp_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep1.Synechocystis_pSYSM.csv
+htseq-clip count -i test_create_count_table_out/Rbp_exp_rep2.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_exp_rep2.Synechocystis_pSYSM.csv
+htseq-clip count -i test_create_count_table_out/Rbp_ctrl_rep1.Synechocystis_pSYSM.bed -a test-data/windows.exp.bed -o test_create_count_table_out/counts_Rbp/Rbp_ctrl_rep1.Synechocystis_pSYSM.csv
+htseq-clip createMatrix -i test_create_count_table_out/counts_Rbp -o test_create_count_table_out/Rbp_count_matrix.txt -b Rbp
+
+
+To get BAM content for single chromosome:
+samtools view -b -h bam_all/Rbp3_uv_rep1.bam Synechocystis_pSYSM > test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam
+
+More tests:
+python htsc_create_count_table.py --hce-f test-data/chr_names.txt --data-id Rbp --win-bed test-data/windows.exp.bed --out test_create_count_table_out --exp-bams test-data/Rbp_exp_rep1.Synechocystis_pSYSM.bam test-data/Rbp_exp_rep2.Synechocystis_pSYSM.bam --ctr-bams test-data/Rbp_ctrl_rep1.Synechocystis_pSYSM.bam --no-zipper
+
+DEWSeq input files:
+test_create_count_table_out/Rbp_count_matrix.txt
+test_create_count_table_out/sample_info.txt
+
+"""
+
+
+################################################################################
+
+def setup_argument_parser():
+    """Setup argparse parser."""
+    help_description = """
+    Use htseq-clip to extract crosslink sites from BAM files, and count
+    overlaps with window regions. Finally create an R count matrix, e.g. as
+    input for DEWSeq.
+    """
+    # Define argument parser.
+    p = argparse.ArgumentParser(add_help=False,
+                                prog="htsc_create_count_table.py",
+                                description=help_description,
+                                formatter_class=argparse.MetavarTypeHelpFormatter)
+
+    # Required arguments.
+    p.add_argument("-h", "--help",
+                   action="help",
+                   help="Print help message")
+    p.add_argument("--win-bed",
+                   dest="in_win_bed",
+                   type=str,
+                   metavar='str',
+                   required=True,
+                   help="Sliding windows BED annotation file created with htseq-clip createSlidingWindows (also accepts .bed.gz)")
+    p.add_argument("--exp-bams",
+                   dest="exp_bam_list",
+                   type=str,
+                   metavar='str',
+                   nargs='+',
+                   required=True,
+                   help="List of IP BAM files (--exp-bams ip_rep1.bam ip_rep2.bam .. )")
+    p.add_argument("--ctr-bams",
+                   dest="ctr_bam_list",
+                   type=str,
+                   metavar='str',
+                   nargs='+',
+                   required=True,
+                   help="List of control (SM input) BAM files (--ctr-bams smi_rep1.bam smi_rep2.bam .. )")
+    p.add_argument("--out",
+                   dest="out_folder",
+                   type=str,
+                   metavar='str',
+                   required=True,
+                   help="Results output folder")
+    # htseq-clip extract.
+    p.add_argument("--hce-e",
+                   dest="hce_e",
+                   type=int,
+                   default=1,
+                   choices=[1, 2],
+                   help="htseq-clip extract -e parameter. This selects read/mate to extract crosslink sites from (default: 1)")
+    p.add_argument("--hce-s",
+                   dest="hce_s",
+                   type=str,
+                   default="m",
+                   help="htseq-clip extract -s parameter. Choose crosslink site (s: start, m: middle, e: end ... ) (default: m)")
+    p.add_argument("--hce-g",
+                   dest="hce_g",
+                   type=int,
+                   metavar='int',
+                   default=0,
+                   help="htseq-clip extract -g parameter. Number of nucleotides to offset for crosslink sites (default: 0)")
+    p.add_argument("--hce-q",
+                   dest="hce_q",
+                   type=int,
+                   metavar='int',
+                   default=10,
+                   help="htseq-clip extract -q parameter. Minimum alignment quality for filtering (default: 10)")
+    p.add_argument("--hce-primary",
+                   dest="hce_primary",
+                   default=False,
+                   action="store_true",
+                   help="htseq-clip extract --primary parameter. Flag to use only primary positions of multimapping reads (default: False)")
+    p.add_argument("--hce-c",
+                   dest="hce_c",
+                   type=int,
+                   metavar='int',
+                   default=1,
+                   help="htseq-clip extract -c parameter. Number of threads/cores to use (default: 1)")
+    p.add_argument("--hce-m",
+                   dest="hce_m",
+                   type=int,
+                   metavar='int',
+                   default=0,
+                   help="htseq-clip extract -m parameter. Minimum read length for filtering (default: 0)")
+    p.add_argument("--hce-x",
+                   dest="hce_x",
+                   type=int,
+                   metavar='int',
+                   default=500,
+                   help="htseq-clip extract -x parameter. Maximum read length for filtering (default: 500)")
+    p.add_argument("--hce-l",
+                   dest="hce_l",
+                   type=int,
+                   metavar='int',
+                   default=10000,
+                   help="htseq-clip extract -l parameter. Maximum read interval length (default: 10000)")
+    p.add_argument("--hce-f",
+                   dest="hce_f",
+                   type=str,
+                   help="htseq-clip extract -f parameter. Extract crosslink sites only from chromosomes given in this file (one chromosome ID per line)")
+    # htseq-clip count.
+    p.add_argument("--hcc-unstranded",
+                   dest="hcc_unstranded",
+                   default=False,
+                   action="store_true",
+                   help="htseq-clip count --unstranded parameter. Use this flag for non strand specific crosslink site counting (default: strand-specific counting)")
+    # More.
+    p.add_argument("--data-id",
+                   dest="dataset_id",
+                   type=str,
+                   default="RBP",
+                   metavar='str',
+                   help="Dataset ID used as prefix for naming datasets (default: RBP)")
+    p.add_argument("--filter-bed",
+                   dest="filter_bed",
+                   type=str,
+                   metavar='str',
+                   help="Provide BED file to filter out BAM reads overlapping with --filter-bed regions")
+    p.add_argument("--filter-mode",
+                   dest="filter_mode",
+                   type=int,
+                   default=1,
+                   choices=[1, 2],
+                   help="Filter mode for --filter-bed file. 1: keep BAM reads not overlapping with --filter-bed regions. 2: Keep only BAM reads overlapping with --filter-bed (default: 1)")
+    p.add_argument("--no-zipper",
+                   dest="no_zipper",
+                   default=False,
+                   action="store_true",
+                   help="Do not gzip output files (default: False)")
+    p.add_argument("--keep-imf",
+                   dest="keep_imf",
+                   default=False,
+                   action="store_true",
+                   help="Keep intermediate files (filtered BAM, BED, CSV) (default: False)")
+    return p
+
+
+################################################################################
+
+def file_make_symbolic_link(file, file_link,
+                            force_overwrite=True):
+    """
+    Create a symbolic file link file_link (ln -s) of file.
+
+    force_overwrite:
+        Overwrites existing file link.
+
+    """
+    assert os.path.exists(file), "file does not exist"
+    # Get absolute path needed for symlink to work.
+    file_abs_path = os.path.abspath(file)
+
+    check_cmd = "ln -s "
+    if force_overwrite:
+        check_cmd += "-f "
+    check_cmd = check_cmd + file_abs_path + " " + file_link
+    output = subprocess.getoutput(check_cmd)
+    error = False
+    if output:
+        error = True
+    assert error is False, "ln has problems with your input:\n%s\n%s" % (check_cmd, output)
+
+
+################################################################################
+
+def bam_remove_overlap_region_reads(in_bed, in_bam, out_bam,
+                                    params="",
+                                    sort_bed=False):
+
+    """
+    Remove BAM reads from in_bam, based on overlap with in_bed BED regions.
+    I.e. only BAM reads not overlapping with in_bed regions are stored in
+    out_bam.
+
+    Using intersectBed instead of samtools view -L, which allows -s for strand
+    specific filtering.
+
+    """
+    assert os.path.exists(in_bed), "in_bed does not exist"
+    assert os.path.exists(in_bam), "in_bam does not exist"
+
+    if sort_bed:
+        check_cmd = "sort -k1,1 -k2,2n " + in_bed + " | " + "intersectBed -abam " + in_bam + " -b stdin " + params + " -sorted > " + out_bam
+    else:
+        check_cmd = "intersectBed -abam " + in_bam + " -b " + in_bed + " " + params + " > " + out_bam
+    output = subprocess.getoutput(check_cmd)
+    error = False
+    if output:
+        error = True
+    assert error is False, "intersectBed has problems with your input:\n%s\n%s" % (check_cmd, output)
+
+
+################################################################################
+
+if __name__ == '__main__':
+
+    parser = setup_argument_parser()
+    args = parser.parse_args()
+
+    assert os.path.exists(args.in_win_bed), "--win-bed file \"%s\" not found" % (args.in_win_bed)
+
+    for bam_file in args.exp_bam_list:
+        assert os.path.exists(bam_file), "--exp-bams BAM file \"%s\" not found" % (bam_file)
+
+    for bam_file in args.ctr_bam_list:
+        assert os.path.exists(bam_file), "--ctr-bams BAM file \"%s\" not found" % (bam_file)
+
+    # assert len(args.exp_bam_list) > 1, "# --exp-bams needs to be > 1"
+
+    if args.filter_bed:
+        assert os.path.exists(args.filter_bed), "--filter-bed file \"%s\" not found" % (args.filter_bed)
+
+    hce_s_dic = {"s": 1, "i": 1, "d": 1, "m": 1, "e": 1}
+    assert args.hce_s in hce_s_dic, "invalid --hce-s given (choose between {s,i,d,m,e})"
+
+    # Crop dataset ID.
+    data_id = args.dataset_id
+    if len(args.dataset_id) > 20:
+        data_id = args.dataset_id[:20]
+    # Remove spaces.
+    data_id = data_id.replace(" ", "_")
+
+    # Output folders.
+    if not os.path.exists(args.out_folder):
+        os.makedirs(args.out_folder)
+    counts_folder = args.out_folder + "/counts_%s" % (data_id)
+    if not os.path.exists(counts_folder):
+        os.makedirs(counts_folder)
+
+    """
+    Create BAM index files.
+
+    """
+    print("# experiment BAMs:  %i" % (len(args.exp_bam_list)))
+    print("# control BAMs:     %i" % (len(args.ctr_bam_list)))
+
+    exp_bams = []
+    control_bams = []
+    exp_ids = []
+    control_ids = []
+    params = "-s -v"
+    if args.filter_mode == 2:
+        params = "-s -u"
+    intermediate_files = []
+
+    print("Create BAM index files ... ")
+    for i, bam_file in enumerate(args.exp_bam_list):
+
+        idx = i + 1
+
+        out_bam = args.out_folder + "/%s_exp_rep%i.bam" % (data_id, idx)
+
+        if args.filter_bed:
+            print("Filter %s by --filter-bed ... " % (bam_file))
+            bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params)
+            intermediate_files.append(out_bam)
+        else:
+            # out_bam = bam_file
+            file_make_symbolic_link(bam_file, out_bam)
+            # shutil.move(bam_file, out_bam)
+
+        pysam.index(out_bam)
+        exp_bams.append(out_bam)
+        exp_ids.append("%s_exp_rep%i" % (data_id, idx))
+
+    for i, bam_file in enumerate(args.ctr_bam_list):
+
+        idx = i + 1
+
+        out_bam = args.out_folder + "/%s_ctrl_rep%i.bam" % (data_id, idx)
+
+        if args.filter_bed:
+            print("Filter %s by --filter-bed ... " % (bam_file))
+            bam_remove_overlap_region_reads(args.filter_bed, bam_file, out_bam, params=params)
+            intermediate_files.append(out_bam)
+        else:
+            # out_bam = bam_file
+            file_make_symbolic_link(bam_file, out_bam)
+            # shutil.move(bam_file, out_bam)
+
+        pysam.index(out_bam)
+        control_bams.append(out_bam)
+        control_ids.append("%s_ctrl_rep%i" % (data_id, idx))
+
+    """
+    htseq-clip extract crosslink sites.
+
+    htseq-clip extract -i bam/Rbp3_total_rep1.bam -e 1 -s m -o sites/Rbp3_total_rep1.bed
+
+    hce_e : -e [1,2] %i
+    hce_s : -s %s
+    -s {s,i,d,m,e}, --site {s,i,d,m,e}
+                            Crosslink site choices, must be one of: s, i, d, m, e
+                            s: start site
+                            i: insertion site
+                            d: deletion site
+                            m: middle site
+                            e: end site (default: e).
+    hce_g : -g offset nt %i
+    hce_q : -q 10
+    hce_primary : --primary
+    hce_c : -c 4
+    hce_m : -m 0
+    hce_x : -x 500
+    hce_l : -l 10000
+    hce_f : -f chr_id file
+
+    """
+
+    print("Extract crosslink sites from BAM files ... ")
+
+    extract_params = " -e %i -s %s -g %i -q %i -c %i -m %i -x %i -l %i" % (args.hce_e, args.hce_s, args.hce_g, args.hce_q, args.hce_c, args.hce_m, args.hce_x, args.hce_l)
+    if args.hce_primary:
+        extract_params += " --primary "
+    if args.hce_f:
+        assert os.path.exists(args.hce_f), "--hce-f file \"%s\" not found" % (args.hce_f)
+        extract_params += " -f %s " % (args.hce_f)
+
+    # Experiment BAMs.
+    exp_beds = []
+    for i, bam_file in enumerate(exp_bams):
+
+        idx = i + 1
+
+        out_bed = args.out_folder + "/%s_exp_rep%i.bed" % (data_id, idx)
+
+        check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params
+        output = subprocess.getoutput(check_cmd)
+
+        print(check_cmd)
+        print(output)
+
+        assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed)
+        exp_beds.append(out_bed)
+        intermediate_files.append(out_bed)
+
+    # Control BAMs.
+    control_beds = []
+    for i, bam_file in enumerate(control_bams):
+
+        idx = i + 1
+
+        out_bed = args.out_folder + "/%s_ctrl_rep%i.bed" % (data_id, idx)
+
+        check_cmd = "htseq-clip extract -i " + bam_file + " -o " + out_bed + extract_params
+        output = subprocess.getoutput(check_cmd)
+
+        print(check_cmd)
+        print(output)
+
+        assert os.path.exists(out_bed), "htseq-clip extract -o file \"%s\" not found" % (out_bed)
+        control_beds.append(out_bed)
+        intermediate_files.append(out_bed)
+
+    """
+    Count crosslink sites in sliding windows.
+
+    htseq-clip count -i sites/Rbp3_total_rep1.bed -a annotation/Rbp3_uv_total_w75s10.txt -o counts/Rbp3_total_rep1.csv
+
+    """
+
+    print("Count reads overlapping with windows ... ")
+
+    count_params = ""
+    if args.hcc_unstranded:
+        count_params += " --unstranded"
+    # Experiment BAMs.
+    for i, bed_file in enumerate(exp_beds):
+
+        idx = i + 1
+
+        out_csv = counts_folder + "/%s_exp_rep%i.csv.gz" % (data_id, idx)
+        if args.no_zipper:
+            out_csv = counts_folder + "/%s_exp_rep%i.csv" % (data_id, idx)
+
+        check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params
+        output = subprocess.getoutput(check_cmd)
+
+        print(check_cmd)
+        print(output)
+
+        assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv)
+        intermediate_files.append(out_csv)
+
+    # Control BAMs.
+    for i, bed_file in enumerate(control_beds):
+
+        idx = i + 1
+
+        out_csv = counts_folder + "/%s_ctrl_rep%i.csv.gz" % (data_id, idx)
+        if args.no_zipper:
+            out_csv = counts_folder + "/%s_ctrl_rep%i.csv" % (data_id, idx)
+
+        check_cmd = "htseq-clip count -i " + bed_file + " -a " + args.in_win_bed + " -o " + out_csv + count_params
+        output = subprocess.getoutput(check_cmd)
+
+        print(check_cmd)
+        print(output)
+
+        assert os.path.exists(out_csv), "htseq-clip count -o file \"%s\" not found" % (out_csv)
+        intermediate_files.append(out_csv)
+
+    """
+    Create an R friendly matrix file.
+
+    htseq-clip createMatrix -i counts/ -b Rbp3 -o counts/Rbp3_uv_total_w75s10_counts.txt.gz
+
+    """
+
+    print("Create R-friendly count matrix file ... ")
+
+    # out_matrix = args.out_folder + "/%s_count_matrix.txt.gz" %(data_id)
+    out_matrix = args.out_folder + "/count_matrix.txt.gz"
+    if args.no_zipper:
+        # out_matrix = args.out_folder + "/%s_count_matrix.txt" %(data_id)
+        out_matrix = args.out_folder + "/count_matrix.txt"
+
+    check_cmd = "htseq-clip createMatrix -i " + counts_folder + " -b " + data_id + " -o " + out_matrix
+    output = subprocess.getoutput(check_cmd)
+
+    print(check_cmd)
+    print(output)
+
+    assert os.path.exists(out_matrix), "htseq-clip createMatrix -o file \"%s\" not found" % (out_matrix)
+
+    """
+    Create sample info file for DEWSeq.
+
+    Sample Info file format:
+    Sample name	Sample type
+    Rbp3_uv_rep1	IP
+    Rbp3_uv_rep2	IP
+    Rbp3_uv_rep3	IP
+    Rbp3_total_rep1	SMI
+    Rbp3_total_rep2	SMI
+    Rbp3_total_rep3	SMI
+
+    """
+
+    print("Write sample info file ... ")
+    sample_info_file = args.out_folder + "/sample_info.txt"
+    OUTTAB = open(sample_info_file, "w")
+    OUTTAB.write("Sample name\tSample type\n")
+    for sid in exp_ids:
+        OUTTAB.write("%s\tIP\n" % (sid))
+    for sid in control_ids:
+        OUTTAB.write("%s\tSMI\n" % (sid))
+    OUTTAB.close()
+
+    """
+    Delete intermediate files.
+
+    """
+    if not args.keep_imf:
+        print("Delete intermediate files ... ")
+        for imf in intermediate_files:
+            if os.path.exists(imf):
+                os.remove(imf)
+
+    """
+    Report.
+
+    """
+
+    print("")
+    print("OUTPUT FILES")
+    print("============")
+
+    print("Count matrix file (DEWSeq input file):\n%s" % (out_matrix))
+    print("Sample info file (DEWSeq input file):\n%s" % (sample_info_file))
+    print("")