view fastq_subsampling.py @ 0:bc23da9d464c draft default tip

planemo upload for repository http://172.20.124.12:3000/bvalot3/PythonScript commit 9676573ee48ce5343600ab45cd3ac1a6adddabe4
author bvalot
date Tue, 14 Jun 2022 08:15:55 +0000
parents
children
line wrap: on
line source

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Return a subsampling fastq file"""

import argparse
import gzip
import io
import os
import random
import sys

import pysam

desc = "Subsampling a single or a paired of fastq(.gz) file"
command = argparse.ArgumentParser(
    prog="fastq_subsampling.py", description=desc, usage="%(prog)s [options] genomeSize"
)
command.add_argument("--galaxy", action="store_true", help=argparse.SUPPRESS)
command.add_argument(
    "-d",
    "--outdir",
    default="./",
    type=str,
    nargs="?",
    help="Directory to store subsampling fastq file, default: current directory",
)
command.add_argument(
    "-p",
    "--prefix",
    default="_sub",
    type=str,
    nargs="?",
    help="Output fastq file with prefix, default:_sub",
)
command.add_argument(
    "-c",
    "--coverage",
    type=int,
    nargs="?",
    default=60,
    help="Mean coverage to sampling (default=60)",
)
command.add_argument(
    "--copy",
    action="store_true",
    help="Perform a copy of sample without enough coverage (default=no subsampling)",
)
command.add_argument(
    "-s",
    "--sfastq",
    nargs="?",
    type=argparse.FileType("r"),
    help="Fastq(.gz) file to subsampling (single)",
)
command.add_argument(
    "-r",
    "--rfastq",
    nargs="?",
    type=argparse.FileType("r"),
    help="Fastq(.gz) file to subsampling in paired (right)",
)
command.add_argument(
    "-l",
    "--lfastq",
    nargs="?",
    type=argparse.FileType("r"),
    help="Fastq(.gz) file to subsampling in paired (left)",
)
command.add_argument(
    "genomesize",
    type=int,
    help="Size of the genome (bp) for calculation of subsampling",
)
command.add_argument("-v", "--version", action="version", version="%(prog)s 0.2.0")


def outname(inname, outdir, append):
    inname = outdir + inname.split("/")[-1]
    if isgzip(inname):
        if inname[-5:] == "fq.gz":
            return inname.rstrip(".fq.gz") + append + ".fastq.gz"
        elif inname[-8:] == "fastq.gz":
            return inname.rstrip(".fastq.gz") + append + ".fastq.gz"
        else:
            return inname + append + ".fastq.gz"
    elif inname[-2:] == "fq":
        return inname.rstrip(".fq") + append + ".fastq.gz"
    elif inname[-5:] == "fastq":
        return inname.rstrip(".fastq") + append + ".fastq.gz"
    else:
        return inname + append + ".fastq.gz"


def isgzip(filename):
    if filename[-2:] == "gz":
        return True
    else:
        return False


def complet_count(fastq):
    count = 0
    total = 0.0
    for read in pysam.FastxFile(fastq):
        count += 1
        total += len(read.sequence)
    return count, int(total / count)


def make_copy(infile, outdir, prefix):
    output = outname(infile, outdir, prefix)
    os.popen(" ".join(["cp", infile, output]))


def make_subsampling(infile, outdir, prefix, select):
    output = io.BufferedWriter(gzip.open(outname(infile, outdir, prefix), "wb", 5))
    for i, read in enumerate(pysam.FastxFile(infile)):
        if i in select:
            output.write((str(read) + "\n").encode())
    output.flush()
    output.close()


def make_subsampling_galaxy(infile, number, select):
    output = io.BufferedWriter(gzip.open("./" + str(number) + ".fastq.gz", "wb", 5))
    for i, read in enumerate(pysam.FastxFile(infile)):
        if i in select:
            output.write((str(read) + "\n").encode())
    output.flush()
    output.close()


if __name__ == "__main__":
    """Performed job on execution script"""
    args = command.parse_args()

    # verify input
    if args.sfastq is None and args.lfastq is None and args.rfastq is None:
        raise Exception("At least single or paired reads must be defined")
    outdir = args.outdir
    if not os.path.isdir(outdir):
        raise Exception(outdir + " is not a valid directory")
    else:
        outdir = outdir.rstrip("/") + "/"

    # compute count and length
    fastqname = None
    if args.sfastq is not None:
        fastqname = args.sfastq.name
    elif args.rfastq is not None and args.lfastq is not None:
        fastqname = args.rfastq.name
    else:
        raise Exception("You must defined right and left fastq file for paired data")

    count = None
    length = None
    count, length = complet_count(fastqname)

    # Calculate coverage and report
    coverage = 0
    if args.sfastq is not None:
        coverage = int(float(count) * length / args.genomesize)
        print(
            " : ".join([fastqname, str(length) + "bp", str(count), str(coverage) + "X"])
        )
    else:
        coverage = int(float(count) * length * 2 / args.genomesize)
        print(
            " : ".join(
                [fastqname, str(length) + "bp*2", str(count), str(coverage) + "X"]
            )
        )

    # check coverage
    if coverage <= args.coverage:
        if args.copy and args.galaxy is False:
            if args.sfastq is not None:
                make_copy(args.sfastq.name, outdir, args.prefix)
            else:
                make_copy(args.rfastq.name, outdir, args.prefix)
                make_copy(args.lfastq.name, outdir, args.prefix)
        print("Coverage is less than threshold, no subsampling need")
        sys.exit(0)

    # performed subsampling
    if args.sfastq is not None:
        subread = int((float(args.genomesize) * args.coverage) / length)
        select = set(random.sample(range(0, count), subread))
        if args.galaxy:
            make_subsampling_galaxy(args.sfastq.name, 1, select)
        else:
            make_subsampling(args.sfastq.name, outdir, args.prefix, select)
        print("Subsampling to " + str(subread))
    else:
        subread = int((float(args.genomesize) * args.coverage) / (length * 2))
        select = set(random.sample(range(0, count), subread))
        if args.galaxy:
            make_subsampling_galaxy(args.rfastq.name, 1, select)
            make_subsampling_galaxy(args.lfastq.name, 2, select)
        else:
            make_subsampling(args.rfastq.name, outdir, args.prefix, select)
            make_subsampling(args.lfastq.name, outdir, args.prefix, select)
        print("Subsampling to " + str(subread))