Mercurial > repos > bvalot > fasta_filter
comparison fastq_subsampling.py @ 0:c74e633b40e9 draft default tip
planemo upload for repository https://github.com/bvalot/galaxy commit d57c24d4b2c0c741d572af9ca0d09f8b82689640
| author | bvalot |
|---|---|
| date | Tue, 14 Jun 2022 08:51:44 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c74e633b40e9 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 # -*- coding: utf-8 -*- | |
| 3 | |
| 4 """Return a subsampling fastq file""" | |
| 5 | |
| 6 import argparse | |
| 7 import gzip | |
| 8 import io | |
| 9 import os | |
| 10 import random | |
| 11 import sys | |
| 12 | |
| 13 import pysam | |
| 14 | |
| 15 desc = "Subsampling a single or a paired of fastq(.gz) file" | |
| 16 command = argparse.ArgumentParser( | |
| 17 prog="fastq_subsampling.py", description=desc, usage="%(prog)s [options] genomeSize" | |
| 18 ) | |
| 19 command.add_argument("--galaxy", action="store_true", help=argparse.SUPPRESS) | |
| 20 command.add_argument( | |
| 21 "-d", | |
| 22 "--outdir", | |
| 23 default="./", | |
| 24 type=str, | |
| 25 nargs="?", | |
| 26 help="Directory to store subsampling fastq file, default: current directory", | |
| 27 ) | |
| 28 command.add_argument( | |
| 29 "-p", | |
| 30 "--prefix", | |
| 31 default="_sub", | |
| 32 type=str, | |
| 33 nargs="?", | |
| 34 help="Output fastq file with prefix, default:_sub", | |
| 35 ) | |
| 36 command.add_argument( | |
| 37 "-c", | |
| 38 "--coverage", | |
| 39 type=int, | |
| 40 nargs="?", | |
| 41 default=60, | |
| 42 help="Mean coverage to sampling (default=60)", | |
| 43 ) | |
| 44 command.add_argument( | |
| 45 "--copy", | |
| 46 action="store_true", | |
| 47 help="Perform a copy of sample without enough coverage (default=no subsampling)", | |
| 48 ) | |
| 49 command.add_argument( | |
| 50 "-s", | |
| 51 "--sfastq", | |
| 52 nargs="?", | |
| 53 type=argparse.FileType("r"), | |
| 54 help="Fastq(.gz) file to subsampling (single)", | |
| 55 ) | |
| 56 command.add_argument( | |
| 57 "-r", | |
| 58 "--rfastq", | |
| 59 nargs="?", | |
| 60 type=argparse.FileType("r"), | |
| 61 help="Fastq(.gz) file to subsampling in paired (right)", | |
| 62 ) | |
| 63 command.add_argument( | |
| 64 "-l", | |
| 65 "--lfastq", | |
| 66 nargs="?", | |
| 67 type=argparse.FileType("r"), | |
| 68 help="Fastq(.gz) file to subsampling in paired (left)", | |
| 69 ) | |
| 70 command.add_argument( | |
| 71 "genomesize", | |
| 72 type=int, | |
| 73 help="Size of the genome (bp) for calculation of subsampling", | |
| 74 ) | |
| 75 command.add_argument("-v", "--version", action="version", version="%(prog)s 0.2.0") | |
| 76 | |
| 77 | |
| 78 def outname(inname, outdir, append): | |
| 79 inname = outdir + inname.split("/")[-1] | |
| 80 if isgzip(inname): | |
| 81 if inname[-5:] == "fq.gz": | |
| 82 return inname.rstrip(".fq.gz") + append + ".fastq.gz" | |
| 83 elif inname[-8:] == "fastq.gz": | |
| 84 return inname.rstrip(".fastq.gz") + append + ".fastq.gz" | |
| 85 else: | |
| 86 return inname + append + ".fastq.gz" | |
| 87 elif inname[-2:] == "fq": | |
| 88 return inname.rstrip(".fq") + append + ".fastq.gz" | |
| 89 elif inname[-5:] == "fastq": | |
| 90 return inname.rstrip(".fastq") + append + ".fastq.gz" | |
| 91 else: | |
| 92 return inname + append + ".fastq.gz" | |
| 93 | |
| 94 | |
| 95 def isgzip(filename): | |
| 96 if filename[-2:] == "gz": | |
| 97 return True | |
| 98 else: | |
| 99 return False | |
| 100 | |
| 101 | |
| 102 def complet_count(fastq): | |
| 103 count = 0 | |
| 104 total = 0.0 | |
| 105 for read in pysam.FastxFile(fastq): | |
| 106 count += 1 | |
| 107 total += len(read.sequence) | |
| 108 return count, int(total / count) | |
| 109 | |
| 110 | |
| 111 def make_copy(infile, outdir, prefix): | |
| 112 output = outname(infile, outdir, prefix) | |
| 113 os.popen(" ".join(["cp", infile, output])) | |
| 114 | |
| 115 | |
| 116 def make_subsampling(infile, outdir, prefix, select): | |
| 117 output = io.BufferedWriter(gzip.open(outname(infile, outdir, prefix), "wb", 5)) | |
| 118 for i, read in enumerate(pysam.FastxFile(infile)): | |
| 119 if i in select: | |
| 120 output.write((str(read) + "\n").encode()) | |
| 121 output.flush() | |
| 122 output.close() | |
| 123 | |
| 124 | |
| 125 def make_subsampling_galaxy(infile, number, select): | |
| 126 output = io.BufferedWriter(gzip.open("./" + str(number) + ".fastq.gz", "wb", 5)) | |
| 127 for i, read in enumerate(pysam.FastxFile(infile)): | |
| 128 if i in select: | |
| 129 output.write((str(read) + "\n").encode()) | |
| 130 output.flush() | |
| 131 output.close() | |
| 132 | |
| 133 | |
| 134 if __name__ == "__main__": | |
| 135 """Performed job on execution script""" | |
| 136 args = command.parse_args() | |
| 137 | |
| 138 # verify input | |
| 139 if args.sfastq is None and args.lfastq is None and args.rfastq is None: | |
| 140 raise Exception("At least single or paired reads must be defined") | |
| 141 outdir = args.outdir | |
| 142 if not os.path.isdir(outdir): | |
| 143 raise Exception(outdir + " is not a valid directory") | |
| 144 else: | |
| 145 outdir = outdir.rstrip("/") + "/" | |
| 146 | |
| 147 # compute count and length | |
| 148 fastqname = None | |
| 149 if args.sfastq is not None: | |
| 150 fastqname = args.sfastq.name | |
| 151 elif args.rfastq is not None and args.lfastq is not None: | |
| 152 fastqname = args.rfastq.name | |
| 153 else: | |
| 154 raise Exception("You must defined right and left fastq file for paired data") | |
| 155 | |
| 156 count = None | |
| 157 length = None | |
| 158 count, length = complet_count(fastqname) | |
| 159 | |
| 160 # Calculate coverage and report | |
| 161 coverage = 0 | |
| 162 if args.sfastq is not None: | |
| 163 coverage = int(float(count) * length / args.genomesize) | |
| 164 print( | |
| 165 " : ".join([fastqname, str(length) + "bp", str(count), str(coverage) + "X"]) | |
| 166 ) | |
| 167 else: | |
| 168 coverage = int(float(count) * length * 2 / args.genomesize) | |
| 169 print( | |
| 170 " : ".join( | |
| 171 [fastqname, str(length) + "bp*2", str(count), str(coverage) + "X"] | |
| 172 ) | |
| 173 ) | |
| 174 | |
| 175 # check coverage | |
| 176 if coverage <= args.coverage: | |
| 177 if args.copy and args.galaxy is False: | |
| 178 if args.sfastq is not None: | |
| 179 make_copy(args.sfastq.name, outdir, args.prefix) | |
| 180 else: | |
| 181 make_copy(args.rfastq.name, outdir, args.prefix) | |
| 182 make_copy(args.lfastq.name, outdir, args.prefix) | |
| 183 print("Coverage is less than threshold, no subsampling need") | |
| 184 sys.exit(0) | |
| 185 | |
| 186 # performed subsampling | |
| 187 if args.sfastq is not None: | |
| 188 subread = int((float(args.genomesize) * args.coverage) / length) | |
| 189 select = set(random.sample(range(0, count), subread)) | |
| 190 if args.galaxy: | |
| 191 make_subsampling_galaxy(args.sfastq.name, 1, select) | |
| 192 else: | |
| 193 make_subsampling(args.sfastq.name, outdir, args.prefix, select) | |
| 194 print("Subsampling to " + str(subread)) | |
| 195 else: | |
| 196 subread = int((float(args.genomesize) * args.coverage) / (length * 2)) | |
| 197 select = set(random.sample(range(0, count), subread)) | |
| 198 if args.galaxy: | |
| 199 make_subsampling_galaxy(args.rfastq.name, 1, select) | |
| 200 make_subsampling_galaxy(args.lfastq.name, 2, select) | |
| 201 else: | |
| 202 make_subsampling(args.rfastq.name, outdir, args.prefix, select) | |
| 203 make_subsampling(args.lfastq.name, outdir, args.prefix, select) | |
| 204 print("Subsampling to " + str(subread)) |
