diff fastq_subset.py @ 8:4e625d3672ba draft

Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
author pjbriggs
date Wed, 16 May 2018 07:39:16 -0400
parents
children 52dbe2089d14
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_subset.py	Wed May 16 07:39:16 2018 -0400
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+import argparse
+import random
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
+def count_reads(fastq):
+    """
+    Count number of reads in a Fastq file
+    """
+    n = 0
+    with open(fastq,'r') as fq:
+        while True:
+            buf = fq.read()
+            n += buf.count('\n')
+            if buf == "": break
+    return n/4
+
+def fastq_subset(fastq_in,fastq_out,indices):
+    """
+    Output a subset of reads from a Fastq file
+
+    The reads to output are specifed by a list
+    of integer indices; only reads at those
+    positions in the input file will be written
+    to the output.
+    """
+    with open(fastq_in,'r') as fq_in:
+        fq_out = open(fastq_out,'w')
+        i = 0
+        for title,seq,qual in FastqGeneralIterator(fq_in):
+            if i in indices:
+                fq_out.write("@%s\n%s\n+\n%s\n" % (title,
+                                                   seq,
+                                                   qual))
+            i += 1
+        fq_out.close()
+
+if __name__ == "__main__":
+
+    p = argparse.ArgumentParser()
+    p.add_argument("fastq_r1")
+    p.add_argument("fastq_r2")
+    p.add_argument("-n",
+                   dest="subset_size",
+                   default=None,
+                   help="subset size")
+    p.add_argument("-s",
+                   dest="seed",
+                   type=int,
+                   default=None,
+                   help="seed for random number generator")
+    args = p.parse_args()
+
+    print "Processing fastq pair:"
+    print "\t%s" % args.fastq_r1
+    print "\t%s" % args.fastq_r2
+
+    nreads = count_reads(args.fastq_r1)
+    print "Counted %d reads in %s" % (nreads,args.fastq_r1)
+
+    if args.subset_size is not None:
+        subset_size = float(args.subset_size)
+        if subset_size < 1.0:
+            subset_size = int(nreads*subset_size)
+        else:
+            subset_size = int(subset_size)
+        print "Extracting subset of reads: %s" % subset_size
+        if args.seed is not None:
+            print "Random number generator seed: %d" % args.seed
+            random.seed(args.seed)
+        subset = random.sample(xrange(nreads),subset_size)
+        fastq_subset(args.fastq_r1,"subset_r1.fq",subset)
+        fastq_subset(args.fastq_r2,"subset_r2.fq",subset)