diff scripts/fastq_positional_quality_trimming.py @ 0:965517909457 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Wed, 22 Jan 2020 08:41:44 -0500
parents
children 0cbed1c0a762
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/fastq_positional_quality_trimming.py	Wed Jan 22 08:41:44 2020 -0500
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+"""
+This tool trims paired-end FASTQ files on the basis of quality score or left/right position, retaining mate integrity.
+Reads without mate after filtering are saved in a separate output file.
+"""
+
+import math
+import optparse
+import sys
+
+def average(values):
+    """ Arithmetic mean of a list of values """
+    return math.fsum(values) / len(values) if len(values) else float('nan')
+
+
+def phred2sanger(phred_scores):
+    """ Convert an array of Phred quality scores (integers in [0, 93]) to a Sanger-encoded quality string"""
+    return ''.join([chr(score + 33) for score in phred_scores])
+
+
+def sanger2phred(sanger_string):
+    """ Convert a Sanger-encoded quality string to an array of Phred quality scores"""
+    return [ord(ch) - 33 for ch in sanger_string]
+
+
+def trimming(sequ, qual, maxlengthtrim, lefttrim, righttrim, minqualtrim, avgqualtrim):
+    """ Trimming of sequence and quality of a read"""
+    # Maximum length trimming
+    if maxlengthtrim != -1:
+        if len(sequ) > maxlengthtrim:
+            sequ = sequ[: maxlengthtrim]
+            qual = qual[: maxlengthtrim]
+    # Left- and right-side trimming
+    if righttrim == 0:
+        sequ = sequ[lefttrim :]
+        qual = qual[lefttrim :]
+    else:
+        sequ = sequ[lefttrim : -righttrim]
+        qual = qual[lefttrim : -righttrim]
+    # Minimum quality right-side trimming
+    while len(sequ) and qual[-1] < minqualtrim:
+        qual = qual[:-1]
+        sequ = sequ[:-1]
+    # Average quality right-side trimming
+    while len(sequ) and average(qual) < avgqualtrim:
+        qual = qual[:-1]
+        sequ = sequ[:-1]
+    return sequ, qual
+
+
+def __main__():
+    parser = optparse.OptionParser()
+    parser.add_option('-1', '--input1', dest='input1', help='forward or single-end reads file in Sanger FASTQ format')
+    parser.add_option('-2', '--input2', dest='input2', help='reverse reads file in Sanger FASTQ format')
+    parser.add_option('--maxlt', dest='maxlengthtrim', type='int', default=400, help='maximum length trimming')
+    parser.add_option('--lt', dest='lefttrim', type='int', default=0, help='left-side trimming')
+    parser.add_option('--rt', dest='righttrim', type='int', default=0, help='right-side trimming')
+    parser.add_option('--minqt', dest='minqualtrim', type='int', default=15, help='minimum quality right-side trimming')
+    parser.add_option('--avgqt', dest='avgqualtrim', type='float', default=20, help='average quality right-side trimming')
+    parser.add_option('--minlf', dest='minlen', type='int', default=25, help='minimum length filtering')
+    parser.add_option('--trimmed1', dest='trimmed1', help='trimmed forward FASTQ file')
+    parser.add_option('--trimmed2', dest='trimmed2', help='trimmed reverse FASTQ file')
+    parser.add_option('--trimmedunpaired', dest='trimmedunpaired', help='trimmed unpaired FASTQ file')
+    parser.add_option('--log', dest='logfile', help='log file')
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error('Wrong number of arguments')
+
+    maxlengthtrim = options.maxlengthtrim
+    lefttrim = options.lefttrim
+    righttrim = options.righttrim
+    minqualtrim = options.minqualtrim
+    avgqualtrim = options.avgqualtrim
+    minlen = options.minlen
+
+    total_reads = 0
+    discarded_reads = 0
+    forward = open(options.input1)
+    if options.input2:
+        paired = True
+        passing_paired_reads = 0
+        passing_unpaired_reads = 0
+        reverse = open(options.input2)
+        trimmed_reverse = open(options.trimmed2, 'w')
+        trimmed_unpaired = open(options.trimmedunpaired, 'w')
+    else:
+        paired = False
+        passing_reads = 0
+    trimmed_forward = open(options.trimmed1, 'w')
+    log = open(options.logfile, 'w')
+    try:
+        while True:
+            headL = forward.next().rstrip()
+            sequL = forward.next().rstrip()
+            commL = forward.next().rstrip()
+            sangL = forward.next().rstrip()
+            qualL = sanger2phred(sangL)
+            trimmed_sequL, trimmed_qualL = trimming(sequL, qualL, maxlengthtrim, lefttrim, righttrim, minqualtrim, avgqualtrim)
+            if paired:
+                try:
+                    headR = reverse.next().rstrip()
+                    sequR = reverse.next().rstrip()
+                    commR = reverse.next().rstrip()
+                    sangR = reverse.next().rstrip()
+                except StopIteration:
+                    sys.exit('Reverse FASTQ file contain less reads than forward FASTQ file.')
+                qualR = sanger2phred(sangR)
+                trimmed_sequR, trimmed_qualR = trimming(sequR, qualR, maxlengthtrim, lefttrim, righttrim, minqualtrim, avgqualtrim)
+            # Filter by residual length
+            if paired:
+                if len(trimmed_sequL) >= minlen and len(trimmed_sequR) >= minlen:
+                    trimmed_forward.write(headL + '\n' + trimmed_sequL + '\n' + commL + '\n' + phred2sanger(trimmed_qualL) + '\n')
+                    trimmed_reverse.write(headR + '\n' + trimmed_sequR + '\n' + commR + '\n' + phred2sanger(trimmed_qualR) + '\n')
+                    passing_paired_reads += 1
+                elif len(trimmed_sequL) >= minlen and len(trimmed_sequR) < minlen:
+                    trimmed_unpaired.write(headL + '\n' + trimmed_sequL + '\n' + commL + '\n' + phred2sanger(trimmed_qualL) + '\n')
+                    passing_unpaired_reads += 1
+                elif len(trimmed_sequL) < minlen and len(trimmed_sequR) >= minlen:
+                    trimmed_unpaired.write(headR + '\n' + trimmed_sequR + '\n' + commR + '\n' + phred2sanger(trimmed_qualR) + '\n')
+                    passing_unpaired_reads += 1
+                else:
+                    discarded_reads += 1
+            else:
+                if len(trimmed_sequL) >= minlen:
+                    trimmed_forward.write(headL + '\n' + trimmed_sequL + '\n' + commL + '\n' + phred2sanger(trimmed_qualL) + '\n')
+                    passing_reads += 1
+                else:
+                    discarded_reads += 1
+            total_reads += 1
+    except StopIteration:
+        if paired:
+            try:
+                reverse.next()
+            except StopIteration:
+                log.write("Total paired reads     : %d\n" % total_reads)
+                log.write("Passing paired reads   : %d\n" % passing_paired_reads)
+                log.write("Passing unpaired reads : %d\n" % passing_unpaired_reads)
+                log.write("Discarded paired reads : %d\n" % discarded_reads)
+            else:
+                sys.exit('Forward FASTQ file contain less reads than reverse FASTQ file.')
+        else:
+            log.write("Total reads     : %d\n" % total_reads)
+            log.write("Passing reads   : %d\n" % passing_reads)
+            log.write("Discarded reads : %d\n" % discarded_reads)
+    finally:
+        forward.close()
+        trimmed_forward.close()
+        log.close()
+        if paired:
+            reverse.close()
+            trimmed_reverse.close()
+            trimmed_unpaired.close()
+
+
+if __name__ == "__main__":
+    __main__()