| 1 | 1 #!/usr/bin/env python | 
| 0 | 2 | 
|  | 3 import sys | 
|  | 4 import os | 
|  | 5 import os.path | 
|  | 6 import re | 
|  | 7 | 
|  | 8 # Define the Dictionary & the Reverse Dictionary for barcode | 
|  | 9 | 
|  | 10 # =========================================== | 
|  | 11 # | 
|  | 12 def FilterFastq (infile, outfile) : | 
|  | 13     try: | 
|  | 14         INFILE = open(infile, 'r') | 
|  | 15     except IOError: | 
|  | 16         print "\n[Error]:\n\t File cannot be open: ", infile; | 
|  | 17         exit(-1) | 
|  | 18     try: | 
|  | 19         OUTFILE = open(outfile, 'w') | 
|  | 20     except IOError: | 
|  | 21         print "\n[Error]:\n\t File cannot be open: ", outfile; | 
|  | 22         exit(-1) | 
|  | 23     Dict = {} | 
|  | 24     count = 0 | 
|  | 25     total = 0 | 
|  | 26     while 1 : | 
|  | 27         line1 = INFILE.readline() | 
|  | 28         if not line1 : | 
|  | 29             break | 
|  | 30         line1 = line1 | 
|  | 31         line2 = INFILE.readline() | 
|  | 32         line3 = INFILE.readline() | 
|  | 33         line4 = INFILE.readline() | 
|  | 34         total += 1 | 
|  | 35  #       print line2 | 
|  | 36         if ( line2 in Dict ) : | 
|  | 37             Dict[line2] += 1 | 
|  | 38         else : | 
|  | 39             Dict[line2] = 1 | 
|  | 40             OUTFILE.write(line1) | 
|  | 41             OUTFILE.write(line2) | 
|  | 42             OUTFILE.write(line3) | 
|  | 43             OUTFILE.write(line4) | 
|  | 44             count += 1 | 
|  | 45     INFILE.close() | 
|  | 46     OUTFILE.close() | 
|  | 47     print "Count of total raw reads: ", total | 
|  | 48     print "Count of reads left: ", count | 
|  | 49     print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" | 
|  | 50 | 
|  | 51 | 
|  | 52 # =============================== | 
|  | 53 # | 
|  | 54 | 
|  | 55 def FilterSequence (infile, outfile) : | 
|  | 56     try: | 
|  | 57         INFILE = open(infile, 'r') | 
|  | 58     except IOError: | 
|  | 59         print "\n[Error]:\n\t File cannot be open: ", infile; | 
|  | 60         exit(-1) | 
|  | 61     try: | 
|  | 62         OUTFILE = open(outfile, 'w') | 
|  | 63     except IOError: | 
|  | 64         print "\n[Error]:\n\t File cannot be open: ", outfile; | 
|  | 65         exit(-1) | 
|  | 66     Dict = {} | 
|  | 67     count = 0 | 
|  | 68     total = 0 | 
|  | 69     for line in INFILE: | 
|  | 70         line = line.strip() | 
|  | 71         total += 1 | 
|  | 72         if ( line in Dict ) : | 
|  | 73             Dict[line] += 1 | 
|  | 74         else : | 
|  | 75             Dict[line] = 1 | 
|  | 76             count += 1 | 
|  | 77             OUTFILE.write(line + "\n") | 
|  | 78     INFILE.close() | 
|  | 79     OUTFILE.close() | 
|  | 80     print "Count of total raw reads: ", total | 
|  | 81     print "Count of reads left: ", count | 
|  | 82     print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" | 
|  | 83 | 
|  | 84 # =============================== | 
|  | 85 # SN603   WA047   6       1101    41.40   99.10   0       1       .GGGA.......TTAG..............       @SZ__@@@@@@@RR__@@@@@@@@@@@@@@       0 | 
|  | 86 # | 
|  | 87 def FilterQseq (infile, outfile, keepquality) : | 
|  | 88     if keepquality : | 
|  | 89         print "User specified '-k' and read quality will not be considered" | 
|  | 90     else : | 
|  | 91         print "Reads with PF=0 will be filtered" | 
|  | 92 | 
|  | 93     try: | 
|  | 94         INFILE = open(infile, 'r') | 
|  | 95     except IOError: | 
|  | 96         print "\n[Error]:\n\t File cannot be open: ", infile; | 
|  | 97         exit(-1) | 
|  | 98     try: | 
|  | 99         OUTFILE = open(outfile, 'w') | 
|  | 100     except IOError: | 
|  | 101         print "\n[Error]:\n\t File cannot be open: ", outfile; | 
|  | 102         exit(-1) | 
|  | 103     Dict = {} | 
|  | 104     count = 0 | 
|  | 105     total = 0 | 
|  | 106     for line in INFILE: | 
|  | 107         tokens = line.strip().split() | 
|  | 108         total += 1 | 
|  | 109         if ( (not keepquality and tokens[10] == "1") or keepquality ) : | 
|  | 110             if ( tokens[8] in Dict ) : | 
|  | 111                 Dict[tokens[8]] += 1 | 
|  | 112             else : | 
|  | 113                 Dict[tokens[8]] = 1 | 
|  | 114                 count += 1 | 
|  | 115                 OUTFILE.write(line) | 
|  | 116 | 
|  | 117     INFILE.close() | 
|  | 118     OUTFILE.close() | 
|  | 119     print "Count of total raw reads: ", total | 
|  | 120     print "Count of reads left: ", count | 
|  | 121     print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" | 
|  | 122 | 
|  | 123 | 
|  | 124 # =============================== | 
|  | 125 # | 
|  | 126 | 
|  | 127 def FilterFasta (infile, outfile) : | 
|  | 128     try: | 
|  | 129         INFILE = open(infile, 'r') | 
|  | 130     except IOError: | 
|  | 131         print "\n[Error]:\n\t File cannot be open: ", infile; | 
|  | 132         exit(-1) | 
|  | 133     try: | 
|  | 134         OUTFILE = open(outfile, 'w') | 
|  | 135     except IOError: | 
|  | 136         print "\n[Error]:\n\t File cannot be open: ", outfile; | 
|  | 137         exit(-1) | 
|  | 138     Dict = {} | 
|  | 139     count = 0 | 
|  | 140     total = 0 | 
|  | 141     name = "" | 
|  | 142     read = "" | 
|  | 143     for line in INFILE: | 
|  | 144         line = line.strip() | 
|  | 145         if (line[0] == '>') : | 
|  | 146             total += 1 | 
|  | 147             if ( name != "" ) : | 
|  | 148                 if ( read in Dict ) : | 
|  | 149                     Dict[read] += 1 | 
|  | 150                 else : | 
|  | 151                     Dict[read] = 1 | 
|  | 152                     count += 1 | 
|  | 153                     OUTFILE.write(">" + name + "\n") | 
|  | 154                     OUTFILE.write(read + "\n") | 
|  | 155             name = line[1:] | 
|  | 156             read = "" | 
|  | 157         else : | 
|  | 158             read = read + line | 
|  | 159     if (name != "") : | 
|  | 160         if ( read in Dict ) : | 
|  | 161             Dict[read] += 1 | 
|  | 162         else : | 
|  | 163             Dict[read] = 1 | 
|  | 164             count += 1 | 
|  | 165             OUTFILE.write(">" + name + "\n") | 
|  | 166             OUTFILE.write(read + "\n") | 
|  | 167     print "Count of total raw reads: ", total | 
|  | 168     print "Count of reads left: ", count | 
|  | 169     print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" | 
|  | 170 | 
|  | 171 | 
|  | 172 | 
|  | 173 # ============================== | 
|  | 174 #  Decide branch functions for different format | 
|  | 175 | 
|  | 176 def FilterReads (infile, outfile, keepquality): | 
|  | 177     try: | 
|  | 178         INFILE = open(infile, 'r') | 
|  | 179     except IOError: | 
|  | 180         print "\n[Error]:\n\t File cannot be open: ", infile; | 
|  | 181         exit(-1) | 
|  | 182     i = 0 | 
|  | 183     # Count the numbers of barcodes in first 10000 lines | 
|  | 184     line = INFILE.readline() | 
|  | 185     tokens = line.strip().split() | 
|  | 186     input_format=""; | 
|  | 187     if line[0]=="@" : | 
|  | 188         input_format = "FastQ" | 
|  | 189         n_fastq = 0 | 
|  | 190     elif len(tokens) == 1 and line[0] != ">": | 
|  | 191         input_format = "sequence" | 
|  | 192     elif len(tokens) == 11: | 
|  | 193         input_format = "qseq" | 
|  | 194     elif line[0]==">" : | 
|  | 195         input_format = "fasta" | 
|  | 196     INFILE.close() | 
|  | 197 | 
|  | 198     print "Input file format detected: ", input_format | 
|  | 199 | 
|  | 200     if input_format == "FastQ" : | 
|  | 201         FilterFastq(infile, outfile) | 
|  | 202     elif input_format == "sequence" : | 
|  | 203         FilterSequence(infile, outfile) | 
|  | 204     elif input_format == "qseq" : | 
|  | 205         FilterQseq(infile, outfile, keepquality) | 
|  | 206     elif input_format == "fasta" : | 
|  | 207         FilterFasta(infile, outfile) | 
|  | 208 | 
|  | 209 | 
|  | 210 from optparse import OptionParser | 
|  | 211 | 
|  | 212 # =========================================== | 
|  | 213 def main(): | 
|  | 214     usage = "Usage: %prog -i <input> -o <output> [-k]\n" \ | 
|  | 215             "Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n" \ | 
|  | 216             "Last Update: 2013-04-01" \ | 
|  | 217             "Description: Unique reads for qseq/fastq/fasta/sequencce,\n" \ | 
|  | 218             "       and filter low quality reads in qseq file." | 
|  | 219     parser = OptionParser(usage) | 
|  | 220     parser.add_option("-i", dest="infile", | 
|  | 221                   help="Name of the input qseq/fastq/fasta/sequence file", metavar="FILE") | 
|  | 222     parser.add_option("-o", dest="outfile", | 
|  | 223                   help="Name of the output file", metavar="FILE") | 
|  | 224     parser.add_option("-k", dest="keepquality", default = False, action = "store_true", | 
|  | 225                   help="Would not filter low quality reads if specified") | 
|  | 226     (options, args) = parser.parse_args() | 
|  | 227 | 
|  | 228     if (options.infile is None) or (options.outfile is None) : | 
| 1 | 229         parser.print_help() | 
| 0 | 230         exit(-1) | 
|  | 231     FilterReads(options.infile, options.outfile, options.keepquality) | 
|  | 232 | 
|  | 233 | 
|  | 234 # =========================================== | 
|  | 235 if __name__ == "__main__": | 
|  | 236     main() | 
|  | 237 |