Mercurial > repos > weilong-guo > bs_seeker2
diff BSseeker2/FilterReads.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/FilterReads.py Fri Jul 12 18:47:28 2013 -0400 @@ -0,0 +1,237 @@ +#!/usr/bin/python + +import sys +import os +import os.path +import re + +# Define the Dictionary & the Reverse Dictionary for barcode + +# =========================================== +# +def FilterFastq (infile, outfile) : + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + try: + OUTFILE = open(outfile, 'w') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", outfile; + exit(-1) + Dict = {} + count = 0 + total = 0 + while 1 : + line1 = INFILE.readline() + if not line1 : + break + line1 = line1 + line2 = INFILE.readline() + line3 = INFILE.readline() + line4 = INFILE.readline() + total += 1 + # print line2 + if ( line2 in Dict ) : + Dict[line2] += 1 + else : + Dict[line2] = 1 + OUTFILE.write(line1) + OUTFILE.write(line2) + OUTFILE.write(line3) + OUTFILE.write(line4) + count += 1 + INFILE.close() + OUTFILE.close() + print "Count of total raw reads: ", total + print "Count of reads left: ", count + print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" + + +# =============================== +# + +def FilterSequence (infile, outfile) : + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + try: + OUTFILE = open(outfile, 'w') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", outfile; + exit(-1) + Dict = {} + count = 0 + total = 0 + for line in INFILE: + line = line.strip() + total += 1 + if ( line in Dict ) : + Dict[line] += 1 + else : + Dict[line] = 1 + count += 1 + OUTFILE.write(line + "\n") + INFILE.close() + OUTFILE.close() + print "Count of total raw reads: ", total + print "Count of reads left: ", count + print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" + +# =============================== +# SN603 WA047 6 1101 41.40 99.10 0 1 .GGGA.......TTAG.............. @SZ__@@@@@@@RR__@@@@@@@@@@@@@@ 0 +# +def FilterQseq (infile, outfile, keepquality) : + if keepquality : + print "User specified '-k' and read quality will not be considered" + else : + print "Reads with PF=0 will be filtered" + + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + try: + OUTFILE = open(outfile, 'w') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", outfile; + exit(-1) + Dict = {} + count = 0 + total = 0 + for line in INFILE: + tokens = line.strip().split() + total += 1 + if ( (not keepquality and tokens[10] == "1") or keepquality ) : + if ( tokens[8] in Dict ) : + Dict[tokens[8]] += 1 + else : + Dict[tokens[8]] = 1 + count += 1 + OUTFILE.write(line) + + INFILE.close() + OUTFILE.close() + print "Count of total raw reads: ", total + print "Count of reads left: ", count + print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" + + +# =============================== +# + +def FilterFasta (infile, outfile) : + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + try: + OUTFILE = open(outfile, 'w') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", outfile; + exit(-1) + Dict = {} + count = 0 + total = 0 + name = "" + read = "" + for line in INFILE: + line = line.strip() + if (line[0] == '>') : + total += 1 + if ( name != "" ) : + if ( read in Dict ) : + Dict[read] += 1 + else : + Dict[read] = 1 + count += 1 + OUTFILE.write(">" + name + "\n") + OUTFILE.write(read + "\n") + name = line[1:] + read = "" + else : + read = read + line + if (name != "") : + if ( read in Dict ) : + Dict[read] += 1 + else : + Dict[read] = 1 + count += 1 + OUTFILE.write(">" + name + "\n") + OUTFILE.write(read + "\n") + print "Count of total raw reads: ", total + print "Count of reads left: ", count + print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" + + + +# ============================== +# Decide branch functions for different format + +def FilterReads (infile, outfile, keepquality): + try: + INFILE = open(infile, 'r') + except IOError: + print "\n[Error]:\n\t File cannot be open: ", infile; + exit(-1) + i = 0 + # Count the numbers of barcodes in first 10000 lines + line = INFILE.readline() + tokens = line.strip().split() + input_format=""; + if line[0]=="@" : + input_format = "FastQ" + n_fastq = 0 + elif len(tokens) == 1 and line[0] != ">": + input_format = "sequence" + elif len(tokens) == 11: + input_format = "qseq" + elif line[0]==">" : + input_format = "fasta" + INFILE.close() + + print "Input file format detected: ", input_format + + if input_format == "FastQ" : + FilterFastq(infile, outfile) + elif input_format == "sequence" : + FilterSequence(infile, outfile) + elif input_format == "qseq" : + FilterQseq(infile, outfile, keepquality) + elif input_format == "fasta" : + FilterFasta(infile, outfile) + + +from optparse import OptionParser + +# =========================================== +def main(): + usage = "Usage: %prog -i <input> -o <output> [-k]\n" \ + "Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n" \ + "Last Update: 2013-04-01" \ + "Description: Unique reads for qseq/fastq/fasta/sequencce,\n" \ + " and filter low quality reads in qseq file." + parser = OptionParser(usage) + parser.add_option("-i", dest="infile", + help="Name of the input qseq/fastq/fasta/sequence file", metavar="FILE") + parser.add_option("-o", dest="outfile", + help="Name of the output file", metavar="FILE") + parser.add_option("-k", dest="keepquality", default = False, action = "store_true", + help="Would not filter low quality reads if specified") + (options, args) = parser.parse_args() + + if (options.infile is None) or (options.outfile is None) : + print parser.print_help() + exit(-1) + FilterReads(options.infile, options.outfile, options.keepquality) + + +# =========================================== +if __name__ == "__main__": + main() +