Mercurial > repos > weilong-guo > bs_seeker2
view BSseeker2/FilterReads.py @ 1:8b26adf64adc draft default tip
V2.0.5
author | weilong-guo |
---|---|
date | Tue, 05 Nov 2013 01:55:39 -0500 |
parents | e6df770c0e58 |
children |
line wrap: on
line source
#!/usr/bin/env python import sys import os import os.path import re # Define the Dictionary & the Reverse Dictionary for barcode # =========================================== # def FilterFastq (infile, outfile) : try: INFILE = open(infile, 'r') except IOError: print "\n[Error]:\n\t File cannot be open: ", infile; exit(-1) try: OUTFILE = open(outfile, 'w') except IOError: print "\n[Error]:\n\t File cannot be open: ", outfile; exit(-1) Dict = {} count = 0 total = 0 while 1 : line1 = INFILE.readline() if not line1 : break line1 = line1 line2 = INFILE.readline() line3 = INFILE.readline() line4 = INFILE.readline() total += 1 # print line2 if ( line2 in Dict ) : Dict[line2] += 1 else : Dict[line2] = 1 OUTFILE.write(line1) OUTFILE.write(line2) OUTFILE.write(line3) OUTFILE.write(line4) count += 1 INFILE.close() OUTFILE.close() print "Count of total raw reads: ", total print "Count of reads left: ", count print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" # =============================== # def FilterSequence (infile, outfile) : try: INFILE = open(infile, 'r') except IOError: print "\n[Error]:\n\t File cannot be open: ", infile; exit(-1) try: OUTFILE = open(outfile, 'w') except IOError: print "\n[Error]:\n\t File cannot be open: ", outfile; exit(-1) Dict = {} count = 0 total = 0 for line in INFILE: line = line.strip() total += 1 if ( line in Dict ) : Dict[line] += 1 else : Dict[line] = 1 count += 1 OUTFILE.write(line + "\n") INFILE.close() OUTFILE.close() print "Count of total raw reads: ", total print "Count of reads left: ", count print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" # =============================== # SN603 WA047 6 1101 41.40 99.10 0 1 .GGGA.......TTAG.............. @SZ__@@@@@@@RR__@@@@@@@@@@@@@@ 0 # def FilterQseq (infile, outfile, keepquality) : if keepquality : print "User specified '-k' and read quality will not be considered" else : print "Reads with PF=0 will be filtered" try: INFILE = open(infile, 'r') except IOError: print "\n[Error]:\n\t File cannot be open: ", infile; exit(-1) try: OUTFILE = open(outfile, 'w') except IOError: print "\n[Error]:\n\t File cannot be open: ", outfile; exit(-1) Dict = {} count = 0 total = 0 for line in INFILE: tokens = line.strip().split() total += 1 if ( (not keepquality and tokens[10] == "1") or keepquality ) : if ( tokens[8] in Dict ) : Dict[tokens[8]] += 1 else : Dict[tokens[8]] = 1 count += 1 OUTFILE.write(line) INFILE.close() OUTFILE.close() print "Count of total raw reads: ", total print "Count of reads left: ", count print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" # =============================== # def FilterFasta (infile, outfile) : try: INFILE = open(infile, 'r') except IOError: print "\n[Error]:\n\t File cannot be open: ", infile; exit(-1) try: OUTFILE = open(outfile, 'w') except IOError: print "\n[Error]:\n\t File cannot be open: ", outfile; exit(-1) Dict = {} count = 0 total = 0 name = "" read = "" for line in INFILE: line = line.strip() if (line[0] == '>') : total += 1 if ( name != "" ) : if ( read in Dict ) : Dict[read] += 1 else : Dict[read] = 1 count += 1 OUTFILE.write(">" + name + "\n") OUTFILE.write(read + "\n") name = line[1:] read = "" else : read = read + line if (name != "") : if ( read in Dict ) : Dict[read] += 1 else : Dict[read] = 1 count += 1 OUTFILE.write(">" + name + "\n") OUTFILE.write(read + "\n") print "Count of total raw reads: ", total print "Count of reads left: ", count print "Percentage of kept reads: %.2f" % (count * 100.0 /total), "%" # ============================== # Decide branch functions for different format def FilterReads (infile, outfile, keepquality): try: INFILE = open(infile, 'r') except IOError: print "\n[Error]:\n\t File cannot be open: ", infile; exit(-1) i = 0 # Count the numbers of barcodes in first 10000 lines line = INFILE.readline() tokens = line.strip().split() input_format=""; if line[0]=="@" : input_format = "FastQ" n_fastq = 0 elif len(tokens) == 1 and line[0] != ">": input_format = "sequence" elif len(tokens) == 11: input_format = "qseq" elif line[0]==">" : input_format = "fasta" INFILE.close() print "Input file format detected: ", input_format if input_format == "FastQ" : FilterFastq(infile, outfile) elif input_format == "sequence" : FilterSequence(infile, outfile) elif input_format == "qseq" : FilterQseq(infile, outfile, keepquality) elif input_format == "fasta" : FilterFasta(infile, outfile) from optparse import OptionParser # =========================================== def main(): usage = "Usage: %prog -i <input> -o <output> [-k]\n" \ "Author : Guo, Weilong; guoweilong@gmail.com; 2012-11-10\n" \ "Last Update: 2013-04-01" \ "Description: Unique reads for qseq/fastq/fasta/sequencce,\n" \ " and filter low quality reads in qseq file." parser = OptionParser(usage) parser.add_option("-i", dest="infile", help="Name of the input qseq/fastq/fasta/sequence file", metavar="FILE") parser.add_option("-o", dest="outfile", help="Name of the output file", metavar="FILE") parser.add_option("-k", dest="keepquality", default = False, action = "store_true", help="Would not filter low quality reads if specified") (options, args) = parser.parse_args() if (options.infile is None) or (options.outfile is None) : parser.print_help() exit(-1) FilterReads(options.infile, options.outfile, options.keepquality) # =========================================== if __name__ == "__main__": main()