Mercurial > repos > yufei-luo > s_mart
view SMART/DiffExpAnal/fastq_groomer_parallel.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line source
import sys, os, optparse, random from galaxy_utils.sequence.fastq import fastqReader, fastqVerboseErrorReader, fastqAggregator, fastqWriter def stop_err(msg): sys.stderr.write("%s\n" % msg) sys.exit() def main(): input_filename = sys.argv[1] #a txt file input_type = sys.argv[2] output_filename = sys.argv[3] #a txt file output_type = sys.argv[4] force_quality_encoding = sys.argv[5] summarize_input = sys.argv[6] == 'summarize_input' pairedEnd_input = sys.argv[7] if pairedEnd_input == 'None': pairedEnd_input = None else: output_pairedEndFileName = sys.argv[8] if force_quality_encoding == 'None': force_quality_encoding = None #Parse the input txt file and read a list of fastq files file = open(input_filename, "r") lines = file.readlines() inputFileNames = [] outGroomerNames = [] resDirName = os.path.dirname(output_filename) + "/" #Write output txt file and define all output groomer file names outFile = open(output_filename, "w") for line in lines: tab = line.split() inputFileNames.append(tab[1]) outGroomerName = resDirName + tab[0] + '_outGroomer_%s.fastq' % random.randrange(0, 10000) outGroomerNames.append(outGroomerName) outFile.write(tab[0] + '\t' + outGroomerName + '\n') outFile.close() file.close() if pairedEnd_input != None: inPairedFile = open(pairedEnd_input, "r") lines = inPairedFile.readlines() inputPairedEndFileNames = [] outGroomerPairedEndNames = [] outPairedEndFile = open(output_pairedEndFileName, "w") for line in lines: tab = line.split() inputPairedEndFileNames.append(tab[1]) outGroomerPairedEndName = resDirName + tab[0] + '_outGroomer_pairedEnd_%s.fastq' % random.randrange(0, 10000) outGroomerPairedEndNames.append(outGroomerPairedEndName) outPairedEndFile.write(tab[0] + '\t' + outGroomerPairedEndName + '\n') outPairedEndFile.close() inPairedFile.close() # Write output file aggregator = fastqAggregator() for i in range(len(outGroomerNames)): out = fastqWriter( open( outGroomerNames[i], 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding ) read_count = None if summarize_input: reader = fastqVerboseErrorReader else: reader = fastqReader for read_count, fastq_read in enumerate( reader( open( inputFileNames[i] ), format = input_type, apply_galaxy_conventions = True ) ): if summarize_input: aggregator.consume_read( fastq_read ) out.write( fastq_read ) out.close() if read_count is not None: print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) if input_type != output_type and 'solexa' in [ input_type, output_type ]: print "Converted between Solexa and PHRED scores." if summarize_input: print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) else: print "No valid FASTQ reads were provided." # Write output pairedEnd file if pairedEnd_input != None: aggregator = fastqAggregator() for i in range(len(outGroomerPairedEndNames)): outPair = fastqWriter(open(outGroomerPairedEndNames[i], 'wb'), format = output_type, force_quality_encoding = force_quality_encoding) read_count = None if summarize_input: reader = fastqVerboseErrorReader else: reader = fastqReader for read_count, fastq_reader in enumerate(reader(open(inputPairedEndFileNames[i]), format=input_type, apply_galaxy_conventions=True)): if summarize_input: aggregator.consume_read(fastq_read) outPair.write(fastq_read) outPair.close() if read_count is not None: print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) if input_type != output_type and 'solexa' in [ input_type, output_type ]: print "Converted between Solexa and PHRED scores." if summarize_input: print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) else: print "No valid paired-end FASTQ reads were provided." if __name__ == "__main__": main()