Mercurial > repos > yufei-luo > s_mart
diff SMART/DiffExpAnal/fastq_groomer_parallel.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/DiffExpAnal/fastq_groomer_parallel.py Tue Apr 30 14:33:21 2013 -0400 @@ -0,0 +1,115 @@ +import sys, os, optparse, random +from galaxy_utils.sequence.fastq import fastqReader, fastqVerboseErrorReader, fastqAggregator, fastqWriter + +def stop_err(msg): + sys.stderr.write("%s\n" % msg) + sys.exit() + +def main(): + + input_filename = sys.argv[1] #a txt file + input_type = sys.argv[2] + output_filename = sys.argv[3] #a txt file + output_type = sys.argv[4] + force_quality_encoding = sys.argv[5] + summarize_input = sys.argv[6] == 'summarize_input' + pairedEnd_input = sys.argv[7] + if pairedEnd_input == 'None': + pairedEnd_input = None + else: + output_pairedEndFileName = sys.argv[8] + + if force_quality_encoding == 'None': + force_quality_encoding = None + + #Parse the input txt file and read a list of fastq files + file = open(input_filename, "r") + lines = file.readlines() + inputFileNames = [] + outGroomerNames = [] + resDirName = os.path.dirname(output_filename) + "/" + #Write output txt file and define all output groomer file names + outFile = open(output_filename, "w") + for line in lines: + tab = line.split() + inputFileNames.append(tab[1]) + outGroomerName = resDirName + tab[0] + '_outGroomer_%s.fastq' % random.randrange(0, 10000) + outGroomerNames.append(outGroomerName) + outFile.write(tab[0] + '\t' + outGroomerName + '\n') + outFile.close() + file.close() + + if pairedEnd_input != None: + inPairedFile = open(pairedEnd_input, "r") + lines = inPairedFile.readlines() + inputPairedEndFileNames = [] + outGroomerPairedEndNames = [] + outPairedEndFile = open(output_pairedEndFileName, "w") + for line in lines: + tab = line.split() + inputPairedEndFileNames.append(tab[1]) + outGroomerPairedEndName = resDirName + tab[0] + '_outGroomer_pairedEnd_%s.fastq' % random.randrange(0, 10000) + outGroomerPairedEndNames.append(outGroomerPairedEndName) + outPairedEndFile.write(tab[0] + '\t' + outGroomerPairedEndName + '\n') + outPairedEndFile.close() + inPairedFile.close() + + # Write output file + aggregator = fastqAggregator() + for i in range(len(outGroomerNames)): + out = fastqWriter( open( outGroomerNames[i], 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding ) + read_count = None + if summarize_input: + reader = fastqVerboseErrorReader + else: + reader = fastqReader + for read_count, fastq_read in enumerate( reader( open( inputFileNames[i] ), format = input_type, apply_galaxy_conventions = True ) ): + if summarize_input: + aggregator.consume_read( fastq_read ) + out.write( fastq_read ) + out.close() + + if read_count is not None: + print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) + if input_type != output_type and 'solexa' in [ input_type, output_type ]: + print "Converted between Solexa and PHRED scores." + if summarize_input: + print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) + ascii_range = aggregator.get_ascii_range() + decimal_range = aggregator.get_decimal_range() + print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed + print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) + else: + print "No valid FASTQ reads were provided." + + + # Write output pairedEnd file + if pairedEnd_input != None: + aggregator = fastqAggregator() + for i in range(len(outGroomerPairedEndNames)): + outPair = fastqWriter(open(outGroomerPairedEndNames[i], 'wb'), format = output_type, force_quality_encoding = force_quality_encoding) + read_count = None + if summarize_input: + reader = fastqVerboseErrorReader + else: + reader = fastqReader + for read_count, fastq_reader in enumerate(reader(open(inputPairedEndFileNames[i]), format=input_type, apply_galaxy_conventions=True)): + if summarize_input: + aggregator.consume_read(fastq_read) + outPair.write(fastq_read) + outPair.close() + + if read_count is not None: + print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) + if input_type != output_type and 'solexa' in [ input_type, output_type ]: + print "Converted between Solexa and PHRED scores." + if summarize_input: + print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) + ascii_range = aggregator.get_ascii_range() + decimal_range = aggregator.get_decimal_range() + print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed + print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) + else: + print "No valid paired-end FASTQ reads were provided." + +if __name__ == "__main__": main()