Mercurial > repos > yufei-luo > s_mart
comparison SMART/DiffExpAnal/fastq_groomer_parallel.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki | 
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 | 
|---|---|
| 1 import sys, os, optparse, random | |
| 2 from galaxy_utils.sequence.fastq import fastqReader, fastqVerboseErrorReader, fastqAggregator, fastqWriter | |
| 3 | |
| 4 def stop_err(msg): | |
| 5 sys.stderr.write("%s\n" % msg) | |
| 6 sys.exit() | |
| 7 | |
| 8 def main(): | |
| 9 | |
| 10 input_filename = sys.argv[1] #a txt file | |
| 11 input_type = sys.argv[2] | |
| 12 output_filename = sys.argv[3] #a txt file | |
| 13 output_type = sys.argv[4] | |
| 14 force_quality_encoding = sys.argv[5] | |
| 15 summarize_input = sys.argv[6] == 'summarize_input' | |
| 16 pairedEnd_input = sys.argv[7] | |
| 17 if pairedEnd_input == 'None': | |
| 18 pairedEnd_input = None | |
| 19 else: | |
| 20 output_pairedEndFileName = sys.argv[8] | |
| 21 | |
| 22 if force_quality_encoding == 'None': | |
| 23 force_quality_encoding = None | |
| 24 | |
| 25 #Parse the input txt file and read a list of fastq files | |
| 26 file = open(input_filename, "r") | |
| 27 lines = file.readlines() | |
| 28 inputFileNames = [] | |
| 29 outGroomerNames = [] | |
| 30 resDirName = os.path.dirname(output_filename) + "/" | |
| 31 #Write output txt file and define all output groomer file names | |
| 32 outFile = open(output_filename, "w") | |
| 33 for line in lines: | |
| 34 tab = line.split() | |
| 35 inputFileNames.append(tab[1]) | |
| 36 outGroomerName = resDirName + tab[0] + '_outGroomer_%s.fastq' % random.randrange(0, 10000) | |
| 37 outGroomerNames.append(outGroomerName) | |
| 38 outFile.write(tab[0] + '\t' + outGroomerName + '\n') | |
| 39 outFile.close() | |
| 40 file.close() | |
| 41 | |
| 42 if pairedEnd_input != None: | |
| 43 inPairedFile = open(pairedEnd_input, "r") | |
| 44 lines = inPairedFile.readlines() | |
| 45 inputPairedEndFileNames = [] | |
| 46 outGroomerPairedEndNames = [] | |
| 47 outPairedEndFile = open(output_pairedEndFileName, "w") | |
| 48 for line in lines: | |
| 49 tab = line.split() | |
| 50 inputPairedEndFileNames.append(tab[1]) | |
| 51 outGroomerPairedEndName = resDirName + tab[0] + '_outGroomer_pairedEnd_%s.fastq' % random.randrange(0, 10000) | |
| 52 outGroomerPairedEndNames.append(outGroomerPairedEndName) | |
| 53 outPairedEndFile.write(tab[0] + '\t' + outGroomerPairedEndName + '\n') | |
| 54 outPairedEndFile.close() | |
| 55 inPairedFile.close() | |
| 56 | |
| 57 # Write output file | |
| 58 aggregator = fastqAggregator() | |
| 59 for i in range(len(outGroomerNames)): | |
| 60 out = fastqWriter( open( outGroomerNames[i], 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding ) | |
| 61 read_count = None | |
| 62 if summarize_input: | |
| 63 reader = fastqVerboseErrorReader | |
| 64 else: | |
| 65 reader = fastqReader | |
| 66 for read_count, fastq_read in enumerate( reader( open( inputFileNames[i] ), format = input_type, apply_galaxy_conventions = True ) ): | |
| 67 if summarize_input: | |
| 68 aggregator.consume_read( fastq_read ) | |
| 69 out.write( fastq_read ) | |
| 70 out.close() | |
| 71 | |
| 72 if read_count is not None: | |
| 73 print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) | |
| 74 if input_type != output_type and 'solexa' in [ input_type, output_type ]: | |
| 75 print "Converted between Solexa and PHRED scores." | |
| 76 if summarize_input: | |
| 77 print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) | |
| 78 ascii_range = aggregator.get_ascii_range() | |
| 79 decimal_range = aggregator.get_decimal_range() | |
| 80 print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed | |
| 81 print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) | |
| 82 else: | |
| 83 print "No valid FASTQ reads were provided." | |
| 84 | |
| 85 | |
| 86 # Write output pairedEnd file | |
| 87 if pairedEnd_input != None: | |
| 88 aggregator = fastqAggregator() | |
| 89 for i in range(len(outGroomerPairedEndNames)): | |
| 90 outPair = fastqWriter(open(outGroomerPairedEndNames[i], 'wb'), format = output_type, force_quality_encoding = force_quality_encoding) | |
| 91 read_count = None | |
| 92 if summarize_input: | |
| 93 reader = fastqVerboseErrorReader | |
| 94 else: | |
| 95 reader = fastqReader | |
| 96 for read_count, fastq_reader in enumerate(reader(open(inputPairedEndFileNames[i]), format=input_type, apply_galaxy_conventions=True)): | |
| 97 if summarize_input: | |
| 98 aggregator.consume_read(fastq_read) | |
| 99 outPair.write(fastq_read) | |
| 100 outPair.close() | |
| 101 | |
| 102 if read_count is not None: | |
| 103 print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) | |
| 104 if input_type != output_type and 'solexa' in [ input_type, output_type ]: | |
| 105 print "Converted between Solexa and PHRED scores." | |
| 106 if summarize_input: | |
| 107 print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) | |
| 108 ascii_range = aggregator.get_ascii_range() | |
| 109 decimal_range = aggregator.get_decimal_range() | |
| 110 print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed | |
| 111 print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) | |
| 112 else: | |
| 113 print "No valid paired-end FASTQ reads were provided." | |
| 114 | |
| 115 if __name__ == "__main__": main() | 
