Mercurial > repos > yufei-luo > s_mart
diff SMART/DiffExpAnal/fastq_groomer_parallel_unSQL.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/DiffExpAnal/fastq_groomer_parallel_unSQL.py Tue Apr 30 14:33:21 2013 -0400 @@ -0,0 +1,168 @@ +import sys, os, optparse,shutil, random +from commons.core.launcher.Launcher import Launcher +from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory +from commons.core.utils.FileUtils import FileUtils + +def _map(iLauncher, cmd, cmdStart, cmdFinish ): + lCmds = [] + lCmds.extend(cmd) + lCmdStart = [] + lCmdStart.extend(cmdStart) + lCmdFinish = [] + lCmdFinish.extend(cmdFinish) + return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) + +def splitFastQ(fileName, nbOfSeqPerBatch): + nbOfLinesPerFile = nbOfSeqPerBatch * 4 + lOutput = [] + filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) + resDir = os.path.dirname(fileName) + with open(fileName) as inF: + fileNb = 1 + line = inF.readline() + if not line or nbOfLinesPerFile == 0: + outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) + lOutput.append(outFileName) + f = open(outFileName, "wb") + shutil.copyfileobj(open(fileName, "rb"), f) + f.close() + else: + while line: + outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) + lOutput.append(outFileName) + with open(outFileName, "w") as outF: + lineNb = 1 + while lineNb <= nbOfLinesPerFile and line: + outF.write(line) + line = inF.readline() + lineNb += 1 + fileNb += 1 + return lOutput + +def joinFastQ(dCutOut2Out): + for key in dCutOut2Out.keys(): + FileUtils.catFilesFromList(dCutOut2Out[key],key, False) + +def _createFastqGroomerCode(outGroomerNames, inputFileNames, input_type, output_type, force_quality_encoding, summarize_input): + cmd2Launch = [] + cmd2Launch.append("log = 0") + cmd2Launch.append("from galaxy_utils.sequence.fastq import fastqReader, fastqVerboseErrorReader, fastqAggregator, fastqWriter") + cmd2Launch.append("aggregator = fastqAggregator()") + cmd2Launch.append("out = fastqWriter( open( '%s', 'wb' ), format = '%s', force_quality_encoding = '%s')" % (outGroomerNames,output_type,force_quality_encoding)) + cmd2Launch.append("read_count = None") + if summarize_input: + cmd2Launch.append("reader = fastqVerboseErrorReader") + else: + cmd2Launch.append("reader = fastqReader") + cmd2Launch.append("for read_count, fastq_read in enumerate( reader( open( '%s' ), format = '%s', apply_galaxy_conventions = True ) ):" % (inputFileNames, input_type)) + if summarize_input: + cmd2Launch.append("\taggregator.consume_read( fastq_read )") + cmd2Launch.append("\tout.write( fastq_read )") + cmd2Launch.append("out.close()") + cmd2Launch.append("if read_count is not None:") + #cmd2Launch.append("\tprint 'Groomed %s %s reads into %s reads.' % ( read_count + 1, %s, %s )" % ('%i', '%s', '%s', input_type,output_type)) + cmd2Launch.append("\tif '%s' != '%s' and 'solexa' in [ '%s', '%s' ]:" % (input_type, output_type, input_type, output_type)) + cmd2Launch.append("\t\tprint 'Converted between Solexa and PHRED scores.'") + if summarize_input: + cmd2Launch.append("\tprint 'Based upon quality and sequence, the input data is valid for: %s' % ( ', '.join( aggregator.get_valid_formats() ) or 'None' )") + cmd2Launch.append("\tascii_range = aggregator.get_ascii_range()") + cmd2Launch.append("\tdecimal_range = aggregator.get_decimal_range()") + cmd2Launch.append("\tprint 'Input ASCII range: %s(%i) - %s(%i)' % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) )") + cmd2Launch.append("\tprint 'Input decimal range: %i - %i' % ( decimal_range[0], decimal_range[1] ) ") + cmd2Launch.append("else:") + cmd2Launch.append("\tprint 'No valid FASTQ reads were provided.'") + cmd2Launch.append("\tlog = 255") + return cmd2Launch + +def stop_err(msg): + sys.stderr.write("%s\n" % msg) + sys.exit() + +def main(): + + input_filename = sys.argv[1] #a txt file + input_type = sys.argv[2] + output_filename = sys.argv[3] #a txt file + output_type = sys.argv[4] + force_quality_encoding = sys.argv[5] + summarize_input = sys.argv[6] == 'summarize_input' + pairedEnd_input = sys.argv[7] + if pairedEnd_input == 'None': + pairedEnd_input = None + else: + output_pairedEndFileName = sys.argv[8] + + if force_quality_encoding == 'None': + force_quality_encoding = None + + #Parse the input txt file and read a list of fastq files + file = open(input_filename, "r") + lines = file.readlines() + inputFileNames = [] + outGroomerNames = [] + resDirName = os.path.dirname(output_filename) + "/" + #Write output txt file and define all output groomer file names + outFile = open(output_filename, "w") + for line in lines: + tab = line.split() + inputFileNames.append(tab[1]) + outGroomerName = resDirName + tab[0] + '_outGroomer_%s.fastq' % random.randrange(0, 10000) + outGroomerNames.append(outGroomerName) + outFile.write(tab[0] + '\t' + outGroomerName + '\n') + outFile.close() + file.close() + + if pairedEnd_input != None: + inPairedFile = open(pairedEnd_input, "r") + lines = inPairedFile.readlines() + inputPairedEndFileNames = [] + outGroomerPairedEndNames = [] + outPairedEndFile = open(output_pairedEndFileName, "w") + for line in lines: + tab = line.split() + inputPairedEndFileNames.append(tab[1]) + outGroomerPairedEndName = resDirName + tab[0] + '_outGroomer_pairedEnd_%s.fastq' % random.randrange(0, 10000) + outGroomerPairedEndNames.append(outGroomerPairedEndName) + outPairedEndFile.write(tab[0] + '\t' + outGroomerPairedEndName + '\n') + outPairedEndFile.close() + inPairedFile.close() + + acronym = "fastqGroomer" + jobdb = TableJobAdaptatorFactory.createJobInstance() + iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) + lCmdsTuples = [] + dCutOut2Out = {} + lAllFile2remove = [] + # Write output file + for i in range(len(outGroomerNames)): + lCutInputFile = splitFastQ(inputFileNames[i], 20000) + lAllFile2remove.extend(lCutInputFile) + lCutOutput = [] + for cutInput in lCutInputFile: + cutOutput = "%s_out" % cutInput + lCutOutput.append(cutOutput) + lAllFile2remove.extend(lCutOutput) + cmd2Launch = _createFastqGroomerCode(cutOutput, cutInput, input_type, output_type, force_quality_encoding, summarize_input) + cmdStart = [] + cmdFinish = [] + lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) + dCutOut2Out[outGroomerNames[i]] = lCutOutput + if pairedEnd_input != None: + lCutInputFile = splitFastQ(inputPairedEndFileNames[i], 20000) + lAllFile2remove.extend(lCutInputFile) + lCutOutput = [] + for cutInput in lCutInputFile: + cutOutput = "%s_out" % cutInput + lCutOutput.append(cutOutput) + lAllFile2remove.extend(lCutOutput) + cmd2Launch = _createFastqGroomerCode(cutOutput, cutInput, input_type, output_type, force_quality_encoding, summarize_input) + cmdStart = [] + cmdFinish = [] + lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) + dCutOut2Out[outGroomerPairedEndNames[i]] = lCutOutput + iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, False) + + joinFastQ(dCutOut2Out) + FileUtils.removeFilesFromListIfExist(lAllFile2remove) + +if __name__ == "__main__": main()