18
|
1 import sys, os, optparse,shutil, random
|
|
2 from commons.core.launcher.Launcher import Launcher
|
|
3 from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory
|
|
4 from commons.core.utils.FileUtils import FileUtils
|
|
5
|
|
6 def _map(iLauncher, cmd, cmdStart, cmdFinish ):
|
|
7 lCmds = []
|
|
8 lCmds.extend(cmd)
|
|
9 lCmdStart = []
|
|
10 lCmdStart.extend(cmdStart)
|
|
11 lCmdFinish = []
|
|
12 lCmdFinish.extend(cmdFinish)
|
|
13 return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))
|
|
14
|
|
15 def splitFastQ(fileName, nbOfSeqPerBatch):
|
|
16 nbOfLinesPerFile = nbOfSeqPerBatch * 4
|
|
17 lOutput = []
|
|
18 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
|
|
19 resDir = os.path.dirname(fileName)
|
|
20 with open(fileName) as inF:
|
|
21 fileNb = 1
|
|
22 line = inF.readline()
|
|
23 if not line or nbOfLinesPerFile == 0:
|
|
24 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt)
|
|
25 lOutput.append(outFileName)
|
|
26 f = open(outFileName, "wb")
|
|
27 shutil.copyfileobj(open(fileName, "rb"), f)
|
|
28 f.close()
|
|
29 else:
|
|
30 while line:
|
|
31 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt)
|
|
32 lOutput.append(outFileName)
|
|
33 with open(outFileName, "w") as outF:
|
|
34 lineNb = 1
|
|
35 while lineNb <= nbOfLinesPerFile and line:
|
|
36 outF.write(line)
|
|
37 line = inF.readline()
|
|
38 lineNb += 1
|
|
39 fileNb += 1
|
|
40 return lOutput
|
|
41
|
|
42 def joinFastQ(dCutOut2Out):
|
|
43 for key in dCutOut2Out.keys():
|
|
44 FileUtils.catFilesFromList(dCutOut2Out[key],key, False)
|
|
45
|
|
46 def _createFastqGroomerCode(outGroomerNames, inputFileNames, input_type, output_type, force_quality_encoding, summarize_input):
|
|
47 cmd2Launch = []
|
|
48 cmd2Launch.append("log = 0")
|
|
49 cmd2Launch.append("from galaxy_utils.sequence.fastq import fastqReader, fastqVerboseErrorReader, fastqAggregator, fastqWriter")
|
|
50 cmd2Launch.append("aggregator = fastqAggregator()")
|
|
51 cmd2Launch.append("out = fastqWriter( open( '%s', 'wb' ), format = '%s', force_quality_encoding = '%s')" % (outGroomerNames,output_type,force_quality_encoding))
|
|
52 cmd2Launch.append("read_count = None")
|
|
53 if summarize_input:
|
|
54 cmd2Launch.append("reader = fastqVerboseErrorReader")
|
|
55 else:
|
|
56 cmd2Launch.append("reader = fastqReader")
|
|
57 cmd2Launch.append("for read_count, fastq_read in enumerate( reader( open( '%s' ), format = '%s', apply_galaxy_conventions = True ) ):" % (inputFileNames, input_type))
|
|
58 if summarize_input:
|
|
59 cmd2Launch.append("\taggregator.consume_read( fastq_read )")
|
|
60 cmd2Launch.append("\tout.write( fastq_read )")
|
|
61 cmd2Launch.append("out.close()")
|
|
62 cmd2Launch.append("if read_count is not None:")
|
|
63 #cmd2Launch.append("\tprint 'Groomed %s %s reads into %s reads.' % ( read_count + 1, %s, %s )" % ('%i', '%s', '%s', input_type,output_type))
|
|
64 cmd2Launch.append("\tif '%s' != '%s' and 'solexa' in [ '%s', '%s' ]:" % (input_type, output_type, input_type, output_type))
|
|
65 cmd2Launch.append("\t\tprint 'Converted between Solexa and PHRED scores.'")
|
|
66 if summarize_input:
|
|
67 cmd2Launch.append("\tprint 'Based upon quality and sequence, the input data is valid for: %s' % ( ', '.join( aggregator.get_valid_formats() ) or 'None' )")
|
|
68 cmd2Launch.append("\tascii_range = aggregator.get_ascii_range()")
|
|
69 cmd2Launch.append("\tdecimal_range = aggregator.get_decimal_range()")
|
|
70 cmd2Launch.append("\tprint 'Input ASCII range: %s(%i) - %s(%i)' % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) )")
|
|
71 cmd2Launch.append("\tprint 'Input decimal range: %i - %i' % ( decimal_range[0], decimal_range[1] ) ")
|
|
72 cmd2Launch.append("else:")
|
|
73 cmd2Launch.append("\tprint 'No valid FASTQ reads were provided.'")
|
|
74 cmd2Launch.append("\tlog = 255")
|
|
75 return cmd2Launch
|
|
76
|
|
77 def stop_err(msg):
|
|
78 sys.stderr.write("%s\n" % msg)
|
|
79 sys.exit()
|
|
80
|
|
81 def main():
|
|
82
|
|
83 input_filename = sys.argv[1] #a txt file
|
|
84 input_type = sys.argv[2]
|
|
85 output_filename = sys.argv[3] #a txt file
|
|
86 output_type = sys.argv[4]
|
|
87 force_quality_encoding = sys.argv[5]
|
|
88 summarize_input = sys.argv[6] == 'summarize_input'
|
|
89 pairedEnd_input = sys.argv[7]
|
|
90 if pairedEnd_input == 'None':
|
|
91 pairedEnd_input = None
|
|
92 else:
|
|
93 output_pairedEndFileName = sys.argv[8]
|
|
94
|
|
95 if force_quality_encoding == 'None':
|
|
96 force_quality_encoding = None
|
|
97
|
|
98 #Parse the input txt file and read a list of fastq files
|
|
99 file = open(input_filename, "r")
|
|
100 lines = file.readlines()
|
|
101 inputFileNames = []
|
|
102 outGroomerNames = []
|
|
103 resDirName = os.path.dirname(output_filename) + "/"
|
|
104 #Write output txt file and define all output groomer file names
|
|
105 outFile = open(output_filename, "w")
|
|
106 for line in lines:
|
|
107 tab = line.split()
|
|
108 inputFileNames.append(tab[1])
|
|
109 outGroomerName = resDirName + tab[0] + '_outGroomer_%s.fastq' % random.randrange(0, 10000)
|
|
110 outGroomerNames.append(outGroomerName)
|
|
111 outFile.write(tab[0] + '\t' + outGroomerName + '\n')
|
|
112 outFile.close()
|
|
113 file.close()
|
|
114
|
|
115 if pairedEnd_input != None:
|
|
116 inPairedFile = open(pairedEnd_input, "r")
|
|
117 lines = inPairedFile.readlines()
|
|
118 inputPairedEndFileNames = []
|
|
119 outGroomerPairedEndNames = []
|
|
120 outPairedEndFile = open(output_pairedEndFileName, "w")
|
|
121 for line in lines:
|
|
122 tab = line.split()
|
|
123 inputPairedEndFileNames.append(tab[1])
|
|
124 outGroomerPairedEndName = resDirName + tab[0] + '_outGroomer_pairedEnd_%s.fastq' % random.randrange(0, 10000)
|
|
125 outGroomerPairedEndNames.append(outGroomerPairedEndName)
|
|
126 outPairedEndFile.write(tab[0] + '\t' + outGroomerPairedEndName + '\n')
|
|
127 outPairedEndFile.close()
|
|
128 inPairedFile.close()
|
|
129
|
|
130 acronym = "fastqGroomer"
|
|
131 jobdb = TableJobAdaptatorFactory.createJobInstance()
|
|
132 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
|
|
133 lCmdsTuples = []
|
|
134 dCutOut2Out = {}
|
|
135 lAllFile2remove = []
|
|
136 # Write output file
|
|
137 for i in range(len(outGroomerNames)):
|
|
138 lCutInputFile = splitFastQ(inputFileNames[i], 20000)
|
|
139 lAllFile2remove.extend(lCutInputFile)
|
|
140 lCutOutput = []
|
|
141 for cutInput in lCutInputFile:
|
|
142 cutOutput = "%s_out" % cutInput
|
|
143 lCutOutput.append(cutOutput)
|
|
144 lAllFile2remove.extend(lCutOutput)
|
|
145 cmd2Launch = _createFastqGroomerCode(cutOutput, cutInput, input_type, output_type, force_quality_encoding, summarize_input)
|
|
146 cmdStart = []
|
|
147 cmdFinish = []
|
|
148 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish))
|
|
149 dCutOut2Out[outGroomerNames[i]] = lCutOutput
|
|
150 if pairedEnd_input != None:
|
|
151 lCutInputFile = splitFastQ(inputPairedEndFileNames[i], 20000)
|
|
152 lAllFile2remove.extend(lCutInputFile)
|
|
153 lCutOutput = []
|
|
154 for cutInput in lCutInputFile:
|
|
155 cutOutput = "%s_out" % cutInput
|
|
156 lCutOutput.append(cutOutput)
|
|
157 lAllFile2remove.extend(lCutOutput)
|
|
158 cmd2Launch = _createFastqGroomerCode(cutOutput, cutInput, input_type, output_type, force_quality_encoding, summarize_input)
|
|
159 cmdStart = []
|
|
160 cmdFinish = []
|
|
161 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish))
|
|
162 dCutOut2Out[outGroomerPairedEndNames[i]] = lCutOutput
|
|
163 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, False)
|
|
164
|
|
165 joinFastQ(dCutOut2Out)
|
|
166 FileUtils.removeFilesFromListIfExist(lAllFile2remove)
|
|
167
|
|
168 if __name__ == "__main__": main()
|