comparison _modules/IData_handling.py @ 0:69e8f12c8b31 draft

"planemo upload"
author bioit_sciensano
date Fri, 11 Mar 2022 15:06:20 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:69e8f12c8b31
1 ## @file IData_handling.py
2 #
3 # VL: Gather here the classes and functions useful for handling input data.
4 from __future__ import print_function
5
6 import gzip
7 from _modules.utilities import reverseComplement,changeCase
8 from time import gmtime, strftime
9 import datetime
10
11 try:
12 import cPickle as pickle
13 except ImportError: # python 3.x
14 import pickle
15
16
17 ## This class encapsulates the reference sequences, the host sequence if any and all useful information about the sequences.
18 #
19 # It is used both for searching the read extracts in the sequences and for exploiting the results
20 class refData:
21 def __init__(self,refseq_list,seed,hostseq):
22 self.refseq_list=refseq_list
23 self.seed=seed
24 self.hostseq=hostseq
25 if hostseq!="":
26 self.refseq_list.insert(0,hostseq)
27 self.nb_sequences=len(refseq_list)
28
29 def getIdxSeq(self,refseq):
30 idx=-1
31 found=False
32 for s in self.refseq_list:
33 idx += 1
34 if s==refseq:
35 found=True
36 break
37 if not found:
38 raise RuntimeError("Couldn't find sequence in list of ref sequences.")
39 return idx
40
41
42 def IdxIsHostseq(self,idx_seq):
43 if (((self.hostseq == "") and (idx_seq <= self.nb_sequences - 1)) or (
44 (self.hostseq != "") and (idx_seq >0))):
45 return False
46 return True
47
48 def getSeqSizesList(self):
49 seq_sizes_list = []
50 for seq in self.refseq_list:
51 seq_sizes_list.append(len(seq))
52 return seq_sizes_list
53
54
55 ## Base class for handling read extracts.
56 #
57 # This class should not be used directly.
58 class ReadExtracts:
59 def __init__(self,seed):
60 self.seed = seed
61 self.r_extracts_list = []
62 self.nb_reads = 0
63 self.nb_extracts=0
64
65 ## Returns the list of read extracts from the loaded dataset, the number of reads and the total number of extracts
66 def getRExtracts(self):
67 return self.r_extracts_list,self.nb_reads,self.nb_extracts
68
69 ## Class containing all the read extracts (PE reads) that must be mapped against a sequence.
70 class readExtractsPE(ReadExtracts):
71 def __init__(self,seed):
72 self.__init__(seed)
73
74
75 def addRead(self, whole_PE1,whole_PE2):
76 self.r_extracts_list.append(whole_PE1[:self.seed])
77 self.r_extracts_list.append(whole_PE1[-self.seed:])
78 self.r_extracts_list.append(whole_PE2[:self.seed])
79 self.r_extracts_list.append(reverseComplement(whole_PE2)[:self.seed])
80 self.r_extracts_list.append(reverseComplement(whole_PE2)[-self.seed:])
81 self.nb_reads += 1
82 self.nb_extracts += 5 # Number of extracts per read: 2 extracts for PE1 and 3 for PE2.
83
84
85
86 ## Class containing all the read extracts (single reads) that must be mapped against a sequence.
87 class readsExtractsS(ReadExtracts):
88 def __init__(self,seed):
89 ReadExtracts.__init__(self,seed)
90
91 ## Adds a read to the list of extracts
92 #
93 # @param whole_read The read as extracted from the fastq file
94 # @param no_pair This paramenter s only used to make the distinction between Single and paired.
95 # Note VL: I didn't use meta programming here because I thought that it would have a negative impact on performance.
96 # TODO: test it when all the rest works.
97 def addRead(self,whole_read,no_pair=""):
98 read_part = whole_read[:self.seed]
99 self.r_extracts_list.append(read_part)
100 self.r_extracts_list.append(whole_read[-self.seed:])
101 self.r_extracts_list.append(reverseComplement(whole_read)[:self.seed])
102 self.r_extracts_list.append(reverseComplement(whole_read)[-self.seed:])
103 self.nb_reads+=1
104 self.nb_extracts += 4
105
106 ## use objects of this class to store read offset (PE1 and PE2) in files.
107 class ReadInfo:
108 def __init__(self, off_PE1,whole_read,seed,off_PE2=None):
109 self.offset1=off_PE1
110 self.offset2=off_PE2
111 self.corlen = len(whole_read) - seed
112
113 ## Gets the number of reads in the fastq file
114 # def getNbReads(fastq):
115 # with open(fastq) as f:
116 # for i, l in enumerate(f):
117 # pass
118 # nb_r=i+1
119 # nb_r=nb_r/4
120 # return nb_r
121
122
123
124 ## loads a chunk of reads for mapping on GPU.
125 # Yields a ReadExtracts object plus a dictionnary of ReadInfo.
126 # keeps in memory the parsing state of the file.
127 # @param ch_size is in number of reads
128 # @reset_ids indicates whether or not we want read index to be reset to 0 at the beginning of each chunk.
129 def getChunk(fastq,seed,paired,ch_siz,reset_ids=True):
130 new_chunk = False
131 d_rinfo=dict()
132 idx_read=0
133 off2=None
134 filin = open(fastq)
135 line = filin.readline()
136 read_paired=""
137 if paired != "":
138 RE=readExtractsPE(seed)
139 filin_paired = open(paired)
140 line_paired = filin_paired.readline()
141 else:
142 RE=readsExtractsS(seed)
143
144 start = False
145 num_line=0
146 while line:
147 # Read sequence
148 read = line.split("\n")[0].split("\r")[0]
149 if paired != "":
150 read_paired = line_paired.split("\n")[0].split("\r")[0]
151 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
152 start = True
153 off1=filin.tell()
154 line = filin.readline()
155 if paired != "":
156 off2=filin_paired.tell()
157 line_paired = filin_paired.readline()
158 continue
159 if (start == True):
160 start = False
161 readlen = len(read)
162 if readlen < seed:
163 line = filin.readline()
164 if paired !="":
165 line_paired = filin_paired.readline() # also skip PE2 in that case
166 continue
167 RE.addRead(read,read_paired)
168 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
169 if (idx_read>0 and ((idx_read+1)%(ch_siz)==0)):
170 yield RE,d_rinfo
171 if (reset_ids):
172 idx_read=0
173 new_chunk=True
174 if paired != "":
175 RE = readExtractsPE(seed)
176 else:
177 RE = readsExtractsS(seed)
178 d_rinfo = dict()
179 if not new_chunk:
180 idx_read+=1
181 else:
182 new_chunk=False
183
184 line = filin.readline()
185 if paired!="":
186 line_paired = filin_paired.readline()
187 filin.close()
188 if paired !="":
189 filin_paired.close()
190 yield RE, d_rinfo
191
192 ## dumps a dictionnary of ReadInfo objects indexed on read index.
193 #
194 # @param d_rinfo dictionnary to dump
195 # @param fic filename (incl. full path) where to dump
196 def dump_d_rinfo(d_rinfo,fic):
197 with open(fic, 'wb') as fp:
198 pickle.dump(d_rinfo, fp, protocol=pickle.HIGHEST_PROTOCOL)
199
200 ## Loads a dictionnary of ReadInfo objects.
201 def load_d_rinfo(fic):
202 with open(fic, 'rb') as fp:
203 d_rinfo = pickle.load(fp)
204 return d_rinfo
205
206
207 ## loads all extracts of reads into a list for processing on GPU.
208 #
209 # returns 1 or 2 readExtracts objects plus a dictionnary of ReadInfo.
210 def getAllReads(fastq,seed,paired):
211 d_rinfo=dict()
212 idx_read=0
213 off2=None
214 filin = open(fastq)
215 line = filin.readline()
216 read_paired=""
217
218 if paired != "":
219 RE=readExtractsPE(seed)
220 filin_paired = open(paired)
221 line_paired = filin_paired.readline()
222 else:
223 RE=readsExtractsS(seed)
224
225 start = False
226 num_line=0
227 while line:
228 # Read sequence
229 read = line.split("\n")[0].split("\r")[0]
230 if paired != "":
231 read_paired = line_paired.split("\n")[0].split("\r")[0]
232 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
233 start = True
234 off1=filin.tell()
235 line = filin.readline()
236 if paired != "":
237 off2=filin_paired.tell()
238 line_paired = filin_paired.readline()
239 continue
240 if (start == True):
241 start = False
242 readlen = len(read)
243 if readlen < seed:
244 line = filin.readline()
245 if paired !="":
246 line_paired = filin_paired.readline() # also skip PE2 in that case
247 continue
248 RE.addRead(read,read_paired)
249 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
250 idx_read+=1
251
252 line = filin.readline()
253 if paired!="":
254 line_paired = filin_paired.readline()
255 filin.close()
256 if paired !="":
257 filin_paired.close()
258 return RE,d_rinfo
259
260 ## use this class to retrieve reads from fastq file.
261 class ReadGetter:
262 ## constructor
263 #
264 # @param fastq Name of the fastq file that contains the read
265 # @param d_rinfo A dictionnary of ReadInfo objects that contains the offset indicating where the read starts in the file.
266 # @param paired The name of the file containing the PE2 (defaults to "").
267 def __init__(self,fastq,d_rinfo,paired=""):
268 self.filin=open(fastq)
269 self.d_rinfo=d_rinfo
270 self.paired=paired
271 if paired!="":
272 self.filinp=open(fastq)
273
274 def getOneRead(self,idx_read):
275 read_paired=""
276 self.filin.seek(self.d_rinfo[idx_read].offset1)
277 read=self.filin.readline()
278 if self.paired!="":
279 self.filinp.seek(self.d_rinfo[idx_read].offset2)
280 read_paired = self.filinp.readline()
281 return read,read_paired
282
283
284 ### READS Functions
285 def totReads(filin):
286 """Verify and retrieve the number of reads in the fastq file before alignment"""
287 if filin.endswith('.gz'):
288 filein = gzip.open(filin, 'rb')
289 else:
290 filein = open(filin, 'r')
291
292 line = 0
293 while filein.readline():
294 line += 1
295 seq = float(round(line / 4))
296 filein.close()
297 return seq
298
299 ### SEQUENCE parsing function
300 def genomeFastaRecovery(filin, limit_reference, edge, host_test = 0):
301 """Get genome sequence from fasta file"""
302 print("recovering genome from: ",filin)
303 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
304 if filin == "":
305 return "", "", ""
306
307 infile = open(filin, 'r')
308 name = []
309 genome = []
310 genome_line = ""
311 genome_rejected = 0
312 for line in infile:
313 if line[0] == ">":
314 if name != []:
315 if len(genome_line) >= limit_reference:
316 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
317 else:
318 genome_rejected += 1
319 name = name[:-1]
320 genome_line = ""
321 name.append(line[1:].split('\r')[0].split('\n')[0])
322 else:
323 genome_line += changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0]
324
325 if len(genome_line) >= limit_reference:
326 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
327 genome_line = ""
328 else:
329 genome_rejected += 1
330 name = name[:-1]
331
332 infile.close()
333
334 if host_test:
335 return "".join(genome)
336 else:
337 return genome, name, genome_rejected
338 close(filin)
339