Mercurial > repos > bioit_sciensano > phagetermvirome
comparison _modules/IData_handling.py @ 0:69e8f12c8b31 draft
"planemo upload"
| author | bioit_sciensano |
|---|---|
| date | Fri, 11 Mar 2022 15:06:20 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:69e8f12c8b31 |
|---|---|
| 1 ## @file IData_handling.py | |
| 2 # | |
| 3 # VL: Gather here the classes and functions useful for handling input data. | |
| 4 from __future__ import print_function | |
| 5 | |
| 6 import gzip | |
| 7 from _modules.utilities import reverseComplement,changeCase | |
| 8 from time import gmtime, strftime | |
| 9 import datetime | |
| 10 | |
| 11 try: | |
| 12 import cPickle as pickle | |
| 13 except ImportError: # python 3.x | |
| 14 import pickle | |
| 15 | |
| 16 | |
| 17 ## This class encapsulates the reference sequences, the host sequence if any and all useful information about the sequences. | |
| 18 # | |
| 19 # It is used both for searching the read extracts in the sequences and for exploiting the results | |
| 20 class refData: | |
| 21 def __init__(self,refseq_list,seed,hostseq): | |
| 22 self.refseq_list=refseq_list | |
| 23 self.seed=seed | |
| 24 self.hostseq=hostseq | |
| 25 if hostseq!="": | |
| 26 self.refseq_list.insert(0,hostseq) | |
| 27 self.nb_sequences=len(refseq_list) | |
| 28 | |
| 29 def getIdxSeq(self,refseq): | |
| 30 idx=-1 | |
| 31 found=False | |
| 32 for s in self.refseq_list: | |
| 33 idx += 1 | |
| 34 if s==refseq: | |
| 35 found=True | |
| 36 break | |
| 37 if not found: | |
| 38 raise RuntimeError("Couldn't find sequence in list of ref sequences.") | |
| 39 return idx | |
| 40 | |
| 41 | |
| 42 def IdxIsHostseq(self,idx_seq): | |
| 43 if (((self.hostseq == "") and (idx_seq <= self.nb_sequences - 1)) or ( | |
| 44 (self.hostseq != "") and (idx_seq >0))): | |
| 45 return False | |
| 46 return True | |
| 47 | |
| 48 def getSeqSizesList(self): | |
| 49 seq_sizes_list = [] | |
| 50 for seq in self.refseq_list: | |
| 51 seq_sizes_list.append(len(seq)) | |
| 52 return seq_sizes_list | |
| 53 | |
| 54 | |
| 55 ## Base class for handling read extracts. | |
| 56 # | |
| 57 # This class should not be used directly. | |
| 58 class ReadExtracts: | |
| 59 def __init__(self,seed): | |
| 60 self.seed = seed | |
| 61 self.r_extracts_list = [] | |
| 62 self.nb_reads = 0 | |
| 63 self.nb_extracts=0 | |
| 64 | |
| 65 ## Returns the list of read extracts from the loaded dataset, the number of reads and the total number of extracts | |
| 66 def getRExtracts(self): | |
| 67 return self.r_extracts_list,self.nb_reads,self.nb_extracts | |
| 68 | |
| 69 ## Class containing all the read extracts (PE reads) that must be mapped against a sequence. | |
| 70 class readExtractsPE(ReadExtracts): | |
| 71 def __init__(self,seed): | |
| 72 self.__init__(seed) | |
| 73 | |
| 74 | |
| 75 def addRead(self, whole_PE1,whole_PE2): | |
| 76 self.r_extracts_list.append(whole_PE1[:self.seed]) | |
| 77 self.r_extracts_list.append(whole_PE1[-self.seed:]) | |
| 78 self.r_extracts_list.append(whole_PE2[:self.seed]) | |
| 79 self.r_extracts_list.append(reverseComplement(whole_PE2)[:self.seed]) | |
| 80 self.r_extracts_list.append(reverseComplement(whole_PE2)[-self.seed:]) | |
| 81 self.nb_reads += 1 | |
| 82 self.nb_extracts += 5 # Number of extracts per read: 2 extracts for PE1 and 3 for PE2. | |
| 83 | |
| 84 | |
| 85 | |
| 86 ## Class containing all the read extracts (single reads) that must be mapped against a sequence. | |
| 87 class readsExtractsS(ReadExtracts): | |
| 88 def __init__(self,seed): | |
| 89 ReadExtracts.__init__(self,seed) | |
| 90 | |
| 91 ## Adds a read to the list of extracts | |
| 92 # | |
| 93 # @param whole_read The read as extracted from the fastq file | |
| 94 # @param no_pair This paramenter s only used to make the distinction between Single and paired. | |
| 95 # Note VL: I didn't use meta programming here because I thought that it would have a negative impact on performance. | |
| 96 # TODO: test it when all the rest works. | |
| 97 def addRead(self,whole_read,no_pair=""): | |
| 98 read_part = whole_read[:self.seed] | |
| 99 self.r_extracts_list.append(read_part) | |
| 100 self.r_extracts_list.append(whole_read[-self.seed:]) | |
| 101 self.r_extracts_list.append(reverseComplement(whole_read)[:self.seed]) | |
| 102 self.r_extracts_list.append(reverseComplement(whole_read)[-self.seed:]) | |
| 103 self.nb_reads+=1 | |
| 104 self.nb_extracts += 4 | |
| 105 | |
| 106 ## use objects of this class to store read offset (PE1 and PE2) in files. | |
| 107 class ReadInfo: | |
| 108 def __init__(self, off_PE1,whole_read,seed,off_PE2=None): | |
| 109 self.offset1=off_PE1 | |
| 110 self.offset2=off_PE2 | |
| 111 self.corlen = len(whole_read) - seed | |
| 112 | |
| 113 ## Gets the number of reads in the fastq file | |
| 114 # def getNbReads(fastq): | |
| 115 # with open(fastq) as f: | |
| 116 # for i, l in enumerate(f): | |
| 117 # pass | |
| 118 # nb_r=i+1 | |
| 119 # nb_r=nb_r/4 | |
| 120 # return nb_r | |
| 121 | |
| 122 | |
| 123 | |
| 124 ## loads a chunk of reads for mapping on GPU. | |
| 125 # Yields a ReadExtracts object plus a dictionnary of ReadInfo. | |
| 126 # keeps in memory the parsing state of the file. | |
| 127 # @param ch_size is in number of reads | |
| 128 # @reset_ids indicates whether or not we want read index to be reset to 0 at the beginning of each chunk. | |
| 129 def getChunk(fastq,seed,paired,ch_siz,reset_ids=True): | |
| 130 new_chunk = False | |
| 131 d_rinfo=dict() | |
| 132 idx_read=0 | |
| 133 off2=None | |
| 134 filin = open(fastq) | |
| 135 line = filin.readline() | |
| 136 read_paired="" | |
| 137 if paired != "": | |
| 138 RE=readExtractsPE(seed) | |
| 139 filin_paired = open(paired) | |
| 140 line_paired = filin_paired.readline() | |
| 141 else: | |
| 142 RE=readsExtractsS(seed) | |
| 143 | |
| 144 start = False | |
| 145 num_line=0 | |
| 146 while line: | |
| 147 # Read sequence | |
| 148 read = line.split("\n")[0].split("\r")[0] | |
| 149 if paired != "": | |
| 150 read_paired = line_paired.split("\n")[0].split("\r")[0] | |
| 151 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read. | |
| 152 start = True | |
| 153 off1=filin.tell() | |
| 154 line = filin.readline() | |
| 155 if paired != "": | |
| 156 off2=filin_paired.tell() | |
| 157 line_paired = filin_paired.readline() | |
| 158 continue | |
| 159 if (start == True): | |
| 160 start = False | |
| 161 readlen = len(read) | |
| 162 if readlen < seed: | |
| 163 line = filin.readline() | |
| 164 if paired !="": | |
| 165 line_paired = filin_paired.readline() # also skip PE2 in that case | |
| 166 continue | |
| 167 RE.addRead(read,read_paired) | |
| 168 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2) | |
| 169 if (idx_read>0 and ((idx_read+1)%(ch_siz)==0)): | |
| 170 yield RE,d_rinfo | |
| 171 if (reset_ids): | |
| 172 idx_read=0 | |
| 173 new_chunk=True | |
| 174 if paired != "": | |
| 175 RE = readExtractsPE(seed) | |
| 176 else: | |
| 177 RE = readsExtractsS(seed) | |
| 178 d_rinfo = dict() | |
| 179 if not new_chunk: | |
| 180 idx_read+=1 | |
| 181 else: | |
| 182 new_chunk=False | |
| 183 | |
| 184 line = filin.readline() | |
| 185 if paired!="": | |
| 186 line_paired = filin_paired.readline() | |
| 187 filin.close() | |
| 188 if paired !="": | |
| 189 filin_paired.close() | |
| 190 yield RE, d_rinfo | |
| 191 | |
| 192 ## dumps a dictionnary of ReadInfo objects indexed on read index. | |
| 193 # | |
| 194 # @param d_rinfo dictionnary to dump | |
| 195 # @param fic filename (incl. full path) where to dump | |
| 196 def dump_d_rinfo(d_rinfo,fic): | |
| 197 with open(fic, 'wb') as fp: | |
| 198 pickle.dump(d_rinfo, fp, protocol=pickle.HIGHEST_PROTOCOL) | |
| 199 | |
| 200 ## Loads a dictionnary of ReadInfo objects. | |
| 201 def load_d_rinfo(fic): | |
| 202 with open(fic, 'rb') as fp: | |
| 203 d_rinfo = pickle.load(fp) | |
| 204 return d_rinfo | |
| 205 | |
| 206 | |
| 207 ## loads all extracts of reads into a list for processing on GPU. | |
| 208 # | |
| 209 # returns 1 or 2 readExtracts objects plus a dictionnary of ReadInfo. | |
| 210 def getAllReads(fastq,seed,paired): | |
| 211 d_rinfo=dict() | |
| 212 idx_read=0 | |
| 213 off2=None | |
| 214 filin = open(fastq) | |
| 215 line = filin.readline() | |
| 216 read_paired="" | |
| 217 | |
| 218 if paired != "": | |
| 219 RE=readExtractsPE(seed) | |
| 220 filin_paired = open(paired) | |
| 221 line_paired = filin_paired.readline() | |
| 222 else: | |
| 223 RE=readsExtractsS(seed) | |
| 224 | |
| 225 start = False | |
| 226 num_line=0 | |
| 227 while line: | |
| 228 # Read sequence | |
| 229 read = line.split("\n")[0].split("\r")[0] | |
| 230 if paired != "": | |
| 231 read_paired = line_paired.split("\n")[0].split("\r")[0] | |
| 232 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read. | |
| 233 start = True | |
| 234 off1=filin.tell() | |
| 235 line = filin.readline() | |
| 236 if paired != "": | |
| 237 off2=filin_paired.tell() | |
| 238 line_paired = filin_paired.readline() | |
| 239 continue | |
| 240 if (start == True): | |
| 241 start = False | |
| 242 readlen = len(read) | |
| 243 if readlen < seed: | |
| 244 line = filin.readline() | |
| 245 if paired !="": | |
| 246 line_paired = filin_paired.readline() # also skip PE2 in that case | |
| 247 continue | |
| 248 RE.addRead(read,read_paired) | |
| 249 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2) | |
| 250 idx_read+=1 | |
| 251 | |
| 252 line = filin.readline() | |
| 253 if paired!="": | |
| 254 line_paired = filin_paired.readline() | |
| 255 filin.close() | |
| 256 if paired !="": | |
| 257 filin_paired.close() | |
| 258 return RE,d_rinfo | |
| 259 | |
| 260 ## use this class to retrieve reads from fastq file. | |
| 261 class ReadGetter: | |
| 262 ## constructor | |
| 263 # | |
| 264 # @param fastq Name of the fastq file that contains the read | |
| 265 # @param d_rinfo A dictionnary of ReadInfo objects that contains the offset indicating where the read starts in the file. | |
| 266 # @param paired The name of the file containing the PE2 (defaults to ""). | |
| 267 def __init__(self,fastq,d_rinfo,paired=""): | |
| 268 self.filin=open(fastq) | |
| 269 self.d_rinfo=d_rinfo | |
| 270 self.paired=paired | |
| 271 if paired!="": | |
| 272 self.filinp=open(fastq) | |
| 273 | |
| 274 def getOneRead(self,idx_read): | |
| 275 read_paired="" | |
| 276 self.filin.seek(self.d_rinfo[idx_read].offset1) | |
| 277 read=self.filin.readline() | |
| 278 if self.paired!="": | |
| 279 self.filinp.seek(self.d_rinfo[idx_read].offset2) | |
| 280 read_paired = self.filinp.readline() | |
| 281 return read,read_paired | |
| 282 | |
| 283 | |
| 284 ### READS Functions | |
| 285 def totReads(filin): | |
| 286 """Verify and retrieve the number of reads in the fastq file before alignment""" | |
| 287 if filin.endswith('.gz'): | |
| 288 filein = gzip.open(filin, 'rb') | |
| 289 else: | |
| 290 filein = open(filin, 'r') | |
| 291 | |
| 292 line = 0 | |
| 293 while filein.readline(): | |
| 294 line += 1 | |
| 295 seq = float(round(line / 4)) | |
| 296 filein.close() | |
| 297 return seq | |
| 298 | |
| 299 ### SEQUENCE parsing function | |
| 300 def genomeFastaRecovery(filin, limit_reference, edge, host_test = 0): | |
| 301 """Get genome sequence from fasta file""" | |
| 302 print("recovering genome from: ",filin) | |
| 303 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) | |
| 304 if filin == "": | |
| 305 return "", "", "" | |
| 306 | |
| 307 infile = open(filin, 'r') | |
| 308 name = [] | |
| 309 genome = [] | |
| 310 genome_line = "" | |
| 311 genome_rejected = 0 | |
| 312 for line in infile: | |
| 313 if line[0] == ">": | |
| 314 if name != []: | |
| 315 if len(genome_line) >= limit_reference: | |
| 316 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge]) | |
| 317 else: | |
| 318 genome_rejected += 1 | |
| 319 name = name[:-1] | |
| 320 genome_line = "" | |
| 321 name.append(line[1:].split('\r')[0].split('\n')[0]) | |
| 322 else: | |
| 323 genome_line += changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0] | |
| 324 | |
| 325 if len(genome_line) >= limit_reference: | |
| 326 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge]) | |
| 327 genome_line = "" | |
| 328 else: | |
| 329 genome_rejected += 1 | |
| 330 name = name[:-1] | |
| 331 | |
| 332 infile.close() | |
| 333 | |
| 334 if host_test: | |
| 335 return "".join(genome) | |
| 336 else: | |
| 337 return genome, name, genome_rejected | |
| 338 close(filin) | |
| 339 |
