annotate _modules/IData_handling.py @ 0:69e8f12c8b31 draft

"planemo upload"
author bioit_sciensano
date Fri, 11 Mar 2022 15:06:20 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
1 ## @file IData_handling.py
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
2 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
3 # VL: Gather here the classes and functions useful for handling input data.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
4 from __future__ import print_function
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
5
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
6 import gzip
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
7 from _modules.utilities import reverseComplement,changeCase
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
8 from time import gmtime, strftime
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
9 import datetime
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
10
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
11 try:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
12 import cPickle as pickle
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
13 except ImportError: # python 3.x
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
14 import pickle
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
15
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
16
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
17 ## This class encapsulates the reference sequences, the host sequence if any and all useful information about the sequences.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
18 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
19 # It is used both for searching the read extracts in the sequences and for exploiting the results
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
20 class refData:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
21 def __init__(self,refseq_list,seed,hostseq):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
22 self.refseq_list=refseq_list
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
23 self.seed=seed
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
24 self.hostseq=hostseq
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
25 if hostseq!="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
26 self.refseq_list.insert(0,hostseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
27 self.nb_sequences=len(refseq_list)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
28
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
29 def getIdxSeq(self,refseq):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
30 idx=-1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
31 found=False
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
32 for s in self.refseq_list:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
33 idx += 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
34 if s==refseq:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
35 found=True
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
36 break
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
37 if not found:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
38 raise RuntimeError("Couldn't find sequence in list of ref sequences.")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
39 return idx
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
40
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
41
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
42 def IdxIsHostseq(self,idx_seq):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
43 if (((self.hostseq == "") and (idx_seq <= self.nb_sequences - 1)) or (
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
44 (self.hostseq != "") and (idx_seq >0))):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
45 return False
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
46 return True
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
47
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
48 def getSeqSizesList(self):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
49 seq_sizes_list = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
50 for seq in self.refseq_list:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
51 seq_sizes_list.append(len(seq))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
52 return seq_sizes_list
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
53
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
54
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
55 ## Base class for handling read extracts.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
56 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
57 # This class should not be used directly.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
58 class ReadExtracts:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
59 def __init__(self,seed):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
60 self.seed = seed
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
61 self.r_extracts_list = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
62 self.nb_reads = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
63 self.nb_extracts=0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
64
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
65 ## Returns the list of read extracts from the loaded dataset, the number of reads and the total number of extracts
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
66 def getRExtracts(self):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
67 return self.r_extracts_list,self.nb_reads,self.nb_extracts
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
68
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
69 ## Class containing all the read extracts (PE reads) that must be mapped against a sequence.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
70 class readExtractsPE(ReadExtracts):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
71 def __init__(self,seed):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
72 self.__init__(seed)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
73
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
74
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
75 def addRead(self, whole_PE1,whole_PE2):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
76 self.r_extracts_list.append(whole_PE1[:self.seed])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
77 self.r_extracts_list.append(whole_PE1[-self.seed:])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
78 self.r_extracts_list.append(whole_PE2[:self.seed])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
79 self.r_extracts_list.append(reverseComplement(whole_PE2)[:self.seed])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
80 self.r_extracts_list.append(reverseComplement(whole_PE2)[-self.seed:])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
81 self.nb_reads += 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
82 self.nb_extracts += 5 # Number of extracts per read: 2 extracts for PE1 and 3 for PE2.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
83
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
84
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
85
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
86 ## Class containing all the read extracts (single reads) that must be mapped against a sequence.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
87 class readsExtractsS(ReadExtracts):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
88 def __init__(self,seed):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
89 ReadExtracts.__init__(self,seed)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
90
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
91 ## Adds a read to the list of extracts
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
92 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
93 # @param whole_read The read as extracted from the fastq file
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
94 # @param no_pair This paramenter s only used to make the distinction between Single and paired.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
95 # Note VL: I didn't use meta programming here because I thought that it would have a negative impact on performance.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
96 # TODO: test it when all the rest works.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
97 def addRead(self,whole_read,no_pair=""):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
98 read_part = whole_read[:self.seed]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
99 self.r_extracts_list.append(read_part)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
100 self.r_extracts_list.append(whole_read[-self.seed:])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
101 self.r_extracts_list.append(reverseComplement(whole_read)[:self.seed])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
102 self.r_extracts_list.append(reverseComplement(whole_read)[-self.seed:])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
103 self.nb_reads+=1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
104 self.nb_extracts += 4
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
105
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
106 ## use objects of this class to store read offset (PE1 and PE2) in files.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
107 class ReadInfo:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
108 def __init__(self, off_PE1,whole_read,seed,off_PE2=None):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
109 self.offset1=off_PE1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
110 self.offset2=off_PE2
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
111 self.corlen = len(whole_read) - seed
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
112
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
113 ## Gets the number of reads in the fastq file
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
114 # def getNbReads(fastq):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
115 # with open(fastq) as f:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
116 # for i, l in enumerate(f):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
117 # pass
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
118 # nb_r=i+1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
119 # nb_r=nb_r/4
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
120 # return nb_r
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
121
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
122
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
123
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
124 ## loads a chunk of reads for mapping on GPU.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
125 # Yields a ReadExtracts object plus a dictionnary of ReadInfo.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
126 # keeps in memory the parsing state of the file.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
127 # @param ch_size is in number of reads
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
128 # @reset_ids indicates whether or not we want read index to be reset to 0 at the beginning of each chunk.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
129 def getChunk(fastq,seed,paired,ch_siz,reset_ids=True):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
130 new_chunk = False
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
131 d_rinfo=dict()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
132 idx_read=0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
133 off2=None
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
134 filin = open(fastq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
135 line = filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
136 read_paired=""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
137 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
138 RE=readExtractsPE(seed)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
139 filin_paired = open(paired)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
140 line_paired = filin_paired.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
141 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
142 RE=readsExtractsS(seed)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
143
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
144 start = False
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
145 num_line=0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
146 while line:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
147 # Read sequence
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
148 read = line.split("\n")[0].split("\r")[0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
149 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
150 read_paired = line_paired.split("\n")[0].split("\r")[0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
151 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
152 start = True
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
153 off1=filin.tell()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
154 line = filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
155 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
156 off2=filin_paired.tell()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
157 line_paired = filin_paired.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
158 continue
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
159 if (start == True):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
160 start = False
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
161 readlen = len(read)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
162 if readlen < seed:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
163 line = filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
164 if paired !="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
165 line_paired = filin_paired.readline() # also skip PE2 in that case
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
166 continue
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
167 RE.addRead(read,read_paired)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
168 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
169 if (idx_read>0 and ((idx_read+1)%(ch_siz)==0)):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
170 yield RE,d_rinfo
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
171 if (reset_ids):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
172 idx_read=0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
173 new_chunk=True
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
174 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
175 RE = readExtractsPE(seed)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
176 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
177 RE = readsExtractsS(seed)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
178 d_rinfo = dict()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
179 if not new_chunk:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
180 idx_read+=1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
181 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
182 new_chunk=False
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
183
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
184 line = filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
185 if paired!="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
186 line_paired = filin_paired.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
187 filin.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
188 if paired !="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
189 filin_paired.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
190 yield RE, d_rinfo
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
191
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
192 ## dumps a dictionnary of ReadInfo objects indexed on read index.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
193 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
194 # @param d_rinfo dictionnary to dump
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
195 # @param fic filename (incl. full path) where to dump
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
196 def dump_d_rinfo(d_rinfo,fic):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
197 with open(fic, 'wb') as fp:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
198 pickle.dump(d_rinfo, fp, protocol=pickle.HIGHEST_PROTOCOL)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
199
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
200 ## Loads a dictionnary of ReadInfo objects.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
201 def load_d_rinfo(fic):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
202 with open(fic, 'rb') as fp:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
203 d_rinfo = pickle.load(fp)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
204 return d_rinfo
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
205
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
206
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
207 ## loads all extracts of reads into a list for processing on GPU.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
208 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
209 # returns 1 or 2 readExtracts objects plus a dictionnary of ReadInfo.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
210 def getAllReads(fastq,seed,paired):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
211 d_rinfo=dict()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
212 idx_read=0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
213 off2=None
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
214 filin = open(fastq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
215 line = filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
216 read_paired=""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
217
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
218 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
219 RE=readExtractsPE(seed)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
220 filin_paired = open(paired)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
221 line_paired = filin_paired.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
222 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
223 RE=readsExtractsS(seed)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
224
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
225 start = False
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
226 num_line=0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
227 while line:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
228 # Read sequence
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
229 read = line.split("\n")[0].split("\r")[0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
230 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
231 read_paired = line_paired.split("\n")[0].split("\r")[0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
232 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
233 start = True
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
234 off1=filin.tell()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
235 line = filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
236 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
237 off2=filin_paired.tell()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
238 line_paired = filin_paired.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
239 continue
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
240 if (start == True):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
241 start = False
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
242 readlen = len(read)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
243 if readlen < seed:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
244 line = filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
245 if paired !="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
246 line_paired = filin_paired.readline() # also skip PE2 in that case
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
247 continue
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
248 RE.addRead(read,read_paired)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
249 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
250 idx_read+=1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
251
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
252 line = filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
253 if paired!="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
254 line_paired = filin_paired.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
255 filin.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
256 if paired !="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
257 filin_paired.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
258 return RE,d_rinfo
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
259
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
260 ## use this class to retrieve reads from fastq file.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
261 class ReadGetter:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
262 ## constructor
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
263 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
264 # @param fastq Name of the fastq file that contains the read
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
265 # @param d_rinfo A dictionnary of ReadInfo objects that contains the offset indicating where the read starts in the file.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
266 # @param paired The name of the file containing the PE2 (defaults to "").
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
267 def __init__(self,fastq,d_rinfo,paired=""):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
268 self.filin=open(fastq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
269 self.d_rinfo=d_rinfo
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
270 self.paired=paired
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
271 if paired!="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
272 self.filinp=open(fastq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
273
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
274 def getOneRead(self,idx_read):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
275 read_paired=""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
276 self.filin.seek(self.d_rinfo[idx_read].offset1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
277 read=self.filin.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
278 if self.paired!="":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
279 self.filinp.seek(self.d_rinfo[idx_read].offset2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
280 read_paired = self.filinp.readline()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
281 return read,read_paired
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
282
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
283
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
284 ### READS Functions
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
285 def totReads(filin):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
286 """Verify and retrieve the number of reads in the fastq file before alignment"""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
287 if filin.endswith('.gz'):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
288 filein = gzip.open(filin, 'rb')
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
289 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
290 filein = open(filin, 'r')
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
291
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
292 line = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
293 while filein.readline():
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
294 line += 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
295 seq = float(round(line / 4))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
296 filein.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
297 return seq
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
298
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
299 ### SEQUENCE parsing function
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
300 def genomeFastaRecovery(filin, limit_reference, edge, host_test = 0):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
301 """Get genome sequence from fasta file"""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
302 print("recovering genome from: ",filin)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
303 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
304 if filin == "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
305 return "", "", ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
306
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
307 infile = open(filin, 'r')
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
308 name = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
309 genome = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
310 genome_line = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
311 genome_rejected = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
312 for line in infile:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
313 if line[0] == ">":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
314 if name != []:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
315 if len(genome_line) >= limit_reference:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
316 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
317 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
318 genome_rejected += 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
319 name = name[:-1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
320 genome_line = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
321 name.append(line[1:].split('\r')[0].split('\n')[0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
322 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
323 genome_line += changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
324
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
325 if len(genome_line) >= limit_reference:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
326 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
327 genome_line = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
328 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
329 genome_rejected += 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
330 name = name[:-1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
331
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
332 infile.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
333
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
334 if host_test:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
335 return "".join(genome)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
336 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
337 return genome, name, genome_rejected
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
338 close(filin)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
339