0
|
1 import fileinput, os, random, math
|
|
2 from bs_utils.utils import *
|
|
3 from bs_align_utils import *
|
|
4 #----------------------------------------------------------------
|
|
5 def extract_mapping(ali_file):
|
|
6 unique_hits = {}
|
|
7 non_unique_hits = {}
|
|
8 header0 = ""
|
|
9 family = []
|
|
10 for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True):
|
|
11 #------------------------
|
|
12 if header != header0:
|
|
13
|
|
14 # --- output ----
|
|
15 if len(family) == 1:
|
|
16 unique_hits[header0] = family[0]
|
|
17 elif len(family) > 1:
|
|
18 min_lst = min(family, key = lambda x: x[0])
|
|
19 max_lst = max(family, key = lambda x: x[0])
|
|
20
|
|
21 if min_lst[0] < max_lst[0]:
|
|
22 unique_hits[header0] = min_lst
|
|
23 else:
|
|
24 non_unique_hits[header0] = min_lst[0]
|
|
25
|
|
26 header0 = header
|
|
27 family = []
|
|
28 family.append((no_mismatch, chr, location1, cigar1, location2, cigar2))
|
|
29
|
|
30 if len(family) == 1:
|
|
31 unique_hits[header0] = family[0]
|
|
32 elif len(family) > 1:
|
|
33 min_lst = min(family, key = lambda x: x[0])
|
|
34 max_lst = max(family, key = lambda x: x[0])
|
|
35
|
|
36 if min_lst[0] < max_lst[0]:
|
|
37 unique_hits[header0] = min_lst
|
|
38 else:
|
|
39 non_unique_hits[header0] = min_lst[0]
|
|
40
|
|
41 return unique_hits, non_unique_hits
|
|
42
|
|
43 def _extract_mapping(ali_file):
|
|
44 U = {}
|
|
45 R = {}
|
|
46 header0 = ""
|
|
47 family = []
|
|
48 for line in fileinput.input(ali_file):
|
|
49 l = line.split()
|
|
50 header = l[0][:-2]
|
|
51 chr = str(l[1])
|
|
52 location = int(l[2])
|
|
53 #no_hits=int(l[4])
|
|
54 #-------- mismatches -----------
|
|
55 if len(l) == 4:
|
|
56 no_mismatch = 0
|
|
57 elif len(l) == 5:
|
|
58 no_mismatch = l[4].count(":")
|
|
59 else:
|
|
60 print l
|
|
61 #------------------------
|
|
62 if header != header0:
|
|
63 #--------------------
|
|
64 if header0 != "":
|
|
65 # --- output ----
|
|
66 if len(family) == 1:
|
|
67 U[header0] = family[0]
|
|
68 else:
|
|
69 if family[0][0] < family[1][0]:
|
|
70 U[header0] = family[0]
|
|
71 elif family[1][0] < family[0][0]:
|
|
72 U[header0] = family[1]
|
|
73 else:
|
|
74 R[header0] = family[0][0]
|
|
75 family=[]
|
|
76 # ---------------
|
|
77 header0 = header
|
|
78 family = [[no_mismatch, chr, location]]
|
|
79 member = 1
|
|
80 elif header == header0:
|
|
81 if member == 1:
|
|
82 family[-1][0] += no_mismatch
|
|
83 family[-1].append(location)
|
|
84 member = 2
|
|
85 elif member == 2:
|
|
86 family.append([no_mismatch, chr, location])
|
|
87 member = 1
|
|
88 #------------------------------
|
|
89
|
|
90 fileinput.close()
|
|
91 return U, R
|
|
92
|
|
93
|
|
94 #----------------------------------------------------------------
|
|
95
|
|
96 def bs_pair_end(main_read_file_1,
|
|
97 main_read_file_2,
|
|
98 asktag,
|
|
99 adapter_file,
|
|
100 cut1,
|
|
101 cut2, # add cut3 and cut4?
|
|
102 no_small_lines,
|
|
103 max_mismatch_no,
|
|
104 aligner_command,
|
|
105 db_path,
|
|
106 tmp_path,
|
1
|
107 outfile, XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
|
0
|
108
|
|
109
|
|
110 #----------------------------------------------------------------
|
1
|
111 adapter = ""
|
|
112 adapterA = ""
|
|
113 adapterB = ""
|
0
|
114 if adapter_file !="":
|
|
115 try :
|
|
116 adapter_inf=open(adapter_file,"r")
|
|
117 if asktag=="N": #<--- directional library
|
|
118 adapter=adapter_inf.readline()
|
|
119 adapter_inf.close()
|
|
120 adapter=adapter.rstrip("\n")
|
|
121 elif asktag=="Y":#<--- un-directional library
|
|
122 adapterA=adapter_inf.readline()
|
|
123 adapterB=adapter_inf.readline()
|
|
124 adapter_inf.close()
|
|
125 adapterA=adapterA.rstrip("\n")
|
|
126 adapterB=adapterB.rstrip("\n")
|
|
127 except IOError :
|
|
128 print "[Error] Cannot find adapter file : %s" % adapter_file
|
|
129 exit(-1)
|
|
130
|
|
131
|
|
132 #----------------------------------------------------------------
|
|
133
|
|
134 logm("End 1 filename: %s"% main_read_file_1 )
|
|
135 logm("End 2 filename: %s"% main_read_file_2 )
|
|
136 logm("The first base (for mapping): %d"% cut1 )
|
1
|
137 logm("The last base (for mapping): %d"% cut2 )
|
0
|
138
|
|
139 logm("Path for short reads aligner: %s"% aligner_command + '\n')
|
|
140 logm("Reference genome library path: %s"% db_path )
|
1
|
141
|
|
142 if asktag == "Y" :
|
|
143 logm("Un-directional library" )
|
|
144 else :
|
|
145 logm("Directional library")
|
0
|
146 logm("Number of mismatches allowed: %s"% max_mismatch_no )
|
|
147
|
|
148 if adapter_file !="":
|
|
149 if asktag=="Y":
|
|
150 logm("Adapters to be removed from 3' of the reads:" )
|
|
151 logm("-- A: %s" % adapterA )
|
|
152 logm("-- B: %s" % adapterB )
|
|
153 elif asktag=="N":
|
|
154 logm("Adapter to be removed from 3' of the reads:" )
|
|
155 logm("-- %s" % adapter )
|
|
156
|
|
157 logm("-------------------------------- " )
|
|
158
|
|
159 #----------------------------------------------------------------
|
|
160
|
|
161 # helper method to join fname with tmp_path
|
|
162 tmp_d = lambda fname: os.path.join(tmp_path, fname)
|
|
163
|
|
164 db_d = lambda fname: os.path.join(db_path, fname)
|
|
165
|
|
166
|
|
167 #----------------------------------------------------------------
|
|
168 # splitting the 2 big read files
|
|
169
|
|
170 input_fname1 = os.path.split(main_read_file_1)[1]
|
|
171 input_fname2 = os.path.split(main_read_file_2)[1]
|
|
172
|
|
173 # TODO: run these in parallel with a subprocess
|
|
174 split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines)
|
|
175 split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines)
|
|
176
|
|
177 dirList=os.listdir(tmp_path)
|
|
178 my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)),
|
|
179 sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList)))
|
|
180
|
|
181
|
|
182
|
|
183 #---- Stats ------------------------------------------------------------
|
|
184 all_raw_reads=0
|
|
185 all_trimmed=0
|
|
186 all_mapped=0
|
|
187 all_mapped_passed=0
|
1
|
188 all_base_before_trim=0
|
|
189 all_base_after_trim=0
|
|
190 all_base_mapped=0
|
0
|
191
|
|
192 all_unmapped=0
|
|
193
|
|
194 numbers_premapped_lst=[0,0,0,0]
|
|
195 numbers_mapped_lst=[0,0,0,0]
|
|
196
|
|
197 mC_lst=[0,0,0]
|
|
198 uC_lst=[0,0,0]
|
|
199
|
|
200 no_my_files=0
|
1
|
201 read_id_lst_1 = dict()
|
|
202 read_id_lst_2 = dict()
|
0
|
203
|
|
204 #----------------------------------------------------------------
|
|
205 print "== Start mapping =="
|
|
206
|
|
207 for read_file_1, read_file_2 in my_files:
|
|
208
|
|
209 no_my_files+=1
|
|
210
|
|
211 random_id=".tmp-"+str(random.randint(1000000,9999999))
|
|
212
|
|
213 original_bs_reads_1 = {}
|
|
214 original_bs_reads_2 = {}
|
|
215 original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2]
|
|
216
|
|
217
|
|
218 if asktag == "Y" :
|
|
219
|
|
220 #----------------------------------------------------------------
|
|
221 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id)
|
|
222 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id)
|
1
|
223 outfile_2RGA = tmp_d('Trimmed_FCT_2.fa'+random_id)
|
0
|
224 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id)
|
|
225
|
|
226 try :
|
1
|
227 if read_file_1.endswith(".gz") : # support input file ending with ".gz"
|
|
228 read_inf = gzip.open(tmp_d(read_file_1), "rb")
|
|
229 else :
|
|
230 read_inf = open(tmp_d(read_file_1),"r")
|
0
|
231 except IOError :
|
|
232 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
|
|
233 exit(-1)
|
|
234 oneline = read_inf.readline()
|
|
235 l = oneline.split()
|
|
236 input_format = ""
|
|
237
|
|
238 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
|
|
239 input_format="fastq"
|
|
240 elif len(l)==1 and oneline[0]!=">": # pure sequences
|
|
241 input_format="seq"
|
|
242 elif len(l)==11: # Illumina GAII qseq file
|
|
243 input_format="qseq"
|
|
244 elif oneline[0]==">": # fasta
|
|
245 input_format="fasta"
|
|
246 read_inf.close()
|
|
247
|
|
248 print "Detected data format: %s" % input_format
|
|
249
|
|
250 #----------------------------------------------------------------
|
|
251 read_file_list = [read_file_1, read_file_2]
|
1
|
252 outfile_FCT_list = [outfile_1FCT, outfile_2RGA]
|
0
|
253 outfile_RCT_list = [outfile_1RCT, outfile_2RCT]
|
|
254 n_list = [0, 0]
|
|
255 for f in range(2):
|
|
256 read_file = read_file_list[f]
|
|
257 outf_FCT = open(outfile_FCT_list[f], 'w')
|
1
|
258 outf_RGA = open(outfile_RCT_list[f], 'w')
|
0
|
259 original_bs_reads = original_bs_reads_lst[f]
|
|
260 n = n_list[f]
|
1
|
261 read_id = ""
|
0
|
262 seq = ""
|
|
263 seq_ready = "N"
|
1
|
264 line_no = 0
|
0
|
265 for line in fileinput.input(tmp_d(read_file)):
|
1
|
266 l = line.split()
|
|
267 line_no += 1
|
0
|
268 if input_format=="seq":
|
1
|
269 n += 1
|
|
270 read_id = str(n)
|
|
271 read_id = read_id.zfill(12)
|
|
272 if f == 1 :
|
|
273 read_id_lst_1[read_id] = read_id
|
|
274 else :
|
|
275 read_id_lst_2[read_id] = read_id
|
|
276 seq = l[0]
|
|
277 seq_ready = "Y"
|
0
|
278 elif input_format=="fastq":
|
1
|
279 l_fastq = math.fmod(line_no, 4)
|
|
280 if l_fastq==1 :
|
|
281 all_raw_reads += 1
|
|
282 read_id = str(line_no/4+1).zfill(12)
|
|
283 if f==1 :
|
|
284 read_id_lst_1[read_id] = l[0][1:]
|
|
285 else :
|
|
286 read_id_lst_2[read_id] = l[0][1:]
|
|
287 seq_ready="N"
|
|
288 elif l_fastq==2 :
|
|
289 seq = l[0]
|
|
290 seq_ready = "Y"
|
|
291 else :
|
|
292 seq = ""
|
|
293 seq_ready = "N"
|
0
|
294 elif input_format=="qseq":
|
1
|
295 all_raw_reads += 1
|
|
296 read_id = str(line_no)
|
|
297 read_id = read_id.zfill(12)
|
|
298 if f == 1 :
|
|
299 read_id_lst_1[read_id] = l[0][1:]
|
|
300 else :
|
|
301 read_id_lst_2[read_id] = l[0][1:]
|
|
302 seq = l[8]
|
|
303 seq_ready = "Y"
|
0
|
304 elif input_format=="fasta":
|
1
|
305 l_fasta = math.fmod(line_no,2)
|
|
306 seq_ready = "N"
|
|
307 if l_fasta==1:
|
|
308 all_raw_reads += 1
|
|
309 read_id = str(line_no/2+1).zfill(12)
|
|
310 if f==1 :
|
|
311 read_id_lst_1[read_id] = l[0][1:]
|
|
312 else :
|
|
313 read_id_lst_2[read_id] = l[0][1:]
|
|
314 seq = ""
|
|
315 elif l_fasta==0 :
|
|
316 seq = l[0]
|
|
317 seq_ready = "Y"
|
0
|
318 #----------------------------------------------------------------
|
|
319 if seq_ready=="Y":
|
1
|
320 seq = seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52
|
|
321 seq = seq.upper()
|
|
322 seq = seq.replace(".","N")
|
0
|
323
|
|
324 #--striping BS adapter from 3' read --------------------------------------------------------------
|
1
|
325 all_base_before_trim += len(seq)
|
|
326 if f==0 and adapterA!="" :
|
|
327 new_read = RemoveAdapter(seq, adapterA, adapter_mismatch)
|
|
328 if len(new_read)<len(seq) :
|
|
329 all_trimmed += 1
|
|
330 seq = new_read
|
|
331 if f==1 and adapterB!="" :
|
|
332 new_read = RemoveAdapter(seq, adapterB, adapter_mismatch)
|
|
333 if len(new_read)<len(seq) :
|
|
334 all_trimmed += 1
|
|
335 seq = new_read
|
|
336 all_base_after_trim += len(seq)
|
0
|
337
|
1
|
338 if len(seq)<=4:
|
0
|
339 seq = "N" * (cut2-cut1+1)
|
|
340 #--------- trimmed_raw_BS_read ------------------
|
1
|
341 original_bs_reads[read_id] = seq
|
0
|
342
|
|
343 #--------- FW_C2T ------------------
|
1
|
344 outf_FCT.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
|
0
|
345 #--------- RC_G2A ------------------
|
1
|
346 outf_RGA.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
|
0
|
347
|
1
|
348 n_list[f] = n
|
0
|
349
|
|
350 outf_FCT.close()
|
1
|
351 outf_RGA.close()
|
0
|
352
|
|
353 fileinput.close()
|
|
354
|
|
355 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
|
1
|
356 all_raw_reads += n
|
0
|
357
|
|
358 #--------------------------------------------------------------------------------
|
|
359 # Bowtie mapping
|
|
360 #--------------------------------------------------------------------------------
|
1
|
361 WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
|
|
362 WC2T_rf = tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
|
|
363 CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
|
|
364 CC2T_rf = tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
|
0
|
365
|
|
366 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
367 'input_file_1' : outfile_1FCT,
|
|
368 'input_file_2' : outfile_2RCT,
|
|
369 'output_file' : WC2T_fr},
|
|
370
|
|
371 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
372 'input_file_1' : outfile_1FCT,
|
|
373 'input_file_2' : outfile_2RCT,
|
1
|
374 'output_file' : CG2A_fr},
|
0
|
375
|
|
376 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
1
|
377 'input_file_1' : outfile_2RGA,
|
0
|
378 'input_file_2' : outfile_1RCT,
|
|
379 'output_file' : WC2T_rf},
|
|
380
|
|
381 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
1
|
382 'input_file_1' : outfile_2RGA,
|
0
|
383 'input_file_2' : outfile_1RCT,
|
|
384 'output_file' : CC2T_rf}])
|
|
385
|
1
|
386 delete_files(outfile_1FCT, outfile_2RGA, outfile_1RCT, outfile_2RCT)
|
0
|
387
|
|
388
|
|
389 #--------------------------------------------------------------------------------
|
|
390 # Post processing
|
|
391 #--------------------------------------------------------------------------------
|
|
392
|
|
393 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
|
|
394 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf)
|
1
|
395 RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr)
|
0
|
396 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf)
|
|
397
|
1
|
398 delete_files(WC2T_fr, WC2T_rf, CG2A_fr, CC2T_rf)
|
0
|
399
|
|
400 #----------------------------------------------------------------
|
|
401 # get uniq-hit reads
|
|
402 #----------------------------------------------------------------
|
1
|
403 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys())
|
0
|
404
|
1
|
405 Unique_FW_fr_C2T = set() # +
|
|
406 Unique_FW_rf_C2T = set() # +
|
|
407 Unique_RC_fr_G2A = set() # -
|
|
408 Unique_RC_rf_C2T = set() # -
|
|
409 Multiple_hits = set()
|
0
|
410
|
|
411
|
|
412 for x in Union_set:
|
1
|
413 _list = []
|
|
414 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_G2A_fr_U, RC_C2T_rf_U]:
|
|
415 mis_lst = d.get(x,[99])
|
|
416 mis = int(mis_lst[0])
|
0
|
417 _list.append(mis)
|
1
|
418 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_G2A_fr_R, RC_C2T_rf_R]:
|
|
419 mis = d.get(x,99)
|
0
|
420 _list.append(mis)
|
1
|
421 mini = min(_list)
|
0
|
422 if _list.count(mini)==1:
|
1
|
423 mini_index = _list.index(mini)
|
0
|
424 if mini_index==0:
|
|
425 Unique_FW_fr_C2T.add(x)
|
|
426 elif mini_index==1:
|
|
427 Unique_FW_rf_C2T.add(x)
|
|
428 elif mini_index==2:
|
1
|
429 Unique_RC_fr_G2A.add(x)
|
0
|
430 elif mini_index==3:
|
|
431 Unique_RC_rf_C2T.add(x)
|
|
432 else :
|
|
433 Multiple_hits.add(x)
|
|
434 else :
|
|
435 Multiple_hits.add(x)
|
|
436
|
|
437 # write reads rejected by Multiple Hits to file
|
|
438 if show_multiple_hit :
|
1
|
439 outf_MH = open("Multiple_hit.fa",'w')
|
0
|
440 for i in Multiple_hits :
|
|
441 outf_MH.write(">%s\n" % i)
|
|
442 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
443 outf_MH.close()
|
|
444
|
|
445 del Union_set
|
|
446 del FW_C2T_fr_R
|
|
447 del FW_C2T_rf_R
|
1
|
448 del RC_G2A_fr_R
|
0
|
449 del RC_C2T_rf_R
|
|
450
|
1
|
451 FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
|
|
452 FW_C2T_rf_uniq_lst = [[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T]
|
|
453 RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A]
|
|
454 RC_C2T_rf_uniq_lst = [[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T]
|
0
|
455
|
|
456 FW_C2T_fr_uniq_lst.sort()
|
|
457 FW_C2T_rf_uniq_lst.sort()
|
|
458 RC_C2T_fr_uniq_lst.sort()
|
|
459 RC_C2T_rf_uniq_lst.sort()
|
1
|
460 FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst]
|
|
461 FW_C2T_rf_uniq_lst = [x[1] for x in FW_C2T_rf_uniq_lst]
|
|
462 RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst]
|
|
463 RC_C2T_rf_uniq_lst = [x[1] for x in RC_C2T_rf_uniq_lst]
|
0
|
464 #----------------------------------------------------------------
|
|
465
|
1
|
466 numbers_premapped_lst[0] += len(Unique_FW_fr_C2T)
|
|
467 numbers_premapped_lst[1] += len(Unique_FW_rf_C2T)
|
|
468 numbers_premapped_lst[2] += len(Unique_RC_fr_G2A)
|
|
469 numbers_premapped_lst[3] += len(Unique_RC_rf_C2T)
|
0
|
470
|
|
471 del Unique_FW_fr_C2T
|
|
472 del Unique_FW_rf_C2T
|
1
|
473 del Unique_RC_fr_G2A
|
0
|
474 del Unique_RC_rf_C2T
|
|
475
|
|
476 #----------------------------------------------------------------
|
|
477
|
|
478 nn = 0
|
|
479 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U),
|
|
480 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U),
|
1
|
481 (RC_C2T_fr_uniq_lst,RC_G2A_fr_U),
|
0
|
482 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]:
|
|
483 nn += 1
|
|
484
|
|
485 mapped_chr0 = ""
|
|
486 for header in ali_unique_lst:
|
|
487
|
|
488 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
|
|
489
|
|
490 #-------------------------------------
|
1
|
491 if mapped_chr!=mapped_chr0:
|
|
492 my_gseq = deserialize(db_d(mapped_chr))
|
|
493 chr_length = len(my_gseq)
|
|
494 mapped_chr0 = mapped_chr
|
0
|
495 #-------------------------------------
|
1
|
496 if nn==1 or nn==3 :
|
0
|
497 original_BS_1 = original_bs_reads_1[header]
|
|
498 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
|
1
|
499 else :
|
0
|
500 original_BS_1 = original_bs_reads_2[header]
|
|
501 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
|
|
502
|
|
503 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
|
|
504 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
|
|
505
|
|
506 all_mapped += 1
|
|
507
|
1
|
508 if nn==1 : # FW-RC mapped to + strand:
|
0
|
509 FR = "+FR"
|
|
510 mapped_strand_1 = "+"
|
|
511 mapped_strand_2 = "+"
|
1
|
512 elif nn==2 : # RC-FW mapped to + strand:
|
0
|
513 FR = "+RF"
|
|
514 mapped_strand_1 = "+"
|
|
515 mapped_strand_2 = "+"
|
1
|
516 elif nn==3 : # FW-RC mapped to - strand:
|
0
|
517 FR = "-FR"
|
|
518 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
|
|
519 mapped_strand_1 = "-"
|
|
520 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
|
|
521 mapped_strand_2 = "-"
|
1
|
522 elif nn==4 : # RC-FW mapped to - strand:
|
0
|
523 FR = "-RF"
|
|
524 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
|
|
525 mapped_strand_1 = "-"
|
|
526 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
|
|
527 mapped_strand_2 = "-"
|
|
528
|
|
529 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
|
|
530 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
|
|
531
|
|
532 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
|
|
533 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
|
|
534
|
|
535 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
|
|
536 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
|
|
537
|
1
|
538 N_mismatch = max(N_mismatch_1, N_mismatch_2)
|
|
539 mm_no=float(max_mismatch_no)
|
|
540 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1)
|
|
541 and N_mismatch_2<=mm_no*len(original_BS_2)):
|
0
|
542
|
|
543 all_mapped_passed += 1
|
|
544 numbers_mapped_lst[nn-1] += 1
|
|
545 #---- unmapped -------------------------
|
|
546 del original_bs_reads_1[header]
|
|
547 del original_bs_reads_2[header]
|
|
548 #---------------------------------------
|
|
549
|
1
|
550 methy_1 = methy_seq(r_aln_1, g_aln_1 + next_1)
|
|
551 methy_2 = methy_seq(r_aln_2, g_aln_2 + next_2)
|
0
|
552
|
|
553 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst)
|
|
554 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
|
|
555
|
|
556 #---XS FILTER----------------
|
|
557 XS_1 = 0
|
|
558 nCH_1 = methy_1.count('y') + methy_1.count('z')
|
|
559 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
|
|
560 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
|
|
561 XS_1 = 1
|
|
562 XS_2 = 0
|
|
563 nCH_2 = methy_2.count('y') + methy_2.count('z')
|
|
564 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
|
|
565 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
|
|
566 XS_2 = 1
|
|
567
|
1
|
568 if mapped_strand_1=='+' :
|
|
569 flag_1 = 67 # 1000011 : 1st read, + strand
|
|
570 flag_2 = 131 #10000011 : 2nd read, + strand
|
|
571 else :
|
|
572 flag_1 = 115 # 1110011 : 1st read, - strand
|
|
573 flag_2 = 179 # 10110011 : 2nd read, - strand
|
0
|
574
|
1
|
575 outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
|
0
|
576 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
|
1
|
577 all_base_mapped += len(original_BS_1)
|
|
578
|
|
579 outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
|
0
|
580 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
|
1
|
581 all_base_mapped += len(original_BS_2)
|
|
582
|
0
|
583
|
|
584 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
|
|
585 #----------------------------------------------------------------
|
|
586 # output unmapped pairs
|
|
587 #----------------------------------------------------------------
|
|
588
|
1
|
589 unmapped_lst = original_bs_reads_1.keys()
|
0
|
590 unmapped_lst.sort()
|
|
591
|
|
592 # for u in unmapped_lst:
|
|
593 # outf_u1.write("%s\n"%original_bs_reads_1[u])
|
|
594 # outf_u2.write("%s\n"%original_bs_reads_2[u])
|
|
595
|
|
596 all_unmapped += len(unmapped_lst)
|
|
597
|
|
598
|
1
|
599 # Directional library
|
0
|
600 if asktag=="N":
|
|
601
|
|
602 #----------------------------------------------------------------
|
1
|
603 outfile_1FCT = tmp_d('Trimed_FCT_1.fa'+random_id)
|
|
604 outfile_2RGA = tmp_d('Trimed_RGA_2.fa'+random_id)
|
0
|
605
|
|
606 try :
|
1
|
607 if read_file_1.endswith(".gz") : # support input file ending with ".gz"
|
|
608 read_inf = gzip.open(tmp_d(read_file_1), "rb")
|
|
609 else :
|
|
610 read_inf = open(tmp_d(read_file_1),"r")
|
0
|
611 except IOError :
|
|
612 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
|
|
613 exit(-1)
|
|
614
|
1
|
615 oneline = read_inf.readline()
|
|
616 l = oneline.split()
|
|
617 input_format = ""
|
0
|
618
|
1
|
619 if oneline[0]=="@" :
|
|
620 input_format = "fastq"
|
|
621 elif len(l)==1 and oneline[0]!=">" :
|
|
622 input_format = "seq"
|
|
623 elif len(l)==11 :
|
|
624 input_format = "qseq"
|
|
625 elif oneline[0]==">" :
|
|
626 input_format = "fasta"
|
0
|
627
|
|
628 read_inf.close()
|
|
629
|
|
630 print "Detected data format: %s" % input_format
|
|
631
|
|
632 #----------------------------------------------------------------
|
1
|
633 read_file_list = [read_file_1,read_file_2]
|
|
634 outfile_FCT_list = [outfile_1FCT,outfile_2RGA]
|
|
635 n_list = [0,0]
|
0
|
636
|
|
637 for f in range(2):
|
1
|
638 read_file = read_file_list[f]
|
|
639 outf_FCT = open(outfile_FCT_list[f],'w')
|
0
|
640 original_bs_reads = original_bs_reads_lst[f]
|
1
|
641 n = n_list[f]
|
0
|
642
|
1
|
643 read_id = ""
|
|
644 seq = ""
|
|
645 seq_ready = "N"
|
|
646 line_no = 0
|
0
|
647 for line in fileinput.input(tmp_d(read_file)):
|
1
|
648 l = line.split()
|
|
649 line_no += 1
|
|
650 if input_format=="seq":
|
|
651 n += 1
|
|
652 read_id = str(n)
|
|
653 read_id = read_id.zfill(12)
|
|
654 if f == 1 :
|
|
655 read_id_lst_1[read_id] = read_id
|
|
656 else :
|
|
657 read_id_lst_2[read_id] = read_id
|
|
658 seq = l[0]
|
|
659 seq_ready = "Y"
|
|
660 elif input_format=="fastq":
|
|
661 l_fastq = math.fmod(line_no, 4)
|
|
662 if l_fastq==1 :
|
|
663 all_raw_reads += 1
|
|
664 read_id = str(line_no/4+1).zfill(12)
|
|
665 if f==1 :
|
|
666 read_id_lst_1[read_id] = l[0][1:]
|
|
667 else :
|
|
668 read_id_lst_2[read_id] = l[0][1:]
|
|
669 seq_ready = "N"
|
|
670 elif l_fastq==2 :
|
|
671 seq = l[0]
|
|
672 seq_ready = "Y"
|
|
673 else :
|
|
674 seq = ""
|
|
675 seq_ready = "N"
|
|
676 elif input_format=="qseq":
|
|
677 all_raw_reads += 1
|
|
678 read_id = str(line_no)
|
|
679 read_id = read_id.zfill(12)
|
|
680 if f == 1 :
|
|
681 read_id_lst_1[read_id] = l[0][1:]
|
|
682 else :
|
|
683 read_id_lst_2[read_id] = l[0][1:]
|
|
684 seq = l[8]
|
|
685 seq_ready = "Y"
|
0
|
686 elif input_format=="fasta":
|
1
|
687 l_fasta = math.fmod(line_no,2)
|
|
688 if l_fasta==1:
|
|
689 seq_ready = "N"
|
|
690 all_raw_reads += 1
|
|
691 read_id = str(line_no/2+1).zfill(12)
|
|
692 if f == 1 :
|
|
693 read_id_lst_1[read_id] = l[0][1:]
|
|
694 else :
|
|
695 read_id_lst_2[read_id] = l[0][1:]
|
|
696 seq = ""
|
|
697 elif l_fasta==0 :
|
|
698 seq = l[0]
|
|
699 seq_ready = "Y"
|
0
|
700 #----------------------------------------------------------------
|
|
701 if seq_ready=="Y":
|
1
|
702 seq = seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52
|
|
703 seq = seq.upper()
|
|
704 seq = seq.replace(".","N")
|
0
|
705
|
|
706 #--striping BS adapter from 3' read --------------------------------------------------------------
|
1
|
707 all_base_before_trim += len(seq)
|
|
708 if f==0 and adapterA!="" :
|
|
709 new_read = RemoveAdapter(seq, adapterA, adapter_mismatch)
|
|
710 if len(new_read) < len(seq) :
|
|
711 all_trimmed += 1
|
|
712 seq = new_read
|
|
713 if f==1 and adapterB!="" :
|
|
714 new_read = RemoveAdapter(seq, adapterB, adapter_mismatch)
|
|
715 if len(new_read)<len(seq) :
|
|
716 all_trimmed += 1
|
|
717 seq = new_read
|
|
718 all_base_after_trim += len(seq)
|
0
|
719
|
1
|
720 if len(seq)<=4:
|
0
|
721 seq = "N" * (cut2-cut1+1)
|
|
722 #--------- trimmed_raw_BS_read ------------------
|
1
|
723 original_bs_reads[read_id] = seq
|
0
|
724
|
|
725 #--------- FW_C2T ------------------
|
|
726 if f==0:
|
1
|
727 outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("C","T")))
|
0
|
728 elif f==1:
|
1
|
729 outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("G","A")))
|
0
|
730
|
1
|
731 n_list[f] = n
|
0
|
732 outf_FCT.close()
|
|
733 fileinput.close()
|
|
734
|
|
735 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
|
1
|
736 all_raw_reads += n
|
0
|
737
|
|
738 #--------------------------------------------------------------------------------
|
|
739 # Bowtie mapping
|
|
740 #--------------------------------------------------------------------------------
|
1
|
741 WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
|
|
742 CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
|
0
|
743
|
|
744 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
745 'input_file_1' : outfile_1FCT,
|
1
|
746 'input_file_2' : outfile_2RGA,
|
0
|
747 'output_file' : WC2T_fr},
|
|
748
|
|
749 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
750 'input_file_1' : outfile_1FCT,
|
1
|
751 'input_file_2' : outfile_2RGA,
|
|
752 'output_file' : CG2A_fr} ])
|
0
|
753
|
|
754
|
1
|
755 delete_files(outfile_1FCT, outfile_2RGA)
|
0
|
756
|
|
757 #--------------------------------------------------------------------------------
|
|
758 # Post processing
|
|
759 #--------------------------------------------------------------------------------
|
|
760
|
|
761 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
|
1
|
762 RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr)
|
0
|
763
|
|
764 #----------------------------------------------------------------
|
|
765 # get uniq-hit reads
|
|
766 #----------------------------------------------------------------
|
1
|
767 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys())
|
0
|
768
|
|
769 Unique_FW_fr_C2T = set() # +
|
1
|
770 Unique_RC_fr_G2A = set() # -
|
0
|
771 Multiple_hits=set()
|
|
772
|
|
773
|
|
774 for x in Union_set:
|
|
775 _list = []
|
1
|
776 for d in [FW_C2T_fr_U, RC_G2A_fr_U]:
|
0
|
777 mis_lst = d.get(x,[99])
|
|
778 mis = int(mis_lst[0])
|
|
779 _list.append(mis)
|
1
|
780 for d in [FW_C2T_fr_R, RC_G2A_fr_R]:
|
0
|
781 mis = d.get(x,99)
|
|
782 _list.append(mis)
|
|
783 mini = min(_list)
|
|
784 if _list.count(mini) == 1:
|
|
785 mini_index = _list.index(mini)
|
|
786 if mini_index == 0:
|
|
787 Unique_FW_fr_C2T.add(x)
|
|
788 elif mini_index == 1:
|
1
|
789 Unique_RC_fr_G2A.add(x)
|
0
|
790 else :
|
|
791 Multiple_hits.add(x)
|
|
792 else :
|
|
793 Multiple_hits.add(x)
|
|
794
|
|
795 # write reads rejected by Multiple Hits to file
|
|
796 if show_multiple_hit :
|
1
|
797 outf_MH = open("Multiple_hit.fa",'w')
|
0
|
798 for i in Multiple_hits :
|
|
799 outf_MH.write(">%s\n" % i)
|
|
800 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
801 outf_MH.close()
|
|
802
|
1
|
803 FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
|
|
804 RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A]
|
0
|
805
|
|
806 FW_C2T_fr_uniq_lst.sort()
|
|
807 RC_C2T_fr_uniq_lst.sort()
|
1
|
808 FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst]
|
|
809 RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst]
|
0
|
810 #----------------------------------------------------------------
|
|
811
|
1
|
812 numbers_premapped_lst[0] += len(Unique_FW_fr_C2T)
|
|
813 numbers_premapped_lst[1] += len(Unique_RC_fr_G2A)
|
0
|
814
|
|
815
|
|
816 #----------------------------------------------------------------
|
|
817
|
|
818 nn = 0
|
1
|
819 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_G2A_fr_U)]:
|
0
|
820 nn += 1
|
|
821 mapped_chr0 = ""
|
|
822 for header in ali_unique_lst:
|
|
823 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
|
|
824
|
|
825 #-------------------------------------
|
|
826 if mapped_chr != mapped_chr0:
|
|
827 my_gseq = deserialize(db_d(mapped_chr))
|
|
828 chr_length = len(my_gseq)
|
|
829 mapped_chr0 = mapped_chr
|
|
830 #-------------------------------------
|
|
831
|
|
832 original_BS_1 = original_bs_reads_1[header]
|
|
833 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
|
1
|
834 #original_BS_2 = original_bs_reads_2[header]
|
0
|
835
|
|
836 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
|
|
837 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
|
|
838
|
|
839 all_mapped += 1
|
|
840
|
|
841 if nn == 1: # FW-RC mapped to + strand:
|
|
842 FR = "+FR"
|
|
843 mapped_strand_1 = "+"
|
|
844 mapped_strand_2 = "+"
|
|
845 elif nn == 2: # FW-RC mapped to - strand:
|
|
846 FR="-FR"
|
|
847 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
|
|
848 mapped_strand_1 = "-"
|
|
849 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
|
|
850 mapped_strand_2 = "-"
|
|
851
|
|
852 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
|
|
853 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
|
|
854
|
|
855 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
|
|
856 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
|
|
857
|
|
858 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
|
|
859 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
|
|
860
|
1
|
861 # if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
|
|
862 # if N_mismatch <= int(max_mismatch_no) :
|
|
863 N_mismatch = max(N_mismatch_1, N_mismatch_2)
|
|
864 mm_no=float(max_mismatch_no)
|
|
865 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1)
|
|
866 and N_mismatch_2<=mm_no*len(original_BS_2)):
|
0
|
867
|
|
868 numbers_mapped_lst[nn-1] += 1
|
|
869 all_mapped_passed += 1
|
|
870
|
|
871 #---- unmapped -------------------------
|
|
872 del original_bs_reads_1[header]
|
|
873 del original_bs_reads_2[header]
|
|
874
|
|
875 #---------------------------------------
|
|
876 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
|
|
877 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
|
|
878 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
|
|
879 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
|
|
880
|
|
881 #
|
|
882 #---XS FILTER----------------
|
|
883 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
|
|
884 XS_1 = 0
|
|
885 nCH_1 = methy_1.count('y') + methy_1.count('z')
|
|
886 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
|
|
887 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
|
|
888 XS_1 = 1
|
|
889 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
|
|
890 XS_2 = 0
|
|
891 nCH_2 = methy_2.count('y') + methy_2.count('z')
|
|
892 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
|
|
893 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
|
|
894 XS_2 = 1
|
|
895
|
1
|
896 if mapped_strand_1 == '+' :
|
|
897 flag_1 = 67 # 1000011 : 1st read, + strand
|
|
898 flag_2 = 131 # 10000011 : 2nd read, + strand
|
|
899 else :
|
|
900 flag_1 = 115 # 1110011 : 1st read, - strand
|
|
901 flag_2 = 179 # 10110011 : 2nd read, - strand
|
0
|
902
|
1
|
903 outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
|
0
|
904 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
|
1
|
905 all_base_mapped += len(original_BS_1)
|
|
906
|
|
907 outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
|
0
|
908 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
|
1
|
909 all_base_mapped += len(original_BS_2)
|
0
|
910
|
|
911 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
|
|
912 #----------------------------------------------------------------
|
|
913 # output unmapped pairs
|
|
914 #----------------------------------------------------------------
|
|
915
|
|
916 unmapped_lst=original_bs_reads_1.keys()
|
|
917 unmapped_lst.sort()
|
|
918
|
|
919 # for u in unmapped_lst:
|
|
920 # outf_u1.write("%s\n"%(original_bs_reads_1[u]))
|
|
921 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) )
|
|
922
|
|
923 all_unmapped+=len(unmapped_lst)
|
|
924
|
|
925 #==================================================================================================
|
|
926 delete_files(tmp_path)
|
|
927
|
|
928 logm("-------------------------------- " )
|
1
|
929 logm("Number of raw BS-read pairs: %d " %(all_raw_reads/2) )
|
|
930 if adapterA != "" or adapterB != "" :
|
|
931 logm("Number of reads having adapter removed: %d \n"% all_trimmed)
|
|
932 logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
|
0
|
933
|
|
934 if all_raw_reads >0:
|
|
935
|
1
|
936 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
|
|
937 logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped + "\n")
|
0
|
938 if asktag=="Y":
|
1
|
939 logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
|
|
940 logm("-- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
|
|
941 logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
|
|
942 logm("-- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
|
0
|
943 elif asktag=="N":
|
1
|
944 logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
|
|
945 logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
|
|
946 logm("--- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
|
|
947 if asktag=="Y":
|
|
948 logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
|
|
949 logm("----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )
|
|
950 logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )
|
|
951 logm("----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )
|
|
952 elif asktag=="N":
|
|
953 logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
|
|
954 logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )
|
0
|
955
|
1
|
956 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)*2/all_raw_reads) )
|
|
957 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
|
|
958 logm("Unmapped read pairs: %d"% all_unmapped+"\n")
|
0
|
959
|
|
960 n_CG=mC_lst[0]+uC_lst[0]
|
|
961 n_CHG=mC_lst[1]+uC_lst[1]
|
|
962 n_CHH=mC_lst[2]+uC_lst[2]
|
|
963
|
|
964 logm("-------------------------------- " )
|
|
965 logm("Methylated C in mapped reads " )
|
|
966 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) )
|
|
967 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) )
|
|
968 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) )
|
|
969
|
|
970 logm("----------------------------------------------" )
|
|
971 logm("------------------- END ----------------------" )
|
|
972 elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))
|