0
|
1 import fileinput, os, random, math
|
|
2 from bs_utils.utils import *
|
|
3 from bs_align_utils import *
|
|
4 #----------------------------------------------------------------
|
|
5 def extract_mapping(ali_file):
|
|
6 unique_hits = {}
|
|
7 non_unique_hits = {}
|
|
8 header0 = ""
|
|
9 family = []
|
|
10 for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True):
|
|
11 #------------------------
|
|
12 if header != header0:
|
|
13
|
|
14 # --- output ----
|
|
15 if len(family) == 1:
|
|
16 unique_hits[header0] = family[0]
|
|
17 elif len(family) > 1:
|
|
18 min_lst = min(family, key = lambda x: x[0])
|
|
19 max_lst = max(family, key = lambda x: x[0])
|
|
20
|
|
21 if min_lst[0] < max_lst[0]:
|
|
22 unique_hits[header0] = min_lst
|
|
23 else:
|
|
24 non_unique_hits[header0] = min_lst[0]
|
|
25
|
|
26 header0 = header
|
|
27 family = []
|
|
28 family.append((no_mismatch, chr, location1, cigar1, location2, cigar2))
|
|
29
|
|
30 if len(family) == 1:
|
|
31 unique_hits[header0] = family[0]
|
|
32 elif len(family) > 1:
|
|
33 min_lst = min(family, key = lambda x: x[0])
|
|
34 max_lst = max(family, key = lambda x: x[0])
|
|
35
|
|
36 if min_lst[0] < max_lst[0]:
|
|
37 unique_hits[header0] = min_lst
|
|
38 else:
|
|
39 non_unique_hits[header0] = min_lst[0]
|
|
40
|
|
41 return unique_hits, non_unique_hits
|
|
42
|
|
43 def _extract_mapping(ali_file):
|
|
44 U = {}
|
|
45 R = {}
|
|
46 header0 = ""
|
|
47 family = []
|
|
48 for line in fileinput.input(ali_file):
|
|
49 l = line.split()
|
|
50 header = l[0][:-2]
|
|
51 chr = str(l[1])
|
|
52 location = int(l[2])
|
|
53 #no_hits=int(l[4])
|
|
54 #-------- mismatches -----------
|
|
55 if len(l) == 4:
|
|
56 no_mismatch = 0
|
|
57 elif len(l) == 5:
|
|
58 no_mismatch = l[4].count(":")
|
|
59 else:
|
|
60 print l
|
|
61 #------------------------
|
|
62 if header != header0:
|
|
63 #--------------------
|
|
64 if header0 != "":
|
|
65 # --- output ----
|
|
66 if len(family) == 1:
|
|
67 U[header0] = family[0]
|
|
68 else:
|
|
69 if family[0][0] < family[1][0]:
|
|
70 U[header0] = family[0]
|
|
71 elif family[1][0] < family[0][0]:
|
|
72 U[header0] = family[1]
|
|
73 else:
|
|
74 R[header0] = family[0][0]
|
|
75 family=[]
|
|
76 # ---------------
|
|
77 header0 = header
|
|
78 family = [[no_mismatch, chr, location]]
|
|
79 member = 1
|
|
80 elif header == header0:
|
|
81 if member == 1:
|
|
82 family[-1][0] += no_mismatch
|
|
83 family[-1].append(location)
|
|
84 member = 2
|
|
85 elif member == 2:
|
|
86 family.append([no_mismatch, chr, location])
|
|
87 member = 1
|
|
88 #------------------------------
|
|
89
|
|
90 fileinput.close()
|
|
91 return U, R
|
|
92
|
|
93
|
|
94 #----------------------------------------------------------------
|
|
95
|
|
96 def bs_pair_end(main_read_file_1,
|
|
97 main_read_file_2,
|
|
98 asktag,
|
|
99 adapter_file,
|
|
100 cut1,
|
|
101 cut2, # add cut3 and cut4?
|
|
102 no_small_lines,
|
|
103 max_mismatch_no,
|
|
104 aligner_command,
|
|
105 db_path,
|
|
106 tmp_path,
|
|
107 outfile, XS_pct, XS_count, show_multiple_hit=False):
|
|
108
|
|
109
|
|
110 #----------------------------------------------------------------
|
|
111 adapter=""
|
|
112 adapterA=""
|
|
113 adapterB=""
|
|
114 if adapter_file !="":
|
|
115 try :
|
|
116 adapter_inf=open(adapter_file,"r")
|
|
117 if asktag=="N": #<--- directional library
|
|
118 adapter=adapter_inf.readline()
|
|
119 adapter_inf.close()
|
|
120 adapter=adapter.rstrip("\n")
|
|
121 elif asktag=="Y":#<--- un-directional library
|
|
122 adapterA=adapter_inf.readline()
|
|
123 adapterB=adapter_inf.readline()
|
|
124 adapter_inf.close()
|
|
125 adapterA=adapterA.rstrip("\n")
|
|
126 adapterB=adapterB.rstrip("\n")
|
|
127 except IOError :
|
|
128 print "[Error] Cannot find adapter file : %s" % adapter_file
|
|
129 exit(-1)
|
|
130
|
|
131
|
|
132 #----------------------------------------------------------------
|
|
133
|
|
134 logm("End 1 filename: %s"% main_read_file_1 )
|
|
135 logm("End 2 filename: %s"% main_read_file_2 )
|
|
136 logm("The first base (for mapping): %d"% cut1 )
|
|
137 logm("The last base (for mapping): %d"% cut2 )
|
|
138
|
|
139 logm("-------------------------------- " )
|
|
140 logm("Un-directional library: %s" % asktag )
|
|
141 logm("Path for short reads aligner: %s"% aligner_command + '\n')
|
|
142 logm("Reference genome library path: %s"% db_path )
|
|
143 logm("Number of mismatches allowed: %s"% max_mismatch_no )
|
|
144
|
|
145 if adapter_file !="":
|
|
146 if asktag=="Y":
|
|
147 logm("Adapters to be removed from 3' of the reads:" )
|
|
148 logm("-- A: %s" % adapterA )
|
|
149 logm("-- B: %s" % adapterB )
|
|
150 elif asktag=="N":
|
|
151 logm("Adapter to be removed from 3' of the reads:" )
|
|
152 logm("-- %s" % adapter )
|
|
153
|
|
154 logm("-------------------------------- " )
|
|
155
|
|
156 #----------------------------------------------------------------
|
|
157
|
|
158 # helper method to join fname with tmp_path
|
|
159 tmp_d = lambda fname: os.path.join(tmp_path, fname)
|
|
160
|
|
161 db_d = lambda fname: os.path.join(db_path, fname)
|
|
162
|
|
163
|
|
164 #----------------------------------------------------------------
|
|
165 # splitting the 2 big read files
|
|
166
|
|
167 input_fname1 = os.path.split(main_read_file_1)[1]
|
|
168 input_fname2 = os.path.split(main_read_file_2)[1]
|
|
169
|
|
170 # TODO: run these in parallel with a subprocess
|
|
171 split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines)
|
|
172 split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines)
|
|
173
|
|
174 dirList=os.listdir(tmp_path)
|
|
175 my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)),
|
|
176 sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList)))
|
|
177
|
|
178
|
|
179
|
|
180 #---- Stats ------------------------------------------------------------
|
|
181 all_raw_reads=0
|
|
182 all_trimmed=0
|
|
183 all_mapped=0
|
|
184 all_mapped_passed=0
|
|
185
|
|
186 all_unmapped=0
|
|
187
|
|
188 numbers_premapped_lst=[0,0,0,0]
|
|
189 numbers_mapped_lst=[0,0,0,0]
|
|
190
|
|
191 mC_lst=[0,0,0]
|
|
192 uC_lst=[0,0,0]
|
|
193
|
|
194 no_my_files=0
|
|
195
|
|
196 #----------------------------------------------------------------
|
|
197 print "== Start mapping =="
|
|
198
|
|
199 for read_file_1, read_file_2 in my_files:
|
|
200
|
|
201 no_my_files+=1
|
|
202
|
|
203 random_id=".tmp-"+str(random.randint(1000000,9999999))
|
|
204
|
|
205 original_bs_reads_1 = {}
|
|
206 original_bs_reads_2 = {}
|
|
207 original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2]
|
|
208
|
|
209
|
|
210 if asktag == "Y" :
|
|
211
|
|
212 #----------------------------------------------------------------
|
|
213 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id)
|
|
214 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id)
|
|
215 outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id)
|
|
216 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id)
|
|
217
|
|
218 try :
|
|
219 read_inf = open(tmp_d(read_file_1),"r")
|
|
220 except IOError :
|
|
221 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
|
|
222 exit(-1)
|
|
223 oneline = read_inf.readline()
|
|
224 l = oneline.split()
|
|
225 input_format = ""
|
|
226
|
|
227 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
|
|
228 input_format="fastq"
|
|
229 n_fastq=0
|
|
230 elif len(l)==1 and oneline[0]!=">": # pure sequences
|
|
231 input_format="seq"
|
|
232 elif len(l)==11: # Illumina GAII qseq file
|
|
233 input_format="qseq"
|
|
234 elif oneline[0]==">": # fasta
|
|
235 input_format="fasta"
|
|
236 n_fasta=0
|
|
237 read_inf.close()
|
|
238
|
|
239 print "Detected data format: %s" % input_format
|
|
240
|
|
241 #----------------------------------------------------------------
|
|
242 read_file_list = [read_file_1, read_file_2]
|
|
243 outfile_FCT_list = [outfile_1FCT, outfile_2FCT]
|
|
244 outfile_RCT_list = [outfile_1RCT, outfile_2RCT]
|
|
245 n_list = [0, 0]
|
|
246
|
|
247
|
|
248 for f in range(2):
|
|
249 read_file = read_file_list[f]
|
|
250 outf_FCT = open(outfile_FCT_list[f], 'w')
|
|
251 outf_RCT = open(outfile_RCT_list[f], 'w')
|
|
252 original_bs_reads = original_bs_reads_lst[f]
|
|
253 n = n_list[f]
|
|
254
|
|
255 seq_id = ""
|
|
256 seq = ""
|
|
257 seq_ready = "N"
|
|
258 for line in fileinput.input(tmp_d(read_file)):
|
|
259 l=line.split()
|
|
260 if input_format=="seq":
|
|
261 n+=1
|
|
262 seq_id=str(n)
|
|
263 seq_id=seq_id.zfill(12)
|
|
264 seq=l[0]
|
|
265 seq_ready="Y"
|
|
266 elif input_format=="fastq":
|
|
267 m_fastq=math.fmod(n_fastq,4)
|
|
268 n_fastq+=1
|
|
269 seq_ready="N"
|
|
270 if m_fastq==0:
|
|
271 n+=1
|
|
272 seq_id=str(n)
|
|
273 seq_id=seq_id.zfill(12)
|
|
274 seq=""
|
|
275 elif m_fastq==1:
|
|
276 seq=l[0]
|
|
277 seq_ready="Y"
|
|
278 else:
|
|
279 seq=""
|
|
280 elif input_format=="qseq":
|
|
281 n+=1
|
|
282 seq_id=str(n)
|
|
283 seq_id=seq_id.zfill(12)
|
|
284 seq=l[8]
|
|
285 seq_ready="Y"
|
|
286 elif input_format=="fasta":
|
|
287 m_fasta=math.fmod(n_fasta,2)
|
|
288 n_fasta+=1
|
|
289 seq_ready="N"
|
|
290 if m_fasta==0:
|
|
291 n+=1
|
|
292 seq_id=l[0][1:]
|
|
293 seq_id=seq_id.zfill(17)
|
|
294 seq=""
|
|
295 elif m_fasta==1:
|
|
296 seq=l[0]
|
|
297 seq_ready="Y"
|
|
298 else:
|
|
299 seq=""
|
|
300 #----------------------------------------------------------------
|
|
301 if seq_ready=="Y":
|
|
302 seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52
|
|
303 seq=seq.upper()
|
|
304 seq=seq.replace(".","N")
|
|
305
|
|
306 #--striping BS adapter from 3' read --------------------------------------------------------------
|
|
307 if (adapterA !="") and (adapterB !=""):
|
|
308 signature=adapterA[:6]
|
|
309 if signature in seq:
|
|
310 signature_pos=seq.index(signature)
|
|
311 if seq[signature_pos:] in adapterA:
|
|
312 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
|
|
313 all_trimmed+=1
|
|
314 else:
|
|
315 signature=adapterB[:6]
|
|
316 if signature in seq:
|
|
317 #print seq_id,seq,signature;
|
|
318 signature_pos=seq.index(signature)
|
|
319 if seq[signature_pos:] in adapterB:
|
|
320 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
|
|
321 all_trimmed+=1
|
|
322
|
|
323 if len(seq) <= 4:
|
|
324 seq = "N" * (cut2-cut1+1)
|
|
325 #--------- trimmed_raw_BS_read ------------------
|
|
326 original_bs_reads[seq_id] = seq
|
|
327
|
|
328 #--------- FW_C2T ------------------
|
|
329 outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T")))
|
|
330 #--------- RC_G2A ------------------
|
|
331 outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
|
|
332
|
|
333 n_list[f]=n
|
|
334
|
|
335 outf_FCT.close()
|
|
336 outf_RCT.close()
|
|
337
|
|
338 fileinput.close()
|
|
339
|
|
340
|
|
341 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
|
|
342 all_raw_reads+=n
|
|
343
|
|
344 #--------------------------------------------------------------------------------
|
|
345 # Bowtie mapping
|
|
346 #--------------------------------------------------------------------------------
|
|
347 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
|
|
348 WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
|
|
349 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
|
|
350 CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
|
|
351
|
|
352 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
353 'input_file_1' : outfile_1FCT,
|
|
354 'input_file_2' : outfile_2RCT,
|
|
355 'output_file' : WC2T_fr},
|
|
356
|
|
357 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
358 'input_file_1' : outfile_1FCT,
|
|
359 'input_file_2' : outfile_2RCT,
|
|
360 'output_file' : CC2T_fr},
|
|
361
|
|
362 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
363 'input_file_1' : outfile_2FCT,
|
|
364 'input_file_2' : outfile_1RCT,
|
|
365 'output_file' : WC2T_rf},
|
|
366
|
|
367 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
368 'input_file_1' : outfile_2FCT,
|
|
369 'input_file_2' : outfile_1RCT,
|
|
370 'output_file' : CC2T_rf}])
|
|
371
|
|
372
|
|
373 delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT)
|
|
374
|
|
375
|
|
376 #--------------------------------------------------------------------------------
|
|
377 # Post processing
|
|
378 #--------------------------------------------------------------------------------
|
|
379
|
|
380
|
|
381 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
|
|
382 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf)
|
|
383 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
|
|
384 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf)
|
|
385
|
|
386 delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf)
|
|
387
|
|
388 #----------------------------------------------------------------
|
|
389 # get uniq-hit reads
|
|
390 #----------------------------------------------------------------
|
|
391 Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys())
|
|
392
|
|
393 Unique_FW_fr_C2T=set() # +
|
|
394 Unique_FW_rf_C2T=set() # +
|
|
395 Unique_RC_fr_C2T=set() # -
|
|
396 Unique_RC_rf_C2T=set() # -
|
|
397 Multiple_hits=set()
|
|
398
|
|
399
|
|
400 for x in Union_set:
|
|
401 _list=[]
|
|
402 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]:
|
|
403 mis_lst=d.get(x,[99])
|
|
404 mis=int(mis_lst[0])
|
|
405 _list.append(mis)
|
|
406 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]:
|
|
407 mis=d.get(x,99)
|
|
408 _list.append(mis)
|
|
409 mini=min(_list)
|
|
410 if _list.count(mini)==1:
|
|
411 mini_index=_list.index(mini)
|
|
412 if mini_index==0:
|
|
413 Unique_FW_fr_C2T.add(x)
|
|
414 elif mini_index==1:
|
|
415 Unique_FW_rf_C2T.add(x)
|
|
416 elif mini_index==2:
|
|
417 Unique_RC_fr_C2T.add(x)
|
|
418 elif mini_index==3:
|
|
419 Unique_RC_rf_C2T.add(x)
|
|
420 else :
|
|
421 Multiple_hits.add(x)
|
|
422 else :
|
|
423 Multiple_hits.add(x)
|
|
424
|
|
425 # write reads rejected by Multiple Hits to file
|
|
426 if show_multiple_hit :
|
|
427 outf_MH=open("Multiple_hit.fa",'w')
|
|
428 for i in Multiple_hits :
|
|
429 outf_MH.write(">%s\n" % i)
|
|
430 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
431 outf_MH.close()
|
|
432
|
|
433 del Union_set
|
|
434 del FW_C2T_fr_R
|
|
435 del FW_C2T_rf_R
|
|
436 del RC_C2T_fr_R
|
|
437 del RC_C2T_rf_R
|
|
438
|
|
439 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
|
|
440 FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T]
|
|
441 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
|
|
442 RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T]
|
|
443
|
|
444 FW_C2T_fr_uniq_lst.sort()
|
|
445 FW_C2T_rf_uniq_lst.sort()
|
|
446 RC_C2T_fr_uniq_lst.sort()
|
|
447 RC_C2T_rf_uniq_lst.sort()
|
|
448 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
|
|
449 FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst]
|
|
450 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
|
|
451 RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst]
|
|
452 #----------------------------------------------------------------
|
|
453
|
|
454 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
|
|
455 numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T)
|
|
456 numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T)
|
|
457 numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T)
|
|
458
|
|
459 del Unique_FW_fr_C2T
|
|
460 del Unique_FW_rf_C2T
|
|
461 del Unique_RC_fr_C2T
|
|
462 del Unique_RC_rf_C2T
|
|
463
|
|
464 #----------------------------------------------------------------
|
|
465
|
|
466 nn = 0
|
|
467 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U),
|
|
468 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U),
|
|
469 (RC_C2T_fr_uniq_lst,RC_C2T_fr_U),
|
|
470 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]:
|
|
471 nn += 1
|
|
472
|
|
473 mapped_chr0 = ""
|
|
474 for header in ali_unique_lst:
|
|
475
|
|
476 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
|
|
477
|
|
478 #-------------------------------------
|
|
479 if mapped_chr != mapped_chr0:
|
|
480 my_gseq=deserialize(db_d(mapped_chr))
|
|
481 chr_length=len(my_gseq)
|
|
482 mapped_chr0=mapped_chr
|
|
483 #-------------------------------------
|
|
484 if nn == 1 or nn == 3:
|
|
485 original_BS_1 = original_bs_reads_1[header]
|
|
486 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
|
|
487 else:
|
|
488 original_BS_1 = original_bs_reads_2[header]
|
|
489 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
|
|
490
|
|
491 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
|
|
492 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
|
|
493
|
|
494 all_mapped += 1
|
|
495
|
|
496 if nn == 1: # FW-RC mapped to + strand:
|
|
497
|
|
498 FR = "+FR"
|
|
499
|
|
500 # mapped_location_1 += 1
|
|
501 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
|
|
502 # origin_genome_1 = origin_genome_long_1[2:-2]
|
|
503 mapped_strand_1 = "+"
|
|
504
|
|
505 # mapped_location_2 += 1
|
|
506 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
|
|
507 # origin_genome_2 = origin_genome_long_2[2:-2]
|
|
508 mapped_strand_2 = "+"
|
|
509
|
|
510 elif nn==2: # RC-FW mapped to + strand:
|
|
511
|
|
512 # original_BS_1 = original_bs_reads_2[header]
|
|
513 # original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
|
|
514 FR = "+RF"
|
|
515 # mapped_location_1 += 1
|
|
516 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
|
|
517 # origin_genome_1 = origin_genome_long_1[2:-2]
|
|
518 mapped_strand_1 = "+"
|
|
519
|
|
520 # mapped_location_2 += 1
|
|
521 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
|
|
522 # origin_genome_2 = origin_genome_long_2[2:-2]
|
|
523 mapped_strand_2 = "+"
|
|
524
|
|
525
|
|
526 elif nn==3: # FW-RC mapped to - strand:
|
|
527 # original_BS_1=original_bs_reads_1[header]
|
|
528 # original_BS_2=reverse_compl_seq(original_bs_reads_2[header])
|
|
529
|
|
530 FR = "-FR"
|
|
531 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
|
|
532 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
|
|
533 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
|
|
534 # origin_genome_1 = origin_genome_long_1[2:-2]
|
|
535 mapped_strand_1 = "-"
|
|
536
|
|
537 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
|
|
538 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ])
|
|
539 # origin_genome_2 = origin_genome_long_2[2:-2]
|
|
540 mapped_strand_2 = "-"
|
|
541
|
|
542 elif nn==4: # RC-FW mapped to - strand:
|
|
543 # original_BS_1=original_bs_reads_2[header]
|
|
544 # original_BS_2=reverse_compl_seq(original_bs_reads_1[header])
|
|
545
|
|
546 FR = "-RF"
|
|
547 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
|
|
548 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
|
|
549 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
|
|
550 # origin_genome_1 = origin_genome_long_1[2:-2]
|
|
551 mapped_strand_1 = "-"
|
|
552
|
|
553 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
|
|
554 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
|
|
555 # origin_genome_2 = origin_genome_long_2[2:-2]
|
|
556 mapped_strand_2 = "-"
|
|
557
|
|
558
|
|
559
|
|
560
|
|
561 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
|
|
562 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
|
|
563
|
|
564 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
|
|
565 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
|
|
566
|
|
567
|
|
568 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
|
|
569 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
|
|
570
|
|
571
|
|
572 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) :
|
|
573 all_mapped_passed += 1
|
|
574 numbers_mapped_lst[nn-1] += 1
|
|
575 #---- unmapped -------------------------
|
|
576 del original_bs_reads_1[header]
|
|
577 del original_bs_reads_2[header]
|
|
578 #---------------------------------------
|
|
579
|
|
580 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
|
|
581 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
|
|
582
|
|
583 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
|
|
584 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
|
|
585
|
|
586 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst)
|
|
587 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
|
|
588
|
|
589
|
|
590 #
|
|
591 #---XS FILTER----------------
|
|
592 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
|
|
593 XS_1 = 0
|
|
594 nCH_1 = methy_1.count('y') + methy_1.count('z')
|
|
595 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
|
|
596 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
|
|
597 XS_1 = 1
|
|
598 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
|
|
599 XS_2 = 0
|
|
600 nCH_2 = methy_2.count('y') + methy_2.count('z')
|
|
601 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
|
|
602 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
|
|
603 XS_2 = 1
|
|
604
|
|
605
|
|
606 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
|
|
607 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
|
|
608 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
|
|
609 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
|
|
610
|
|
611 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
|
|
612 #----------------------------------------------------------------
|
|
613 # output unmapped pairs
|
|
614 #----------------------------------------------------------------
|
|
615
|
|
616 unmapped_lst=original_bs_reads_1.keys()
|
|
617 unmapped_lst.sort()
|
|
618
|
|
619 # for u in unmapped_lst:
|
|
620 # outf_u1.write("%s\n"%original_bs_reads_1[u])
|
|
621 # outf_u2.write("%s\n"%original_bs_reads_2[u])
|
|
622
|
|
623 all_unmapped += len(unmapped_lst)
|
|
624
|
|
625
|
|
626 if asktag=="N":
|
|
627
|
|
628 #----------------------------------------------------------------
|
|
629 outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id)
|
|
630 outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id)
|
|
631
|
|
632 try :
|
|
633 read_inf=open(tmp_d(read_file_1),"r")
|
|
634 except IOError :
|
|
635 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
|
|
636 exit(-1)
|
|
637
|
|
638 oneline=read_inf.readline()
|
|
639 l=oneline.split()
|
|
640 input_format=""
|
|
641
|
|
642 #if len(l)==5: # old solexa format
|
|
643 # input_format="old Solexa Seq file"
|
|
644
|
|
645 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
|
|
646 input_format="FastQ"
|
|
647 n_fastq=0
|
|
648 elif len(l)==1 and oneline[0]!=">": # pure sequences
|
|
649 input_format="_list of sequences"
|
|
650 elif len(l)==11: # Illumina GAII qseq file
|
|
651 input_format="Illumina GAII qseq file"
|
|
652 elif oneline[0]==">": # fasta
|
|
653 input_format="fasta"
|
|
654 n_fasta=0
|
|
655
|
|
656 read_inf.close()
|
|
657
|
|
658 print "Detected data format: %s" % input_format
|
|
659
|
|
660 #----------------------------------------------------------------
|
|
661 read_file_list=[read_file_1,read_file_2]
|
|
662 outfile_FCT_list=[outfile_1FCT,outfile_2FCT]
|
|
663 n_list=[0,0]
|
|
664
|
|
665 for f in range(2):
|
|
666 read_file=read_file_list[f]
|
|
667 outf_FCT=open(outfile_FCT_list[f],'w')
|
|
668 original_bs_reads = original_bs_reads_lst[f]
|
|
669 n=n_list[f]
|
|
670
|
|
671 seq_id=""
|
|
672 seq=""
|
|
673 seq_ready="N"
|
|
674 for line in fileinput.input(tmp_d(read_file)):
|
|
675 l=line.split()
|
|
676 if input_format=="old Solexa Seq file":
|
|
677 n+=1
|
|
678 seq_id=str(n)
|
|
679 seq_id=seq_id.zfill(12)
|
|
680 seq=l[4]
|
|
681 seq_ready="Y"
|
|
682 elif input_format=="_list of sequences":
|
|
683 n+=1
|
|
684 seq_id=str(n)
|
|
685 seq_id=seq_id.zfill(12)
|
|
686 seq=l[0]
|
|
687 seq_ready="Y"
|
|
688 elif input_format=="FastQ":
|
|
689 m_fastq=math.fmod(n_fastq,4)
|
|
690 n_fastq+=1
|
|
691 seq_ready="N"
|
|
692 if m_fastq==0:
|
|
693 n+=1
|
|
694 seq_id=str(n)
|
|
695 seq_id=seq_id.zfill(12)
|
|
696 seq=""
|
|
697 elif m_fastq==1:
|
|
698 seq=l[0]
|
|
699 seq_ready="Y"
|
|
700 else:
|
|
701 seq=""
|
|
702 elif input_format=="Illumina GAII qseq file":
|
|
703 n+=1
|
|
704 seq_id=str(n)
|
|
705 seq_id=seq_id.zfill(12)
|
|
706 seq=l[8]
|
|
707 seq_ready="Y"
|
|
708 elif input_format=="fasta":
|
|
709 m_fasta=math.fmod(n_fasta,2)
|
|
710 n_fasta+=1
|
|
711 seq_ready="N"
|
|
712 if m_fasta==0:
|
|
713 n+=1
|
|
714 seq_id=l[0][1:]
|
|
715 seq_id=seq_id.zfill(17)
|
|
716 seq=""
|
|
717 elif m_fasta==1:
|
|
718 seq=l[0]
|
|
719 seq_ready="Y"
|
|
720 else:
|
|
721 seq=""
|
|
722 #----------------------------------------------------------------
|
|
723 if seq_ready=="Y":
|
|
724 seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52
|
|
725 seq=seq.upper()
|
|
726 seq=seq.replace(".","N")
|
|
727
|
|
728 #--striping BS adapter from 3' read --------------------------------------------------------------
|
|
729 if (adapterA !="") and (adapterB !=""):
|
|
730 signature=adapterA[:6]
|
|
731 if signature in seq:
|
|
732 signature_pos=seq.index(signature)
|
|
733 if seq[signature_pos:] in adapterA:
|
|
734 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
|
|
735 all_trimmed+=1
|
|
736 else:
|
|
737 signature=adapterB[:6]
|
|
738 if signature in seq:
|
|
739 #print seq_id,seq,signature;
|
|
740 signature_pos=seq.index(signature)
|
|
741 if seq[signature_pos:] in adapterB:
|
|
742 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
|
|
743 all_trimmed+=1
|
|
744
|
|
745 if len(seq) <= 4:
|
|
746 seq = "N" * (cut2-cut1+1)
|
|
747 #--------- trimmed_raw_BS_read ------------------
|
|
748 original_bs_reads[seq_id] = seq
|
|
749
|
|
750 #--------- FW_C2T ------------------
|
|
751 if f==0:
|
|
752 outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T")))
|
|
753 elif f==1:
|
|
754 outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T")))
|
|
755
|
|
756
|
|
757 n_list[f]=n
|
|
758 outf_FCT.close()
|
|
759
|
|
760 fileinput.close()
|
|
761
|
|
762
|
|
763 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
|
|
764 all_raw_reads+=n
|
|
765
|
|
766 #--------------------------------------------------------------------------------
|
|
767 # Bowtie mapping
|
|
768 #--------------------------------------------------------------------------------
|
|
769 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
|
|
770 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
|
|
771
|
|
772 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
773 'input_file_1' : outfile_1FCT,
|
|
774 'input_file_2' : outfile_2FCT,
|
|
775 'output_file' : WC2T_fr},
|
|
776
|
|
777 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
778 'input_file_1' : outfile_1FCT,
|
|
779 'input_file_2' : outfile_2FCT,
|
|
780 'output_file' : CC2T_fr} ])
|
|
781
|
|
782
|
|
783 delete_files(outfile_1FCT, outfile_2FCT)
|
|
784
|
|
785 #--------------------------------------------------------------------------------
|
|
786 # Post processing
|
|
787 #--------------------------------------------------------------------------------
|
|
788
|
|
789 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
|
|
790 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
|
|
791
|
|
792 #----------------------------------------------------------------
|
|
793 # get uniq-hit reads
|
|
794 #----------------------------------------------------------------
|
|
795 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys())
|
|
796
|
|
797 Unique_FW_fr_C2T = set() # +
|
|
798 Unique_RC_fr_C2T = set() # -
|
|
799 Multiple_hits=set()
|
|
800
|
|
801
|
|
802 for x in Union_set:
|
|
803 _list = []
|
|
804 for d in [FW_C2T_fr_U, RC_C2T_fr_U]:
|
|
805 mis_lst = d.get(x,[99])
|
|
806 mis = int(mis_lst[0])
|
|
807 _list.append(mis)
|
|
808 for d in [FW_C2T_fr_R, RC_C2T_fr_R]:
|
|
809 mis = d.get(x,99)
|
|
810 _list.append(mis)
|
|
811 mini = min(_list)
|
|
812 if _list.count(mini) == 1:
|
|
813 mini_index = _list.index(mini)
|
|
814 if mini_index == 0:
|
|
815 Unique_FW_fr_C2T.add(x)
|
|
816 elif mini_index == 1:
|
|
817 Unique_RC_fr_C2T.add(x)
|
|
818 else :
|
|
819 Multiple_hits.add(x)
|
|
820 else :
|
|
821 Multiple_hits.add(x)
|
|
822
|
|
823 # write reads rejected by Multiple Hits to file
|
|
824 if show_multiple_hit :
|
|
825 outf_MH=open("Multiple_hit.fa",'w')
|
|
826 for i in Multiple_hits :
|
|
827 outf_MH.write(">%s\n" % i)
|
|
828 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
829 outf_MH.close()
|
|
830
|
|
831 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
|
|
832 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
|
|
833
|
|
834 FW_C2T_fr_uniq_lst.sort()
|
|
835 RC_C2T_fr_uniq_lst.sort()
|
|
836 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
|
|
837 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
|
|
838 #----------------------------------------------------------------
|
|
839
|
|
840 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
|
|
841 numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T)
|
|
842
|
|
843
|
|
844 #----------------------------------------------------------------
|
|
845
|
|
846 nn = 0
|
|
847 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]:
|
|
848 nn += 1
|
|
849 mapped_chr0 = ""
|
|
850 for header in ali_unique_lst:
|
|
851 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
|
|
852
|
|
853
|
|
854 #-------------------------------------
|
|
855 if mapped_chr != mapped_chr0:
|
|
856 my_gseq = deserialize(db_d(mapped_chr))
|
|
857 chr_length = len(my_gseq)
|
|
858 mapped_chr0 = mapped_chr
|
|
859 #-------------------------------------
|
|
860
|
|
861 original_BS_1 = original_bs_reads_1[header]
|
|
862 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
|
|
863
|
|
864 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
|
|
865 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
|
|
866
|
|
867 all_mapped += 1
|
|
868
|
|
869 if nn == 1: # FW-RC mapped to + strand:
|
|
870 FR = "+FR"
|
|
871 # mapped_location_1 += 1
|
|
872 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
|
|
873 # origin_genome_1 = origin_genome_long_1[2:-2]
|
|
874 mapped_strand_1 = "+"
|
|
875
|
|
876 # mapped_location_2 += 1
|
|
877 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
|
|
878 # origin_genome_2 = origin_genome_long_2[2:-2]
|
|
879 mapped_strand_2 = "+"
|
|
880
|
|
881 elif nn == 2: # FW-RC mapped to - strand:
|
|
882
|
|
883 FR="-FR"
|
|
884 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
|
|
885 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
|
|
886 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
|
|
887 # origin_genome_1 = origin_genome_long_1[2:-2]
|
|
888 mapped_strand_1 = "-"
|
|
889
|
|
890 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
|
|
891 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
|
|
892 # origin_genome_2 = origin_genome_long_2[2:-2]
|
|
893 mapped_strand_2 = "-"
|
|
894
|
|
895
|
|
896
|
|
897 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
|
|
898 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
|
|
899
|
|
900
|
|
901 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
|
|
902 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
|
|
903
|
|
904 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
|
|
905 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
|
|
906
|
|
907 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
|
|
908
|
|
909 numbers_mapped_lst[nn-1] += 1
|
|
910 all_mapped_passed += 1
|
|
911
|
|
912 #---- unmapped -------------------------
|
|
913 del original_bs_reads_1[header]
|
|
914 del original_bs_reads_2[header]
|
|
915
|
|
916 #---------------------------------------
|
|
917 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
|
|
918 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
|
|
919 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
|
|
920 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
|
|
921 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
|
|
922 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
|
|
923
|
|
924 #
|
|
925 #---XS FILTER----------------
|
|
926 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
|
|
927 XS_1 = 0
|
|
928 nCH_1 = methy_1.count('y') + methy_1.count('z')
|
|
929 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
|
|
930 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
|
|
931 XS_1 = 1
|
|
932 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
|
|
933 XS_2 = 0
|
|
934 nCH_2 = methy_2.count('y') + methy_2.count('z')
|
|
935 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
|
|
936 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
|
|
937 XS_2 = 1
|
|
938
|
|
939
|
|
940 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
|
|
941 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
|
|
942 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
|
|
943 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
|
|
944
|
|
945 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
|
|
946 #----------------------------------------------------------------
|
|
947 # output unmapped pairs
|
|
948 #----------------------------------------------------------------
|
|
949
|
|
950 unmapped_lst=original_bs_reads_1.keys()
|
|
951 unmapped_lst.sort()
|
|
952
|
|
953 # for u in unmapped_lst:
|
|
954 # outf_u1.write("%s\n"%(original_bs_reads_1[u]))
|
|
955 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) )
|
|
956
|
|
957 all_unmapped+=len(unmapped_lst)
|
|
958
|
|
959
|
|
960 #==================================================================================================
|
|
961 # outf.close()
|
|
962 #
|
|
963 # outf_u1.close()
|
|
964 # outf_u2.close()
|
|
965 delete_files(tmp_path)
|
|
966
|
|
967 logm("-------------------------------- " )
|
|
968 logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) )
|
|
969 logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n")
|
|
970
|
|
971 if all_raw_reads >0:
|
|
972
|
|
973 logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n")
|
|
974 if asktag=="Y":
|
|
975 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
|
|
976 logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
|
|
977 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
|
|
978 logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
|
|
979 elif asktag=="N":
|
|
980 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
|
|
981 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
|
|
982
|
|
983
|
|
984 logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
|
|
985 logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
|
|
986 if asktag=="Y":
|
|
987 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
|
|
988 logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )
|
|
989 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )
|
|
990 logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )
|
|
991 elif asktag=="N":
|
|
992 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
|
|
993 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )
|
|
994
|
|
995 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
|
|
996 logm("O Unmapped read pairs: %d"% all_unmapped+"\n")
|
|
997
|
|
998
|
|
999 n_CG=mC_lst[0]+uC_lst[0]
|
|
1000 n_CHG=mC_lst[1]+uC_lst[1]
|
|
1001 n_CHH=mC_lst[2]+uC_lst[2]
|
|
1002
|
|
1003 logm("-------------------------------- " )
|
|
1004 logm("Methylated C in mapped reads " )
|
|
1005 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) )
|
|
1006 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) )
|
|
1007 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) )
|
|
1008
|
|
1009 logm("----------------------------------------------" )
|
|
1010 logm("------------------- END ----------------------" )
|
|
1011 elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))
|