0
|
1 import fileinput, os, time, random, math
|
|
2 from bs_utils.utils import *
|
|
3 from bs_align_utils import *
|
|
4
|
|
5 #----------------------------------------------------------------
|
|
6 # Read from the mapped results, return lists of unique / multiple-hit reads
|
|
7 # The function suppose at most 2 hits will be reported in single file
|
|
8 def extract_mapping(ali_file):
|
|
9 unique_hits = {}
|
|
10 non_unique_hits = {}
|
|
11
|
|
12 header0 = ""
|
|
13 lst = []
|
|
14
|
|
15 for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file):
|
|
16 #------------------------------
|
|
17 if header != header0:
|
|
18 #---------- output -----------
|
|
19 if len(lst) == 1:
|
|
20 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
|
|
21 elif len(lst) > 1:
|
|
22 min_lst = min(lst, key = lambda x: x[0])
|
|
23 max_lst = max(lst, key = lambda x: x[0])
|
|
24
|
|
25 if min_lst[0] < max_lst[0]:
|
|
26 unique_hits[header0] = min_lst
|
|
27 else:
|
|
28 non_unique_hits[header0] = min_lst[0]
|
|
29 #print "multiple hit", header, chr, location, no_mismatch, cigar # test
|
|
30 header0 = header
|
|
31 lst = [(no_mismatch, chr, location, cigar)]
|
|
32 else: # header == header0, same header (read id)
|
|
33 lst.append((no_mismatch, chr, location, cigar))
|
|
34
|
|
35 if len(lst) == 1:
|
|
36 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
|
|
37 elif len(lst) > 1:
|
|
38 min_lst = min(lst, key = lambda x: x[0])
|
|
39 max_lst = max(lst, key = lambda x: x[0])
|
|
40
|
|
41 if min_lst[0] < max_lst[0]:
|
|
42 unique_hits[header0] = min_lst
|
|
43 else:
|
|
44 non_unique_hits[header0] = min_lst[0]
|
|
45
|
|
46
|
|
47 return unique_hits, non_unique_hits
|
|
48
|
|
49
|
|
50 def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines,
|
|
51 max_mismatch_no, aligner_command, db_path, tmp_path, outfile,
|
|
52 XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
|
|
53 #----------------------------------------------------------------
|
|
54 # adapter : strand-specific or not
|
|
55 adapter=""
|
|
56 adapter_fw=""
|
|
57 adapter_rc=""
|
|
58 if adapter_file !="":
|
|
59 try :
|
|
60 adapter_inf=open(adapter_file,"r")
|
|
61 except IOError:
|
|
62 print "[Error] Cannot open adapter file : %s" % adapter_file
|
|
63 exit(-1)
|
|
64 if asktag == "N": #<--- directional library
|
|
65 adapter=adapter_inf.readline()
|
|
66 adapter_inf.close()
|
|
67 adapter=adapter.rstrip("\n")
|
|
68 elif asktag == "Y":#<--- un-directional library
|
|
69 adapter_fw=adapter_inf.readline()
|
|
70 adapter_rc=adapter_inf.readline()
|
|
71 adapter_inf.close()
|
|
72 adapter_fw=adapter_fw.rstrip("\n")
|
|
73 adapter_rc=adapter_rc.rstrip("\n")
|
|
74 adapter_inf.close()
|
|
75 #----------------------------------------------------------------
|
|
76
|
|
77
|
|
78
|
|
79 #----------------------------------------------------------------
|
|
80 logm("Read filename: %s"% main_read_file )
|
|
81 logm("Un-directional library: %s" % asktag )
|
|
82 logm("The first base (for mapping): %d" % cut1)
|
|
83 logm("The last base (for mapping): %d" % cut2)
|
|
84 logm("Max. lines per mapping: %d"% no_small_lines)
|
|
85 logm("Aligner: %s" % aligner_command)
|
|
86 logm("Reference genome library path: %s" % db_path )
|
|
87 logm("Number of mismatches allowed: %s" % max_mismatch_no )
|
|
88 if adapter_file !="":
|
|
89 if asktag=="N":
|
|
90 logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n")))
|
|
91 elif asktag=="Y":
|
|
92 logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) )
|
|
93 logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) )
|
|
94 #----------------------------------------------------------------
|
|
95
|
|
96 # helper method to join fname with tmp_path
|
|
97 tmp_d = lambda fname: os.path.join(tmp_path, fname)
|
|
98
|
|
99 db_d = lambda fname: os.path.join(db_path, fname)
|
|
100
|
|
101 #----------------------------------------------------------------
|
|
102 # splitting the big read file
|
|
103
|
|
104 input_fname = os.path.split(main_read_file)[1]
|
|
105
|
|
106 # split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines)
|
|
107 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path)
|
|
108 # if splitted_file.startswith("%s-s-" % input_fname))
|
|
109
|
|
110 #---- Stats ------------------------------------------------------------
|
|
111 all_raw_reads=0
|
|
112 all_trimed=0
|
|
113 all_mapped=0
|
|
114 all_mapped_passed=0
|
|
115
|
|
116 numbers_premapped_lst=[0,0,0,0]
|
|
117 numbers_mapped_lst=[0,0,0,0]
|
|
118
|
|
119 mC_lst=[0,0,0]
|
|
120 uC_lst=[0,0,0]
|
|
121
|
|
122
|
|
123 no_my_files=0
|
|
124
|
|
125 #----------------------------------------------------------------
|
|
126 logm("== Start mapping ==")
|
|
127
|
|
128 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
|
|
129 # for read_file in my_files:
|
|
130 original_bs_reads = {}
|
|
131 no_my_files+=1
|
|
132 random_id = ".tmp-"+str(random.randint(1000000,9999999))
|
|
133
|
|
134 #-------------------------------------------------------------------
|
|
135 # undirectional sequencing
|
|
136 #-------------------------------------------------------------------
|
|
137 if asktag=="Y":
|
|
138
|
|
139 #----------------------------------------------------------------
|
|
140 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
|
|
141 outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
|
|
142
|
|
143 outf2=open(outfile2,'w')
|
|
144 outf3=open(outfile3,'w')
|
|
145
|
|
146 #----------------------------------------------------------------
|
|
147 # detect format of input file
|
|
148 try :
|
|
149 read_inf=open(read_file,"r")
|
|
150 except IOError :
|
|
151 print "[Error] Cannot open input file : %s" % read_file
|
|
152 exit(-1)
|
|
153
|
|
154 oneline=read_inf.readline()
|
|
155 l=oneline.split()
|
|
156 input_format=""
|
|
157 if oneline[0]=="@": # fastq
|
|
158 input_format="fastq"
|
|
159 n_fastq=0
|
|
160 elif len(l)==1 and oneline[0]!=">": # pure sequences
|
|
161 input_format="seq"
|
|
162 elif len(l)==11: # qseq
|
|
163 input_format="qseq"
|
|
164 elif oneline[0]==">": # fasta
|
|
165 input_format="fasta"
|
|
166 n_fasta=0
|
|
167 read_inf.close()
|
|
168
|
|
169 #----------------------------------------------------------------
|
|
170 # read sequence, remove adapter and convert
|
|
171 read_id=""
|
|
172 seq=""
|
|
173 seq_ready="N"
|
|
174 for line in fileinput.input(read_file):
|
|
175 l=line.split()
|
|
176
|
|
177 if input_format=="seq":
|
|
178 all_raw_reads+=1
|
|
179 read_id=str(all_raw_reads)
|
|
180 read_id=read_id.zfill(12)
|
|
181 seq=l[0]
|
|
182 seq_ready="Y"
|
|
183 elif input_format=="fastq":
|
|
184 m_fastq=math.fmod(n_fastq,4)
|
|
185 n_fastq+=1
|
|
186 seq_ready="N"
|
|
187 if m_fastq==0:
|
|
188 all_raw_reads+=1
|
|
189 read_id=str(all_raw_reads)
|
|
190 read_id=read_id.zfill(12)
|
|
191 seq=""
|
|
192 elif m_fastq==1:
|
|
193 seq=l[0]
|
|
194 seq_ready="Y"
|
|
195 else:
|
|
196 seq=""
|
|
197 elif input_format=="qseq":
|
|
198 all_raw_reads+=1
|
|
199 read_id=str(all_raw_reads)
|
|
200 read_id=read_id.zfill(12)
|
|
201 seq=l[8]
|
|
202 seq_ready="Y"
|
|
203 elif input_format=="fasta":
|
|
204 m_fasta=math.fmod(n_fasta,2)
|
|
205 n_fasta+=1
|
|
206 seq_ready="N"
|
|
207 if m_fasta==0:
|
|
208 all_raw_reads+=1
|
|
209 #read_id=str(all_raw_reads)
|
|
210 read_id=l[0][1:]
|
|
211 seq=""
|
|
212 elif m_fasta==1:
|
|
213 seq=l[0]
|
|
214 seq_ready="Y"
|
|
215 else:
|
|
216 seq=""
|
|
217
|
|
218 #----------------------------------------------------------------
|
|
219 if seq_ready=="Y":
|
|
220 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52
|
|
221 seq=seq.upper()
|
|
222 seq=seq.replace(".","N")
|
|
223
|
|
224 # striping BS adapter from 3' read
|
|
225 if (adapter_fw !="") and (adapter_rc !="") :
|
|
226 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
|
|
227 new_read = Remove_5end_Adapter(new_read, adapter_rc)
|
|
228 if len(new_read) < len(seq) :
|
|
229 all_trimed += 1
|
|
230 seq = new_read
|
|
231
|
|
232 if len(seq)<=4:
|
|
233 seq=''.join(["N" for x in xrange(cut2-cut1+1)])
|
|
234
|
|
235 #--------- trimmed_raw_BS_read ------------------
|
|
236 original_bs_reads[read_id] = seq
|
|
237
|
|
238 #--------- FW_C2T ------------------
|
|
239 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
|
|
240 #--------- RC_G2A ------------------
|
|
241 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
|
|
242
|
|
243 fileinput.close()
|
|
244
|
|
245 outf2.close()
|
|
246 outf3.close()
|
|
247
|
|
248 delete_files(read_file)
|
|
249
|
|
250 #--------------------------------------------------------------------------------
|
|
251 # Bowtie mapping
|
|
252 #-------------------------------------------------------------------------------
|
|
253 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
254 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
255 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
|
|
256 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
|
|
257
|
|
258 # print aligner_command % {'int_no_mismatches' : int_no_mismatches,
|
|
259 # 'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
260 # 'input_file' : outfile2,
|
|
261 # 'output_file' : WC2T}
|
|
262
|
|
263 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
264 'input_file' : outfile2,
|
|
265 'output_file' : WC2T},
|
|
266
|
|
267 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
268 'input_file' : outfile2,
|
|
269 'output_file' : CC2T},
|
|
270
|
|
271 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
|
|
272 'input_file' : outfile3,
|
|
273 'output_file' : WG2A},
|
|
274
|
|
275 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
|
|
276 'input_file' : outfile3,
|
|
277 'output_file' : CG2A} ])
|
|
278
|
|
279
|
|
280 delete_files(outfile2, outfile3)
|
|
281
|
|
282
|
|
283 #--------------------------------------------------------------------------------
|
|
284 # Post processing
|
|
285 #--------------------------------------------------------------------------------
|
|
286
|
|
287 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
|
|
288 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
|
|
289
|
|
290 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
|
|
291 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
|
|
292
|
|
293 #----------------------------------------------------------------
|
|
294 # get unique-hit reads
|
|
295 #----------------------------------------------------------------
|
|
296 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
|
|
297
|
|
298 Unique_FW_C2T=set() # +
|
|
299 Unique_RC_G2A=set() # +
|
|
300 Unique_FW_G2A=set() # -
|
|
301 Unique_RC_C2T=set() # -
|
|
302 Multiple_hits=set()
|
|
303
|
|
304
|
|
305 for x in Union_set:
|
|
306 _list=[]
|
|
307 for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
|
|
308 mis_lst=d.get(x,[99])
|
|
309 mis=int(mis_lst[0])
|
|
310 _list.append(mis)
|
|
311 for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
|
|
312 mis=d.get(x,99)
|
|
313 _list.append(mis)
|
|
314 mini=min(_list)
|
|
315 if _list.count(mini) == 1:
|
|
316 mini_index=_list.index(mini)
|
|
317 if mini_index == 0:
|
|
318 Unique_FW_C2T.add(x)
|
|
319 elif mini_index == 1:
|
|
320 Unique_RC_G2A.add(x)
|
|
321 elif mini_index == 2:
|
|
322 Unique_FW_G2A.add(x)
|
|
323 elif mini_index == 3:
|
|
324 Unique_RC_C2T.add(x)
|
|
325 # if mini_index = 4,5,6,7, indicating multiple hits
|
|
326 else :
|
|
327 Multiple_hits.add(x)
|
|
328 else :
|
|
329 Multiple_hits.add(x)
|
|
330 # write reads rejected by Multiple Hits to file
|
|
331 if show_multiple_hit :
|
|
332 outf_MH=open("Multiple_hit.fa",'w')
|
|
333 for i in Multiple_hits :
|
|
334 outf_MH.write(">%s\n" % i)
|
|
335 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
336 outf_MH.close()
|
|
337
|
|
338 del Union_set
|
|
339 del FW_C2T_R
|
|
340 del FW_G2A_R
|
|
341 del RC_C2T_R
|
|
342 del RC_G2A_R
|
|
343
|
|
344 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
|
|
345 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
|
|
346 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
|
|
347 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
|
|
348 FW_C2T_uniq_lst.sort()
|
|
349 RC_C2T_uniq_lst.sort()
|
|
350 FW_G2A_uniq_lst.sort()
|
|
351 RC_G2A_uniq_lst.sort()
|
|
352 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
|
|
353 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
|
|
354 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
|
|
355 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
|
|
356
|
|
357 del Unique_FW_C2T
|
|
358 del Unique_FW_G2A
|
|
359 del Unique_RC_C2T
|
|
360 del Unique_RC_G2A
|
|
361
|
|
362 #----------------------------------------------------------------
|
|
363 numbers_premapped_lst[0] += len(Unique_FW_C2T)
|
|
364 numbers_premapped_lst[1] += len(Unique_RC_G2A)
|
|
365 numbers_premapped_lst[2] += len(Unique_FW_G2A)
|
|
366 numbers_premapped_lst[3] += len(Unique_RC_C2T)
|
|
367
|
|
368
|
|
369 #----------------------------------------------------------------
|
|
370
|
|
371 nn=0
|
|
372 gseq = dict()
|
|
373 chr_length = dict()
|
|
374 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),
|
|
375 (RC_G2A_uniq_lst,RC_G2A_U),
|
|
376 (FW_G2A_uniq_lst,FW_G2A_U),
|
|
377 (RC_C2T_uniq_lst,RC_C2T_U)]:
|
|
378 nn += 1
|
|
379 mapped_chr0 = ""
|
|
380
|
|
381 for header in ali_unique_lst:
|
|
382
|
|
383 _, mapped_chr, mapped_location, cigar = ali_dic[header]
|
|
384
|
|
385 original_BS = original_bs_reads[header]
|
|
386 #-------------------------------------
|
|
387 if mapped_chr not in gseq:
|
|
388 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
389 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
390
|
|
391 if nn == 2 or nn == 3:
|
|
392 cigar = list(reversed(cigar))
|
|
393 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
394
|
|
395
|
|
396 all_mapped += 1
|
|
397
|
|
398 if nn == 1: # +FW mapped to + strand:
|
|
399 FR = "+FW"
|
|
400 mapped_strand="+"
|
|
401
|
|
402 elif nn == 2: # +RC mapped to + strand:
|
|
403 FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+)
|
|
404 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
405 mapped_strand = "+"
|
|
406 original_BS = reverse_compl_seq(original_BS) # for RC reads
|
|
407
|
|
408 elif nn == 3: # -RC mapped to - strand:
|
|
409 mapped_strand = "-"
|
|
410 FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-)
|
|
411 original_BS = reverse_compl_seq(original_BS) # for RC reads
|
|
412
|
|
413 elif nn == 4: # -FW mapped to - strand:
|
|
414 mapped_strand = "-"
|
|
415 FR = "-FW"
|
|
416 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
417
|
|
418 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
|
|
419
|
|
420 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
421
|
|
422
|
|
423 if len(r_aln)==len(g_aln):
|
|
424 N_mismatch = N_MIS(r_aln, g_aln)
|
|
425 if N_mismatch <= int(max_mismatch_no):
|
|
426 numbers_mapped_lst[nn-1] += 1
|
|
427 all_mapped_passed += 1
|
|
428 methy = methy_seq(r_aln, g_aln + next)
|
|
429 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
430
|
|
431 #---XS FILTER----------------
|
|
432 XS = 0
|
|
433 nCH = methy.count('y') + methy.count('z')
|
|
434 nmCH = methy.count('Y') + methy.count('Z')
|
|
435 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
436 XS = 1
|
|
437
|
|
438 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
|
|
439
|
|
440 #----------------------------------------------------------------
|
|
441 logm("--> %s (%d) "%(read_file, no_my_files))
|
|
442 delete_files(WC2T, WG2A, CC2T, CG2A)
|
|
443
|
|
444
|
|
445
|
|
446 #--------------------------------------------------------------------
|
|
447 # directional sequencing
|
|
448 #--------------------------------------------------------------------
|
|
449
|
|
450 if asktag=="N":
|
|
451 #----------------------------------------------------------------
|
|
452 outfile2=tmp_d('Trimed_C2T.fa'+random_id)
|
|
453 outf2=open(outfile2,'w')
|
|
454
|
|
455 n=0
|
|
456 #----------------------------------------------------------------
|
|
457 try :
|
|
458 read_inf=open(read_file,"r")
|
|
459 except IOError :
|
|
460 print "[Error] Cannot open input file : %s" % read_file
|
|
461 exit(-1)
|
|
462
|
|
463 oneline=read_inf.readline()
|
|
464 l=oneline.split()
|
|
465 input_format=""
|
|
466 if oneline[0]=="@": # FastQ
|
|
467 input_format="fastq"
|
|
468 n_fastq=0
|
|
469 elif len(l)==1 and oneline[0]!=">": # pure sequences
|
|
470 input_format="seq"
|
|
471 elif len(l)==11: # Illumina GAII qseq file
|
|
472 input_format="qseq"
|
|
473 elif oneline[0]==">": # fasta
|
|
474 input_format="fasta"
|
|
475 n_fasta=0
|
|
476 read_inf.close()
|
|
477 #print "detected data format: %s"%(input_format)
|
|
478 #----------------------------------------------------------------
|
|
479 read_id=""
|
|
480 seq=""
|
|
481 seq_ready="N"
|
|
482 for line in fileinput.input(read_file):
|
|
483 l=line.split()
|
|
484 if input_format=="seq":
|
|
485 all_raw_reads+=1
|
|
486 read_id=str(all_raw_reads)
|
|
487 read_id=read_id.zfill(12)
|
|
488 seq=l[0]
|
|
489 seq_ready="Y"
|
|
490 elif input_format=="fastq":
|
|
491 m_fastq=math.fmod(n_fastq,4)
|
|
492 n_fastq+=1
|
|
493 seq_ready="N"
|
|
494 if m_fastq==0:
|
|
495 all_raw_reads+=1
|
|
496 read_id=str(all_raw_reads)
|
|
497 read_id=read_id.zfill(12)
|
|
498 seq=""
|
|
499 elif m_fastq==1:
|
|
500 seq=l[0]
|
|
501 seq_ready="Y"
|
|
502 else:
|
|
503 seq=""
|
|
504 elif input_format=="qseq":
|
|
505 all_raw_reads+=1
|
|
506 read_id=str(all_raw_reads)
|
|
507 read_id=read_id.zfill(12)
|
|
508 seq=l[8]
|
|
509 seq_ready="Y"
|
|
510 elif input_format=="fasta":
|
|
511 m_fasta=math.fmod(n_fasta,2)
|
|
512 n_fasta+=1
|
|
513 seq_ready="N"
|
|
514 if m_fasta==0:
|
|
515 all_raw_reads+=1
|
|
516 read_id=l[0][1:]
|
|
517 seq=""
|
|
518 elif m_fasta==1:
|
|
519 seq=l[0]
|
|
520 seq_ready="Y"
|
|
521 else:
|
|
522 seq=""
|
|
523
|
|
524 #--------------------------------
|
|
525 if seq_ready=="Y":
|
|
526 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52
|
|
527 seq=seq.upper()
|
|
528 seq=seq.replace(".","N")
|
|
529
|
|
530 #--striping adapter from 3' read -------
|
|
531 if adapter != "":
|
|
532 new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
|
|
533 if len(new_read) < len(seq) :
|
|
534 all_trimed += 1
|
|
535 seq = new_read
|
|
536
|
|
537 if len(seq)<=4:
|
|
538 seq = "N" * (cut2-cut1+1)
|
|
539
|
|
540 #--------- trimmed_raw_BS_read ------------------
|
|
541 original_bs_reads[read_id] = seq
|
|
542
|
|
543
|
|
544 #--------- FW_C2T ------------------
|
|
545 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
|
|
546
|
|
547 fileinput.close()
|
|
548
|
|
549 outf2.close()
|
|
550 delete_files(read_file)
|
|
551
|
|
552 #--------------------------------------------------------------------------------
|
|
553 # Bowtie mapping
|
|
554 #--------------------------------------------------------------------------------
|
|
555 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
556 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
557
|
|
558 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
559 'input_file' : outfile2,
|
|
560 'output_file' : WC2T},
|
|
561 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
562 'input_file' : outfile2,
|
|
563 'output_file' : CC2T} ])
|
|
564
|
|
565 delete_files(outfile2)
|
|
566
|
|
567 #--------------------------------------------------------------------------------
|
|
568 # Post processing
|
|
569 #--------------------------------------------------------------------------------
|
|
570
|
|
571
|
|
572 FW_C2T_U, FW_C2T_R = extract_mapping(WC2T)
|
|
573 RC_C2T_U, RC_C2T_R = extract_mapping(CC2T)
|
|
574
|
|
575 #----------------------------------------------------------------
|
|
576 # get uniq-hit reads
|
|
577 #----------------------------------------------------------------
|
|
578 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
|
|
579
|
|
580 Unique_FW_C2T = set() # +
|
|
581 Unique_RC_C2T = set() # -
|
|
582 Multiple_hits=set()
|
|
583 # write reads rejected by Multiple Hits to file
|
|
584
|
|
585 for x in Union_set:
|
|
586 _list=[]
|
|
587 for d in [FW_C2T_U,RC_C2T_U]:
|
|
588 mis_lst=d.get(x,[99])
|
|
589 mis=int(mis_lst[0])
|
|
590 _list.append(mis)
|
|
591 for d in [FW_C2T_R,RC_C2T_R]:
|
|
592 mis=d.get(x,99)
|
|
593 _list.append(mis)
|
|
594 mini=min(_list)
|
|
595 #print _list
|
|
596 if _list.count(mini)==1:
|
|
597 mini_index=_list.index(mini)
|
|
598 if mini_index==0:
|
|
599 Unique_FW_C2T.add(x)
|
|
600 elif mini_index==1:
|
|
601 Unique_RC_C2T.add(x)
|
|
602 else:
|
|
603 Multiple_hits.add(x)
|
|
604 else :
|
|
605 Multiple_hits.add(x)
|
|
606 # write reads rejected by Multiple Hits to file
|
|
607 if show_multiple_hit :
|
|
608 outf_MH=open("Multiple_hit.fa",'w')
|
|
609 for i in Multiple_hits :
|
|
610 outf_MH.write(">%s\n" % i)
|
|
611 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
612 outf_MH.close()
|
|
613
|
|
614
|
|
615
|
|
616 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
|
|
617 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
|
|
618 FW_C2T_uniq_lst.sort()
|
|
619 RC_C2T_uniq_lst.sort()
|
|
620 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
|
|
621 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
|
|
622
|
|
623
|
|
624 #----------------------------------------------------------------
|
|
625
|
|
626 numbers_premapped_lst[0] += len(Unique_FW_C2T)
|
|
627 numbers_premapped_lst[1] += len(Unique_RC_C2T)
|
|
628
|
|
629 #----------------------------------------------------------------
|
|
630
|
|
631 nn = 0
|
|
632 gseq = dict()
|
|
633 chr_length = dict()
|
|
634 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
|
|
635 nn += 1
|
|
636 mapped_chr0 = ""
|
|
637 for header in ali_unique_lst:
|
|
638 _, mapped_chr, mapped_location, cigar = ali_dic[header]
|
|
639 original_BS = original_bs_reads[header]
|
|
640 #-------------------------------------
|
|
641 if mapped_chr not in gseq :
|
|
642 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
643 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
644 #if mapped_chr != mapped_chr0:
|
|
645 # my_gseq = deserialize(db_d(mapped_chr))
|
|
646 # chr_length = len(my_gseq)
|
|
647 # mapped_chr0 = mapped_chr
|
|
648 #-------------------------------------
|
|
649
|
|
650 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
651
|
|
652 all_mapped+=1
|
|
653 if nn == 1: # +FW mapped to + strand:
|
|
654 FR = "+FW"
|
|
655 mapped_strand = "+"
|
|
656 elif nn == 2: # -FW mapped to - strand:
|
|
657 mapped_strand = "-"
|
|
658 FR = "-FW"
|
|
659 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
660
|
|
661
|
|
662 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
|
|
663 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
664
|
|
665 if len(r_aln) == len(g_aln):
|
|
666 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
|
|
667 if N_mismatch <= int(max_mismatch_no):
|
|
668 numbers_mapped_lst[nn-1] += 1
|
|
669 all_mapped_passed += 1
|
|
670 methy = methy_seq(r_aln, g_aln+next)
|
|
671 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
672
|
|
673 #---XS FILTER----------------
|
|
674 XS = 0
|
|
675 nCH = methy.count('y') + methy.count('z')
|
|
676 nmCH = methy.count('Y') + methy.count('Z')
|
|
677 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
678 XS = 1
|
|
679
|
|
680 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
|
|
681
|
|
682 #----------------------------------------------------------------
|
|
683 logm("--> %s (%d) "%(read_file,no_my_files))
|
|
684 delete_files(WC2T, CC2T)
|
|
685
|
|
686
|
|
687 #----------------------------------------------------------------
|
|
688
|
|
689 # outf.close()
|
|
690 delete_files(tmp_path)
|
|
691
|
|
692 logm("Number of raw reads: %d \n"% all_raw_reads)
|
|
693 if all_raw_reads >0:
|
|
694 logm("Number of reads having adapter removed: %d \n" % all_trimed )
|
|
695 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
|
|
696 logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped)
|
|
697 if asktag=="Y":
|
|
698 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
|
|
699 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
|
|
700 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
|
|
701 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
|
|
702 elif asktag=="N":
|
|
703 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
|
|
704 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
|
|
705
|
|
706 logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
|
|
707 if asktag=="Y":
|
|
708 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
|
|
709 logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) )
|
|
710 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) )
|
|
711 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )
|
|
712 elif asktag=="N":
|
|
713 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
|
|
714 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
|
|
715 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
|
|
716
|
|
717 n_CG=mC_lst[0]+uC_lst[0]
|
|
718 n_CHG=mC_lst[1]+uC_lst[1]
|
|
719 n_CHH=mC_lst[2]+uC_lst[2]
|
|
720
|
|
721 logm("----------------------------------------------" )
|
|
722 logm("Methylated C in mapped reads ")
|
|
723
|
|
724 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
|
|
725 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
|
|
726 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
|
|
727
|
|
728 logm("------------------- END --------------------" )
|
|
729 elapsed("=== END %s ===" % main_read_file)
|
|
730
|
|
731
|
|
732
|