0
|
1 import fileinput, os, time, random, math
|
|
2 from bs_utils.utils import *
|
|
3 from bs_align_utils import *
|
1
|
4 import gzip
|
0
|
5
|
|
6 #----------------------------------------------------------------
|
|
7 # Read from the mapped results, return lists of unique / multiple-hit reads
|
|
8 # The function suppose at most 2 hits will be reported in single file
|
|
9 def extract_mapping(ali_file):
|
|
10 unique_hits = {}
|
|
11 non_unique_hits = {}
|
|
12
|
|
13 header0 = ""
|
|
14 lst = []
|
|
15
|
|
16 for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file):
|
|
17 #------------------------------
|
|
18 if header != header0:
|
|
19 #---------- output -----------
|
|
20 if len(lst) == 1:
|
|
21 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
|
|
22 elif len(lst) > 1:
|
|
23 min_lst = min(lst, key = lambda x: x[0])
|
|
24 max_lst = max(lst, key = lambda x: x[0])
|
|
25
|
|
26 if min_lst[0] < max_lst[0]:
|
|
27 unique_hits[header0] = min_lst
|
|
28 else:
|
|
29 non_unique_hits[header0] = min_lst[0]
|
|
30 #print "multiple hit", header, chr, location, no_mismatch, cigar # test
|
|
31 header0 = header
|
|
32 lst = [(no_mismatch, chr, location, cigar)]
|
|
33 else: # header == header0, same header (read id)
|
|
34 lst.append((no_mismatch, chr, location, cigar))
|
|
35
|
|
36 if len(lst) == 1:
|
|
37 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
|
|
38 elif len(lst) > 1:
|
|
39 min_lst = min(lst, key = lambda x: x[0])
|
|
40 max_lst = max(lst, key = lambda x: x[0])
|
|
41
|
|
42 if min_lst[0] < max_lst[0]:
|
|
43 unique_hits[header0] = min_lst
|
|
44 else:
|
|
45 non_unique_hits[header0] = min_lst[0]
|
|
46
|
|
47
|
|
48 return unique_hits, non_unique_hits
|
|
49
|
|
50
|
|
51 def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines,
|
|
52 max_mismatch_no, aligner_command, db_path, tmp_path, outfile,
|
|
53 XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
|
|
54 #----------------------------------------------------------------
|
|
55 # adapter : strand-specific or not
|
|
56 adapter=""
|
|
57 adapter_fw=""
|
|
58 adapter_rc=""
|
|
59 if adapter_file !="":
|
|
60 try :
|
|
61 adapter_inf=open(adapter_file,"r")
|
|
62 except IOError:
|
|
63 print "[Error] Cannot open adapter file : %s" % adapter_file
|
|
64 exit(-1)
|
|
65 if asktag == "N": #<--- directional library
|
|
66 adapter=adapter_inf.readline()
|
|
67 adapter_inf.close()
|
|
68 adapter=adapter.rstrip("\n")
|
|
69 elif asktag == "Y":#<--- un-directional library
|
|
70 adapter_fw=adapter_inf.readline()
|
|
71 adapter_rc=adapter_inf.readline()
|
|
72 adapter_inf.close()
|
|
73 adapter_fw=adapter_fw.rstrip("\n")
|
|
74 adapter_rc=adapter_rc.rstrip("\n")
|
|
75 adapter_inf.close()
|
|
76 #----------------------------------------------------------------
|
|
77
|
|
78
|
|
79
|
|
80 #----------------------------------------------------------------
|
1
|
81 logm("Read filename: %s" % main_read_file)
|
|
82 logm("The first base (for mapping): %d"% cut1 )
|
|
83 logm("The last base (for mapping): %d"% cut2 )
|
|
84
|
|
85 logm("Path for short reads aligner: %s"% aligner_command + '\n')
|
|
86 logm("Reference genome library path: %s"% db_path )
|
|
87
|
|
88 if asktag == "Y" :
|
|
89 logm("Un-directional library" )
|
|
90 else :
|
|
91 logm("Directional library")
|
|
92
|
|
93 logm("Number of mismatches allowed: %s"% max_mismatch_no )
|
|
94
|
0
|
95 if adapter_file !="":
|
1
|
96 logm("Adapter seq: %s" % adapter_fw)
|
|
97 logm("-------------------------------- " )
|
0
|
98 #----------------------------------------------------------------
|
|
99
|
|
100 # helper method to join fname with tmp_path
|
|
101 tmp_d = lambda fname: os.path.join(tmp_path, fname)
|
|
102
|
|
103 db_d = lambda fname: os.path.join(db_path, fname)
|
|
104
|
|
105 #----------------------------------------------------------------
|
|
106 # splitting the big read file
|
|
107
|
|
108 input_fname = os.path.split(main_read_file)[1]
|
|
109
|
|
110 # split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines)
|
|
111 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path)
|
|
112 # if splitted_file.startswith("%s-s-" % input_fname))
|
|
113
|
|
114 #---- Stats ------------------------------------------------------------
|
|
115 all_raw_reads=0
|
1
|
116 all_trimmed=0
|
0
|
117 all_mapped=0
|
|
118 all_mapped_passed=0
|
1
|
119 all_base_before_trim=0
|
|
120 all_base_after_trim=0
|
|
121 all_base_mapped=0
|
0
|
122
|
|
123 numbers_premapped_lst=[0,0,0,0]
|
|
124 numbers_mapped_lst=[0,0,0,0]
|
|
125
|
|
126 mC_lst=[0,0,0]
|
|
127 uC_lst=[0,0,0]
|
|
128
|
|
129 no_my_files=0
|
|
130
|
|
131 #----------------------------------------------------------------
|
|
132 logm("== Start mapping ==")
|
|
133
|
|
134 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
|
|
135 # for read_file in my_files:
|
|
136 original_bs_reads = {}
|
|
137 no_my_files+=1
|
|
138 random_id = ".tmp-"+str(random.randint(1000000,9999999))
|
|
139
|
|
140 #-------------------------------------------------------------------
|
1
|
141 # un-directional sequencing
|
0
|
142 #-------------------------------------------------------------------
|
|
143 if asktag=="Y":
|
|
144
|
|
145 #----------------------------------------------------------------
|
|
146 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
|
|
147 outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
|
|
148
|
|
149 outf2=open(outfile2,'w')
|
|
150 outf3=open(outfile3,'w')
|
|
151
|
|
152 #----------------------------------------------------------------
|
|
153 # detect format of input file
|
|
154 try :
|
1
|
155 if read_file.endswith(".gz") : # support input file ending with ".gz"
|
|
156 read_inf = gzip.open(read_file, "rb")
|
|
157 else :
|
|
158 read_inf=open(read_file,"r")
|
0
|
159 except IOError :
|
|
160 print "[Error] Cannot open input file : %s" % read_file
|
|
161 exit(-1)
|
|
162
|
1
|
163 oneline = read_inf.readline()
|
|
164 l = oneline.split()
|
|
165 input_format = ""
|
|
166 if oneline[0]=="@":
|
|
167 input_format = "fastq"
|
|
168 elif len(l)==1 and oneline[0]!=">":
|
|
169 input_format = "seq"
|
|
170 elif len(l)==11:
|
|
171 input_format = "qseq"
|
|
172 elif oneline[0]==">":
|
|
173 input_format = "fasta"
|
0
|
174 read_inf.close()
|
|
175
|
|
176 #----------------------------------------------------------------
|
1
|
177 # read sequence, remove adapter and convert
|
|
178 read_id = ""
|
|
179 seq = ""
|
|
180 seq_ready = "N"
|
|
181 line_no = 0
|
|
182 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed): # allow input with .gz
|
|
183 l = line.split()
|
|
184 line_no += 1
|
0
|
185 if input_format=="seq":
|
1
|
186 all_raw_reads += 1
|
|
187 read_id = str(all_raw_reads)
|
|
188 read_id = read_id.zfill(12)
|
|
189 seq = l[0]
|
|
190 seq_ready = "Y"
|
0
|
191 elif input_format=="fastq":
|
1
|
192 l_fastq = math.fmod(line_no, 4)
|
|
193 if l_fastq == 1 :
|
|
194 all_raw_reads += 1
|
|
195 read_id = l[0][1:]
|
|
196 seq_ready = "N"
|
|
197 elif l_fastq == 2 :
|
|
198 seq = l[0]
|
|
199 seq_ready = "Y"
|
|
200 else :
|
|
201 seq = ""
|
|
202 seq_ready = "N"
|
0
|
203 elif input_format=="qseq":
|
1
|
204 all_raw_reads += 1
|
|
205 read_id = str(all_raw_reads)
|
|
206 read_id = read_id.zfill(12)
|
|
207 seq = l[8]
|
|
208 seq_ready = "Y"
|
|
209 elif input_format=="fasta" :
|
|
210 l_fasta = math.fmod(line_no,2)
|
|
211 if l_fasta==1:
|
|
212 all_raw_reads += 1
|
|
213 read_id = l[0][1:]
|
|
214 seq = ""
|
|
215 seq_ready = "N"
|
|
216 elif l_fasta==0 :
|
|
217 seq = l[0]
|
|
218 seq_ready = "Y"
|
0
|
219
|
|
220 #----------------------------------------------------------------
|
|
221 if seq_ready=="Y":
|
|
222 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52
|
|
223 seq=seq.upper()
|
|
224 seq=seq.replace(".","N")
|
|
225
|
|
226 # striping BS adapter from 3' read
|
1
|
227 all_base_before_trim += len(seq)
|
0
|
228 if (adapter_fw !="") and (adapter_rc !="") :
|
|
229 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
|
|
230 new_read = Remove_5end_Adapter(new_read, adapter_rc)
|
|
231 if len(new_read) < len(seq) :
|
1
|
232 all_trimmed += 1
|
0
|
233 seq = new_read
|
1
|
234 all_base_after_trim += len(seq)
|
0
|
235 if len(seq)<=4:
|
|
236 seq=''.join(["N" for x in xrange(cut2-cut1+1)])
|
|
237
|
|
238 #--------- trimmed_raw_BS_read ------------------
|
|
239 original_bs_reads[read_id] = seq
|
|
240
|
|
241 #--------- FW_C2T ------------------
|
|
242 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
|
|
243 #--------- RC_G2A ------------------
|
|
244 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
|
|
245
|
|
246 fileinput.close()
|
|
247
|
|
248 outf2.close()
|
|
249 outf3.close()
|
|
250
|
|
251 delete_files(read_file)
|
|
252
|
|
253 #--------------------------------------------------------------------------------
|
|
254 # Bowtie mapping
|
|
255 #-------------------------------------------------------------------------------
|
|
256 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
257 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
258 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
|
|
259 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
|
|
260
|
|
261 # print aligner_command % {'int_no_mismatches' : int_no_mismatches,
|
|
262 # 'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
263 # 'input_file' : outfile2,
|
|
264 # 'output_file' : WC2T}
|
|
265
|
|
266 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
267 'input_file' : outfile2,
|
|
268 'output_file' : WC2T},
|
|
269
|
|
270 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
271 'input_file' : outfile2,
|
|
272 'output_file' : CC2T},
|
|
273
|
|
274 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
|
|
275 'input_file' : outfile3,
|
|
276 'output_file' : WG2A},
|
|
277
|
|
278 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
|
|
279 'input_file' : outfile3,
|
|
280 'output_file' : CG2A} ])
|
|
281
|
|
282
|
|
283 delete_files(outfile2, outfile3)
|
|
284
|
|
285
|
|
286 #--------------------------------------------------------------------------------
|
|
287 # Post processing
|
|
288 #--------------------------------------------------------------------------------
|
|
289
|
|
290 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
|
|
291 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
|
|
292
|
|
293 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
|
|
294 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
|
|
295
|
|
296 #----------------------------------------------------------------
|
|
297 # get unique-hit reads
|
|
298 #----------------------------------------------------------------
|
|
299 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
|
|
300
|
|
301 Unique_FW_C2T=set() # +
|
|
302 Unique_RC_G2A=set() # +
|
|
303 Unique_FW_G2A=set() # -
|
|
304 Unique_RC_C2T=set() # -
|
|
305 Multiple_hits=set()
|
|
306
|
|
307
|
|
308 for x in Union_set:
|
|
309 _list=[]
|
|
310 for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
|
|
311 mis_lst=d.get(x,[99])
|
|
312 mis=int(mis_lst[0])
|
|
313 _list.append(mis)
|
|
314 for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
|
|
315 mis=d.get(x,99)
|
|
316 _list.append(mis)
|
|
317 mini=min(_list)
|
|
318 if _list.count(mini) == 1:
|
|
319 mini_index=_list.index(mini)
|
|
320 if mini_index == 0:
|
|
321 Unique_FW_C2T.add(x)
|
|
322 elif mini_index == 1:
|
|
323 Unique_RC_G2A.add(x)
|
|
324 elif mini_index == 2:
|
|
325 Unique_FW_G2A.add(x)
|
|
326 elif mini_index == 3:
|
|
327 Unique_RC_C2T.add(x)
|
|
328 # if mini_index = 4,5,6,7, indicating multiple hits
|
|
329 else :
|
|
330 Multiple_hits.add(x)
|
|
331 else :
|
|
332 Multiple_hits.add(x)
|
|
333 # write reads rejected by Multiple Hits to file
|
|
334 if show_multiple_hit :
|
|
335 outf_MH=open("Multiple_hit.fa",'w')
|
|
336 for i in Multiple_hits :
|
|
337 outf_MH.write(">%s\n" % i)
|
|
338 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
339 outf_MH.close()
|
|
340
|
|
341 del Union_set
|
|
342 del FW_C2T_R
|
|
343 del FW_G2A_R
|
|
344 del RC_C2T_R
|
|
345 del RC_G2A_R
|
|
346
|
|
347 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
|
|
348 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
|
|
349 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
|
|
350 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
|
|
351 FW_C2T_uniq_lst.sort()
|
|
352 RC_C2T_uniq_lst.sort()
|
|
353 FW_G2A_uniq_lst.sort()
|
|
354 RC_G2A_uniq_lst.sort()
|
|
355 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
|
|
356 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
|
|
357 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
|
|
358 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
|
1
|
359 #----------------------------------------------------------------
|
|
360
|
|
361 numbers_premapped_lst[0] += len(Unique_FW_C2T)
|
|
362 numbers_premapped_lst[1] += len(Unique_RC_G2A)
|
|
363 numbers_premapped_lst[2] += len(Unique_FW_G2A)
|
|
364 numbers_premapped_lst[3] += len(Unique_RC_C2T)
|
0
|
365
|
|
366 del Unique_FW_C2T
|
|
367 del Unique_FW_G2A
|
|
368 del Unique_RC_C2T
|
|
369 del Unique_RC_G2A
|
|
370
|
|
371
|
|
372 #----------------------------------------------------------------
|
|
373
|
|
374 nn=0
|
|
375 gseq = dict()
|
|
376 chr_length = dict()
|
|
377 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),
|
|
378 (RC_G2A_uniq_lst,RC_G2A_U),
|
|
379 (FW_G2A_uniq_lst,FW_G2A_U),
|
|
380 (RC_C2T_uniq_lst,RC_C2T_U)]:
|
|
381 nn += 1
|
|
382
|
|
383 for header in ali_unique_lst:
|
|
384
|
|
385 _, mapped_chr, mapped_location, cigar = ali_dic[header]
|
|
386
|
|
387 original_BS = original_bs_reads[header]
|
|
388 #-------------------------------------
|
|
389 if mapped_chr not in gseq:
|
|
390 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
391 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
392
|
|
393 if nn == 2 or nn == 3:
|
|
394 cigar = list(reversed(cigar))
|
|
395 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
396
|
|
397
|
|
398 all_mapped += 1
|
|
399
|
|
400 if nn == 1: # +FW mapped to + strand:
|
|
401 FR = "+FW"
|
|
402 mapped_strand="+"
|
|
403
|
|
404 elif nn == 2: # +RC mapped to + strand:
|
|
405 FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+)
|
|
406 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
407 mapped_strand = "+"
|
|
408 original_BS = reverse_compl_seq(original_BS) # for RC reads
|
|
409
|
|
410 elif nn == 3: # -RC mapped to - strand:
|
|
411 mapped_strand = "-"
|
|
412 FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-)
|
|
413 original_BS = reverse_compl_seq(original_BS) # for RC reads
|
|
414
|
|
415 elif nn == 4: # -FW mapped to - strand:
|
|
416 mapped_strand = "-"
|
|
417 FR = "-FW"
|
|
418 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
419
|
|
420 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
|
|
421
|
|
422 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
423
|
|
424
|
|
425 if len(r_aln)==len(g_aln):
|
|
426 N_mismatch = N_MIS(r_aln, g_aln)
|
1
|
427 # if N_mismatch <= int(max_mismatch_no):
|
|
428 mm_no=float(max_mismatch_no)
|
|
429 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
|
0
|
430 numbers_mapped_lst[nn-1] += 1
|
|
431 all_mapped_passed += 1
|
|
432 methy = methy_seq(r_aln, g_aln + next)
|
|
433 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
434
|
|
435 #---XS FILTER----------------
|
|
436 XS = 0
|
|
437 nCH = methy.count('y') + methy.count('z')
|
|
438 nmCH = methy.count('Y') + methy.count('Z')
|
|
439 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
440 XS = 1
|
|
441
|
|
442 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
|
1
|
443 all_base_mapped += len(original_BS)
|
0
|
444
|
|
445 #----------------------------------------------------------------
|
|
446 logm("--> %s (%d) "%(read_file, no_my_files))
|
|
447 delete_files(WC2T, WG2A, CC2T, CG2A)
|
|
448
|
|
449
|
|
450
|
|
451 #--------------------------------------------------------------------
|
|
452 # directional sequencing
|
|
453 #--------------------------------------------------------------------
|
|
454
|
|
455 if asktag=="N":
|
|
456 #----------------------------------------------------------------
|
1
|
457 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
|
0
|
458 outf2=open(outfile2,'w')
|
|
459 #----------------------------------------------------------------
|
|
460 try :
|
1
|
461 if read_file.endswith(".gz") : # support input file ending with ".gz"
|
|
462 read_inf = gzip.open(read_file, "rb")
|
|
463 else :
|
|
464 read_inf=open(read_file,"r")
|
0
|
465 except IOError :
|
|
466 print "[Error] Cannot open input file : %s" % read_file
|
|
467 exit(-1)
|
|
468
|
1
|
469 oneline = read_inf.readline()
|
|
470 l = oneline.split()
|
|
471 input_format = ""
|
|
472 if oneline[0]=="@":
|
|
473 input_format = "fastq"
|
|
474 elif len(l)==1 and oneline[0]!=">":
|
|
475 input_format = "seq"
|
|
476 elif len(l)==11:
|
|
477 input_format = "qseq"
|
|
478 elif oneline[0]==">":
|
|
479 input_format = "fasta"
|
0
|
480 read_inf.close()
|
1
|
481
|
0
|
482 #print "detected data format: %s"%(input_format)
|
|
483 #----------------------------------------------------------------
|
|
484 read_id=""
|
|
485 seq=""
|
|
486 seq_ready="N"
|
1
|
487 line_no = 0
|
|
488 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
|
|
489 l = line.split()
|
|
490 line_no += 1
|
0
|
491 if input_format=="seq":
|
1
|
492 all_raw_reads += 1
|
|
493 read_id = str(all_raw_reads)
|
|
494 read_id = read_id.zfill(12)
|
|
495 seq = l[0]
|
|
496 seq_ready = "Y"
|
0
|
497 elif input_format=="fastq":
|
1
|
498 l_fastq = math.fmod(line_no, 4)
|
|
499 if l_fastq == 1 :
|
|
500 all_raw_reads += 1
|
|
501 read_id = l[0][1:]
|
|
502 seq_ready = "N"
|
|
503 elif l_fastq == 2 :
|
|
504 seq = l[0]
|
|
505 seq_ready = "Y"
|
|
506 else :
|
|
507 seq = ""
|
|
508 seq_ready = "N"
|
0
|
509 elif input_format=="qseq":
|
1
|
510 all_raw_reads += 1
|
|
511 read_id = str(all_raw_reads)
|
|
512 read_id = read_id.zfill(12)
|
|
513 seq = l[8]
|
|
514 seq_ready = "Y"
|
|
515 elif input_format=="fasta" :
|
|
516 l_fasta = math.fmod(line_no,2)
|
|
517 if l_fasta==1:
|
|
518 all_raw_reads += 1
|
|
519 read_id = l[0][1:]
|
|
520 seq = ""
|
|
521 seq_ready = "N"
|
|
522 elif l_fasta==0 :
|
|
523 seq = l[0]
|
|
524 seq_ready = "Y"
|
0
|
525 #--------------------------------
|
|
526 if seq_ready=="Y":
|
|
527 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52
|
|
528 seq=seq.upper()
|
|
529 seq=seq.replace(".","N")
|
|
530
|
|
531 #--striping adapter from 3' read -------
|
1
|
532 all_base_before_trim += len(seq)
|
0
|
533 if adapter != "":
|
|
534 new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
|
|
535 if len(new_read) < len(seq) :
|
1
|
536 all_trimmed += 1
|
0
|
537 seq = new_read
|
1
|
538 all_base_after_trim += len(seq)
|
0
|
539 if len(seq)<=4:
|
|
540 seq = "N" * (cut2-cut1+1)
|
|
541
|
|
542 #--------- trimmed_raw_BS_read ------------------
|
|
543 original_bs_reads[read_id] = seq
|
|
544
|
|
545
|
|
546 #--------- FW_C2T ------------------
|
|
547 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
|
|
548
|
|
549 fileinput.close()
|
|
550
|
|
551 outf2.close()
|
|
552 delete_files(read_file)
|
|
553
|
|
554 #--------------------------------------------------------------------------------
|
|
555 # Bowtie mapping
|
|
556 #--------------------------------------------------------------------------------
|
|
557 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
558 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
559
|
|
560 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
561 'input_file' : outfile2,
|
|
562 'output_file' : WC2T},
|
|
563 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
564 'input_file' : outfile2,
|
|
565 'output_file' : CC2T} ])
|
|
566
|
|
567 delete_files(outfile2)
|
|
568
|
|
569 #--------------------------------------------------------------------------------
|
|
570 # Post processing
|
|
571 #--------------------------------------------------------------------------------
|
|
572
|
|
573
|
|
574 FW_C2T_U, FW_C2T_R = extract_mapping(WC2T)
|
|
575 RC_C2T_U, RC_C2T_R = extract_mapping(CC2T)
|
|
576
|
|
577 #----------------------------------------------------------------
|
|
578 # get uniq-hit reads
|
|
579 #----------------------------------------------------------------
|
|
580 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
|
|
581
|
|
582 Unique_FW_C2T = set() # +
|
|
583 Unique_RC_C2T = set() # -
|
|
584 Multiple_hits=set()
|
|
585 # write reads rejected by Multiple Hits to file
|
|
586
|
|
587 for x in Union_set:
|
|
588 _list=[]
|
|
589 for d in [FW_C2T_U,RC_C2T_U]:
|
|
590 mis_lst=d.get(x,[99])
|
|
591 mis=int(mis_lst[0])
|
|
592 _list.append(mis)
|
|
593 for d in [FW_C2T_R,RC_C2T_R]:
|
|
594 mis=d.get(x,99)
|
|
595 _list.append(mis)
|
|
596 mini=min(_list)
|
|
597 #print _list
|
|
598 if _list.count(mini)==1:
|
|
599 mini_index=_list.index(mini)
|
|
600 if mini_index==0:
|
|
601 Unique_FW_C2T.add(x)
|
|
602 elif mini_index==1:
|
|
603 Unique_RC_C2T.add(x)
|
|
604 else:
|
|
605 Multiple_hits.add(x)
|
|
606 else :
|
|
607 Multiple_hits.add(x)
|
|
608 # write reads rejected by Multiple Hits to file
|
|
609 if show_multiple_hit :
|
|
610 outf_MH=open("Multiple_hit.fa",'w')
|
|
611 for i in Multiple_hits :
|
|
612 outf_MH.write(">%s\n" % i)
|
|
613 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
614 outf_MH.close()
|
|
615
|
|
616
|
|
617 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
|
|
618 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
|
|
619 FW_C2T_uniq_lst.sort()
|
|
620 RC_C2T_uniq_lst.sort()
|
|
621 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
|
|
622 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
|
|
623
|
|
624
|
|
625 #----------------------------------------------------------------
|
|
626
|
|
627 numbers_premapped_lst[0] += len(Unique_FW_C2T)
|
|
628 numbers_premapped_lst[1] += len(Unique_RC_C2T)
|
|
629
|
|
630 #----------------------------------------------------------------
|
|
631
|
|
632 nn = 0
|
|
633 gseq = dict()
|
|
634 chr_length = dict()
|
|
635 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
|
|
636 nn += 1
|
|
637 for header in ali_unique_lst:
|
|
638 _, mapped_chr, mapped_location, cigar = ali_dic[header]
|
|
639 original_BS = original_bs_reads[header]
|
|
640 #-------------------------------------
|
|
641 if mapped_chr not in gseq :
|
|
642 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
643 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
644
|
|
645 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
646
|
|
647 all_mapped+=1
|
|
648 if nn == 1: # +FW mapped to + strand:
|
|
649 FR = "+FW"
|
|
650 mapped_strand = "+"
|
|
651 elif nn == 2: # -FW mapped to - strand:
|
|
652 mapped_strand = "-"
|
|
653 FR = "-FW"
|
|
654 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
655
|
|
656
|
|
657 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
|
|
658 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
659
|
|
660 if len(r_aln) == len(g_aln):
|
|
661 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
|
1
|
662 mm_no=float(max_mismatch_no)
|
|
663 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
|
0
|
664 numbers_mapped_lst[nn-1] += 1
|
|
665 all_mapped_passed += 1
|
|
666 methy = methy_seq(r_aln, g_aln+next)
|
|
667 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
668
|
|
669 #---XS FILTER----------------
|
|
670 XS = 0
|
|
671 nCH = methy.count('y') + methy.count('z')
|
|
672 nmCH = methy.count('Y') + methy.count('Z')
|
|
673 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
674 XS = 1
|
|
675
|
|
676 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
|
1
|
677 all_base_mapped += len(original_BS)
|
0
|
678
|
|
679 #----------------------------------------------------------------
|
|
680 logm("--> %s (%d) "%(read_file,no_my_files))
|
|
681 delete_files(WC2T, CC2T)
|
|
682
|
|
683
|
|
684 #----------------------------------------------------------------
|
|
685
|
|
686 delete_files(tmp_path)
|
|
687
|
|
688 logm("Number of raw reads: %d \n"% all_raw_reads)
|
|
689 if all_raw_reads >0:
|
1
|
690 logm("Number of bases in total: %d "%all_base_before_trim)
|
|
691 if adapter != "" :
|
|
692 logm("Number of reads having adapter removed: %d \n" % all_trimmed )
|
|
693 logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
|
|
694 logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
|
|
695 logm("Number of unique-hits reads (before post-filtering): %d\n" % all_mapped)
|
0
|
696 if asktag=="Y":
|
|
697 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
|
|
698 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
|
|
699 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
|
|
700 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
|
|
701 elif asktag=="N":
|
|
702 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
|
|
703 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
|
|
704
|
|
705 logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
|
|
706 if asktag=="Y":
|
|
707 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
|
|
708 logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) )
|
|
709 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) )
|
|
710 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )
|
|
711 elif asktag=="N":
|
|
712 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
|
|
713 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
|
|
714 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
|
1
|
715 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
|
|
716
|
0
|
717
|
|
718 n_CG=mC_lst[0]+uC_lst[0]
|
|
719 n_CHG=mC_lst[1]+uC_lst[1]
|
|
720 n_CHH=mC_lst[2]+uC_lst[2]
|
|
721
|
|
722 logm("----------------------------------------------" )
|
|
723 logm("Methylated C in mapped reads ")
|
|
724
|
|
725 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
|
|
726 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
|
|
727 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
|
|
728
|
|
729 logm("------------------- END --------------------" )
|
|
730 elapsed("=== END %s ===" % main_read_file)
|
|
731
|
|
732
|
|
733
|