0
|
1 import fileinput, random, math, os.path
|
|
2 from bs_index.rrbs_build import FWD_MAPPABLE_REGIONS, REV_MAPPABLE_REGIONS
|
|
3 from bs_utils.utils import *
|
|
4
|
|
5 from bs_align.bs_single_end import extract_mapping
|
|
6 from bs_align_utils import *
|
|
7
|
|
8 def my_mappable_region(chr_regions, mapped_location, FR): # start_position (first C), end_position (last G), serial, sequence
|
|
9 #print len(chr_regions)
|
|
10 out_serial = 0
|
|
11 out_start = -1
|
|
12 out_end = -1
|
|
13 #print "mapped_location:", mapped_location
|
|
14 if FR == "+FW" or FR == "-RC":
|
|
15 my_location = str(mapped_location)
|
|
16 if my_location in chr_regions:
|
|
17 my_lst = chr_regions[my_location]
|
|
18 out_start = int(my_location)
|
|
19 out_end = my_lst[0]
|
|
20 out_serial = my_lst[1]
|
|
21 #else :
|
|
22 # print "[For debug]: +FW location %s cannot be found" % my_location
|
|
23 elif FR == "-FW" or FR == "+RC":
|
|
24 my_location = str(mapped_location)
|
|
25 if my_location in chr_regions:
|
|
26 my_lst = chr_regions[my_location]
|
|
27 out_end = int(my_location)
|
|
28 out_start = my_lst[0]
|
|
29 out_serial = my_lst[1]
|
|
30 #else :
|
|
31 # print "[For debug]: -FW location %s cannot be found" % my_location
|
|
32
|
|
33 return out_serial, out_start, out_end
|
|
34
|
|
35
|
|
36 #----------------------------------------------------------------
|
|
37
|
|
38 def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines, max_mismatch_no,
|
|
39 aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, cut_format="C-CGG",
|
|
40 show_multiple_hit=False):
|
|
41 #----------------------------------------------------------------
|
|
42 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC"
|
|
43 #cut_context = re.sub("-", "", cut_format)
|
|
44 # Ex. cut_format="C-CGG,AT-CG,G-CWGC"
|
|
45 """
|
|
46
|
|
47 :param main_read_file:
|
|
48 :param asktag:
|
|
49 :param adapter_file:
|
|
50 :param cut_s:
|
|
51 :param cut_e:
|
|
52 :param no_small_lines:
|
|
53 :param max_mismatch_no:
|
|
54 :param aligner_command:
|
|
55 :param db_path:
|
|
56 :param tmp_path:
|
|
57 :param outfile:
|
|
58 :param XS_pct:
|
|
59 :param XS_count:
|
|
60 :param adapter_mismatch:
|
|
61 :param cut_format:
|
|
62 """
|
|
63 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC']
|
|
64 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC']
|
|
65 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G']
|
|
66 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC']
|
|
67 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5]
|
|
68 min_cut5_len = min([len(i) for i in cut5_context])
|
|
69 #print cut_format_lst
|
|
70 #print cut_format
|
|
71 #print cut5_context
|
|
72
|
|
73 cut_tag_lst = Enumerate_C_to_CT(cut_format_lst) # ['G-TTGC', 'AT-TG', 'G-CAGT', 'T-CGG', 'G-TAGC', 'C-TGG', 'G-CAGC', 'G-CTGC', 'AT-CG', 'T-TGG', 'G-TTGT', 'G-TAGT', 'C-CGG', 'G-CTGT']
|
|
74 cut5_tag_lst = [re.match(r'(.*)\-(.*)', i).group(1) for i in cut_tag_lst]
|
|
75 cut3_tag_lst = [re.match(r'(.*)\-(.*)', i).group(2) for i in cut_tag_lst]
|
|
76 check_pattern = [ i[-2:]+"_"+j for i,j in zip(cut5_tag_lst, cut3_tag_lst) ]
|
|
77
|
|
78 #print "======="
|
|
79 #print cut_tag_lst
|
|
80 #print cut3_tag_lst
|
|
81 #print cut5_tag_lst
|
|
82 #print check_pattern
|
|
83
|
|
84 # set region[gx,gy] for checking_genome_context
|
|
85 gx = [ 0 if j>2 else 2-j for j in [len(i) for i in cut5_tag_lst] ] # [XC-CGG]
|
|
86 gy = [ 3+len(i) for i in cut3_tag_lst ]
|
|
87
|
|
88
|
|
89 #----------------------------------------------------------------
|
|
90
|
|
91 # helper method to join fname with tmp_path
|
|
92 tmp_d = lambda fname: os.path.join(tmp_path, fname)
|
|
93 db_d = lambda fname: os.path.join(db_path, fname)
|
|
94
|
|
95 MAX_TRY = 500 # For finding the serial_no
|
|
96 whole_adapter_seq = ""
|
|
97 #----------------------------------------------------------------
|
|
98 adapter_seq=""
|
|
99 if adapter_file:
|
|
100 try :
|
|
101 adapter_inf = open(adapter_file,"r")
|
|
102 whole_adapter_seq = adapter_inf.readline().strip()
|
|
103 adapter_seq = whole_adapter_seq[0:10] # only use first 10bp of adapter
|
|
104 adapter_inf.close()
|
|
105 except IOError:
|
|
106 print "[Error] Cannot find adapter file : %s !" % adapter_file
|
|
107 exit(-1)
|
|
108
|
|
109 logm("I Read filename: %s" % main_read_file)
|
|
110 logm("I The last cycle (for mapping): %d" % cut_e )
|
|
111 logm("I Bowtie path: %s" % aligner_command )
|
|
112 logm("I Reference genome library path: %s" % db_path )
|
|
113 logm("I Number of mismatches allowed: %s" % max_mismatch_no)
|
|
114 logm("I Adapter seq: %s" % whole_adapter_seq)
|
|
115 logm("----------------------------------------------")
|
|
116
|
|
117 #----------------------------------------------------------------
|
|
118 all_raw_reads=0
|
|
119 all_tagged=0
|
|
120 all_tagged_trimmed=0
|
|
121 all_mapped=0
|
|
122 all_mapped_passed=0
|
|
123 n_cut_tag_lst={}
|
|
124 #print cut3_tag_lst
|
|
125 for x in cut3_tag_lst:
|
|
126 n_cut_tag_lst[x]=0
|
|
127
|
|
128 mC_lst=[0,0,0]
|
|
129 uC_lst=[0,0,0]
|
|
130
|
|
131 no_my_files=0
|
|
132
|
|
133 num_mapped_FW_C2T = 0
|
|
134 num_mapped_RC_C2T = 0
|
|
135 num_mapped_FW_G2A = 0
|
|
136 num_mapped_RC_G2A = 0
|
|
137
|
|
138 #===============================================
|
|
139 # directional sequencing
|
|
140 #===============================================
|
|
141
|
|
142 if asktag=="N" :
|
|
143 #----------------------------------------------------------------
|
|
144 logm("== Start mapping ==")
|
|
145
|
|
146 input_fname = os.path.split(main_read_file)[1]
|
|
147 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
|
|
148
|
|
149 logm("Processing read file: %s" % read_file)
|
|
150 original_bs_reads = {}
|
|
151 no_my_files+=1
|
|
152 random_id = ".tmp-"+str(random.randint(1000000,9999999))
|
|
153 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
|
|
154
|
|
155 outf2=open(outfile2,'w')
|
|
156
|
|
157 #--- Checking input format ------------------------------------------
|
|
158 try :
|
|
159 read_inf=open(read_file,"r")
|
|
160 except IOError:
|
|
161 print "[Error] Cannot open input file : %s" % read_file
|
|
162 exit(-1)
|
|
163
|
|
164 oneline=read_inf.readline()
|
|
165 l=oneline.split()
|
|
166 n_fastq=0
|
|
167 n_fasta=0
|
|
168 input_format=""
|
|
169 if oneline[0]=="@": # FastQ
|
|
170 input_format="fastq"
|
|
171 elif len(l)==1 and oneline[0]!=">": # pure sequences
|
|
172 input_format="seq"
|
|
173 elif len(l)==11: # Illumina qseq
|
|
174 input_format="qseq"
|
|
175 elif oneline[0]==">": # fasta
|
|
176 input_format="fasta"
|
|
177 read_inf.close()
|
|
178
|
|
179 #----------------------------------------------------------------
|
|
180 seq_id=""
|
|
181 seq=""
|
|
182 seq_ready=0
|
|
183 for line in fileinput.input(read_file):
|
|
184 l=line.split()
|
|
185
|
|
186 if input_format=="seq":
|
|
187 all_raw_reads+=1
|
|
188 seq_id=str(all_raw_reads)
|
|
189 seq_id=seq_id.zfill(12)
|
|
190 seq=l[0]
|
|
191 seq_ready="Y"
|
|
192 elif input_format=="fastq":
|
|
193 m_fastq=math.fmod(n_fastq,4)
|
|
194 n_fastq+=1
|
|
195 seq_ready="N"
|
|
196 if m_fastq==0:
|
|
197 all_raw_reads+=1
|
|
198 seq_id=str(all_raw_reads)
|
|
199 seq_id=seq_id.zfill(12)
|
|
200 seq=""
|
|
201 elif m_fastq==1:
|
|
202 seq=l[0]
|
|
203 seq_ready="Y"
|
|
204 else:
|
|
205 seq=""
|
|
206 elif input_format=="qseq":
|
|
207 all_raw_reads+=1
|
|
208 seq_id=str(all_raw_reads)
|
|
209 seq_id=seq_id.zfill(12)
|
|
210 seq=l[8]
|
|
211 seq_ready="Y"
|
|
212 elif input_format=="fasta":
|
|
213 m_fasta=math.fmod(n_fasta,2)
|
|
214 n_fasta+=1
|
|
215 seq_ready="N"
|
|
216 if m_fasta==0:
|
|
217 all_raw_reads+=1
|
|
218 seq_id=l[0][1:]
|
|
219 seq=""
|
|
220 elif m_fasta==1:
|
|
221 seq=l[0]
|
|
222 seq_ready="Y"
|
|
223 else:
|
|
224 seq=""
|
|
225 #---------------------------------------------------------------
|
|
226 if seq_ready=="Y":
|
|
227 # Normalize the characters
|
|
228 seq=seq.upper().replace(".","N")
|
|
229
|
|
230 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
|
|
231 if len(read_tag) > 0 :
|
|
232 all_tagged += 1
|
|
233 for i in read_tag :
|
|
234 n_cut_tag_lst[i] += 1
|
|
235
|
|
236 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
|
|
237
|
|
238 #-- Trimming adapter sequence ---
|
|
239 if adapter_seq != "" :
|
|
240 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch)
|
|
241 if len(new_read) < len(seq) :
|
|
242 all_tagged_trimmed += 1
|
|
243 seq = new_read
|
|
244 if len(seq) <= 4 :
|
|
245 seq = "N" * (cut_e - cut_s)
|
|
246
|
|
247 # all reads will be considered, regardless of tags
|
|
248 #--------- trimmed_raw_BS_read and qscore ------------------
|
|
249 original_bs_reads[seq_id] = seq
|
|
250 #--------- FW_C2T ------------------
|
|
251 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T')))
|
|
252 fileinput.close()
|
|
253
|
|
254 outf2.close()
|
|
255
|
|
256 delete_files(read_file)
|
|
257 logm("Processing input is done")
|
|
258 #--------------------------------------------------------------------------------
|
|
259
|
|
260 # mapping
|
|
261 #--------------------------------------------------------------------------------
|
|
262 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
263 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
264
|
|
265 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
266 'input_file' : outfile2,
|
|
267 'output_file' : WC2T},
|
|
268 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
269 'input_file' : outfile2,
|
|
270 'output_file' : CC2T} ])
|
|
271
|
|
272 logm("Aligning reads is done")
|
|
273
|
|
274 delete_files(outfile2)
|
|
275
|
|
276 #--------------------------------------------------------------------------------
|
|
277 # Post processing
|
|
278 #--------------------------------------------------------------------------------
|
|
279
|
|
280 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
|
|
281 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
|
|
282 logm("Extracting alignments is done")
|
|
283
|
|
284 #----------------------------------------------------------------
|
|
285 # get uniq-hit reads
|
|
286 #----------------------------------------------------------------
|
|
287 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
|
|
288
|
|
289 Unique_FW_C2T=set() # +
|
|
290 Unique_RC_C2T=set() # -
|
|
291 Multiple_hits=set()
|
|
292
|
|
293
|
|
294 for x in Union_set:
|
|
295 _list=[]
|
|
296 for dx in [FW_C2T_U, RC_C2T_U]:
|
|
297 mis_lst=dx.get(x,[99])
|
|
298 mis=int(mis_lst[0])
|
|
299 _list.append(mis)
|
|
300 for dx in [FW_C2T_R, RC_C2T_R]:
|
|
301 mis=dx.get(x,99)
|
|
302 _list.append(mis)
|
|
303 mini=min(_list)
|
|
304 if _list.count(mini)==1:
|
|
305 mini_index=_list.index(mini)
|
|
306 if mini_index==0:
|
|
307 Unique_FW_C2T.add(x)
|
|
308 elif mini_index==1:
|
|
309 Unique_RC_C2T.add(x)
|
|
310 else :
|
|
311 Multiple_hits.add(x)
|
|
312 else :
|
|
313 Multiple_hits.add(x)
|
|
314 # write reads rejected by Multiple Hits to file
|
|
315 if show_multiple_hit :
|
|
316 outf_MH=open("Multiple_hit.fa",'w')
|
|
317 for i in Multiple_hits :
|
|
318 outf_MH.write(">%s\n" % i)
|
|
319 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
320 outf_MH.close()
|
|
321
|
|
322 del Union_set
|
|
323 del FW_C2T_R
|
|
324 del RC_C2T_R
|
|
325
|
|
326 FW_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
|
|
327 RC_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
|
|
328 FW_uniq_lst.sort()
|
|
329 RC_uniq_lst.sort()
|
|
330 FW_uniq_lst=[x[1] for x in FW_uniq_lst]
|
|
331 RC_uniq_lst=[x[1] for x in RC_uniq_lst]
|
|
332
|
|
333 del Unique_FW_C2T
|
|
334 del Unique_RC_C2T
|
|
335
|
|
336 #----------------------------------------------------------------
|
|
337 # Post-filtering reads
|
|
338
|
|
339 # ---- FW ----
|
|
340 FW_regions = dict()
|
|
341 gseq = dict()
|
|
342 chr_length = dict()
|
|
343 for header in FW_uniq_lst :
|
|
344 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
|
|
345 original_BS = original_bs_reads[header]
|
|
346 if mapped_chr not in FW_regions :
|
|
347 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
|
|
348 if mapped_chr not in gseq :
|
|
349 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
350 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
351
|
|
352 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
353 all_mapped+=1
|
|
354 FR = "+FW"
|
|
355 mapped_strand = "+"
|
|
356 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
357 mapped_location,
|
|
358 mapped_location + g_len,
|
|
359 mapped_strand)
|
|
360 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
361 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
362
|
|
363 if len(r_aln) == len(g_aln) :
|
|
364 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
365 if True in checking_genome_context :
|
|
366 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
367 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
368 try_pos, FR)
|
|
369 if my_region_serial == 0 : # still be 0
|
|
370 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
371 # print "[For debug]: FW read has no tags"
|
|
372 try_count = 0
|
|
373 try_pos = mapped_location - min_cut5_len + 1
|
|
374 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
375 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
376 try_pos, FR)
|
|
377 try_pos -= 1
|
|
378 try_count += 1
|
|
379
|
|
380 #if my_region_serial == 0 :
|
|
381 # print "[For debug]: chr=", mapped_chr
|
|
382 # print "[For debug]: +FW read still can not find fragment serial"
|
|
383 # Tip: sometimes "my_region_serial" is still 0 ...
|
|
384
|
|
385
|
|
386 N_mismatch = N_MIS(r_aln, g_aln)
|
|
387 if N_mismatch <= int(max_mismatch_no) :
|
|
388 all_mapped_passed += 1
|
|
389 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
390 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
391 #---XS FILTER----------------
|
|
392 XS = 0
|
|
393 nCH = methy.count('y') + methy.count('z')
|
|
394 nmCH = methy.count('Y') + methy.count('Z')
|
|
395 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
396 XS = 1
|
|
397 num_mapped_FW_C2T += 1
|
|
398 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
399 mapped_location, cigar, original_BS, methy, XS,
|
|
400 output_genome = output_genome,
|
|
401 rrbs = True,
|
|
402 my_region_serial = my_region_serial,
|
|
403 my_region_start = my_region_start,
|
|
404 my_region_end = my_region_end)
|
|
405 else :
|
|
406 print "[For debug]: reads not in same lengths"
|
|
407
|
|
408 #print "start RC"
|
|
409 # ---- RC ----
|
|
410 RC_regions = dict()
|
|
411 for header in RC_uniq_lst :
|
|
412 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
|
|
413 original_BS = original_bs_reads[header]
|
|
414 if mapped_chr not in RC_regions :
|
|
415 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
|
|
416 if mapped_chr not in gseq :
|
|
417 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
418 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
419
|
|
420 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
421 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
422 all_mapped+=1
|
|
423 FR = "-FW"
|
|
424 mapped_strand = "-"
|
|
425 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
426 mapped_location,
|
|
427 mapped_location + g_len,
|
|
428 mapped_strand)
|
|
429 #checking_genome_context = (output_genome[gx:gy] == check_pattern)
|
|
430 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
431 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
432
|
|
433 if len(r_aln) == len(g_aln) : # and checking_genome_context:
|
|
434 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
435 if True in checking_genome_context :
|
|
436 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
437 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
438 try_pos , FR)
|
|
439 if my_region_serial == 0 : # still be 0
|
|
440 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
441 #print "[For debug]: RC Read has no tags"
|
|
442 try_count = 0
|
|
443 try_pos = mapped_location + g_len + min_cut5_len - 2
|
|
444 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
445 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
446 try_pos, FR)
|
|
447 try_pos += 1
|
|
448 try_count += 1
|
|
449
|
|
450 #if my_region_serial == 0 :
|
|
451 # print "[For debug]: chr=", mapped_chr
|
|
452 # print "[For debug]: -FW read still cannot find fragment serial"
|
|
453
|
|
454
|
|
455 N_mismatch = N_MIS(r_aln, g_aln)
|
|
456 if N_mismatch <= int(max_mismatch_no) :
|
|
457 all_mapped_passed += 1
|
|
458 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
459 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
460 #---XS FILTER----------------
|
|
461 XS = 0
|
|
462 nCH = methy.count('y') + methy.count('z')
|
|
463 nmCH = methy.count('Y') + methy.count('Z')
|
|
464 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
465 XS = 1
|
|
466 num_mapped_RC_C2T += 1
|
|
467 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
468 mapped_location, cigar, original_BS, methy, XS,
|
|
469 output_genome = output_genome,
|
|
470 rrbs = True,
|
|
471 my_region_serial = my_region_serial,
|
|
472 my_region_start = my_region_start,
|
|
473 my_region_end = my_region_end)
|
|
474 else :
|
|
475 print "[For debug]: reads not in same lengths"
|
|
476
|
|
477
|
|
478 # Finished both FW and RC
|
|
479 logm("Done: %s (%d) \n" % (read_file, no_my_files))
|
|
480 print "--> %s (%d) "%(read_file, no_my_files)
|
|
481 del original_bs_reads
|
|
482 delete_files(WC2T, CC2T)
|
|
483
|
|
484 # End of directional library
|
|
485
|
|
486
|
|
487 # ====================================================
|
|
488 # un-directional library
|
|
489 # ====================================================
|
|
490
|
|
491 elif asktag=="Y" :
|
|
492 #----------------------------------------------------------------
|
|
493 logm("== Start mapping ==")
|
|
494
|
|
495 input_fname = os.path.split(main_read_file)[1]
|
|
496 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
|
|
497
|
|
498 logm("Processing read file: %s" % read_file)
|
|
499 original_bs_reads = {}
|
|
500 no_my_files+=1
|
|
501 random_id = ".tmp-"+str(random.randint(1000000,9999999))
|
|
502 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
|
|
503 outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
|
|
504
|
|
505 outf2=open(outfile2,'w')
|
|
506 outf3=open(outfile3,'w')
|
|
507
|
|
508 #--- Checking input format ------------------------------------------
|
|
509 try :
|
|
510 read_inf=open(read_file,"r")
|
|
511 except IOError:
|
|
512 print "[Error] Cannot open input file : %s" % read_file
|
|
513 exit(-1)
|
|
514
|
|
515 oneline=read_inf.readline()
|
|
516 l=oneline.split()
|
|
517 n_fastq=0
|
|
518 n_fasta=0
|
|
519 input_format=""
|
|
520 if oneline[0]=="@": # FastQ
|
|
521 input_format="fastq"
|
|
522 elif len(l)==1 and oneline[0]!=">": # pure sequences
|
|
523 input_format="seq"
|
|
524 elif len(l)==11: # Illumina qseq
|
|
525 input_format="qseq"
|
|
526 elif oneline[0]==">": # fasta
|
|
527 input_format="fasta"
|
|
528 read_inf.close()
|
|
529
|
|
530 #----------------------------------------------------------------
|
|
531 seq_id = ""
|
|
532 seq = ""
|
|
533 seq_ready=0
|
|
534 for line in fileinput.input(read_file):
|
|
535 l=line.split()
|
|
536
|
|
537 if input_format == "seq":
|
|
538 all_raw_reads+=1
|
|
539 seq_id=str(all_raw_reads)
|
|
540 seq_id=seq_id.zfill(12)
|
|
541 seq=l[0]
|
|
542 seq_ready="Y"
|
|
543 elif input_format=="fastq":
|
|
544 m_fastq=math.fmod(n_fastq,4)
|
|
545 n_fastq+=1
|
|
546 seq_ready="N"
|
|
547 if m_fastq==0:
|
|
548 all_raw_reads+=1
|
|
549 seq_id=str(all_raw_reads)
|
|
550 seq_id=seq_id.zfill(12)
|
|
551 seq=""
|
|
552 elif m_fastq==1:
|
|
553 seq=l[0]
|
|
554 seq_ready="Y"
|
|
555 else:
|
|
556 seq=""
|
|
557 elif input_format=="qseq":
|
|
558 all_raw_reads+=1
|
|
559 seq_id=str(all_raw_reads)
|
|
560 seq_id=seq_id.zfill(12)
|
|
561 seq=l[8]
|
|
562 seq_ready="Y"
|
|
563 elif input_format=="fasta":
|
|
564 m_fasta=math.fmod(n_fasta,2)
|
|
565 n_fasta+=1
|
|
566 seq_ready="N"
|
|
567 if m_fasta==0:
|
|
568 all_raw_reads+=1
|
|
569 seq_id=l[0][1:]
|
|
570 seq=""
|
|
571 elif m_fasta==1:
|
|
572 seq=l[0]
|
|
573 seq_ready="Y"
|
|
574 else:
|
|
575 seq=""
|
|
576 #---------------------------------------------------------------
|
|
577 if seq_ready=="Y":
|
|
578 # Normalize the characters
|
|
579 seq=seq.upper().replace(".","N")
|
|
580
|
|
581 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
|
|
582 if len(read_tag) > 0 :
|
|
583 all_tagged += 1
|
|
584 for i in read_tag :
|
|
585 n_cut_tag_lst[i] += 1
|
|
586
|
|
587 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
|
|
588
|
|
589 #-- Trimming adapter sequence ---
|
|
590 if adapter_seq != "" :
|
|
591 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch)
|
|
592 if len(new_read) < len(seq) :
|
|
593 all_tagged_trimmed += 1
|
|
594 seq = new_read
|
|
595 if len(seq) <= 4 :
|
|
596 seq = "N" * (cut_e - cut_s)
|
|
597
|
|
598 # all reads will be considered, regardless of tags
|
|
599 #--------- trimmed_raw_BS_read and qscore ------------------
|
|
600 original_bs_reads[seq_id] = seq
|
|
601 #--------- FW_C2T ------------------
|
|
602 outf2.write('>%s\n%s\n'%(seq_id, seq.replace('C', 'T')))
|
|
603 #--------- RC_G2A ------------------
|
|
604 outf3.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
|
|
605 fileinput.close()
|
|
606
|
|
607 outf2.close()
|
|
608
|
|
609 delete_files(read_file)
|
|
610 logm("Processing input is done")
|
|
611 #--------------------------------------------------------------------------------
|
|
612
|
|
613 # mapping
|
|
614 #--------------------------------------------------------------------------------
|
|
615 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
616 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
617 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
|
|
618 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
|
|
619
|
|
620 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
621 'input_file' : outfile2,
|
|
622 'output_file' : WC2T},
|
|
623 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
624 'input_file' : outfile2,
|
|
625 'output_file' : CC2T},
|
|
626 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
|
|
627 'input_file' : outfile3,
|
|
628 'output_file' : WG2A},
|
|
629 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
|
|
630 'input_file' : outfile3,
|
|
631 'output_file' : CG2A} ])
|
|
632
|
|
633 logm("Aligning reads is done")
|
|
634
|
|
635 delete_files(outfile2)
|
|
636
|
|
637 #--------------------------------------------------------------------------------
|
|
638 # Post processing
|
|
639 #--------------------------------------------------------------------------------
|
|
640
|
|
641 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
|
|
642 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
|
|
643
|
|
644 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
|
|
645 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
|
|
646
|
|
647 logm("Extracting alignments is done")
|
|
648
|
|
649 #----------------------------------------------------------------
|
|
650 # get unique-hit reads
|
|
651 #----------------------------------------------------------------
|
|
652 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
|
|
653
|
|
654 Unique_FW_C2T=set() # +
|
|
655 Unique_RC_G2A=set() # +
|
|
656 Unique_FW_G2A=set() # -
|
|
657 Unique_RC_C2T=set() # -
|
|
658 Multiple_hits=set()
|
|
659
|
|
660 for x in Union_set:
|
|
661 _list=[]
|
|
662 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
|
|
663 mis_lst=dx.get(x,[99])
|
|
664 mis=int(mis_lst[0])
|
|
665 _list.append(mis)
|
|
666 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
|
|
667 mis=dx.get(x,99)
|
|
668 _list.append(mis)
|
|
669 mini=min(_list)
|
|
670 if _list.count(mini) == 1:
|
|
671 mini_index=_list.index(mini)
|
|
672 if mini_index == 0:
|
|
673 Unique_FW_C2T.add(x)
|
|
674 elif mini_index == 1:
|
|
675 Unique_RC_G2A.add(x)
|
|
676 elif mini_index == 2:
|
|
677 Unique_FW_G2A.add(x)
|
|
678 elif mini_index == 3:
|
|
679 Unique_RC_C2T.add(x)
|
|
680 else :
|
|
681 Multiple_hits.add(x)
|
|
682 else :
|
|
683 Multiple_hits.add(x)
|
|
684 # write reads rejected by Multiple Hits to file
|
|
685 if show_multiple_hit :
|
|
686 outf_MH=open("Multiple_hit.fa",'w')
|
|
687 for i in Multiple_hits :
|
|
688 outf_MH.write(">%s\n" % i)
|
|
689 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
690 outf_MH.close()
|
|
691
|
|
692 del Union_set
|
|
693 del FW_C2T_R
|
|
694 del FW_G2A_R
|
|
695 del RC_C2T_R
|
|
696 del RC_G2A_R
|
|
697
|
|
698 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
|
|
699 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
|
|
700 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
|
|
701 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
|
|
702 FW_C2T_uniq_lst.sort()
|
|
703 RC_C2T_uniq_lst.sort()
|
|
704 FW_G2A_uniq_lst.sort()
|
|
705 RC_G2A_uniq_lst.sort()
|
|
706 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
|
|
707 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
|
|
708 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
|
|
709 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
|
|
710
|
|
711 del Unique_FW_C2T
|
|
712 del Unique_FW_G2A
|
|
713 del Unique_RC_C2T
|
|
714 del Unique_RC_G2A
|
|
715
|
|
716
|
|
717 #----------------------------------------------------------------
|
|
718 # Post-filtering reads
|
|
719 # ---- FW_C2T ---- undirectional
|
|
720 FW_regions = dict()
|
|
721 gseq = dict()
|
|
722 chr_length = dict()
|
|
723 for header in FW_C2T_uniq_lst :
|
|
724 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
|
|
725 original_BS = original_bs_reads[header]
|
|
726 if mapped_chr not in FW_regions :
|
|
727 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
|
|
728 if mapped_chr not in gseq :
|
|
729 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
730 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
731
|
|
732 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
733 all_mapped+=1
|
|
734 FR = "+FW"
|
|
735 mapped_strand = "+"
|
|
736 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
737 mapped_location,
|
|
738 mapped_location + g_len,
|
|
739 mapped_strand)
|
|
740 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
741 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
742
|
|
743 if len(r_aln) == len(g_aln) :
|
|
744 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
745 if True in checking_genome_context :
|
|
746 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
747 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
748 try_pos, FR)
|
|
749 if my_region_serial == 0 : # still be 0
|
|
750 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
751 # print "[For debug]: FW read has no tags"
|
|
752 try_count = 0
|
|
753 try_pos = mapped_location - min_cut5_len + 1
|
|
754 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
755 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
756 try_pos, FR)
|
|
757 try_pos -= 1
|
|
758 try_count += 1
|
|
759
|
|
760 N_mismatch = N_MIS(r_aln, g_aln)
|
|
761 if N_mismatch <= int(max_mismatch_no) :
|
|
762 all_mapped_passed += 1
|
|
763 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
764 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
765 #---XS FILTER----------------
|
|
766 XS = 0
|
|
767 nCH = methy.count('y') + methy.count('z')
|
|
768 nmCH = methy.count('Y') + methy.count('Z')
|
|
769 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
770 XS = 1
|
|
771 num_mapped_FW_C2T += 1
|
|
772 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
773 mapped_location, cigar, original_BS, methy, XS,
|
|
774 output_genome = output_genome,
|
|
775 rrbs = True,
|
|
776 my_region_serial = my_region_serial,
|
|
777 my_region_start = my_region_start,
|
|
778 my_region_end = my_region_end)
|
|
779 else :
|
|
780 print "[For debug]: reads not in same lengths"
|
|
781
|
|
782
|
|
783 # ---- RC_C2T ---- undirectional
|
|
784 RC_regions = dict()
|
|
785 for header in RC_C2T_uniq_lst :
|
|
786 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
|
|
787 original_BS = original_bs_reads[header]
|
|
788 if mapped_chr not in RC_regions :
|
|
789 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
|
|
790 if mapped_chr not in gseq :
|
|
791 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
792 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
793
|
|
794 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
795 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
796 all_mapped+=1
|
|
797 FR = "-FW"
|
|
798 mapped_strand = "-"
|
|
799 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
800 mapped_location,
|
|
801 mapped_location + g_len,
|
|
802 mapped_strand)
|
|
803 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
804 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
805
|
|
806 if len(r_aln) == len(g_aln) : # and checking_genome_context:
|
|
807 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
808 if True in checking_genome_context :
|
|
809 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
810 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
811 try_pos , FR)
|
|
812 if my_region_serial == 0 : # still be 0
|
|
813 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
814 #print "[For debug]: RC Read has no tags"
|
|
815 try_count = 0
|
|
816 try_pos = mapped_location + g_len + min_cut5_len - 2
|
|
817 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
818 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
819 try_pos, FR)
|
|
820 try_pos += 1
|
|
821 try_count += 1
|
|
822
|
|
823 N_mismatch = N_MIS(r_aln, g_aln)
|
|
824 if N_mismatch <= int(max_mismatch_no) :
|
|
825 all_mapped_passed += 1
|
|
826 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
827 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
828 #---XS FILTER----------------
|
|
829 XS = 0
|
|
830 nCH = methy.count('y') + methy.count('z')
|
|
831 nmCH = methy.count('Y') + methy.count('Z')
|
|
832 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
833 XS = 1
|
|
834 num_mapped_RC_C2T += 1
|
|
835 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
836 mapped_location, cigar, original_BS, methy, XS,
|
|
837 output_genome = output_genome,
|
|
838 rrbs = True,
|
|
839 my_region_serial = my_region_serial,
|
|
840 my_region_start = my_region_start,
|
|
841 my_region_end = my_region_end)
|
|
842
|
|
843 else :
|
|
844 print "[For debug]: reads not in same lengths"
|
|
845
|
|
846
|
|
847 # ---- FW_G2A ---- undirectional
|
|
848 FW_regions = dict()
|
|
849 gseq = dict()
|
|
850 chr_length = dict()
|
|
851 for header in FW_G2A_uniq_lst :
|
|
852 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header]
|
|
853 original_BS = original_bs_reads[header]
|
|
854 if mapped_chr not in FW_regions :
|
|
855 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
|
|
856 if mapped_chr not in gseq :
|
|
857 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
858 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
859 cigar = list(reversed(cigar))
|
|
860
|
|
861 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
862 all_mapped+=1
|
|
863 FR = "-RC"
|
|
864 mapped_strand = "-"
|
|
865 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
866 mapped_location,
|
|
867 mapped_location + g_len,
|
|
868 mapped_strand)
|
|
869 original_BS = reverse_compl_seq(original_BS) # for RC reads
|
|
870 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
871 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
872
|
|
873 if len(r_aln) == len(g_aln) :
|
|
874 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
875 if True in checking_genome_context :
|
|
876 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
877 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
878 try_pos, FR)
|
|
879 if my_region_serial == 0 : # still be 0
|
|
880 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
881 #print "[For debug]: FW read has no tags"
|
|
882 try_count = 0
|
|
883 try_pos = mapped_location - min_cut5_len + 1
|
|
884 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
885 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
886 try_pos, FR)
|
|
887 try_pos += 1
|
|
888 try_count += 1
|
|
889 #if my_region_serial == 0 :
|
|
890 # print "[For debug]: chr=", mapped_chr
|
|
891 # print "[For debug]: FW_G2A read still can not find fragment serial"
|
|
892 # Tip: sometimes "my_region_serial" is still 0 ...
|
|
893
|
|
894
|
|
895 N_mismatch = N_MIS(r_aln, g_aln)
|
|
896 if N_mismatch <= int(max_mismatch_no) :
|
|
897 all_mapped_passed += 1
|
|
898 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
899 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
900 #---XS FILTER----------------
|
|
901 XS = 0
|
|
902 nCH = methy.count('y') + methy.count('z')
|
|
903 nmCH = methy.count('Y') + methy.count('Z')
|
|
904 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
905 XS = 1
|
|
906 num_mapped_FW_G2A += 1
|
|
907 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
908 mapped_location, cigar, original_BS, methy, XS,
|
|
909 output_genome = output_genome,
|
|
910 rrbs = True,
|
|
911 my_region_serial = my_region_serial,
|
|
912 my_region_start = my_region_start,
|
|
913 my_region_end = my_region_end)
|
|
914 else :
|
|
915 print "[For debug]: reads not in same lengths"
|
|
916
|
|
917
|
|
918 # ---- RC_G2A ---- undirectional
|
|
919 RC_regions = dict()
|
|
920 for header in RC_G2A_uniq_lst :
|
|
921 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header]
|
|
922 original_BS = original_bs_reads[header]
|
|
923 if mapped_chr not in RC_regions :
|
|
924 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
|
|
925 if mapped_chr not in gseq :
|
|
926 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
927 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
928 cigar = list(reversed(cigar))
|
|
929
|
|
930 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
931 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
932 all_mapped+=1
|
|
933 FR = "+RC"
|
|
934 mapped_strand = "+"
|
|
935 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
936 mapped_location,
|
|
937 mapped_location + g_len,
|
|
938 mapped_strand)
|
|
939 original_BS = reverse_compl_seq(original_BS) # for RC reads
|
|
940 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
941 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
942
|
|
943 if len(r_aln) == len(g_aln) : # and checking_genome_context:
|
|
944 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
945 if True in checking_genome_context :
|
|
946 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
947 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
948 mapped_location + g_len + min_cut5_len -1, FR)
|
|
949 if my_region_serial == 0 : # still be 0
|
|
950 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
951 #print "[For debug]: RC Read has no tags"
|
|
952 try_count = 0
|
|
953 try_pos = mapped_location + g_len + min_cut5_len - 2
|
|
954 while try_count < MAX_TRY :
|
|
955 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
956 try_pos, FR)
|
|
957 try_pos += 1
|
|
958 try_count += 1
|
|
959
|
|
960 #if my_region_serial == 0 :
|
|
961 # print "[For debug]: chr=", mapped_chr
|
|
962 # print "[For debug]: RC_C2A read still cannot find fragment serial"
|
|
963
|
|
964
|
|
965 N_mismatch = N_MIS(r_aln, g_aln)
|
|
966 if N_mismatch <= int(max_mismatch_no) :
|
|
967 all_mapped_passed += 1
|
|
968 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
969 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
970 #---XS FILTER----------------
|
|
971 XS = 0
|
|
972 nCH = methy.count('y') + methy.count('z')
|
|
973 nmCH = methy.count('Y') + methy.count('Z')
|
|
974 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
975 XS = 1
|
|
976 num_mapped_RC_G2A += 1
|
|
977 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
978 mapped_location, cigar, original_BS, methy, XS,
|
|
979 output_genome = output_genome,
|
|
980 rrbs = True,
|
|
981 my_region_serial = my_region_serial,
|
|
982 my_region_start = my_region_start,
|
|
983 my_region_end = my_region_end)
|
|
984 else :
|
|
985 print "[For debug]: reads not in same lengths"
|
|
986
|
|
987
|
|
988
|
|
989 # Finished both FW and RC
|
|
990 logm("Done: %s (%d) \n" % (read_file, no_my_files))
|
|
991 print "--> %s (%d) "%(read_file, no_my_files)
|
|
992 del original_bs_reads
|
|
993 delete_files(WC2T, CC2T, WG2A, CG2A)
|
|
994
|
|
995
|
|
996
|
|
997 # End of un-directional library
|
|
998
|
|
999 delete_files(tmp_path)
|
|
1000
|
|
1001
|
|
1002 logm("O Number of raw reads: %d "% all_raw_reads)
|
|
1003 if all_raw_reads >0:
|
|
1004 logm("O Number of CGG/TGG tagged reads: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))
|
|
1005 for kk in range(len(n_cut_tag_lst)):
|
|
1006 logm("O Number of raw reads with %s tag: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
|
|
1007 logm("O Number of CGG/TGG reads having adapter removed: %d "%all_tagged_trimmed)
|
|
1008 logm("O Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
|
|
1009 logm("O Number of unique-hits reads for post-filtering: %d"%all_mapped)
|
|
1010
|
|
1011 logm("O ------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
|
|
1012 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))
|
|
1013
|
|
1014 if asktag=="Y": # undiretional
|
|
1015 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
|
|
1016 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) )
|
|
1017 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
|
|
1018 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) )
|
|
1019 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration
|
|
1020 # according to literal meaning
|
|
1021 elif asktag=="N": # directional
|
|
1022 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
|
|
1023 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
|
|
1024
|
|
1025 n_CG=mC_lst[0]+uC_lst[0]
|
|
1026 n_CHG=mC_lst[1]+uC_lst[1]
|
|
1027 n_CHH=mC_lst[2]+uC_lst[2]
|
|
1028
|
|
1029 logm("----------------------------------------------")
|
|
1030 logm("M Methylated C in mapped reads ")
|
|
1031 logm("M mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
|
|
1032 logm("M mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
|
|
1033 logm("M mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
|
|
1034 logm("----------------------------------------------")
|
|
1035 logm("------------------- END ----------------------")
|
|
1036
|
|
1037 elapsed(main_read_file)
|
|
1038
|
|
1039 close_log()
|
|
1040
|
|
1041
|