0
|
1 import fileinput, random, math, os.path
|
|
2 from bs_index.rrbs_build import FWD_MAPPABLE_REGIONS, REV_MAPPABLE_REGIONS
|
|
3 from bs_utils.utils import *
|
|
4
|
|
5 from bs_align.bs_single_end import extract_mapping
|
|
6 from bs_align_utils import *
|
|
7
|
|
8 def my_mappable_region(chr_regions, mapped_location, FR): # start_position (first C), end_position (last G), serial, sequence
|
|
9 #print len(chr_regions)
|
|
10 out_serial = 0
|
|
11 out_start = -1
|
|
12 out_end = -1
|
|
13 #print "mapped_location:", mapped_location
|
|
14 if FR == "+FW" or FR == "-RC":
|
|
15 my_location = str(mapped_location)
|
|
16 if my_location in chr_regions:
|
|
17 my_lst = chr_regions[my_location]
|
|
18 out_start = int(my_location)
|
|
19 out_end = my_lst[0]
|
|
20 out_serial = my_lst[1]
|
|
21 #else :
|
|
22 # print "[For debug]: +FW location %s cannot be found" % my_location
|
|
23 elif FR == "-FW" or FR == "+RC":
|
|
24 my_location = str(mapped_location)
|
|
25 if my_location in chr_regions:
|
|
26 my_lst = chr_regions[my_location]
|
|
27 out_end = int(my_location)
|
|
28 out_start = my_lst[0]
|
|
29 out_serial = my_lst[1]
|
|
30 #else :
|
|
31 # print "[For debug]: -FW location %s cannot be found" % my_location
|
|
32
|
|
33 return out_serial, out_start, out_end
|
|
34
|
|
35
|
|
36 #----------------------------------------------------------------
|
|
37
|
|
38 def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines, max_mismatch_no,
|
|
39 aligner_command, db_path, tmp_path, outfile, XS_pct, XS_count, adapter_mismatch, cut_format="C-CGG",
|
|
40 show_multiple_hit=False):
|
|
41 #----------------------------------------------------------------
|
|
42 # For double enzyme: cut_format="C-CGG,A-CTG"; ApekI:"G^CWGC"
|
|
43 #cut_context = re.sub("-", "", cut_format)
|
|
44 # Ex. cut_format="C-CGG,AT-CG,G-CWGC"
|
|
45
|
|
46 cut_format_lst = EnumerateIUPAC(cut_format.upper().split(",")) # ['G-CAGC', 'AT-CG', 'C-CGG', 'G-CTGC']
|
|
47 cut_context = [i.replace("-","") for i in cut_format_lst] # ['GCAGC', 'ATCG', 'CCGG', 'GCTGC']
|
|
48 cut5_context = [re.match( r'(.*)\-(.*)', i).group(1) for i in cut_format_lst] # ['G', 'AT', 'C', 'G']
|
|
49 cut3_context = [re.match( r'(.*)\-(.*)', i).group(2) for i in cut_format_lst] # ['CAGC', 'CG', 'CGG', 'CTGC']
|
|
50 cut_len = [len(i) for i in cut_context] # [5, 4, 4, 5]
|
|
51 min_cut5_len = min([len(i) for i in cut5_context])
|
|
52 #print cut_format_lst
|
|
53 #print cut_format
|
|
54 #print cut5_context
|
|
55
|
|
56 cut_tag_lst = Enumerate_C_to_CT(cut_format_lst) # ['G-TTGC', 'AT-TG', 'G-CAGT', 'T-CGG', 'G-TAGC', 'C-TGG', 'G-CAGC', 'G-CTGC', 'AT-CG', 'T-TGG', 'G-TTGT', 'G-TAGT', 'C-CGG', 'G-CTGT']
|
|
57 cut5_tag_lst = [re.match(r'(.*)\-(.*)', i).group(1) for i in cut_tag_lst]
|
|
58 cut3_tag_lst = [re.match(r'(.*)\-(.*)', i).group(2) for i in cut_tag_lst]
|
|
59 check_pattern = [ i[-2:]+"_"+j for i,j in zip(cut5_tag_lst, cut3_tag_lst) ]
|
|
60
|
|
61 #print "======="
|
|
62 #print cut_tag_lst
|
|
63 #print cut3_tag_lst
|
|
64 #print cut5_tag_lst
|
|
65 #print check_pattern
|
|
66
|
|
67 # set region[gx,gy] for checking_genome_context
|
|
68 gx = [ 0 if j>2 else 2-j for j in [len(i) for i in cut5_tag_lst] ] # [XC-CGG]
|
|
69 gy = [ 3+len(i) for i in cut3_tag_lst ]
|
|
70
|
|
71
|
|
72 #----------------------------------------------------------------
|
|
73
|
|
74 # helper method to join fname with tmp_path
|
|
75 tmp_d = lambda fname: os.path.join(tmp_path, fname)
|
|
76 db_d = lambda fname: os.path.join(db_path, fname)
|
|
77
|
|
78 MAX_TRY = 500 # For finding the serial_no
|
|
79 whole_adapter_seq = ""
|
|
80 #----------------------------------------------------------------
|
|
81 adapter_seq=""
|
|
82 if adapter_file:
|
|
83 try :
|
|
84 adapter_inf = open(adapter_file,"r")
|
|
85 whole_adapter_seq = adapter_inf.readline().strip()
|
|
86 adapter_seq = whole_adapter_seq[0:10] # only use first 10bp of adapter
|
|
87 adapter_inf.close()
|
|
88 except IOError:
|
|
89 print "[Error] Cannot find adapter file : %s !" % adapter_file
|
|
90 exit(-1)
|
|
91
|
1
|
92 logm("Read filename: %s" % main_read_file)
|
|
93 logm("The first base (for mapping): %d"% cut_s )
|
|
94 logm("The last base (for mapping): %d"% cut_e )
|
|
95
|
|
96 logm("Path for short reads aligner: %s"% aligner_command + '\n')
|
|
97 logm("Reference genome library path: %s"% db_path )
|
|
98
|
|
99 if asktag == "Y" :
|
|
100 logm("Un-directional library" )
|
|
101 else :
|
|
102 logm("Directional library")
|
|
103
|
|
104 logm("Number of mismatches allowed: %s"% max_mismatch_no )
|
|
105
|
|
106 if adapter_file !="":
|
|
107 logm("Adapter seq: %s" % whole_adapter_seq)
|
|
108 logm("-------------------------------- " )
|
0
|
109
|
|
110 #----------------------------------------------------------------
|
|
111 all_raw_reads=0
|
|
112 all_tagged=0
|
|
113 all_tagged_trimmed=0
|
1
|
114 all_base_before_trim=0
|
|
115 all_base_after_trim=0
|
|
116 all_base_mapped=0
|
0
|
117 all_mapped=0
|
|
118 all_mapped_passed=0
|
|
119 n_cut_tag_lst={}
|
|
120 #print cut3_tag_lst
|
|
121 for x in cut3_tag_lst:
|
|
122 n_cut_tag_lst[x]=0
|
|
123
|
|
124 mC_lst=[0,0,0]
|
|
125 uC_lst=[0,0,0]
|
|
126
|
|
127 no_my_files=0
|
|
128
|
|
129 num_mapped_FW_C2T = 0
|
|
130 num_mapped_RC_C2T = 0
|
|
131 num_mapped_FW_G2A = 0
|
|
132 num_mapped_RC_G2A = 0
|
|
133
|
1
|
134 # Count of nucleotides, which should be cut before the adapters
|
|
135 Extra_base_cut_5end_adapter = max([ abs(len(i)-len(j)) for i,j in zip(cut5_context, cut3_context)])
|
|
136
|
0
|
137 #===============================================
|
|
138 # directional sequencing
|
|
139 #===============================================
|
|
140
|
|
141 if asktag=="N" :
|
|
142 #----------------------------------------------------------------
|
|
143 logm("== Start mapping ==")
|
|
144
|
|
145 input_fname = os.path.split(main_read_file)[1]
|
|
146 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
|
|
147
|
|
148 logm("Processing read file: %s" % read_file)
|
|
149 original_bs_reads = {}
|
|
150 no_my_files+=1
|
|
151 random_id = ".tmp-"+str(random.randint(1000000,9999999))
|
|
152 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
|
|
153
|
|
154 outf2=open(outfile2,'w')
|
|
155
|
|
156 #--- Checking input format ------------------------------------------
|
|
157 try :
|
1
|
158 if read_file.endswith(".gz") : # support input file ending with ".gz"
|
|
159 read_inf = gzip.open(read_file, "rb")
|
|
160 else :
|
|
161 read_inf=open(read_file,"r")
|
0
|
162 except IOError:
|
|
163 print "[Error] Cannot open input file : %s" % read_file
|
|
164 exit(-1)
|
|
165
|
1
|
166 oneline = read_inf.readline()
|
|
167 l = oneline.split()
|
|
168 input_format = ""
|
|
169 if oneline[0]=="@":
|
|
170 input_format = "fastq"
|
|
171 elif len(l)==1 and oneline[0]!=">" :
|
|
172 input_format = "seq"
|
|
173 elif len(l)==11:
|
|
174 input_format = "qseq"
|
|
175 elif oneline[0]==">" :
|
|
176 input_format = "fasta"
|
0
|
177 read_inf.close()
|
|
178
|
1
|
179
|
0
|
180 #----------------------------------------------------------------
|
1
|
181 read_id = ""
|
|
182 seq = ""
|
|
183 seq_ready = 0
|
|
184 line_no = 0
|
|
185 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
|
|
186 l = line.split()
|
|
187 line_no += 1
|
0
|
188 if input_format=="seq":
|
1
|
189 all_raw_reads += 1
|
|
190 read_id = str(all_raw_reads)
|
|
191 read_id = read_id.zfill(12)
|
|
192 seq = l[0]
|
|
193 seq_ready = "Y"
|
0
|
194 elif input_format=="fastq":
|
1
|
195 l_fastq = math.fmod(line_no, 4)
|
|
196 if l_fastq == 1 :
|
|
197 all_raw_reads += 1
|
|
198 read_id = l[0][1:]
|
|
199 seq_ready = "N"
|
|
200 elif l_fastq == 2 :
|
|
201 seq = l[0]
|
|
202 seq_ready = "Y"
|
|
203 else :
|
|
204 seq = ""
|
|
205 seq_ready = "N"
|
0
|
206 elif input_format=="qseq":
|
1
|
207 all_raw_reads += 1
|
|
208 read_id = str(all_raw_reads)
|
|
209 read_id = read_id.zfill(12)
|
|
210 seq = l[8]
|
|
211 seq_ready = "Y"
|
|
212 elif input_format=="fasta" :
|
|
213 l_fasta = math.fmod(line_no,2)
|
|
214 if l_fasta==1:
|
|
215 all_raw_reads += 1
|
|
216 read_id = l[0][1:]
|
|
217 seq = ""
|
|
218 seq_ready = "N"
|
|
219 elif l_fasta==0 :
|
|
220 seq = l[0]
|
|
221 seq_ready = "Y"
|
|
222
|
0
|
223 #---------------------------------------------------------------
|
|
224 if seq_ready=="Y":
|
|
225 # Normalize the characters
|
|
226 seq=seq.upper().replace(".","N")
|
|
227
|
|
228 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
|
|
229 if len(read_tag) > 0 :
|
|
230 all_tagged += 1
|
|
231 for i in read_tag :
|
|
232 n_cut_tag_lst[i] += 1
|
|
233
|
|
234 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
|
|
235
|
|
236 #-- Trimming adapter sequence ---
|
1
|
237
|
|
238 all_base_before_trim += len(seq)
|
0
|
239 if adapter_seq != "" :
|
1
|
240 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter)
|
0
|
241 if len(new_read) < len(seq) :
|
|
242 all_tagged_trimmed += 1
|
|
243 seq = new_read
|
1
|
244 all_base_after_trim += len(seq)
|
0
|
245 if len(seq) <= 4 :
|
|
246 seq = "N" * (cut_e - cut_s)
|
|
247
|
|
248 # all reads will be considered, regardless of tags
|
|
249 #--------- trimmed_raw_BS_read and qscore ------------------
|
1
|
250 original_bs_reads[read_id] = seq
|
0
|
251 #--------- FW_C2T ------------------
|
1
|
252 outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T')))
|
0
|
253 fileinput.close()
|
|
254
|
|
255 outf2.close()
|
|
256
|
|
257 delete_files(read_file)
|
|
258 logm("Processing input is done")
|
|
259 #--------------------------------------------------------------------------------
|
|
260
|
|
261 # mapping
|
|
262 #--------------------------------------------------------------------------------
|
|
263 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
264 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
265
|
|
266 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
267 'input_file' : outfile2,
|
|
268 'output_file' : WC2T},
|
|
269 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
270 'input_file' : outfile2,
|
|
271 'output_file' : CC2T} ])
|
|
272
|
|
273 logm("Aligning reads is done")
|
|
274
|
|
275 delete_files(outfile2)
|
|
276
|
|
277 #--------------------------------------------------------------------------------
|
|
278 # Post processing
|
|
279 #--------------------------------------------------------------------------------
|
|
280
|
|
281 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
|
|
282 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
|
|
283 logm("Extracting alignments is done")
|
|
284
|
|
285 #----------------------------------------------------------------
|
|
286 # get uniq-hit reads
|
|
287 #----------------------------------------------------------------
|
|
288 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
|
|
289
|
|
290 Unique_FW_C2T=set() # +
|
|
291 Unique_RC_C2T=set() # -
|
|
292 Multiple_hits=set()
|
|
293
|
|
294
|
|
295 for x in Union_set:
|
|
296 _list=[]
|
|
297 for dx in [FW_C2T_U, RC_C2T_U]:
|
|
298 mis_lst=dx.get(x,[99])
|
|
299 mis=int(mis_lst[0])
|
|
300 _list.append(mis)
|
|
301 for dx in [FW_C2T_R, RC_C2T_R]:
|
|
302 mis=dx.get(x,99)
|
|
303 _list.append(mis)
|
|
304 mini=min(_list)
|
|
305 if _list.count(mini)==1:
|
|
306 mini_index=_list.index(mini)
|
|
307 if mini_index==0:
|
|
308 Unique_FW_C2T.add(x)
|
|
309 elif mini_index==1:
|
|
310 Unique_RC_C2T.add(x)
|
|
311 else :
|
|
312 Multiple_hits.add(x)
|
|
313 else :
|
|
314 Multiple_hits.add(x)
|
|
315 # write reads rejected by Multiple Hits to file
|
|
316 if show_multiple_hit :
|
|
317 outf_MH=open("Multiple_hit.fa",'w')
|
|
318 for i in Multiple_hits :
|
|
319 outf_MH.write(">%s\n" % i)
|
|
320 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
321 outf_MH.close()
|
|
322
|
|
323 del Union_set
|
|
324 del FW_C2T_R
|
|
325 del RC_C2T_R
|
|
326
|
|
327 FW_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
|
|
328 RC_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
|
|
329 FW_uniq_lst.sort()
|
|
330 RC_uniq_lst.sort()
|
|
331 FW_uniq_lst=[x[1] for x in FW_uniq_lst]
|
|
332 RC_uniq_lst=[x[1] for x in RC_uniq_lst]
|
|
333
|
|
334 del Unique_FW_C2T
|
|
335 del Unique_RC_C2T
|
|
336
|
|
337 #----------------------------------------------------------------
|
|
338 # Post-filtering reads
|
|
339
|
|
340 # ---- FW ----
|
|
341 FW_regions = dict()
|
|
342 gseq = dict()
|
|
343 chr_length = dict()
|
|
344 for header in FW_uniq_lst :
|
|
345 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
|
|
346 original_BS = original_bs_reads[header]
|
|
347 if mapped_chr not in FW_regions :
|
|
348 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
|
|
349 if mapped_chr not in gseq :
|
|
350 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
351 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
352
|
|
353 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
354 all_mapped+=1
|
|
355 FR = "+FW"
|
|
356 mapped_strand = "+"
|
|
357 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
358 mapped_location,
|
|
359 mapped_location + g_len,
|
|
360 mapped_strand)
|
|
361 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
362 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
363
|
|
364 if len(r_aln) == len(g_aln) :
|
|
365 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
366 if True in checking_genome_context :
|
|
367 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
368 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
369 try_pos, FR)
|
|
370 if my_region_serial == 0 : # still be 0
|
|
371 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
372 # print "[For debug]: FW read has no tags"
|
|
373 try_count = 0
|
|
374 try_pos = mapped_location - min_cut5_len + 1
|
|
375 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
376 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
377 try_pos, FR)
|
|
378 try_pos -= 1
|
|
379 try_count += 1
|
|
380
|
|
381 #if my_region_serial == 0 :
|
|
382 # print "[For debug]: chr=", mapped_chr
|
|
383 # print "[For debug]: +FW read still can not find fragment serial"
|
|
384 # Tip: sometimes "my_region_serial" is still 0 ...
|
|
385
|
|
386
|
|
387 N_mismatch = N_MIS(r_aln, g_aln)
|
1
|
388 # if N_mismatch <= int(max_mismatch_no) :
|
|
389 mm_no=float(max_mismatch_no)
|
|
390 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
|
0
|
391 all_mapped_passed += 1
|
|
392 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
393 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
394 #---XS FILTER----------------
|
|
395 XS = 0
|
|
396 nCH = methy.count('y') + methy.count('z')
|
|
397 nmCH = methy.count('Y') + methy.count('Z')
|
|
398 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
399 XS = 1
|
|
400 num_mapped_FW_C2T += 1
|
|
401 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
402 mapped_location, cigar, original_BS, methy, XS,
|
|
403 output_genome = output_genome,
|
|
404 rrbs = True,
|
|
405 my_region_serial = my_region_serial,
|
|
406 my_region_start = my_region_start,
|
|
407 my_region_end = my_region_end)
|
1
|
408 all_base_mapped += len(original_BS)
|
0
|
409 else :
|
|
410 print "[For debug]: reads not in same lengths"
|
|
411
|
|
412 #print "start RC"
|
|
413 # ---- RC ----
|
|
414 RC_regions = dict()
|
|
415 for header in RC_uniq_lst :
|
|
416 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
|
|
417 original_BS = original_bs_reads[header]
|
|
418 if mapped_chr not in RC_regions :
|
|
419 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
|
|
420 if mapped_chr not in gseq :
|
|
421 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
422 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
423
|
|
424 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
425 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
426 all_mapped+=1
|
|
427 FR = "-FW"
|
|
428 mapped_strand = "-"
|
|
429 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
430 mapped_location,
|
|
431 mapped_location + g_len,
|
|
432 mapped_strand)
|
|
433 #checking_genome_context = (output_genome[gx:gy] == check_pattern)
|
|
434 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
435 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
436
|
|
437 if len(r_aln) == len(g_aln) : # and checking_genome_context:
|
|
438 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
439 if True in checking_genome_context :
|
|
440 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
441 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
442 try_pos , FR)
|
|
443 if my_region_serial == 0 : # still be 0
|
|
444 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
445 #print "[For debug]: RC Read has no tags"
|
|
446 try_count = 0
|
|
447 try_pos = mapped_location + g_len + min_cut5_len - 2
|
|
448 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
449 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
450 try_pos, FR)
|
|
451 try_pos += 1
|
|
452 try_count += 1
|
|
453
|
|
454 #if my_region_serial == 0 :
|
|
455 # print "[For debug]: chr=", mapped_chr
|
|
456 # print "[For debug]: -FW read still cannot find fragment serial"
|
|
457
|
|
458
|
|
459 N_mismatch = N_MIS(r_aln, g_aln)
|
1
|
460 # if N_mismatch <= int(max_mismatch_no) :
|
|
461 mm_no=float(max_mismatch_no)
|
|
462 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
|
0
|
463 all_mapped_passed += 1
|
|
464 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
465 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
466 #---XS FILTER----------------
|
|
467 XS = 0
|
|
468 nCH = methy.count('y') + methy.count('z')
|
|
469 nmCH = methy.count('Y') + methy.count('Z')
|
|
470 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
471 XS = 1
|
|
472 num_mapped_RC_C2T += 1
|
|
473 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
474 mapped_location, cigar, original_BS, methy, XS,
|
|
475 output_genome = output_genome,
|
|
476 rrbs = True,
|
|
477 my_region_serial = my_region_serial,
|
|
478 my_region_start = my_region_start,
|
|
479 my_region_end = my_region_end)
|
1
|
480 all_base_mapped += len(original_BS)
|
0
|
481 else :
|
|
482 print "[For debug]: reads not in same lengths"
|
|
483
|
|
484
|
|
485 # Finished both FW and RC
|
|
486 logm("Done: %s (%d) \n" % (read_file, no_my_files))
|
|
487 print "--> %s (%d) "%(read_file, no_my_files)
|
|
488 del original_bs_reads
|
|
489 delete_files(WC2T, CC2T)
|
|
490
|
|
491 # End of directional library
|
|
492
|
|
493
|
|
494 # ====================================================
|
|
495 # un-directional library
|
|
496 # ====================================================
|
|
497
|
|
498 elif asktag=="Y" :
|
|
499 #----------------------------------------------------------------
|
|
500 logm("== Start mapping ==")
|
|
501
|
|
502 input_fname = os.path.split(main_read_file)[1]
|
|
503 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
|
|
504
|
|
505 logm("Processing read file: %s" % read_file)
|
|
506 original_bs_reads = {}
|
|
507 no_my_files+=1
|
|
508 random_id = ".tmp-"+str(random.randint(1000000,9999999))
|
|
509 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
|
|
510 outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
|
|
511
|
|
512 outf2=open(outfile2,'w')
|
|
513 outf3=open(outfile3,'w')
|
|
514
|
|
515 #--- Checking input format ------------------------------------------
|
|
516 try :
|
1
|
517 if read_file.endswith(".gz") : # support input file ending with ".gz"
|
|
518 read_inf = gzip.open(read_file, "rb")
|
|
519 else :
|
|
520 read_inf=open(read_file,"r")
|
0
|
521 except IOError:
|
|
522 print "[Error] Cannot open input file : %s" % read_file
|
|
523 exit(-1)
|
|
524
|
1
|
525 oneline = read_inf.readline()
|
|
526 l = oneline.split()
|
|
527 input_format = ""
|
|
528 if oneline[0]=="@":
|
|
529 input_format = "fastq"
|
|
530 elif len(l)==1 and oneline[0]!=">" :
|
|
531 input_format = "seq"
|
|
532 elif len(l)==11:
|
|
533 input_format = "qseq"
|
|
534 elif oneline[0]==">" :
|
|
535 input_format = "fasta"
|
0
|
536 read_inf.close()
|
|
537
|
|
538 #----------------------------------------------------------------
|
1
|
539 read_id = ""
|
0
|
540 seq = ""
|
1
|
541 seq_ready = 0
|
|
542 line_no = 0
|
|
543 for line in fileinput.input(read_file, openhook=fileinput.hook_compressed):
|
|
544 l = line.split()
|
|
545 line_no += 1
|
|
546 if input_format=="seq":
|
|
547 all_raw_reads += 1
|
|
548 read_id = str(all_raw_reads)
|
|
549 read_id = read_id.zfill(12)
|
|
550 seq = l[0]
|
|
551 seq_ready = "Y"
|
0
|
552 elif input_format=="fastq":
|
1
|
553 l_fastq = math.fmod(line_no, 4)
|
|
554 if l_fastq == 1 :
|
|
555 all_raw_reads += 1
|
|
556 read_id = l[0][1:]
|
|
557 seq_ready = "N"
|
|
558 elif l_fastq == 2 :
|
|
559 seq = l[0]
|
|
560 seq_ready = "Y"
|
|
561 else :
|
|
562 seq = ""
|
|
563 seq_ready = "N"
|
0
|
564 elif input_format=="qseq":
|
1
|
565 all_raw_reads += 1
|
|
566 read_id = str(all_raw_reads)
|
|
567 read_id = read_id.zfill(12)
|
|
568 seq = l[8]
|
|
569 seq_ready = "Y"
|
|
570 elif input_format=="fasta" :
|
|
571 l_fasta = math.fmod(line_no,2)
|
|
572 if l_fasta==1:
|
|
573 all_raw_reads += 1
|
|
574 read_id = l[0][1:]
|
|
575 seq = ""
|
|
576 seq_ready = "N"
|
|
577 elif l_fasta==0 :
|
|
578 seq = l[0]
|
|
579 seq_ready = "Y"
|
|
580
|
|
581 #---------------------------------------------------------------
|
0
|
582 if seq_ready=="Y":
|
|
583 # Normalize the characters
|
1
|
584 seq = seq.upper().replace(".","N")
|
0
|
585
|
|
586 read_tag = [ m for m,n in [ (i, len(i)) for i in uniq(cut3_tag_lst)] if seq[0:n] == m ]
|
|
587 if len(read_tag) > 0 :
|
|
588 all_tagged += 1
|
|
589 for i in read_tag :
|
|
590 n_cut_tag_lst[i] += 1
|
|
591
|
|
592 seq = seq[(cut_s-1):cut_e] # cut_s start from 1 cycle by default
|
|
593
|
|
594 #-- Trimming adapter sequence ---
|
1
|
595 all_base_before_trim += len(seq)
|
0
|
596 if adapter_seq != "" :
|
1
|
597 new_read = RemoveAdapter(seq, adapter_seq, adapter_mismatch, Extra_base_cut_5end_adapter)
|
0
|
598 if len(new_read) < len(seq) :
|
|
599 all_tagged_trimmed += 1
|
|
600 seq = new_read
|
1
|
601 all_base_after_trim += len(seq)
|
0
|
602 if len(seq) <= 4 :
|
|
603 seq = "N" * (cut_e - cut_s)
|
|
604
|
|
605 # all reads will be considered, regardless of tags
|
|
606 #--------- trimmed_raw_BS_read and qscore ------------------
|
1
|
607 original_bs_reads[read_id] = seq
|
0
|
608 #--------- FW_C2T ------------------
|
1
|
609 outf2.write('>%s\n%s\n'%(read_id, seq.replace('C', 'T')))
|
0
|
610 #--------- RC_G2A ------------------
|
1
|
611 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
|
0
|
612 fileinput.close()
|
|
613
|
|
614 outf2.close()
|
|
615
|
|
616 delete_files(read_file)
|
|
617 logm("Processing input is done")
|
|
618 #--------------------------------------------------------------------------------
|
|
619
|
|
620 # mapping
|
|
621 #--------------------------------------------------------------------------------
|
1
|
622 WC2T = tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
623 CC2T = tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
|
|
624 WG2A = tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
|
|
625 CG2A = tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
|
0
|
626
|
|
627 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
|
|
628 'input_file' : outfile2,
|
|
629 'output_file' : WC2T},
|
|
630 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
|
|
631 'input_file' : outfile2,
|
|
632 'output_file' : CC2T},
|
|
633 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
|
|
634 'input_file' : outfile3,
|
|
635 'output_file' : WG2A},
|
|
636 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
|
|
637 'input_file' : outfile3,
|
|
638 'output_file' : CG2A} ])
|
|
639
|
|
640 logm("Aligning reads is done")
|
|
641
|
|
642 delete_files(outfile2)
|
|
643
|
|
644 #--------------------------------------------------------------------------------
|
|
645 # Post processing
|
|
646 #--------------------------------------------------------------------------------
|
|
647
|
1
|
648 FW_C2T_U,FW_C2T_R = extract_mapping(WC2T)
|
|
649 RC_G2A_U,RC_G2A_R = extract_mapping(CG2A)
|
0
|
650
|
1
|
651 FW_G2A_U,FW_G2A_R = extract_mapping(WG2A)
|
|
652 RC_C2T_U,RC_C2T_R = extract_mapping(CC2T)
|
0
|
653
|
|
654 logm("Extracting alignments is done")
|
|
655
|
|
656 #----------------------------------------------------------------
|
|
657 # get unique-hit reads
|
|
658 #----------------------------------------------------------------
|
1
|
659 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
|
0
|
660
|
1
|
661 Unique_FW_C2T = set() # +
|
|
662 Unique_RC_G2A = set() # +
|
|
663 Unique_FW_G2A = set() # -
|
|
664 Unique_RC_C2T = set() # -
|
|
665 Multiple_hits = set()
|
0
|
666
|
1
|
667 for x in Union_set :
|
|
668 _list = []
|
0
|
669 for dx in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
|
1
|
670 mis_lst = dx.get(x,[99])
|
|
671 mis = int(mis_lst[0])
|
0
|
672 _list.append(mis)
|
|
673 for dx in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
|
1
|
674 mis = dx.get(x,99)
|
0
|
675 _list.append(mis)
|
1
|
676 mini = min(_list)
|
0
|
677 if _list.count(mini) == 1:
|
1
|
678 mini_index = _list.index(mini)
|
0
|
679 if mini_index == 0:
|
|
680 Unique_FW_C2T.add(x)
|
|
681 elif mini_index == 1:
|
|
682 Unique_RC_G2A.add(x)
|
|
683 elif mini_index == 2:
|
|
684 Unique_FW_G2A.add(x)
|
|
685 elif mini_index == 3:
|
|
686 Unique_RC_C2T.add(x)
|
|
687 else :
|
|
688 Multiple_hits.add(x)
|
|
689 else :
|
|
690 Multiple_hits.add(x)
|
|
691 # write reads rejected by Multiple Hits to file
|
|
692 if show_multiple_hit :
|
1
|
693 outf_MH = open("Multiple_hit.fa",'w')
|
0
|
694 for i in Multiple_hits :
|
|
695 outf_MH.write(">%s\n" % i)
|
|
696 outf_MH.write("%s\n" % original_bs_reads[i])
|
|
697 outf_MH.close()
|
|
698
|
|
699 del Union_set
|
|
700 del FW_C2T_R
|
|
701 del FW_G2A_R
|
|
702 del RC_C2T_R
|
|
703 del RC_G2A_R
|
|
704
|
1
|
705 FW_C2T_uniq_lst = [[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
|
|
706 FW_G2A_uniq_lst = [[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
|
|
707 RC_C2T_uniq_lst = [[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
|
|
708 RC_G2A_uniq_lst = [[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
|
0
|
709 FW_C2T_uniq_lst.sort()
|
|
710 RC_C2T_uniq_lst.sort()
|
|
711 FW_G2A_uniq_lst.sort()
|
|
712 RC_G2A_uniq_lst.sort()
|
1
|
713 FW_C2T_uniq_lst = [x[1] for x in FW_C2T_uniq_lst]
|
|
714 RC_C2T_uniq_lst = [x[1] for x in RC_C2T_uniq_lst]
|
|
715 FW_G2A_uniq_lst = [x[1] for x in FW_G2A_uniq_lst]
|
|
716 RC_G2A_uniq_lst = [x[1] for x in RC_G2A_uniq_lst]
|
0
|
717
|
|
718 del Unique_FW_C2T
|
|
719 del Unique_FW_G2A
|
|
720 del Unique_RC_C2T
|
|
721 del Unique_RC_G2A
|
|
722
|
|
723
|
|
724 #----------------------------------------------------------------
|
|
725 # Post-filtering reads
|
1
|
726 # ---- FW_C2T ---- un-directional
|
0
|
727 FW_regions = dict()
|
|
728 gseq = dict()
|
|
729 chr_length = dict()
|
|
730 for header in FW_C2T_uniq_lst :
|
|
731 _, mapped_chr, mapped_location, cigar = FW_C2T_U[header]
|
|
732 original_BS = original_bs_reads[header]
|
|
733 if mapped_chr not in FW_regions :
|
|
734 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
|
|
735 if mapped_chr not in gseq :
|
|
736 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
737 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
738
|
|
739 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
740 all_mapped+=1
|
|
741 FR = "+FW"
|
|
742 mapped_strand = "+"
|
|
743 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
744 mapped_location,
|
|
745 mapped_location + g_len,
|
|
746 mapped_strand)
|
|
747 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
748 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
749
|
|
750 if len(r_aln) == len(g_aln) :
|
|
751 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
752 if True in checking_genome_context :
|
|
753 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
754 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
755 try_pos, FR)
|
|
756 if my_region_serial == 0 : # still be 0
|
|
757 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
758 # print "[For debug]: FW read has no tags"
|
|
759 try_count = 0
|
|
760 try_pos = mapped_location - min_cut5_len + 1
|
|
761 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
762 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
763 try_pos, FR)
|
|
764 try_pos -= 1
|
|
765 try_count += 1
|
|
766
|
|
767 N_mismatch = N_MIS(r_aln, g_aln)
|
1
|
768 # if N_mismatch <= int(max_mismatch_no) :
|
|
769 mm_no=float(max_mismatch_no)
|
|
770 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
|
0
|
771 all_mapped_passed += 1
|
|
772 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
773 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
774 #---XS FILTER----------------
|
|
775 XS = 0
|
|
776 nCH = methy.count('y') + methy.count('z')
|
|
777 nmCH = methy.count('Y') + methy.count('Z')
|
|
778 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
779 XS = 1
|
|
780 num_mapped_FW_C2T += 1
|
|
781 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
782 mapped_location, cigar, original_BS, methy, XS,
|
|
783 output_genome = output_genome,
|
|
784 rrbs = True,
|
|
785 my_region_serial = my_region_serial,
|
|
786 my_region_start = my_region_start,
|
|
787 my_region_end = my_region_end)
|
1
|
788 all_base_mapped += len(original_BS)
|
0
|
789 else :
|
|
790 print "[For debug]: reads not in same lengths"
|
|
791
|
|
792
|
1
|
793 # ---- RC_C2T ---- un-directional
|
0
|
794 RC_regions = dict()
|
|
795 for header in RC_C2T_uniq_lst :
|
|
796 _, mapped_chr, mapped_location, cigar = RC_C2T_U[header]
|
|
797 original_BS = original_bs_reads[header]
|
|
798 if mapped_chr not in RC_regions :
|
|
799 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
|
|
800 if mapped_chr not in gseq :
|
|
801 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
802 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
803
|
|
804 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
805 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
806 all_mapped+=1
|
|
807 FR = "-FW"
|
|
808 mapped_strand = "-"
|
|
809 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
810 mapped_location,
|
|
811 mapped_location + g_len,
|
|
812 mapped_strand)
|
|
813 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
814 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
815
|
|
816 if len(r_aln) == len(g_aln) : # and checking_genome_context:
|
|
817 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
818 if True in checking_genome_context :
|
|
819 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
820 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
821 try_pos , FR)
|
|
822 if my_region_serial == 0 : # still be 0
|
|
823 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
824 #print "[For debug]: RC Read has no tags"
|
|
825 try_count = 0
|
|
826 try_pos = mapped_location + g_len + min_cut5_len - 2
|
|
827 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
828 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
829 try_pos, FR)
|
|
830 try_pos += 1
|
|
831 try_count += 1
|
|
832
|
|
833 N_mismatch = N_MIS(r_aln, g_aln)
|
1
|
834 # if N_mismatch <= int(max_mismatch_no) :
|
|
835 mm_no=float(max_mismatch_no)
|
|
836 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
|
0
|
837 all_mapped_passed += 1
|
|
838 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
839 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
840 #---XS FILTER----------------
|
|
841 XS = 0
|
|
842 nCH = methy.count('y') + methy.count('z')
|
|
843 nmCH = methy.count('Y') + methy.count('Z')
|
|
844 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
845 XS = 1
|
|
846 num_mapped_RC_C2T += 1
|
|
847 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
848 mapped_location, cigar, original_BS, methy, XS,
|
|
849 output_genome = output_genome,
|
|
850 rrbs = True,
|
|
851 my_region_serial = my_region_serial,
|
|
852 my_region_start = my_region_start,
|
|
853 my_region_end = my_region_end)
|
1
|
854 all_base_mapped += len(original_BS)
|
0
|
855
|
|
856 else :
|
|
857 print "[For debug]: reads not in same lengths"
|
|
858
|
|
859
|
1
|
860 # ---- FW_G2A ---- un-directional
|
0
|
861 FW_regions = dict()
|
|
862 gseq = dict()
|
|
863 chr_length = dict()
|
|
864 for header in FW_G2A_uniq_lst :
|
|
865 _, mapped_chr, mapped_location, cigar = FW_G2A_U[header]
|
|
866 original_BS = original_bs_reads[header]
|
|
867 if mapped_chr not in FW_regions :
|
|
868 FW_regions[mapped_chr] = deserialize(db_d(FWD_MAPPABLE_REGIONS(mapped_chr)))
|
|
869 if mapped_chr not in gseq :
|
|
870 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
871 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
872 cigar = list(reversed(cigar))
|
|
873
|
|
874 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
875 all_mapped+=1
|
|
876 FR = "-RC"
|
|
877 mapped_strand = "-"
|
|
878 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
879 mapped_location,
|
|
880 mapped_location + g_len,
|
|
881 mapped_strand)
|
|
882 original_BS = reverse_compl_seq(original_BS) # for RC reads
|
|
883 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
884 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
885
|
|
886 if len(r_aln) == len(g_aln) :
|
|
887 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
888 if True in checking_genome_context :
|
|
889 try_pos = [mapped_location - len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
890 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
891 try_pos, FR)
|
|
892 if my_region_serial == 0 : # still be 0
|
|
893 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
894 #print "[For debug]: FW read has no tags"
|
|
895 try_count = 0
|
|
896 try_pos = mapped_location - min_cut5_len + 1
|
|
897 while my_region_serial == 0 and try_count < MAX_TRY :
|
|
898 my_region_serial, my_region_start, my_region_end = my_mappable_region(FW_regions[mapped_chr],
|
|
899 try_pos, FR)
|
|
900 try_pos += 1
|
|
901 try_count += 1
|
|
902 #if my_region_serial == 0 :
|
|
903 # print "[For debug]: chr=", mapped_chr
|
|
904 # print "[For debug]: FW_G2A read still can not find fragment serial"
|
|
905 # Tip: sometimes "my_region_serial" is still 0 ...
|
|
906
|
|
907 N_mismatch = N_MIS(r_aln, g_aln)
|
1
|
908 # if N_mismatch <= int(max_mismatch_no) :
|
|
909 max_mismatch_no=float(max_mismatch_no)
|
|
910 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
|
0
|
911 all_mapped_passed += 1
|
|
912 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
913 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
914 #---XS FILTER----------------
|
|
915 XS = 0
|
|
916 nCH = methy.count('y') + methy.count('z')
|
|
917 nmCH = methy.count('Y') + methy.count('Z')
|
|
918 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
919 XS = 1
|
|
920 num_mapped_FW_G2A += 1
|
|
921 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
922 mapped_location, cigar, original_BS, methy, XS,
|
|
923 output_genome = output_genome,
|
|
924 rrbs = True,
|
|
925 my_region_serial = my_region_serial,
|
|
926 my_region_start = my_region_start,
|
|
927 my_region_end = my_region_end)
|
1
|
928 all_base_mapped += len(original_BS)
|
0
|
929 else :
|
|
930 print "[For debug]: reads not in same lengths"
|
|
931
|
|
932
|
1
|
933 # ---- RC_G2A ---- un-directional
|
0
|
934 RC_regions = dict()
|
|
935 for header in RC_G2A_uniq_lst :
|
|
936 _, mapped_chr, mapped_location, cigar = RC_G2A_U[header]
|
|
937 original_BS = original_bs_reads[header]
|
|
938 if mapped_chr not in RC_regions :
|
|
939 RC_regions[mapped_chr] = deserialize(db_d(REV_MAPPABLE_REGIONS(mapped_chr)))
|
|
940 if mapped_chr not in gseq :
|
|
941 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
|
|
942 chr_length[mapped_chr] = len(gseq[mapped_chr])
|
|
943 cigar = list(reversed(cigar))
|
|
944
|
|
945 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
|
|
946 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
|
|
947 all_mapped+=1
|
|
948 FR = "+RC"
|
|
949 mapped_strand = "+"
|
|
950 origin_genome, next2bp, output_genome = get_genomic_sequence(gseq[mapped_chr],
|
|
951 mapped_location,
|
|
952 mapped_location + g_len,
|
|
953 mapped_strand)
|
|
954 original_BS = reverse_compl_seq(original_BS) # for RC reads
|
|
955 checking_genome_context = [output_genome[i:j] == k for i,j,k in zip(gx,gy,check_pattern) ]
|
|
956 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
|
|
957
|
|
958 if len(r_aln) == len(g_aln) : # and checking_genome_context:
|
|
959 my_region_serial, my_region_start, my_region_end = [-1, 0, 0]
|
|
960 if True in checking_genome_context :
|
|
961 try_pos = [mapped_location + g_len - 1 + len(i) for i,j in zip(cut5_tag_lst, checking_genome_context) if j][0]
|
|
962 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
963 mapped_location + g_len + min_cut5_len -1, FR)
|
|
964 if my_region_serial == 0 : # still be 0
|
|
965 # for some cases, read has no tags; searching the upstream sequence for tags
|
|
966 #print "[For debug]: RC Read has no tags"
|
|
967 try_count = 0
|
|
968 try_pos = mapped_location + g_len + min_cut5_len - 2
|
|
969 while try_count < MAX_TRY :
|
|
970 my_region_serial, my_region_start, my_region_end = my_mappable_region(RC_regions[mapped_chr],
|
|
971 try_pos, FR)
|
|
972 try_pos += 1
|
|
973 try_count += 1
|
|
974
|
|
975 #if my_region_serial == 0 :
|
|
976 # print "[For debug]: chr=", mapped_chr
|
|
977 # print "[For debug]: RC_C2A read still cannot find fragment serial"
|
|
978
|
|
979 N_mismatch = N_MIS(r_aln, g_aln)
|
1
|
980 # if N_mismatch <= int(max_mismatch_no) :
|
|
981 mm_no=float(max_mismatch_no)
|
|
982 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
|
0
|
983 all_mapped_passed += 1
|
|
984 methy = methy_seq(r_aln, g_aln + next2bp)
|
|
985 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
|
|
986 #---XS FILTER----------------
|
|
987 XS = 0
|
|
988 nCH = methy.count('y') + methy.count('z')
|
|
989 nmCH = methy.count('Y') + methy.count('Z')
|
|
990 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
|
|
991 XS = 1
|
|
992 num_mapped_RC_G2A += 1
|
|
993 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand,
|
|
994 mapped_location, cigar, original_BS, methy, XS,
|
|
995 output_genome = output_genome,
|
|
996 rrbs = True,
|
|
997 my_region_serial = my_region_serial,
|
|
998 my_region_start = my_region_start,
|
|
999 my_region_end = my_region_end)
|
1
|
1000 all_base_mapped += len(original_BS)
|
0
|
1001 else :
|
|
1002 print "[For debug]: reads not in same lengths"
|
|
1003
|
|
1004 # Finished both FW and RC
|
|
1005 logm("Done: %s (%d) \n" % (read_file, no_my_files))
|
|
1006 print "--> %s (%d) "%(read_file, no_my_files)
|
|
1007 del original_bs_reads
|
|
1008 delete_files(WC2T, CC2T, WG2A, CG2A)
|
|
1009 # End of un-directional library
|
|
1010
|
|
1011 delete_files(tmp_path)
|
|
1012
|
|
1013
|
1
|
1014 logm("Number of raw reads: %d "% all_raw_reads)
|
|
1015 if all_raw_reads>0:
|
|
1016 logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)"%(all_tagged,float(all_tagged)/all_raw_reads))
|
0
|
1017 for kk in range(len(n_cut_tag_lst)):
|
1
|
1018 logm("Number of raw reads with tag %s: %d (%1.3f)"%(cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
|
|
1019 if adapter_seq!="" :
|
|
1020 logm("Number of reads having adapter removed: %d "%all_tagged_trimmed)
|
|
1021 logm("Number of bases in total: %d "%all_base_before_trim)
|
|
1022 if adapter_seq!="" :
|
|
1023 logm("Number of bases after trimming the adapters: %d (%1.3f)"%(all_base_after_trim, float(all_base_after_trim)/all_base_before_trim ) )
|
|
1024 logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
|
|
1025 logm("Number of unique-hits reads (before post-filtering): %d"%all_mapped)
|
0
|
1026
|
1
|
1027 logm("------ %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
|
|
1028 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads))
|
|
1029 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
|
0
|
1030
|
1
|
1031 if asktag=="Y": # un-diretional
|
0
|
1032 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
|
|
1033 logm(" ---- %7d RC reads mapped to Watson strand"%(num_mapped_FW_G2A) )
|
|
1034 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
|
|
1035 logm(" ---- %7d RC reads mapped to Crick strand"%(num_mapped_RC_G2A) )
|
|
1036 # the variable name 'num_mapped_RC_G2A' seems not consistent with illustration
|
|
1037 # according to literal meaning
|
|
1038 elif asktag=="N": # directional
|
|
1039 logm(" ---- %7d FW reads mapped to Watson strand"%(num_mapped_FW_C2T) )
|
|
1040 logm(" ---- %7d FW reads mapped to Crick strand"%(num_mapped_RC_C2T) )
|
1
|
1041 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
|
0
|
1042
|
1
|
1043 n_CG = mC_lst[0] + uC_lst[0]
|
|
1044 n_CHG = mC_lst[1] + uC_lst[1]
|
|
1045 n_CHH = mC_lst[2] + uC_lst[2]
|
0
|
1046
|
|
1047 logm("----------------------------------------------")
|
1
|
1048 logm("Methylated C in mapped reads ")
|
|
1049 logm(" mCG %1.3f%%"%( (100 * float(mC_lst[0]) / n_CG) if n_CG!=0 else 0))
|
|
1050 logm(" mCHG %1.3f%%"%( (100 * float(mC_lst[1]) / n_CHG) if n_CHG!=0 else 0))
|
|
1051 logm(" mCHH %1.3f%%"%( (100 * float(mC_lst[2]) / n_CHH) if n_CHH!=0 else 0))
|
0
|
1052 logm("----------------------------------------------")
|
|
1053 logm("------------------- END ----------------------")
|
|
1054
|
|
1055 elapsed(main_read_file)
|
|
1056
|
|
1057 close_log()
|
|
1058
|
|
1059
|