annotate BSseeker2/bs_index/rrbs_build.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 import os
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3 from bs_utils.utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 FWD_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.fwd_mappable_regions'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 REV_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.rev_mappable_regions'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10 # example: T-CGA
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12 def rrbs_build(fasta_file, build_command, ref_path, low_bound, up_bound, aligner, cut_format="C-CGG"):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13 # ref_path is a string that contains the directory where the reference genomes are stored with
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 # the input fasta filename appended
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16 cut_format = cut_format.upper() # Ex. "C-CGG,C-TAG"; MspI&ApekI:"G^CWGC"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17 if cut_format == "C-CGG" :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 ref_path = os.path.join(ref_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 os.path.split(fasta_file)[1] + '_rrbs_%d_%d' % (low_bound, up_bound) +'_' + aligner)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21 ref_path = os.path.join(ref_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22 os.path.split(fasta_file)[1] + '_rrbs_%s_%d_%d' % ( cut_format.replace("-","").replace(",","-"), low_bound, up_bound) +'_' + aligner)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24 ref_p = lambda filename: os.path.join(ref_path, filename)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
26 clear_dir(ref_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27 open_log(ref_p('log'))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29 refd = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30 w_c2t = open(ref_p('W_C2T.fa'),'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 c_c2t = open(ref_p('C_C2T.fa'),'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 w_g2a = open(ref_p('W_G2A.fa'),'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34 c_g2a = open(ref_p('C_G2A.fa'),'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 mappable_regions_output_file = open(ref_p("RRBS_mappable_regions.txt"),"w")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 all_L = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 all_mappable_length = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40 all_unmappable_length = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42 no_mappable_region = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 total_chromosomes = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45 # cut_context = re.sub("-", "", cut_format).split(",")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46 cut_context = EnumerateIUPAC(cut_format.replace("-","").split(","))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47 cut_len = [len(i) for i in cut_context]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48 cut_len_max = max(cut_len)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51 for chrom_id, chrom_seq in read_fasta(fasta_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 total_chromosomes += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 refd[chrom_id] = len(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 fwd_chr_regions = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56 rev_chr_regions = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
58 L = len(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 XXXX_sites = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 XXXX_XXXX = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 #-- collect all "XXXX sites ---------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63 i = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 while i <= L - cut_len_max:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65 j = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 while j < len(cut_len) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 if chrom_seq[i : i + cut_len[j]] == cut_context[j]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 XXXX_sites.append( (i, i + cut_len[j] - 1) ) # add (1st position, last position)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 j += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 i += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 #-- find "XXXX" pairs that are within the length of fragment ----
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 for j in xrange(len(XXXX_sites) - 1):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74 DD = (XXXX_sites[j+1][0] - XXXX_sites[j][1]) - 1 # NOT including both XXXX; DD: fragment length
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75 if low_bound <= DD <= up_bound:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76 XXXX_XXXX.append([XXXX_sites[j][0], XXXX_sites[j+1][1]]) # leftmost <--> rightmost
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77 mappable_seq = chrom_seq[XXXX_sites[j][0] : XXXX_sites[j+1][1] + 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78 no_mappable_region += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 fwd_chr_regions[str(XXXX_sites[j][0])] = [XXXX_sites[j+1][1], no_mappable_region]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
81 rev_chr_regions[str(XXXX_sites[j+1][1])] = [XXXX_sites[j][0], no_mappable_region]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
82
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
83 # start_position, end_position, serial, sequence
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
84 mappable_regions_output_file.write("%s\t%d\t%d\t%d\t%s\n"%(chrom_id, no_mappable_region,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
85 XXXX_sites[j][0], XXXX_sites[j+1][1], mappable_seq))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
86 # storing region information to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
87 # format: A[left_CCGG_pos]=[right_CCGG_pos, number_of_mappable_region]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
88 serialize(fwd_chr_regions, ref_p(FWD_MAPPABLE_REGIONS(chrom_id)))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
89 serialize(rev_chr_regions, ref_p(REV_MAPPABLE_REGIONS(chrom_id)))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
90
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
91 #-----------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
92 # mask the genome
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
93 _map_seq = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
94 mappable_length = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95 unmappable_length = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
96 m = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
97 mark = False
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98 while m < L: # for every nucleotide in chr
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99 if len(XXXX_XXXX) > 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100 pair = XXXX_XXXX[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 p1 = pair[0] # left end of fragment
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
102 p2 = pair[1] # right end of fragment
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103 if p1 <= m < p2 + 1 :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104 _map_seq.append(chrom_seq[m])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105 mappable_length+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 mark = True
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
107 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108 if not mark:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109 _map_seq.append("-")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110 unmappable_length += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
111 else: # the last eligible base
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
112 _ = XXXX_XXXX.pop(0)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
113 if len(XXXX_XXXX)>0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 pair = XXXX_XXXX[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115 p1 = pair[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
116 p2 = pair[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 if p1 <= m < p2 + 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
118 _map_seq.append(chrom_seq[m])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
119 mappable_length += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
120 mark = True
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
121 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
122 _map_seq.append("-")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
123 unmappable_length += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
124 mark = False
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
125 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
126 _map_seq.append("-")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
127 unmappable_length+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
128 mark = False
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
129 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
130 if not mark:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
131 _map_seq.append("-")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
132 unmappable_length+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
133 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
134 _map_seq.append("-")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
135 unmappable_length+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
136 mark = False
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
137
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
138 m+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
139
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
140 #-----------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
141
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
142 chrom_seq = ''.join(_map_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
143 serialize(chrom_seq, ref_p(chrom_id))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
144
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
145 w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
146 w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
147
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
148 chrom_seq = reverse_compl_seq(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
149
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150 c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151 c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153 #-----------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154 logm("# %s : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)"%(chrom_id,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
155 L,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
156 unmappable_length,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
157 mappable_length,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
158 float(mappable_length)/L) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
159 all_L += L
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
160 all_mappable_length += mappable_length
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161 all_unmappable_length += unmappable_length
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
163 elapsed('Finished initial pre-processing of ' + chrom_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
164
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
165
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
166 for outf in [w_c2t, c_c2t, w_g2a, c_g2a]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
167 outf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
168
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
169
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
170 logm("# Total %d chromosome(s) : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)" %(total_chromosomes,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
171 all_L,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
172 all_unmappable_length,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
173 all_mappable_length,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 float(all_mappable_length)/all_L) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175 logm("# eligible fragments : %d" % no_mappable_region )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
177 serialize(refd, ref_p("refname"))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
178
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
179 mappable_regions_output_file.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
180 elapsed('Storing mappable regions and genome')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
181
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
182 #---------------- bowtie-build -------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
183
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
184 # append ref_path to all elements of to_bowtie
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
185 to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A'])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
186
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
187 run_in_parallel([(build_command % { 'fname' : fname }, fname + '.log') for fname in to_bowtie])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
188
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
189 elapsed('Index building')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
190 # delete all fasta files
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191 delete_files( f +'.fa' for f in to_bowtie)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
192
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
193 elapsed('END')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
194