Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_index/rrbs_build.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 import os | |
2 | |
3 from bs_utils.utils import * | |
4 | |
5 | |
6 FWD_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.fwd_mappable_regions' | |
7 REV_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.rev_mappable_regions' | |
8 | |
9 | |
10 # example: T-CGA | |
11 | |
12 def rrbs_build(fasta_file, build_command, ref_path, low_bound, up_bound, aligner, cut_format="C-CGG"): | |
13 # ref_path is a string that contains the directory where the reference genomes are stored with | |
14 # the input fasta filename appended | |
15 | |
16 cut_format = cut_format.upper() # Ex. "C-CGG,C-TAG"; MspI&ApekI:"G^CWGC" | |
17 if cut_format == "C-CGG" : | |
18 ref_path = os.path.join(ref_path, | |
19 os.path.split(fasta_file)[1] + '_rrbs_%d_%d' % (low_bound, up_bound) +'_' + aligner) | |
20 else : | |
21 ref_path = os.path.join(ref_path, | |
22 os.path.split(fasta_file)[1] + '_rrbs_%s_%d_%d' % ( cut_format.replace("-","").replace(",","-"), low_bound, up_bound) +'_' + aligner) | |
23 | |
24 ref_p = lambda filename: os.path.join(ref_path, filename) | |
25 | |
26 clear_dir(ref_path) | |
27 open_log(ref_p('log')) | |
28 | |
29 refd = {} | |
30 w_c2t = open(ref_p('W_C2T.fa'),'w') | |
31 c_c2t = open(ref_p('C_C2T.fa'),'w') | |
32 | |
33 w_g2a = open(ref_p('W_G2A.fa'),'w') | |
34 c_g2a = open(ref_p('C_G2A.fa'),'w') | |
35 | |
36 mappable_regions_output_file = open(ref_p("RRBS_mappable_regions.txt"),"w") | |
37 | |
38 all_L = 0 | |
39 all_mappable_length = 0 | |
40 all_unmappable_length = 0 | |
41 | |
42 no_mappable_region = 0 | |
43 total_chromosomes = 0 | |
44 | |
45 # cut_context = re.sub("-", "", cut_format).split(",") | |
46 cut_context = EnumerateIUPAC(cut_format.replace("-","").split(",")) | |
47 cut_len = [len(i) for i in cut_context] | |
48 cut_len_max = max(cut_len) | |
49 | |
50 | |
51 for chrom_id, chrom_seq in read_fasta(fasta_file): | |
52 total_chromosomes += 1 | |
53 refd[chrom_id] = len(chrom_seq) | |
54 | |
55 fwd_chr_regions = {} | |
56 rev_chr_regions = {} | |
57 | |
58 L = len(chrom_seq) | |
59 XXXX_sites = [] | |
60 XXXX_XXXX = [] | |
61 | |
62 #-- collect all "XXXX sites --------------------------------- | |
63 i = 1 | |
64 while i <= L - cut_len_max: | |
65 j = 0 | |
66 while j < len(cut_len) : | |
67 if chrom_seq[i : i + cut_len[j]] == cut_context[j]: | |
68 XXXX_sites.append( (i, i + cut_len[j] - 1) ) # add (1st position, last position) | |
69 j += 1 | |
70 i += 1 | |
71 | |
72 #-- find "XXXX" pairs that are within the length of fragment ---- | |
73 for j in xrange(len(XXXX_sites) - 1): | |
74 DD = (XXXX_sites[j+1][0] - XXXX_sites[j][1]) - 1 # NOT including both XXXX; DD: fragment length | |
75 if low_bound <= DD <= up_bound: | |
76 XXXX_XXXX.append([XXXX_sites[j][0], XXXX_sites[j+1][1]]) # leftmost <--> rightmost | |
77 mappable_seq = chrom_seq[XXXX_sites[j][0] : XXXX_sites[j+1][1] + 1] | |
78 no_mappable_region += 1 | |
79 | |
80 fwd_chr_regions[str(XXXX_sites[j][0])] = [XXXX_sites[j+1][1], no_mappable_region] | |
81 rev_chr_regions[str(XXXX_sites[j+1][1])] = [XXXX_sites[j][0], no_mappable_region] | |
82 | |
83 # start_position, end_position, serial, sequence | |
84 mappable_regions_output_file.write("%s\t%d\t%d\t%d\t%s\n"%(chrom_id, no_mappable_region, | |
85 XXXX_sites[j][0], XXXX_sites[j+1][1], mappable_seq)) | |
86 # storing region information to file | |
87 # format: A[left_CCGG_pos]=[right_CCGG_pos, number_of_mappable_region] | |
88 serialize(fwd_chr_regions, ref_p(FWD_MAPPABLE_REGIONS(chrom_id))) | |
89 serialize(rev_chr_regions, ref_p(REV_MAPPABLE_REGIONS(chrom_id))) | |
90 | |
91 #----------------------------------- | |
92 # mask the genome | |
93 _map_seq = [] | |
94 mappable_length = 0 | |
95 unmappable_length = 0 | |
96 m = 0 | |
97 mark = False | |
98 while m < L: # for every nucleotide in chr | |
99 if len(XXXX_XXXX) > 0: | |
100 pair = XXXX_XXXX[0] | |
101 p1 = pair[0] # left end of fragment | |
102 p2 = pair[1] # right end of fragment | |
103 if p1 <= m < p2 + 1 : | |
104 _map_seq.append(chrom_seq[m]) | |
105 mappable_length+=1 | |
106 mark = True | |
107 else : | |
108 if not mark: | |
109 _map_seq.append("-") | |
110 unmappable_length += 1 | |
111 else: # the last eligible base | |
112 _ = XXXX_XXXX.pop(0) | |
113 if len(XXXX_XXXX)>0: | |
114 pair = XXXX_XXXX[0] | |
115 p1 = pair[0] | |
116 p2 = pair[1] | |
117 if p1 <= m < p2 + 1: | |
118 _map_seq.append(chrom_seq[m]) | |
119 mappable_length += 1 | |
120 mark = True | |
121 else: | |
122 _map_seq.append("-") | |
123 unmappable_length += 1 | |
124 mark = False | |
125 else: | |
126 _map_seq.append("-") | |
127 unmappable_length+=1 | |
128 mark = False | |
129 else: | |
130 if not mark: | |
131 _map_seq.append("-") | |
132 unmappable_length+=1 | |
133 else: | |
134 _map_seq.append("-") | |
135 unmappable_length+=1 | |
136 mark = False | |
137 | |
138 m+=1 | |
139 | |
140 #----------------------------------- | |
141 | |
142 chrom_seq = ''.join(_map_seq) | |
143 serialize(chrom_seq, ref_p(chrom_id)) | |
144 | |
145 w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T"))) | |
146 w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A"))) | |
147 | |
148 chrom_seq = reverse_compl_seq(chrom_seq) | |
149 | |
150 c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T"))) | |
151 c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A"))) | |
152 | |
153 #----------------------------------- | |
154 logm("# %s : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)"%(chrom_id, | |
155 L, | |
156 unmappable_length, | |
157 mappable_length, | |
158 float(mappable_length)/L) ) | |
159 all_L += L | |
160 all_mappable_length += mappable_length | |
161 all_unmappable_length += unmappable_length | |
162 | |
163 elapsed('Finished initial pre-processing of ' + chrom_id) | |
164 | |
165 | |
166 for outf in [w_c2t, c_c2t, w_g2a, c_g2a]: | |
167 outf.close() | |
168 | |
169 | |
170 logm("# Total %d chromosome(s) : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)" %(total_chromosomes, | |
171 all_L, | |
172 all_unmappable_length, | |
173 all_mappable_length, | |
174 float(all_mappable_length)/all_L) ) | |
175 logm("# eligible fragments : %d" % no_mappable_region ) | |
176 | |
177 serialize(refd, ref_p("refname")) | |
178 | |
179 mappable_regions_output_file.close() | |
180 elapsed('Storing mappable regions and genome') | |
181 | |
182 #---------------- bowtie-build ------------------------------------------- | |
183 | |
184 # append ref_path to all elements of to_bowtie | |
185 to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A']) | |
186 | |
187 run_in_parallel([(build_command % { 'fname' : fname }, fname + '.log') for fname in to_bowtie]) | |
188 | |
189 elapsed('Index building') | |
190 # delete all fasta files | |
191 delete_files( f +'.fa' for f in to_bowtie) | |
192 | |
193 elapsed('END') | |
194 |