comparison BSseeker2/bs_index/rrbs_build.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e6df770c0e58
1 import os
2
3 from bs_utils.utils import *
4
5
6 FWD_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.fwd_mappable_regions'
7 REV_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.rev_mappable_regions'
8
9
10 # example: T-CGA
11
12 def rrbs_build(fasta_file, build_command, ref_path, low_bound, up_bound, aligner, cut_format="C-CGG"):
13 # ref_path is a string that contains the directory where the reference genomes are stored with
14 # the input fasta filename appended
15
16 cut_format = cut_format.upper() # Ex. "C-CGG,C-TAG"; MspI&ApekI:"G^CWGC"
17 if cut_format == "C-CGG" :
18 ref_path = os.path.join(ref_path,
19 os.path.split(fasta_file)[1] + '_rrbs_%d_%d' % (low_bound, up_bound) +'_' + aligner)
20 else :
21 ref_path = os.path.join(ref_path,
22 os.path.split(fasta_file)[1] + '_rrbs_%s_%d_%d' % ( cut_format.replace("-","").replace(",","-"), low_bound, up_bound) +'_' + aligner)
23
24 ref_p = lambda filename: os.path.join(ref_path, filename)
25
26 clear_dir(ref_path)
27 open_log(ref_p('log'))
28
29 refd = {}
30 w_c2t = open(ref_p('W_C2T.fa'),'w')
31 c_c2t = open(ref_p('C_C2T.fa'),'w')
32
33 w_g2a = open(ref_p('W_G2A.fa'),'w')
34 c_g2a = open(ref_p('C_G2A.fa'),'w')
35
36 mappable_regions_output_file = open(ref_p("RRBS_mappable_regions.txt"),"w")
37
38 all_L = 0
39 all_mappable_length = 0
40 all_unmappable_length = 0
41
42 no_mappable_region = 0
43 total_chromosomes = 0
44
45 # cut_context = re.sub("-", "", cut_format).split(",")
46 cut_context = EnumerateIUPAC(cut_format.replace("-","").split(","))
47 cut_len = [len(i) for i in cut_context]
48 cut_len_max = max(cut_len)
49
50
51 for chrom_id, chrom_seq in read_fasta(fasta_file):
52 total_chromosomes += 1
53 refd[chrom_id] = len(chrom_seq)
54
55 fwd_chr_regions = {}
56 rev_chr_regions = {}
57
58 L = len(chrom_seq)
59 XXXX_sites = []
60 XXXX_XXXX = []
61
62 #-- collect all "XXXX sites ---------------------------------
63 i = 1
64 while i <= L - cut_len_max:
65 j = 0
66 while j < len(cut_len) :
67 if chrom_seq[i : i + cut_len[j]] == cut_context[j]:
68 XXXX_sites.append( (i, i + cut_len[j] - 1) ) # add (1st position, last position)
69 j += 1
70 i += 1
71
72 #-- find "XXXX" pairs that are within the length of fragment ----
73 for j in xrange(len(XXXX_sites) - 1):
74 DD = (XXXX_sites[j+1][0] - XXXX_sites[j][1]) - 1 # NOT including both XXXX; DD: fragment length
75 if low_bound <= DD <= up_bound:
76 XXXX_XXXX.append([XXXX_sites[j][0], XXXX_sites[j+1][1]]) # leftmost <--> rightmost
77 mappable_seq = chrom_seq[XXXX_sites[j][0] : XXXX_sites[j+1][1] + 1]
78 no_mappable_region += 1
79
80 fwd_chr_regions[str(XXXX_sites[j][0])] = [XXXX_sites[j+1][1], no_mappable_region]
81 rev_chr_regions[str(XXXX_sites[j+1][1])] = [XXXX_sites[j][0], no_mappable_region]
82
83 # start_position, end_position, serial, sequence
84 mappable_regions_output_file.write("%s\t%d\t%d\t%d\t%s\n"%(chrom_id, no_mappable_region,
85 XXXX_sites[j][0], XXXX_sites[j+1][1], mappable_seq))
86 # storing region information to file
87 # format: A[left_CCGG_pos]=[right_CCGG_pos, number_of_mappable_region]
88 serialize(fwd_chr_regions, ref_p(FWD_MAPPABLE_REGIONS(chrom_id)))
89 serialize(rev_chr_regions, ref_p(REV_MAPPABLE_REGIONS(chrom_id)))
90
91 #-----------------------------------
92 # mask the genome
93 _map_seq = []
94 mappable_length = 0
95 unmappable_length = 0
96 m = 0
97 mark = False
98 while m < L: # for every nucleotide in chr
99 if len(XXXX_XXXX) > 0:
100 pair = XXXX_XXXX[0]
101 p1 = pair[0] # left end of fragment
102 p2 = pair[1] # right end of fragment
103 if p1 <= m < p2 + 1 :
104 _map_seq.append(chrom_seq[m])
105 mappable_length+=1
106 mark = True
107 else :
108 if not mark:
109 _map_seq.append("-")
110 unmappable_length += 1
111 else: # the last eligible base
112 _ = XXXX_XXXX.pop(0)
113 if len(XXXX_XXXX)>0:
114 pair = XXXX_XXXX[0]
115 p1 = pair[0]
116 p2 = pair[1]
117 if p1 <= m < p2 + 1:
118 _map_seq.append(chrom_seq[m])
119 mappable_length += 1
120 mark = True
121 else:
122 _map_seq.append("-")
123 unmappable_length += 1
124 mark = False
125 else:
126 _map_seq.append("-")
127 unmappable_length+=1
128 mark = False
129 else:
130 if not mark:
131 _map_seq.append("-")
132 unmappable_length+=1
133 else:
134 _map_seq.append("-")
135 unmappable_length+=1
136 mark = False
137
138 m+=1
139
140 #-----------------------------------
141
142 chrom_seq = ''.join(_map_seq)
143 serialize(chrom_seq, ref_p(chrom_id))
144
145 w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
146 w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
147
148 chrom_seq = reverse_compl_seq(chrom_seq)
149
150 c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
151 c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
152
153 #-----------------------------------
154 logm("# %s : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)"%(chrom_id,
155 L,
156 unmappable_length,
157 mappable_length,
158 float(mappable_length)/L) )
159 all_L += L
160 all_mappable_length += mappable_length
161 all_unmappable_length += unmappable_length
162
163 elapsed('Finished initial pre-processing of ' + chrom_id)
164
165
166 for outf in [w_c2t, c_c2t, w_g2a, c_g2a]:
167 outf.close()
168
169
170 logm("# Total %d chromosome(s) : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)" %(total_chromosomes,
171 all_L,
172 all_unmappable_length,
173 all_mappable_length,
174 float(all_mappable_length)/all_L) )
175 logm("# eligible fragments : %d" % no_mappable_region )
176
177 serialize(refd, ref_p("refname"))
178
179 mappable_regions_output_file.close()
180 elapsed('Storing mappable regions and genome')
181
182 #---------------- bowtie-build -------------------------------------------
183
184 # append ref_path to all elements of to_bowtie
185 to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A'])
186
187 run_in_parallel([(build_command % { 'fname' : fname }, fname + '.log') for fname in to_bowtie])
188
189 elapsed('Index building')
190 # delete all fasta files
191 delete_files( f +'.fa' for f in to_bowtie)
192
193 elapsed('END')
194