0
|
1 import os
|
|
2
|
|
3 from bs_utils.utils import *
|
|
4
|
|
5
|
|
6 FWD_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.fwd_mappable_regions'
|
|
7 REV_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.rev_mappable_regions'
|
|
8
|
|
9
|
|
10 # example: T-CGA
|
|
11
|
|
12 def rrbs_build(fasta_file, build_command, ref_path, low_bound, up_bound, aligner, cut_format="C-CGG"):
|
|
13 # ref_path is a string that contains the directory where the reference genomes are stored with
|
|
14 # the input fasta filename appended
|
|
15
|
|
16 cut_format = cut_format.upper() # Ex. "C-CGG,C-TAG"; MspI&ApekI:"G^CWGC"
|
|
17 if cut_format == "C-CGG" :
|
|
18 ref_path = os.path.join(ref_path,
|
|
19 os.path.split(fasta_file)[1] + '_rrbs_%d_%d' % (low_bound, up_bound) +'_' + aligner)
|
|
20 else :
|
|
21 ref_path = os.path.join(ref_path,
|
|
22 os.path.split(fasta_file)[1] + '_rrbs_%s_%d_%d' % ( cut_format.replace("-","").replace(",","-"), low_bound, up_bound) +'_' + aligner)
|
|
23
|
|
24 ref_p = lambda filename: os.path.join(ref_path, filename)
|
|
25
|
|
26 clear_dir(ref_path)
|
|
27 open_log(ref_p('log'))
|
|
28
|
|
29 refd = {}
|
|
30 w_c2t = open(ref_p('W_C2T.fa'),'w')
|
|
31 c_c2t = open(ref_p('C_C2T.fa'),'w')
|
|
32
|
|
33 w_g2a = open(ref_p('W_G2A.fa'),'w')
|
|
34 c_g2a = open(ref_p('C_G2A.fa'),'w')
|
|
35
|
|
36 mappable_regions_output_file = open(ref_p("RRBS_mappable_regions.txt"),"w")
|
|
37
|
|
38 all_L = 0
|
|
39 all_mappable_length = 0
|
|
40 all_unmappable_length = 0
|
|
41
|
|
42 no_mappable_region = 0
|
|
43 total_chromosomes = 0
|
|
44
|
|
45 # cut_context = re.sub("-", "", cut_format).split(",")
|
|
46 cut_context = EnumerateIUPAC(cut_format.replace("-","").split(","))
|
|
47 cut_len = [len(i) for i in cut_context]
|
|
48 cut_len_max = max(cut_len)
|
|
49
|
|
50
|
|
51 for chrom_id, chrom_seq in read_fasta(fasta_file):
|
|
52 total_chromosomes += 1
|
|
53 refd[chrom_id] = len(chrom_seq)
|
|
54
|
|
55 fwd_chr_regions = {}
|
|
56 rev_chr_regions = {}
|
|
57
|
|
58 L = len(chrom_seq)
|
|
59 XXXX_sites = []
|
|
60 XXXX_XXXX = []
|
|
61
|
|
62 #-- collect all "XXXX sites ---------------------------------
|
|
63 i = 1
|
|
64 while i <= L - cut_len_max:
|
|
65 j = 0
|
|
66 while j < len(cut_len) :
|
|
67 if chrom_seq[i : i + cut_len[j]] == cut_context[j]:
|
|
68 XXXX_sites.append( (i, i + cut_len[j] - 1) ) # add (1st position, last position)
|
|
69 j += 1
|
|
70 i += 1
|
|
71
|
|
72 #-- find "XXXX" pairs that are within the length of fragment ----
|
|
73 for j in xrange(len(XXXX_sites) - 1):
|
|
74 DD = (XXXX_sites[j+1][0] - XXXX_sites[j][1]) - 1 # NOT including both XXXX; DD: fragment length
|
|
75 if low_bound <= DD <= up_bound:
|
|
76 XXXX_XXXX.append([XXXX_sites[j][0], XXXX_sites[j+1][1]]) # leftmost <--> rightmost
|
|
77 mappable_seq = chrom_seq[XXXX_sites[j][0] : XXXX_sites[j+1][1] + 1]
|
|
78 no_mappable_region += 1
|
|
79
|
|
80 fwd_chr_regions[str(XXXX_sites[j][0])] = [XXXX_sites[j+1][1], no_mappable_region]
|
|
81 rev_chr_regions[str(XXXX_sites[j+1][1])] = [XXXX_sites[j][0], no_mappable_region]
|
|
82
|
|
83 # start_position, end_position, serial, sequence
|
|
84 mappable_regions_output_file.write("%s\t%d\t%d\t%d\t%s\n"%(chrom_id, no_mappable_region,
|
|
85 XXXX_sites[j][0], XXXX_sites[j+1][1], mappable_seq))
|
|
86 # storing region information to file
|
|
87 # format: A[left_CCGG_pos]=[right_CCGG_pos, number_of_mappable_region]
|
|
88 serialize(fwd_chr_regions, ref_p(FWD_MAPPABLE_REGIONS(chrom_id)))
|
|
89 serialize(rev_chr_regions, ref_p(REV_MAPPABLE_REGIONS(chrom_id)))
|
|
90
|
|
91 #-----------------------------------
|
|
92 # mask the genome
|
|
93 _map_seq = []
|
|
94 mappable_length = 0
|
|
95 unmappable_length = 0
|
|
96 m = 0
|
|
97 mark = False
|
|
98 while m < L: # for every nucleotide in chr
|
|
99 if len(XXXX_XXXX) > 0:
|
|
100 pair = XXXX_XXXX[0]
|
|
101 p1 = pair[0] # left end of fragment
|
|
102 p2 = pair[1] # right end of fragment
|
|
103 if p1 <= m < p2 + 1 :
|
|
104 _map_seq.append(chrom_seq[m])
|
|
105 mappable_length+=1
|
|
106 mark = True
|
|
107 else :
|
|
108 if not mark:
|
|
109 _map_seq.append("-")
|
|
110 unmappable_length += 1
|
|
111 else: # the last eligible base
|
|
112 _ = XXXX_XXXX.pop(0)
|
|
113 if len(XXXX_XXXX)>0:
|
|
114 pair = XXXX_XXXX[0]
|
|
115 p1 = pair[0]
|
|
116 p2 = pair[1]
|
|
117 if p1 <= m < p2 + 1:
|
|
118 _map_seq.append(chrom_seq[m])
|
|
119 mappable_length += 1
|
|
120 mark = True
|
|
121 else:
|
|
122 _map_seq.append("-")
|
|
123 unmappable_length += 1
|
|
124 mark = False
|
|
125 else:
|
|
126 _map_seq.append("-")
|
|
127 unmappable_length+=1
|
|
128 mark = False
|
|
129 else:
|
|
130 if not mark:
|
|
131 _map_seq.append("-")
|
|
132 unmappable_length+=1
|
|
133 else:
|
|
134 _map_seq.append("-")
|
|
135 unmappable_length+=1
|
|
136 mark = False
|
|
137
|
|
138 m+=1
|
|
139
|
|
140 #-----------------------------------
|
|
141
|
|
142 chrom_seq = ''.join(_map_seq)
|
|
143 serialize(chrom_seq, ref_p(chrom_id))
|
|
144
|
|
145 w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
|
|
146 w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
|
|
147
|
|
148 chrom_seq = reverse_compl_seq(chrom_seq)
|
|
149
|
|
150 c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
|
|
151 c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
|
|
152
|
|
153 #-----------------------------------
|
|
154 logm("# %s : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)"%(chrom_id,
|
|
155 L,
|
|
156 unmappable_length,
|
|
157 mappable_length,
|
|
158 float(mappable_length)/L) )
|
|
159 all_L += L
|
|
160 all_mappable_length += mappable_length
|
|
161 all_unmappable_length += unmappable_length
|
|
162
|
|
163 elapsed('Finished initial pre-processing of ' + chrom_id)
|
|
164
|
|
165
|
|
166 for outf in [w_c2t, c_c2t, w_g2a, c_g2a]:
|
|
167 outf.close()
|
|
168
|
|
169
|
|
170 logm("# Total %d chromosome(s) : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)" %(total_chromosomes,
|
|
171 all_L,
|
|
172 all_unmappable_length,
|
|
173 all_mappable_length,
|
|
174 float(all_mappable_length)/all_L) )
|
|
175 logm("# eligible fragments : %d" % no_mappable_region )
|
|
176
|
|
177 serialize(refd, ref_p("refname"))
|
|
178
|
|
179 mappable_regions_output_file.close()
|
|
180 elapsed('Storing mappable regions and genome')
|
|
181
|
|
182 #---------------- bowtie-build -------------------------------------------
|
|
183
|
|
184 # append ref_path to all elements of to_bowtie
|
|
185 to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A'])
|
|
186
|
|
187 run_in_parallel([(build_command % { 'fname' : fname }, fname + '.log') for fname in to_bowtie])
|
|
188
|
|
189 elapsed('Index building')
|
|
190 # delete all fasta files
|
|
191 delete_files( f +'.fa' for f in to_bowtie)
|
|
192
|
|
193 elapsed('END')
|
|
194
|