Mercurial > repos > weilong-guo > bs_seeker2
diff BSseeker2/bs_index/rrbs_build.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_index/rrbs_build.py Fri Jul 12 18:47:28 2013 -0400 @@ -0,0 +1,194 @@ +import os + +from bs_utils.utils import * + + +FWD_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.fwd_mappable_regions' +REV_MAPPABLE_REGIONS = lambda chrom_id: chrom_id+'.rev_mappable_regions' + + +# example: T-CGA + +def rrbs_build(fasta_file, build_command, ref_path, low_bound, up_bound, aligner, cut_format="C-CGG"): + # ref_path is a string that contains the directory where the reference genomes are stored with + # the input fasta filename appended + + cut_format = cut_format.upper() # Ex. "C-CGG,C-TAG"; MspI&ApekI:"G^CWGC" + if cut_format == "C-CGG" : + ref_path = os.path.join(ref_path, + os.path.split(fasta_file)[1] + '_rrbs_%d_%d' % (low_bound, up_bound) +'_' + aligner) + else : + ref_path = os.path.join(ref_path, + os.path.split(fasta_file)[1] + '_rrbs_%s_%d_%d' % ( cut_format.replace("-","").replace(",","-"), low_bound, up_bound) +'_' + aligner) + + ref_p = lambda filename: os.path.join(ref_path, filename) + + clear_dir(ref_path) + open_log(ref_p('log')) + + refd = {} + w_c2t = open(ref_p('W_C2T.fa'),'w') + c_c2t = open(ref_p('C_C2T.fa'),'w') + + w_g2a = open(ref_p('W_G2A.fa'),'w') + c_g2a = open(ref_p('C_G2A.fa'),'w') + + mappable_regions_output_file = open(ref_p("RRBS_mappable_regions.txt"),"w") + + all_L = 0 + all_mappable_length = 0 + all_unmappable_length = 0 + + no_mappable_region = 0 + total_chromosomes = 0 + +# cut_context = re.sub("-", "", cut_format).split(",") + cut_context = EnumerateIUPAC(cut_format.replace("-","").split(",")) + cut_len = [len(i) for i in cut_context] + cut_len_max = max(cut_len) + + + for chrom_id, chrom_seq in read_fasta(fasta_file): + total_chromosomes += 1 + refd[chrom_id] = len(chrom_seq) + + fwd_chr_regions = {} + rev_chr_regions = {} + + L = len(chrom_seq) + XXXX_sites = [] + XXXX_XXXX = [] + + #-- collect all "XXXX sites --------------------------------- + i = 1 + while i <= L - cut_len_max: + j = 0 + while j < len(cut_len) : + if chrom_seq[i : i + cut_len[j]] == cut_context[j]: + XXXX_sites.append( (i, i + cut_len[j] - 1) ) # add (1st position, last position) + j += 1 + i += 1 + + #-- find "XXXX" pairs that are within the length of fragment ---- + for j in xrange(len(XXXX_sites) - 1): + DD = (XXXX_sites[j+1][0] - XXXX_sites[j][1]) - 1 # NOT including both XXXX; DD: fragment length + if low_bound <= DD <= up_bound: + XXXX_XXXX.append([XXXX_sites[j][0], XXXX_sites[j+1][1]]) # leftmost <--> rightmost + mappable_seq = chrom_seq[XXXX_sites[j][0] : XXXX_sites[j+1][1] + 1] + no_mappable_region += 1 + + fwd_chr_regions[str(XXXX_sites[j][0])] = [XXXX_sites[j+1][1], no_mappable_region] + rev_chr_regions[str(XXXX_sites[j+1][1])] = [XXXX_sites[j][0], no_mappable_region] + + # start_position, end_position, serial, sequence + mappable_regions_output_file.write("%s\t%d\t%d\t%d\t%s\n"%(chrom_id, no_mappable_region, + XXXX_sites[j][0], XXXX_sites[j+1][1], mappable_seq)) + # storing region information to file + # format: A[left_CCGG_pos]=[right_CCGG_pos, number_of_mappable_region] + serialize(fwd_chr_regions, ref_p(FWD_MAPPABLE_REGIONS(chrom_id))) + serialize(rev_chr_regions, ref_p(REV_MAPPABLE_REGIONS(chrom_id))) + + #----------------------------------- + # mask the genome + _map_seq = [] + mappable_length = 0 + unmappable_length = 0 + m = 0 + mark = False + while m < L: # for every nucleotide in chr + if len(XXXX_XXXX) > 0: + pair = XXXX_XXXX[0] + p1 = pair[0] # left end of fragment + p2 = pair[1] # right end of fragment + if p1 <= m < p2 + 1 : + _map_seq.append(chrom_seq[m]) + mappable_length+=1 + mark = True + else : + if not mark: + _map_seq.append("-") + unmappable_length += 1 + else: # the last eligible base + _ = XXXX_XXXX.pop(0) + if len(XXXX_XXXX)>0: + pair = XXXX_XXXX[0] + p1 = pair[0] + p2 = pair[1] + if p1 <= m < p2 + 1: + _map_seq.append(chrom_seq[m]) + mappable_length += 1 + mark = True + else: + _map_seq.append("-") + unmappable_length += 1 + mark = False + else: + _map_seq.append("-") + unmappable_length+=1 + mark = False + else: + if not mark: + _map_seq.append("-") + unmappable_length+=1 + else: + _map_seq.append("-") + unmappable_length+=1 + mark = False + + m+=1 + + #----------------------------------- + + chrom_seq = ''.join(_map_seq) + serialize(chrom_seq, ref_p(chrom_id)) + + w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T"))) + w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A"))) + + chrom_seq = reverse_compl_seq(chrom_seq) + + c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T"))) + c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A"))) + + #----------------------------------- + logm("# %s : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)"%(chrom_id, + L, + unmappable_length, + mappable_length, + float(mappable_length)/L) ) + all_L += L + all_mappable_length += mappable_length + all_unmappable_length += unmappable_length + + elapsed('Finished initial pre-processing of ' + chrom_id) + + + for outf in [w_c2t, c_c2t, w_g2a, c_g2a]: + outf.close() + + + logm("# Total %d chromosome(s) : all (%d) : unmappable (%d) : mappable (%d) : ratio (%1.5f)" %(total_chromosomes, + all_L, + all_unmappable_length, + all_mappable_length, + float(all_mappable_length)/all_L) ) + logm("# eligible fragments : %d" % no_mappable_region ) + + serialize(refd, ref_p("refname")) + + mappable_regions_output_file.close() + elapsed('Storing mappable regions and genome') + + #---------------- bowtie-build ------------------------------------------- + + # append ref_path to all elements of to_bowtie + to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A']) + + run_in_parallel([(build_command % { 'fname' : fname }, fname + '.log') for fname in to_bowtie]) + + elapsed('Index building') + # delete all fasta files + delete_files( f +'.fa' for f in to_bowtie) + + elapsed('END') +