Mercurial > repos > weilong-guo > bs_seeker2
diff BSseeker2/bs_align/bs_align_utils.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/bs_align_utils.py Fri Jul 12 18:47:28 2013 -0400 @@ -0,0 +1,385 @@ +from bs_utils.utils import * +import re + + +BAM_MATCH = 0 +BAM_INS = 1 +BAM_DEL = 2 +BAM_SOFTCLIP = 4 + +CIGAR_OPS = {'M' : BAM_MATCH, 'I' : BAM_INS, 'D' : BAM_DEL, 'S' : BAM_SOFTCLIP} + + +def N_MIS(r,g): + mismatches = 0 + if len(r)==len(g): + for i in xrange(len(r)): + if r[i] != g[i] and r[i] != "N" and g[i] != "N" and not(r[i] == 'T' and g[i] == 'C'): + mismatches += 1 + return mismatches + + +#---------------------------------------------------------------- + +""" +Exmaple: +======== +Read : ACCGCGTTGATCGAGTACGTACGTGGGTC +Adapter : ....................ACGTGGGTCCCG +======== + +no_mismatch : the maximum number allowed for mismatches + +Algorithm: (allowing 1 mismatch) +======== +-Step 1: + ACCGCGTTGATCGAGTACGTACGTGGGTC + ||XX + ACGTGGGTCCCG +-Step 2: + ACCGCGTTGATCGAGTACGTACGTGGGTC + X||X + .ACGTGGGTCCCG +-Step 3: + ACCGCGTTGATCGAGTACGTACGTGGGTC + XX + ..ACGTGGGTCCCG +-Step ... +-Step N: + ACCGCGTTGATCGAGTACGTACGTGGGTC + ||||||||| + ....................ACGTGGGTCCCG +Success & return! +======== + +""" + +def RemoveAdapter ( read, adapter, no_mismatch ) : + lr = len(read) + la = len(adapter) + for i in xrange( lr - no_mismatch ) : + read_pos = i + adapter_pos = 0 + count_no_mis = 0 + while (adapter_pos < la) and (read_pos < lr) : + if (read[read_pos] == adapter[adapter_pos]) : + read_pos = read_pos + 1 + adapter_pos = adapter_pos + 1 + else : + count_no_mis = count_no_mis + 1 + if count_no_mis > no_mismatch : + break + else : + read_pos = read_pos + 1 + adapter_pos = adapter_pos + 1 + # while_end + + if adapter_pos == la or read_pos == lr : + return read[:i] + # for_end + return read + + +def Remove_5end_Adapter ( read, adapter, no_mismatch) : + lr = len(read) + la = len(adapter) + for i in xrange (la - no_mismatch) : + read_pos = 0 + adapter_pos = i + count_no_mis = 0 + while (adapter_pos < la) and (read_pos < lr) : + if (read[read_pos] == adapter[adapter_pos]) : + adapter_pos = adapter_pos + 1 + read_pos = read_pos + 1 + else : + count_no_mis = count_no_mis + 1 + if count_no_mis > no_mismatch : + break + else : + read_pos = read_pos + 1 + adapter_pos = adapter_pos + 1 + # while_end + if adapter_pos == la : + return read[(la-i):] + + +def next_nuc(seq, pos, n): + """ Returns the nucleotide that is n places from pos in seq. Skips gap symbols. + """ + i = pos + 1 + while i < len(seq): + if seq[i] != '-': + n -= 1 + if n == 0: break + i += 1 + if i < len(seq) : + return seq[i] + else : + return 'N' + + + +def methy_seq(read, genome): + H = ['A', 'C', 'T'] + m_seq = [] + xx = "-" + for i in xrange(len(read)): + + if genome[i] == '-': + continue + + elif read[i] != 'C' and read[i] != 'T': + xx = "-" + + elif read[i] == "T" and genome[i] == "C": #(unmethylated): + nn1 = next_nuc(genome, i, 1) + if nn1 == "G": + xx = "x" + elif nn1 in H : + nn2 = next_nuc(genome, i, 2) + if nn2 == "G": + xx = "y" + elif nn2 in H : + xx = "z" + + elif read[i] == "C" and genome[i] == "C": #(methylated): + nn1 = next_nuc(genome, i, 1) + + if nn1 == "G": + xx = "X" + elif nn1 in H : + nn2 = next_nuc(genome, i, 2) + + if nn2 == "G": + xx = "Y" + elif nn2 in H: + xx = "Z" + else: + xx = "-" + m_seq.append(xx) + + return ''.join(m_seq) + +def mcounts(mseq, mlst, ulst): + out_mlst=[mlst[0]+mseq.count("X"), mlst[1]+mseq.count("Y"), mlst[2]+mseq.count("Z")] + out_ulst=[ulst[0]+mseq.count("x"), ulst[1]+mseq.count("y"), ulst[2]+mseq.count("z")] + return out_mlst, out_ulst + + + +def process_aligner_output(filename, pair_end = False): + + #m = re.search(r'-('+'|'.join(supported_aligners) +')-TMP', filename) + m = re.search(r'-('+'|'.join(supported_aligners) +')-.*TMP', filename) + if m is None: + error('The temporary folder path should contain the name of one of the supported aligners: ' + filename) + + format = m.group(1) + try : + input = open(filename) + except IOError: + print "[Error] Cannot open file %s" % filename + exit(-1) + + QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL = range(11) + def parse_SAM(line): + buf = line.split() + # print buf + flag = int(buf[FLAG]) + + # skip reads that are not mapped + # skip reads that have probability of being non-unique higher than 1/10 + if flag & 0x4 : # or int(buf[MAPQ]) < 10: + return None, None, None, None, None, None + # print "format = ", format + if format == BOWTIE: + mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'NM:i:'][0]) # get the edit distance + # --- bug fixed ------ + elif format == BOWTIE2: + if re.search(r'(.)*-e2e-TMP(.*)', filename) is None : # local model + mismatches = 1-int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'AS:i:'][0]) + # print "====local=====\n" + ## bowtie2 use AS tag (score) to evaluate the mapping. The higher, the better. + else : # end-to-end model + # print "end-to-end\n" + mismatches = int([buf[i][5:] for i in xrange(11, len(buf)) if buf[i][:5] == 'XM:i:'][0]) + # --- Weilong --------- + elif format == SOAP: + mismatches = 1-buf[MAPQ] + # mismatches = 1/float(buf[MAPQ]) + ## downstream might round (0,1) to 0, so use integer instead + ## fixed by Weilong + elif format == RMAP: + # chr16 75728107 75728147 read45 9 - + # chr16 67934919 67934959 read45 9 - + mismatches = buf[4] + + return (buf[QNAME], # read ID + buf[RNAME], # reference ID + int(buf[POS]) - 1, # position, 0 based (SAM is 1 based) + mismatches, # number of mismatches + parse_cigar(buf[CIGAR]), # the parsed cigar string + flag & 0x40 # true if it is the first mate in a pair, false if it is the second mate + ) + + SOAP_QNAME, SOAP_SEQ, SOAP_QUAL, SOAP_NHITS, SOAP_AB, SOAP_LEN, SOAP_STRAND, SOAP_CHR, SOAP_LOCATION, SOAP_MISMATCHES = range(10) + def parse_SOAP(line): + buf = line.split() + return (buf[SOAP_QNAME], + buf[SOAP_CHR], + int(buf[SOAP_LOCATION]) - 1, + int(buf[SOAP_MISMATCHES]), + buf[SOAP_AB], + buf[SOAP_STRAND], + parse_cigar(buf[SOAP_LEN]+'M') + ) + + # chr16 75728107 75728147 read45 9 - + RMAP_CHR, RMAP_START, RMAP_END, RMAP_QNAME, RMAP_MISMATCH, RMAP_STRAND = range(6) + def parse_RMAP(line): + buf = line.split() + return ( buf[RMAP_QNAME], + buf[RMAP_CHR], + int(buf[RMAP_START]), # to check -1 or not + int(buf[RMAP_END]) - int(buf[RMAP_START]) + 1, + int(buf[RMAP_MISMATCH]), + buf[RMAP_STRAND] + ) + + if format == BOWTIE or format == BOWTIE2: + if pair_end: + for line in input: + header1, chr1, location1, no_mismatch1, cigar1, _ = parse_SAM(line) + header2, _, location2, no_mismatch2, cigar2, mate_no2 = parse_SAM(input.next()) + + + if header1 and header2: + # flip the location info if the second mate comes first in the alignment file + if mate_no2: + location1, location2 = location2, location1 + cigar1, cigar2 = cigar2, cigar1 + + + yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2 + else: + for line in input: + header, chr, location, no_mismatch, cigar, _ = parse_SAM(line) + if header is not None: + yield header, chr, location, no_mismatch, cigar + elif format == SOAP: + if pair_end: + for line in input: + header1, chr1, location1, no_mismatch1, mate1, strand1, cigar1 = parse_SOAP(line) + header2, _ , location2, no_mismatch2, _, strand2, cigar2 = parse_SOAP(input.next()) + + if mate1 == 'b': + location1, location2 = location2, location1 + strand1, strand2 = strand2, strand1 + ciga1, cigar2 = cigar2, cigar1 + + + if header1 and header2 and strand1 == '+' and strand2 == '-': + yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2 + + else: + for line in input: + header, chr, location, no_mismatch, _, strand, cigar = parse_SOAP(line) + if header and strand == '+': + yield header, chr, location, no_mismatch, cigar + elif format == RMAP : + if pair_end : + todo = 0 + # to do + else : + for line in input: + header, chr, location, read_len, no_mismatch, strand = parse_RMAP(line) + cigar = str(read_len) + "M" + yield header, chr, location, no_mismatch, cigar + + input.close() + + +def parse_cigar(cigar_string): + i = 0 + prev_i = 0 + cigar = [] + while i < len(cigar_string): + if cigar_string[i] in CIGAR_OPS: + cigar.append((CIGAR_OPS[cigar_string[i]], int(cigar_string[prev_i:i]))) + prev_i = i + 1 + i += 1 + return cigar + +def get_read_start_end_and_genome_length(cigar): + r_start = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0 + r_end = r_start + g_len = 0 + for edit_op, count in cigar: + if edit_op == BAM_MATCH: + r_end += count + g_len += count + elif edit_op == BAM_INS: + r_end += count + elif edit_op == BAM_DEL: + g_len += count + return r_start, r_end, g_len # return the start and end in the read and the length of the genomic sequence + # r_start : start position on the read + # r_end : end position on the read + # g_len : length of the mapped region on genome + + +def cigar_to_alignment(cigar, read_seq, genome_seq): + """ Reconstruct the pairwise alignment based on the CIGAR string and the two sequences + """ + + # reconstruct the alignment + r_pos = cigar[0][1] if cigar[0][0] == BAM_SOFTCLIP else 0 + g_pos = 0 + r_aln = '' + g_aln = '' + for edit_op, count in cigar: + if edit_op == BAM_MATCH: + r_aln += read_seq[r_pos : r_pos + count] + g_aln += genome_seq[g_pos : g_pos + count] + r_pos += count + g_pos += count + elif edit_op == BAM_DEL: + r_aln += '-'*count + g_aln += genome_seq[g_pos : g_pos + count] + g_pos += count + elif edit_op == BAM_INS: + r_aln += read_seq[r_pos : r_pos + count] + g_aln += '-'*count + r_pos += count + + return r_aln, g_aln + + + +# return sequence is [start, end), not include 'end' +def get_genomic_sequence(genome, start, end, strand = '+'): + if strand != '+' and strand != '-' : + print "[Bug] get_genomic_sequence input should be \'+\' or \'-\'." + exit(-1) + if start > 1: + prev = genome[start-2:start] + elif start == 1: + prev = 'N'+genome[0] + else: + prev = 'NN' + + if end < len(genome) - 1: + next = genome[end: end + 2] + elif end == len(genome) - 1: + next = genome[end] + 'N' + else: + next = 'NN' + origin_genome = genome[start:end] + + if strand == '-': + # reverse complement everything if strand is '-' + revc = reverse_compl_seq('%s%s%s' % (prev, origin_genome, next)) + prev, origin_genome, next = revc[:2], revc[2:-2], revc[-2:] + + return origin_genome, next, '%s_%s_%s' % (prev, origin_genome, next) + # next : next two nucleotides \ No newline at end of file