view BSseeker2/bs_seeker2-call_methylation.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
line wrap: on
line source

#!/usr/bin/env python

from optparse import OptionParser, OptionGroup
from bs_utils.utils import *

try :
    import pysam
except ImportError :
    print "[Error] Cannot import \"pysam\" package. Have you installed it?"
    exit(-1)

import gzip

def context_calling(seq, position):

    word=seq[position]
    word=word.upper()

    context="--"
    context_CH="--"
    if position + 2 < len(seq) and position - 2 >= 0:

        if word == "C":
            word2 = seq[position+1]
            context_CH = word + word2
            if word2 == "G":
                context = "CG"
            elif word2 in ['A','C','T']:
                word3 = seq[position+2]
                if word3 == "G":
                    context = "CHG"
                elif word3 in ['A','C','T']:
                    context="CHH"

        elif word == "G":
            word2 = seq[position-1]
            context_CH = word + word2
            context_CH = context_CH.translate(string.maketrans("ATCG", "TAGC"))
            if word2 == "C":
                context = "CG"
            elif word2 in ['A','G','T']:
                word3 = seq[position-2]
                if word3 == "C":
                    context = "CHG"
                elif word3 in ['A','G','T']:
                    context = "CHH"

    return word, context, context_CH



if __name__ == '__main__':

    parser = OptionParser()
    parser.add_option("-i", "--input", type="string", dest="infilename",help="BAM output from bs_seeker2-align.py", metavar="INFILE")
    parser.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path)
    parser.add_option("-o", "--output-prefix", type="string", dest="output_prefix",help="The output prefix to create ATCGmap and wiggle files [INFILE]", metavar="OUTFILE")

    parser.add_option("--wig", type="string", dest="wig_file",help="The output .wig file [INFILE.wig]", metavar="OUTFILE")
    parser.add_option("--CGmap", type="string", dest="CGmap_file",help="The output .CGmap file [INFILE.CGmap]", metavar="OUTFILE")
    parser.add_option("--ATCGmap", type="string", dest="ATCGmap_file",help="The output .ATCGmap file [INFILE.ATCGmap]", metavar="OUTFILE")

    parser.add_option("-x", "--rm-SX", action="store_true", dest="RM_SX",help="Removed reads with tag \'XS:i:1\', which would be considered as not fully converted by bisulfite treatment [Default: %default]", default = False)
    parser.add_option("--txt", action="store_true", dest="text",help="Show CGmap and ATCGmap in .gz [Default: %default]", default = False)

    parser.add_option("-r", "--read-no",type = "int", dest="read_no",help="The least number of reads covering one site to be shown in wig file [Default: %default]", default = 1)
    parser.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False)

    (options, args) = parser.parse_args()


    # if no options were given by the user, print help and exit
    if len(sys.argv) == 1:
        parser.print_help()
        exit(0)

    if options.version :
        show_version()
        exit (-1)
    else :
        show_version()


    if options.infilename is None:
        error('-i option is required')
    if not os.path.isfile(options.infilename):
        error('Cannot find input file: %s' % options.infilename)

    open_log(options.infilename+'.call_methylation_log')
    db_d = lambda fname:  os.path.join( os.path.expanduser(options.dbpath), fname) # bug fixed, weilong

    logm('sorting BS-Seeker alignments')
    sorted_input_filename = options.infilename+'_sorted'
    pysam.sort(options.infilename, sorted_input_filename)
    sorted_input_filename += '.bam'
    logm('indexing sorted alignments')
    pysam.index(sorted_input_filename)

    logm('calculating methylation levels')
    if options.text :
        ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap')
        ATCGmap = open(ATCGmap_fname, 'w')

        CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap')
        CGmap = open(CGmap_fname, 'w')
    else :
        ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap.gz')
        ATCGmap = gzip.open(ATCGmap_fname, 'wb')

        CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap.gz')
        CGmap = gzip.open(CGmap_fname, 'wb')

    wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + '.wig')
    wiggle = open(wiggle_fname, 'w')

    sorted_input = pysam.Samfile(sorted_input_filename, 'rb')
    
    chrom = None
    nucs = ['A', 'T', 'C', 'G', 'N']
    ATCG_fwd = dict((n, 0) for n in nucs)
    ATCG_rev = dict((n, 0) for n in nucs)
    for col in sorted_input.pileup():
        col_chrom = sorted_input.getrname(col.tid)
        if chrom != col_chrom:
            chrom = col_chrom
            chrom_seq = deserialize(db_d(chrom))
            wiggle.write('variableStep chrom=%s\n' % chrom)

        for n in nucs:
            ATCG_fwd[n] = 0
            ATCG_rev[n] = 0

        nuc, context, subcontext = context_calling(chrom_seq, col.pos)
        total_reads = 0

        for pr in col.pileups:
        #     print pr
             if (not pr.indel) : # skip indels
                #if ( (options.RM_SX) and (pr.alignment.tags[1][1] == 1) ):
                ##=== Fixed error reported by Roberto
                #print options.RM_SX,  dict(pr.alignment.tags)["XS"]
                #if ( (options.RM_SX) and (dict(pr.alignment.tags)["XS"] == 1) ):

                if ( (options.RM_SX) and (dict(pr.alignment.tags).get("XS",0) == 1) ):
                    # print "Debug: ", options.RM_SX, pr.alignment.tags[1]
                    # when need to filter and read with tag (XS==1), then remove the reads
                    continue

                if pr.qpos >= len(pr.alignment.seq):
                    print 'WARNING: read %s has an invalid alignment. Discarding.. ' % pr.alignment.qname
                    continue
                read_nuc = pr.alignment.seq[pr.qpos]
                if pr.alignment.is_reverse:
                    ATCG_rev[read_nuc] += 1
                else:
                    ATCG_fwd[read_nuc] += 1

                if read_nuc != 'N':
                    total_reads += 1

        cnts = lambda d: '\t'.join(str(d[n]) for n in nucs)
        fwd_counts = cnts(ATCG_fwd)
        rev_counts = cnts(ATCG_rev)

        meth_level = None
        meth_cytosines = 0
        unmeth_cytosines = 0

        if nuc == 'C':
            # plus strand: take the ratio of C's to T's from reads that come from the forward strand
            meth_cytosines = ATCG_fwd['C']
            unmeth_cytosines = ATCG_fwd['T']

        elif nuc == 'G':
            # minus strand: take the ratio of G's to A's from reads that come from the reverse strand
            meth_cytosines = ATCG_rev['G']
            unmeth_cytosines = ATCG_rev['A']

        if meth_cytosines + unmeth_cytosines > 0:
            meth_level = float(meth_cytosines)/(meth_cytosines + unmeth_cytosines)

        pos = col.pos + 1

        meth_level_string = str(meth_level) if meth_level is not None else 'na'
        ATCGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(fwd_counts)s\t%(rev_counts)s\t%(meth_level_string)s\n' % locals())
#
        all_cytosines = meth_cytosines + unmeth_cytosines 
        if (meth_level is not None) and (all_cytosines >= options.read_no):
        #    print all_cytosines
            if nuc == 'C':
                wiggle.write('%d\t%f\n' % (pos, meth_level))
            else :
                wiggle.write('%d\t-%f\n' % (pos, meth_level))

        CGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(meth_level_string)s\t%(meth_cytosines)s\t%(all_cytosines)s\n' % locals())
    ATCGmap.close()
    CGmap.close()
    wiggle.close()

    logm('Wiggle: %s'% wiggle_fname)
    logm('ATCGMap: %s' % ATCGmap_fname)
    logm('CGmap: %s' % CGmap_fname)