diff BSseeker2/bs_seeker2-call_methylation.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_seeker2-call_methylation.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,205 @@
+#!/usr/bin/python
+
+from optparse import OptionParser, OptionGroup
+from bs_utils.utils import *
+
+try :
+    import pysam
+except ImportError :
+    print "[Error] Cannot import \"pysam\" package. Have you installed it?"
+    exit(-1)
+
+import gzip
+
+def context_calling(seq, position):
+
+    word=seq[position]
+    word=word.upper()
+
+    context="--"
+    context_CH="--"
+    if position + 2 < len(seq) and position - 2 >= 0:
+
+        if word == "C":
+            word2 = seq[position+1]
+            context_CH = word + word2
+            if word2 == "G":
+                context = "CG"
+            elif word2 in ['A','C','T']:
+                word3 = seq[position+2]
+                if word3 == "G":
+                    context = "CHG"
+                elif word3 in ['A','C','T']:
+                    context="CHH"
+
+        elif word == "G":
+            word2 = seq[position-1]
+            context_CH = word + word2
+            context_CH = context_CH.translate(string.maketrans("ATCG", "TAGC"))
+            if word2 == "C":
+                context = "CG"
+            elif word2 in ['A','G','T']:
+                word3 = seq[position-2]
+                if word3 == "C":
+                    context = "CHG"
+                elif word3 in ['A','G','T']:
+                    context = "CHH"
+
+    return word, context, context_CH
+
+
+
+if __name__ == '__main__':
+
+    parser = OptionParser()
+    parser.add_option("-i", "--input", type="string", dest="infilename",help="BAM output from bs_seeker2-align.py", metavar="INFILE")
+    parser.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path)
+    parser.add_option("-o", "--output-prefix", type="string", dest="output_prefix",help="The output prefix to create ATCGmap and wiggle files [INFILE]", metavar="OUTFILE")
+
+    parser.add_option("--wig", type="string", dest="wig_file",help="The output .wig file [INFILE.wig]", metavar="OUTFILE")
+    parser.add_option("--CGmap", type="string", dest="CGmap_file",help="The output .CGmap file [INFILE.CGmap]", metavar="OUTFILE")
+    parser.add_option("--ATCGmap", type="string", dest="ATCGmap_file",help="The output .ATCGmap file [INFILE.ATCGmap]", metavar="OUTFILE")
+
+    parser.add_option("-x", "--rm-SX", action="store_true", dest="RM_SX",help="Removed reads with tag \'XS:i:1\', which would be considered as not fully converted by bisulfite treatment [Default: %default]", default = False)
+    parser.add_option("--txt", action="store_true", dest="text",help="Show CGmap and ATCGmap in .gz [Default: %default]", default = False)
+
+    parser.add_option("-r", "--read-no",type = "int", dest="read_no",help="The least number of reads covering one site to be shown in wig file [Default: %default]", default = 1)
+    parser.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False)
+
+    (options, args) = parser.parse_args()
+
+
+    # if no options were given by the user, print help and exit
+    if len(sys.argv) == 1:
+        print parser.print_help()
+        exit(0)
+
+    if options.version :
+        show_version()
+        exit (-1)
+    else :
+        show_version()
+
+
+    if options.infilename is None:
+        error('-i option is required')
+    if not os.path.isfile(options.infilename):
+        error('Cannot find input file: %s' % options.infilename)
+
+    open_log(options.infilename+'.call_methylation_log')
+    db_d = lambda fname:  os.path.join( os.path.expanduser(options.dbpath), fname) # bug fixed, weilong
+
+    logm('sorting BS-Seeker alignments')
+    sorted_input_filename = options.infilename+'_sorted'
+    pysam.sort(options.infilename, sorted_input_filename)
+    sorted_input_filename += '.bam'
+    logm('indexing sorted alignments')
+    pysam.index(sorted_input_filename)
+
+    logm('calculating methylation levels')
+    if options.text :
+        ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap')
+        ATCGmap = open(ATCGmap_fname, 'w')
+
+        CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap')
+        CGmap = open(CGmap_fname, 'w')
+    else :
+        ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap.gz')
+        ATCGmap = gzip.open(ATCGmap_fname, 'wb')
+
+        CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap.gz')
+        CGmap = gzip.open(CGmap_fname, 'wb')
+
+    wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + '.wig')
+    wiggle = open(wiggle_fname, 'w')
+
+    sorted_input = pysam.Samfile(sorted_input_filename, 'rb')
+    
+    chrom = None
+    nucs = ['A', 'T', 'C', 'G', 'N']
+    ATCG_fwd = dict((n, 0) for n in nucs)
+    ATCG_rev = dict((n, 0) for n in nucs)
+    for col in sorted_input.pileup():
+        col_chrom = sorted_input.getrname(col.tid)
+        if chrom != col_chrom:
+            chrom = col_chrom
+            chrom_seq = deserialize(db_d(chrom))
+            wiggle.write('variableStep chrom=%s\n' % chrom)
+
+        for n in nucs:
+            ATCG_fwd[n] = 0
+            ATCG_rev[n] = 0
+
+        nuc, context, subcontext = context_calling(chrom_seq, col.pos)
+        total_reads = 0
+
+
+
+        for pr in col.pileups:
+        #     print pr
+             if (not pr.indel) : # skip indels
+                #if ( (options.RM_SX) and (pr.alignment.tags[1][1] == 1) ):
+                ##=== Fixed error reported by Roberto
+                #print options.RM_SX,  dict(pr.alignment.tags)["XS"]
+                #if ( (options.RM_SX) and (dict(pr.alignment.tags)["XS"] == 1) ):
+
+                if ( (options.RM_SX) and (dict(pr.alignment.tags).get("XS",0) == 1) ):
+                    # print "Debug: ", options.RM_SX, pr.alignment.tags[1]
+                    # when need to filter and read with tag (XS==1), then remove the reads
+                    continue
+
+                if pr.qpos >= len(pr.alignment.seq):
+                    print 'WARNING: read %s has an invalid alignment. Discarding.. ' % pr.alignment.qname
+                    continue
+                read_nuc = pr.alignment.seq[pr.qpos]
+         #       print "read_nuc=", read_nuc
+                if pr.alignment.is_reverse:
+                    ATCG_rev[read_nuc] += 1
+                else:
+                    ATCG_fwd[read_nuc] += 1
+
+                if read_nuc != 'N':
+                    total_reads += 1
+
+        cnts = lambda d: '\t'.join(str(d[n]) for n in nucs)
+        fwd_counts = cnts(ATCG_fwd)
+        rev_counts = cnts(ATCG_rev)
+
+        meth_level = None
+        meth_cytosines = 0
+        unmeth_cytosines = 0
+
+        if nuc == 'C':
+            # plus strand: take the ratio of C's to T's from reads that come from the forward strand
+            meth_cytosines = ATCG_fwd['C']
+            unmeth_cytosines = ATCG_fwd['T']
+
+        elif nuc == 'G':
+            # minus strand: take the ratio of G's to A's from reads that come from the reverse strand
+            meth_cytosines = ATCG_rev['G']
+            unmeth_cytosines = ATCG_rev['A']
+
+        if meth_cytosines + unmeth_cytosines > 0:
+            meth_level = float(meth_cytosines)/(meth_cytosines + unmeth_cytosines)
+
+        pos = col.pos + 1
+
+        meth_level_string = str(meth_level) if meth_level is not None else 'na'
+        ATCGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(fwd_counts)s\t%(rev_counts)s\t%(meth_level_string)s\n' % locals())
+#
+        all_cytosines = meth_cytosines + unmeth_cytosines 
+        if (meth_level is not None) and (all_cytosines >= options.read_no):
+        #    print all_cytosines
+            if nuc == 'C':
+                wiggle.write('%d\t%f\n' % (pos, meth_level))
+            else :
+                wiggle.write('%d\t-%f\n' % (pos, meth_level))
+            CGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(meth_level_string)s\t%(meth_cytosines)s\t%(all_cytosines)s\n' % locals())
+    ATCGmap.close()
+    CGmap.close()
+    wiggle.close()
+
+    logm('Wiggle: %s'% wiggle_fname)
+    logm('ATCGMap: %s' % ATCGmap_fname)
+    logm('CGmap: %s' % CGmap_fname)
+