| 
1
 | 
     1 #!/usr/bin/env python
 | 
| 
0
 | 
     2 
 | 
| 
 | 
     3 from optparse import OptionParser, OptionGroup
 | 
| 
 | 
     4 from bs_utils.utils import *
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 try :
 | 
| 
 | 
     7     import pysam
 | 
| 
 | 
     8 except ImportError :
 | 
| 
 | 
     9     print "[Error] Cannot import \"pysam\" package. Have you installed it?"
 | 
| 
 | 
    10     exit(-1)
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 import gzip
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 def context_calling(seq, position):
 | 
| 
 | 
    15 
 | 
| 
 | 
    16     word=seq[position]
 | 
| 
 | 
    17     word=word.upper()
 | 
| 
 | 
    18 
 | 
| 
 | 
    19     context="--"
 | 
| 
 | 
    20     context_CH="--"
 | 
| 
 | 
    21     if position + 2 < len(seq) and position - 2 >= 0:
 | 
| 
 | 
    22 
 | 
| 
 | 
    23         if word == "C":
 | 
| 
 | 
    24             word2 = seq[position+1]
 | 
| 
 | 
    25             context_CH = word + word2
 | 
| 
 | 
    26             if word2 == "G":
 | 
| 
 | 
    27                 context = "CG"
 | 
| 
 | 
    28             elif word2 in ['A','C','T']:
 | 
| 
 | 
    29                 word3 = seq[position+2]
 | 
| 
 | 
    30                 if word3 == "G":
 | 
| 
 | 
    31                     context = "CHG"
 | 
| 
 | 
    32                 elif word3 in ['A','C','T']:
 | 
| 
 | 
    33                     context="CHH"
 | 
| 
 | 
    34 
 | 
| 
 | 
    35         elif word == "G":
 | 
| 
 | 
    36             word2 = seq[position-1]
 | 
| 
 | 
    37             context_CH = word + word2
 | 
| 
 | 
    38             context_CH = context_CH.translate(string.maketrans("ATCG", "TAGC"))
 | 
| 
 | 
    39             if word2 == "C":
 | 
| 
 | 
    40                 context = "CG"
 | 
| 
 | 
    41             elif word2 in ['A','G','T']:
 | 
| 
 | 
    42                 word3 = seq[position-2]
 | 
| 
 | 
    43                 if word3 == "C":
 | 
| 
 | 
    44                     context = "CHG"
 | 
| 
 | 
    45                 elif word3 in ['A','G','T']:
 | 
| 
 | 
    46                     context = "CHH"
 | 
| 
 | 
    47 
 | 
| 
 | 
    48     return word, context, context_CH
 | 
| 
 | 
    49 
 | 
| 
 | 
    50 
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 if __name__ == '__main__':
 | 
| 
 | 
    53 
 | 
| 
 | 
    54     parser = OptionParser()
 | 
| 
 | 
    55     parser.add_option("-i", "--input", type="string", dest="infilename",help="BAM output from bs_seeker2-align.py", metavar="INFILE")
 | 
| 
 | 
    56     parser.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path)
 | 
| 
 | 
    57     parser.add_option("-o", "--output-prefix", type="string", dest="output_prefix",help="The output prefix to create ATCGmap and wiggle files [INFILE]", metavar="OUTFILE")
 | 
| 
 | 
    58 
 | 
| 
 | 
    59     parser.add_option("--wig", type="string", dest="wig_file",help="The output .wig file [INFILE.wig]", metavar="OUTFILE")
 | 
| 
 | 
    60     parser.add_option("--CGmap", type="string", dest="CGmap_file",help="The output .CGmap file [INFILE.CGmap]", metavar="OUTFILE")
 | 
| 
 | 
    61     parser.add_option("--ATCGmap", type="string", dest="ATCGmap_file",help="The output .ATCGmap file [INFILE.ATCGmap]", metavar="OUTFILE")
 | 
| 
 | 
    62 
 | 
| 
 | 
    63     parser.add_option("-x", "--rm-SX", action="store_true", dest="RM_SX",help="Removed reads with tag \'XS:i:1\', which would be considered as not fully converted by bisulfite treatment [Default: %default]", default = False)
 | 
| 
 | 
    64     parser.add_option("--txt", action="store_true", dest="text",help="Show CGmap and ATCGmap in .gz [Default: %default]", default = False)
 | 
| 
 | 
    65 
 | 
| 
 | 
    66     parser.add_option("-r", "--read-no",type = "int", dest="read_no",help="The least number of reads covering one site to be shown in wig file [Default: %default]", default = 1)
 | 
| 
 | 
    67     parser.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False)
 | 
| 
 | 
    68 
 | 
| 
 | 
    69     (options, args) = parser.parse_args()
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 
 | 
| 
 | 
    72     # if no options were given by the user, print help and exit
 | 
| 
 | 
    73     if len(sys.argv) == 1:
 | 
| 
1
 | 
    74         parser.print_help()
 | 
| 
0
 | 
    75         exit(0)
 | 
| 
 | 
    76 
 | 
| 
 | 
    77     if options.version :
 | 
| 
 | 
    78         show_version()
 | 
| 
 | 
    79         exit (-1)
 | 
| 
 | 
    80     else :
 | 
| 
 | 
    81         show_version()
 | 
| 
 | 
    82 
 | 
| 
 | 
    83 
 | 
| 
 | 
    84     if options.infilename is None:
 | 
| 
 | 
    85         error('-i option is required')
 | 
| 
 | 
    86     if not os.path.isfile(options.infilename):
 | 
| 
 | 
    87         error('Cannot find input file: %s' % options.infilename)
 | 
| 
 | 
    88 
 | 
| 
 | 
    89     open_log(options.infilename+'.call_methylation_log')
 | 
| 
 | 
    90     db_d = lambda fname:  os.path.join( os.path.expanduser(options.dbpath), fname) # bug fixed, weilong
 | 
| 
 | 
    91 
 | 
| 
 | 
    92     logm('sorting BS-Seeker alignments')
 | 
| 
 | 
    93     sorted_input_filename = options.infilename+'_sorted'
 | 
| 
 | 
    94     pysam.sort(options.infilename, sorted_input_filename)
 | 
| 
 | 
    95     sorted_input_filename += '.bam'
 | 
| 
 | 
    96     logm('indexing sorted alignments')
 | 
| 
 | 
    97     pysam.index(sorted_input_filename)
 | 
| 
 | 
    98 
 | 
| 
 | 
    99     logm('calculating methylation levels')
 | 
| 
 | 
   100     if options.text :
 | 
| 
 | 
   101         ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap')
 | 
| 
 | 
   102         ATCGmap = open(ATCGmap_fname, 'w')
 | 
| 
 | 
   103 
 | 
| 
 | 
   104         CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap')
 | 
| 
 | 
   105         CGmap = open(CGmap_fname, 'w')
 | 
| 
 | 
   106     else :
 | 
| 
 | 
   107         ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap.gz')
 | 
| 
 | 
   108         ATCGmap = gzip.open(ATCGmap_fname, 'wb')
 | 
| 
 | 
   109 
 | 
| 
 | 
   110         CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap.gz')
 | 
| 
 | 
   111         CGmap = gzip.open(CGmap_fname, 'wb')
 | 
| 
 | 
   112 
 | 
| 
 | 
   113     wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + '.wig')
 | 
| 
 | 
   114     wiggle = open(wiggle_fname, 'w')
 | 
| 
 | 
   115 
 | 
| 
 | 
   116     sorted_input = pysam.Samfile(sorted_input_filename, 'rb')
 | 
| 
 | 
   117     
 | 
| 
 | 
   118     chrom = None
 | 
| 
 | 
   119     nucs = ['A', 'T', 'C', 'G', 'N']
 | 
| 
 | 
   120     ATCG_fwd = dict((n, 0) for n in nucs)
 | 
| 
 | 
   121     ATCG_rev = dict((n, 0) for n in nucs)
 | 
| 
 | 
   122     for col in sorted_input.pileup():
 | 
| 
 | 
   123         col_chrom = sorted_input.getrname(col.tid)
 | 
| 
 | 
   124         if chrom != col_chrom:
 | 
| 
 | 
   125             chrom = col_chrom
 | 
| 
 | 
   126             chrom_seq = deserialize(db_d(chrom))
 | 
| 
 | 
   127             wiggle.write('variableStep chrom=%s\n' % chrom)
 | 
| 
 | 
   128 
 | 
| 
 | 
   129         for n in nucs:
 | 
| 
 | 
   130             ATCG_fwd[n] = 0
 | 
| 
 | 
   131             ATCG_rev[n] = 0
 | 
| 
 | 
   132 
 | 
| 
 | 
   133         nuc, context, subcontext = context_calling(chrom_seq, col.pos)
 | 
| 
 | 
   134         total_reads = 0
 | 
| 
 | 
   135 
 | 
| 
 | 
   136         for pr in col.pileups:
 | 
| 
 | 
   137         #     print pr
 | 
| 
 | 
   138              if (not pr.indel) : # skip indels
 | 
| 
 | 
   139                 #if ( (options.RM_SX) and (pr.alignment.tags[1][1] == 1) ):
 | 
| 
 | 
   140                 ##=== Fixed error reported by Roberto
 | 
| 
 | 
   141                 #print options.RM_SX,  dict(pr.alignment.tags)["XS"]
 | 
| 
 | 
   142                 #if ( (options.RM_SX) and (dict(pr.alignment.tags)["XS"] == 1) ):
 | 
| 
 | 
   143 
 | 
| 
 | 
   144                 if ( (options.RM_SX) and (dict(pr.alignment.tags).get("XS",0) == 1) ):
 | 
| 
 | 
   145                     # print "Debug: ", options.RM_SX, pr.alignment.tags[1]
 | 
| 
 | 
   146                     # when need to filter and read with tag (XS==1), then remove the reads
 | 
| 
 | 
   147                     continue
 | 
| 
 | 
   148 
 | 
| 
 | 
   149                 if pr.qpos >= len(pr.alignment.seq):
 | 
| 
 | 
   150                     print 'WARNING: read %s has an invalid alignment. Discarding.. ' % pr.alignment.qname
 | 
| 
 | 
   151                     continue
 | 
| 
 | 
   152                 read_nuc = pr.alignment.seq[pr.qpos]
 | 
| 
 | 
   153                 if pr.alignment.is_reverse:
 | 
| 
 | 
   154                     ATCG_rev[read_nuc] += 1
 | 
| 
 | 
   155                 else:
 | 
| 
 | 
   156                     ATCG_fwd[read_nuc] += 1
 | 
| 
 | 
   157 
 | 
| 
 | 
   158                 if read_nuc != 'N':
 | 
| 
 | 
   159                     total_reads += 1
 | 
| 
 | 
   160 
 | 
| 
 | 
   161         cnts = lambda d: '\t'.join(str(d[n]) for n in nucs)
 | 
| 
 | 
   162         fwd_counts = cnts(ATCG_fwd)
 | 
| 
 | 
   163         rev_counts = cnts(ATCG_rev)
 | 
| 
 | 
   164 
 | 
| 
 | 
   165         meth_level = None
 | 
| 
 | 
   166         meth_cytosines = 0
 | 
| 
 | 
   167         unmeth_cytosines = 0
 | 
| 
 | 
   168 
 | 
| 
 | 
   169         if nuc == 'C':
 | 
| 
 | 
   170             # plus strand: take the ratio of C's to T's from reads that come from the forward strand
 | 
| 
 | 
   171             meth_cytosines = ATCG_fwd['C']
 | 
| 
 | 
   172             unmeth_cytosines = ATCG_fwd['T']
 | 
| 
 | 
   173 
 | 
| 
 | 
   174         elif nuc == 'G':
 | 
| 
 | 
   175             # minus strand: take the ratio of G's to A's from reads that come from the reverse strand
 | 
| 
 | 
   176             meth_cytosines = ATCG_rev['G']
 | 
| 
 | 
   177             unmeth_cytosines = ATCG_rev['A']
 | 
| 
 | 
   178 
 | 
| 
 | 
   179         if meth_cytosines + unmeth_cytosines > 0:
 | 
| 
 | 
   180             meth_level = float(meth_cytosines)/(meth_cytosines + unmeth_cytosines)
 | 
| 
 | 
   181 
 | 
| 
 | 
   182         pos = col.pos + 1
 | 
| 
 | 
   183 
 | 
| 
 | 
   184         meth_level_string = str(meth_level) if meth_level is not None else 'na'
 | 
| 
 | 
   185         ATCGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(fwd_counts)s\t%(rev_counts)s\t%(meth_level_string)s\n' % locals())
 | 
| 
 | 
   186 #
 | 
| 
 | 
   187         all_cytosines = meth_cytosines + unmeth_cytosines 
 | 
| 
 | 
   188         if (meth_level is not None) and (all_cytosines >= options.read_no):
 | 
| 
 | 
   189         #    print all_cytosines
 | 
| 
 | 
   190             if nuc == 'C':
 | 
| 
 | 
   191                 wiggle.write('%d\t%f\n' % (pos, meth_level))
 | 
| 
 | 
   192             else :
 | 
| 
 | 
   193                 wiggle.write('%d\t-%f\n' % (pos, meth_level))
 | 
| 
1
 | 
   194 
 | 
| 
 | 
   195         CGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(meth_level_string)s\t%(meth_cytosines)s\t%(all_cytosines)s\n' % locals())
 | 
| 
0
 | 
   196     ATCGmap.close()
 | 
| 
 | 
   197     CGmap.close()
 | 
| 
 | 
   198     wiggle.close()
 | 
| 
 | 
   199 
 | 
| 
 | 
   200     logm('Wiggle: %s'% wiggle_fname)
 | 
| 
 | 
   201     logm('ATCGMap: %s' % ATCGmap_fname)
 | 
| 
 | 
   202     logm('CGmap: %s' % CGmap_fname)
 | 
| 
 | 
   203 
 |