1
|
1 #!/usr/bin/env python
|
0
|
2
|
|
3 from optparse import OptionParser, OptionGroup
|
|
4 from bs_utils.utils import *
|
|
5
|
|
6 try :
|
|
7 import pysam
|
|
8 except ImportError :
|
|
9 print "[Error] Cannot import \"pysam\" package. Have you installed it?"
|
|
10 exit(-1)
|
|
11
|
|
12 import gzip
|
|
13
|
|
14 def context_calling(seq, position):
|
|
15
|
|
16 word=seq[position]
|
|
17 word=word.upper()
|
|
18
|
|
19 context="--"
|
|
20 context_CH="--"
|
|
21 if position + 2 < len(seq) and position - 2 >= 0:
|
|
22
|
|
23 if word == "C":
|
|
24 word2 = seq[position+1]
|
|
25 context_CH = word + word2
|
|
26 if word2 == "G":
|
|
27 context = "CG"
|
|
28 elif word2 in ['A','C','T']:
|
|
29 word3 = seq[position+2]
|
|
30 if word3 == "G":
|
|
31 context = "CHG"
|
|
32 elif word3 in ['A','C','T']:
|
|
33 context="CHH"
|
|
34
|
|
35 elif word == "G":
|
|
36 word2 = seq[position-1]
|
|
37 context_CH = word + word2
|
|
38 context_CH = context_CH.translate(string.maketrans("ATCG", "TAGC"))
|
|
39 if word2 == "C":
|
|
40 context = "CG"
|
|
41 elif word2 in ['A','G','T']:
|
|
42 word3 = seq[position-2]
|
|
43 if word3 == "C":
|
|
44 context = "CHG"
|
|
45 elif word3 in ['A','G','T']:
|
|
46 context = "CHH"
|
|
47
|
|
48 return word, context, context_CH
|
|
49
|
|
50
|
|
51
|
|
52 if __name__ == '__main__':
|
|
53
|
|
54 parser = OptionParser()
|
|
55 parser.add_option("-i", "--input", type="string", dest="infilename",help="BAM output from bs_seeker2-align.py", metavar="INFILE")
|
|
56 parser.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path)
|
|
57 parser.add_option("-o", "--output-prefix", type="string", dest="output_prefix",help="The output prefix to create ATCGmap and wiggle files [INFILE]", metavar="OUTFILE")
|
|
58
|
|
59 parser.add_option("--wig", type="string", dest="wig_file",help="The output .wig file [INFILE.wig]", metavar="OUTFILE")
|
|
60 parser.add_option("--CGmap", type="string", dest="CGmap_file",help="The output .CGmap file [INFILE.CGmap]", metavar="OUTFILE")
|
|
61 parser.add_option("--ATCGmap", type="string", dest="ATCGmap_file",help="The output .ATCGmap file [INFILE.ATCGmap]", metavar="OUTFILE")
|
|
62
|
|
63 parser.add_option("-x", "--rm-SX", action="store_true", dest="RM_SX",help="Removed reads with tag \'XS:i:1\', which would be considered as not fully converted by bisulfite treatment [Default: %default]", default = False)
|
|
64 parser.add_option("--txt", action="store_true", dest="text",help="Show CGmap and ATCGmap in .gz [Default: %default]", default = False)
|
|
65
|
|
66 parser.add_option("-r", "--read-no",type = "int", dest="read_no",help="The least number of reads covering one site to be shown in wig file [Default: %default]", default = 1)
|
|
67 parser.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False)
|
|
68
|
|
69 (options, args) = parser.parse_args()
|
|
70
|
|
71
|
|
72 # if no options were given by the user, print help and exit
|
|
73 if len(sys.argv) == 1:
|
1
|
74 parser.print_help()
|
0
|
75 exit(0)
|
|
76
|
|
77 if options.version :
|
|
78 show_version()
|
|
79 exit (-1)
|
|
80 else :
|
|
81 show_version()
|
|
82
|
|
83
|
|
84 if options.infilename is None:
|
|
85 error('-i option is required')
|
|
86 if not os.path.isfile(options.infilename):
|
|
87 error('Cannot find input file: %s' % options.infilename)
|
|
88
|
|
89 open_log(options.infilename+'.call_methylation_log')
|
|
90 db_d = lambda fname: os.path.join( os.path.expanduser(options.dbpath), fname) # bug fixed, weilong
|
|
91
|
|
92 logm('sorting BS-Seeker alignments')
|
|
93 sorted_input_filename = options.infilename+'_sorted'
|
|
94 pysam.sort(options.infilename, sorted_input_filename)
|
|
95 sorted_input_filename += '.bam'
|
|
96 logm('indexing sorted alignments')
|
|
97 pysam.index(sorted_input_filename)
|
|
98
|
|
99 logm('calculating methylation levels')
|
|
100 if options.text :
|
|
101 ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap')
|
|
102 ATCGmap = open(ATCGmap_fname, 'w')
|
|
103
|
|
104 CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap')
|
|
105 CGmap = open(CGmap_fname, 'w')
|
|
106 else :
|
|
107 ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap.gz')
|
|
108 ATCGmap = gzip.open(ATCGmap_fname, 'wb')
|
|
109
|
|
110 CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap.gz')
|
|
111 CGmap = gzip.open(CGmap_fname, 'wb')
|
|
112
|
|
113 wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + '.wig')
|
|
114 wiggle = open(wiggle_fname, 'w')
|
|
115
|
|
116 sorted_input = pysam.Samfile(sorted_input_filename, 'rb')
|
|
117
|
|
118 chrom = None
|
|
119 nucs = ['A', 'T', 'C', 'G', 'N']
|
|
120 ATCG_fwd = dict((n, 0) for n in nucs)
|
|
121 ATCG_rev = dict((n, 0) for n in nucs)
|
|
122 for col in sorted_input.pileup():
|
|
123 col_chrom = sorted_input.getrname(col.tid)
|
|
124 if chrom != col_chrom:
|
|
125 chrom = col_chrom
|
|
126 chrom_seq = deserialize(db_d(chrom))
|
|
127 wiggle.write('variableStep chrom=%s\n' % chrom)
|
|
128
|
|
129 for n in nucs:
|
|
130 ATCG_fwd[n] = 0
|
|
131 ATCG_rev[n] = 0
|
|
132
|
|
133 nuc, context, subcontext = context_calling(chrom_seq, col.pos)
|
|
134 total_reads = 0
|
|
135
|
|
136 for pr in col.pileups:
|
|
137 # print pr
|
|
138 if (not pr.indel) : # skip indels
|
|
139 #if ( (options.RM_SX) and (pr.alignment.tags[1][1] == 1) ):
|
|
140 ##=== Fixed error reported by Roberto
|
|
141 #print options.RM_SX, dict(pr.alignment.tags)["XS"]
|
|
142 #if ( (options.RM_SX) and (dict(pr.alignment.tags)["XS"] == 1) ):
|
|
143
|
|
144 if ( (options.RM_SX) and (dict(pr.alignment.tags).get("XS",0) == 1) ):
|
|
145 # print "Debug: ", options.RM_SX, pr.alignment.tags[1]
|
|
146 # when need to filter and read with tag (XS==1), then remove the reads
|
|
147 continue
|
|
148
|
|
149 if pr.qpos >= len(pr.alignment.seq):
|
|
150 print 'WARNING: read %s has an invalid alignment. Discarding.. ' % pr.alignment.qname
|
|
151 continue
|
|
152 read_nuc = pr.alignment.seq[pr.qpos]
|
|
153 if pr.alignment.is_reverse:
|
|
154 ATCG_rev[read_nuc] += 1
|
|
155 else:
|
|
156 ATCG_fwd[read_nuc] += 1
|
|
157
|
|
158 if read_nuc != 'N':
|
|
159 total_reads += 1
|
|
160
|
|
161 cnts = lambda d: '\t'.join(str(d[n]) for n in nucs)
|
|
162 fwd_counts = cnts(ATCG_fwd)
|
|
163 rev_counts = cnts(ATCG_rev)
|
|
164
|
|
165 meth_level = None
|
|
166 meth_cytosines = 0
|
|
167 unmeth_cytosines = 0
|
|
168
|
|
169 if nuc == 'C':
|
|
170 # plus strand: take the ratio of C's to T's from reads that come from the forward strand
|
|
171 meth_cytosines = ATCG_fwd['C']
|
|
172 unmeth_cytosines = ATCG_fwd['T']
|
|
173
|
|
174 elif nuc == 'G':
|
|
175 # minus strand: take the ratio of G's to A's from reads that come from the reverse strand
|
|
176 meth_cytosines = ATCG_rev['G']
|
|
177 unmeth_cytosines = ATCG_rev['A']
|
|
178
|
|
179 if meth_cytosines + unmeth_cytosines > 0:
|
|
180 meth_level = float(meth_cytosines)/(meth_cytosines + unmeth_cytosines)
|
|
181
|
|
182 pos = col.pos + 1
|
|
183
|
|
184 meth_level_string = str(meth_level) if meth_level is not None else 'na'
|
|
185 ATCGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(fwd_counts)s\t%(rev_counts)s\t%(meth_level_string)s\n' % locals())
|
|
186 #
|
|
187 all_cytosines = meth_cytosines + unmeth_cytosines
|
|
188 if (meth_level is not None) and (all_cytosines >= options.read_no):
|
|
189 # print all_cytosines
|
|
190 if nuc == 'C':
|
|
191 wiggle.write('%d\t%f\n' % (pos, meth_level))
|
|
192 else :
|
|
193 wiggle.write('%d\t-%f\n' % (pos, meth_level))
|
1
|
194
|
|
195 CGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(meth_level_string)s\t%(meth_cytosines)s\t%(all_cytosines)s\n' % locals())
|
0
|
196 ATCGmap.close()
|
|
197 CGmap.close()
|
|
198 wiggle.close()
|
|
199
|
|
200 logm('Wiggle: %s'% wiggle_fname)
|
|
201 logm('ATCGMap: %s' % ATCGmap_fname)
|
|
202 logm('CGmap: %s' % CGmap_fname)
|
|
203
|