Mercurial > repos > weilong-guo > bs_seeker2
comparison BSseeker2/bs_seeker2-call_methylation.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e6df770c0e58 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 from optparse import OptionParser, OptionGroup | |
4 from bs_utils.utils import * | |
5 | |
6 try : | |
7 import pysam | |
8 except ImportError : | |
9 print "[Error] Cannot import \"pysam\" package. Have you installed it?" | |
10 exit(-1) | |
11 | |
12 import gzip | |
13 | |
14 def context_calling(seq, position): | |
15 | |
16 word=seq[position] | |
17 word=word.upper() | |
18 | |
19 context="--" | |
20 context_CH="--" | |
21 if position + 2 < len(seq) and position - 2 >= 0: | |
22 | |
23 if word == "C": | |
24 word2 = seq[position+1] | |
25 context_CH = word + word2 | |
26 if word2 == "G": | |
27 context = "CG" | |
28 elif word2 in ['A','C','T']: | |
29 word3 = seq[position+2] | |
30 if word3 == "G": | |
31 context = "CHG" | |
32 elif word3 in ['A','C','T']: | |
33 context="CHH" | |
34 | |
35 elif word == "G": | |
36 word2 = seq[position-1] | |
37 context_CH = word + word2 | |
38 context_CH = context_CH.translate(string.maketrans("ATCG", "TAGC")) | |
39 if word2 == "C": | |
40 context = "CG" | |
41 elif word2 in ['A','G','T']: | |
42 word3 = seq[position-2] | |
43 if word3 == "C": | |
44 context = "CHG" | |
45 elif word3 in ['A','G','T']: | |
46 context = "CHH" | |
47 | |
48 return word, context, context_CH | |
49 | |
50 | |
51 | |
52 if __name__ == '__main__': | |
53 | |
54 parser = OptionParser() | |
55 parser.add_option("-i", "--input", type="string", dest="infilename",help="BAM output from bs_seeker2-align.py", metavar="INFILE") | |
56 parser.add_option("-d", "--db", type="string", dest="dbpath",help="Path to the reference genome library (generated in preprocessing genome) [Default: %default]" , metavar="DBPATH", default = reference_genome_path) | |
57 parser.add_option("-o", "--output-prefix", type="string", dest="output_prefix",help="The output prefix to create ATCGmap and wiggle files [INFILE]", metavar="OUTFILE") | |
58 | |
59 parser.add_option("--wig", type="string", dest="wig_file",help="The output .wig file [INFILE.wig]", metavar="OUTFILE") | |
60 parser.add_option("--CGmap", type="string", dest="CGmap_file",help="The output .CGmap file [INFILE.CGmap]", metavar="OUTFILE") | |
61 parser.add_option("--ATCGmap", type="string", dest="ATCGmap_file",help="The output .ATCGmap file [INFILE.ATCGmap]", metavar="OUTFILE") | |
62 | |
63 parser.add_option("-x", "--rm-SX", action="store_true", dest="RM_SX",help="Removed reads with tag \'XS:i:1\', which would be considered as not fully converted by bisulfite treatment [Default: %default]", default = False) | |
64 parser.add_option("--txt", action="store_true", dest="text",help="Show CGmap and ATCGmap in .gz [Default: %default]", default = False) | |
65 | |
66 parser.add_option("-r", "--read-no",type = "int", dest="read_no",help="The least number of reads covering one site to be shown in wig file [Default: %default]", default = 1) | |
67 parser.add_option("-v", "--version", action="store_true", dest="version",help="show version of BS-Seeker2", metavar="version", default = False) | |
68 | |
69 (options, args) = parser.parse_args() | |
70 | |
71 | |
72 # if no options were given by the user, print help and exit | |
73 if len(sys.argv) == 1: | |
74 print parser.print_help() | |
75 exit(0) | |
76 | |
77 if options.version : | |
78 show_version() | |
79 exit (-1) | |
80 else : | |
81 show_version() | |
82 | |
83 | |
84 if options.infilename is None: | |
85 error('-i option is required') | |
86 if not os.path.isfile(options.infilename): | |
87 error('Cannot find input file: %s' % options.infilename) | |
88 | |
89 open_log(options.infilename+'.call_methylation_log') | |
90 db_d = lambda fname: os.path.join( os.path.expanduser(options.dbpath), fname) # bug fixed, weilong | |
91 | |
92 logm('sorting BS-Seeker alignments') | |
93 sorted_input_filename = options.infilename+'_sorted' | |
94 pysam.sort(options.infilename, sorted_input_filename) | |
95 sorted_input_filename += '.bam' | |
96 logm('indexing sorted alignments') | |
97 pysam.index(sorted_input_filename) | |
98 | |
99 logm('calculating methylation levels') | |
100 if options.text : | |
101 ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap') | |
102 ATCGmap = open(ATCGmap_fname, 'w') | |
103 | |
104 CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap') | |
105 CGmap = open(CGmap_fname, 'w') | |
106 else : | |
107 ATCGmap_fname = options.ATCGmap_file or ((options.output_prefix or options.infilename) + '.ATCGmap.gz') | |
108 ATCGmap = gzip.open(ATCGmap_fname, 'wb') | |
109 | |
110 CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap.gz') | |
111 CGmap = gzip.open(CGmap_fname, 'wb') | |
112 | |
113 wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + '.wig') | |
114 wiggle = open(wiggle_fname, 'w') | |
115 | |
116 sorted_input = pysam.Samfile(sorted_input_filename, 'rb') | |
117 | |
118 chrom = None | |
119 nucs = ['A', 'T', 'C', 'G', 'N'] | |
120 ATCG_fwd = dict((n, 0) for n in nucs) | |
121 ATCG_rev = dict((n, 0) for n in nucs) | |
122 for col in sorted_input.pileup(): | |
123 col_chrom = sorted_input.getrname(col.tid) | |
124 if chrom != col_chrom: | |
125 chrom = col_chrom | |
126 chrom_seq = deserialize(db_d(chrom)) | |
127 wiggle.write('variableStep chrom=%s\n' % chrom) | |
128 | |
129 for n in nucs: | |
130 ATCG_fwd[n] = 0 | |
131 ATCG_rev[n] = 0 | |
132 | |
133 nuc, context, subcontext = context_calling(chrom_seq, col.pos) | |
134 total_reads = 0 | |
135 | |
136 | |
137 | |
138 for pr in col.pileups: | |
139 # print pr | |
140 if (not pr.indel) : # skip indels | |
141 #if ( (options.RM_SX) and (pr.alignment.tags[1][1] == 1) ): | |
142 ##=== Fixed error reported by Roberto | |
143 #print options.RM_SX, dict(pr.alignment.tags)["XS"] | |
144 #if ( (options.RM_SX) and (dict(pr.alignment.tags)["XS"] == 1) ): | |
145 | |
146 if ( (options.RM_SX) and (dict(pr.alignment.tags).get("XS",0) == 1) ): | |
147 # print "Debug: ", options.RM_SX, pr.alignment.tags[1] | |
148 # when need to filter and read with tag (XS==1), then remove the reads | |
149 continue | |
150 | |
151 if pr.qpos >= len(pr.alignment.seq): | |
152 print 'WARNING: read %s has an invalid alignment. Discarding.. ' % pr.alignment.qname | |
153 continue | |
154 read_nuc = pr.alignment.seq[pr.qpos] | |
155 # print "read_nuc=", read_nuc | |
156 if pr.alignment.is_reverse: | |
157 ATCG_rev[read_nuc] += 1 | |
158 else: | |
159 ATCG_fwd[read_nuc] += 1 | |
160 | |
161 if read_nuc != 'N': | |
162 total_reads += 1 | |
163 | |
164 cnts = lambda d: '\t'.join(str(d[n]) for n in nucs) | |
165 fwd_counts = cnts(ATCG_fwd) | |
166 rev_counts = cnts(ATCG_rev) | |
167 | |
168 meth_level = None | |
169 meth_cytosines = 0 | |
170 unmeth_cytosines = 0 | |
171 | |
172 if nuc == 'C': | |
173 # plus strand: take the ratio of C's to T's from reads that come from the forward strand | |
174 meth_cytosines = ATCG_fwd['C'] | |
175 unmeth_cytosines = ATCG_fwd['T'] | |
176 | |
177 elif nuc == 'G': | |
178 # minus strand: take the ratio of G's to A's from reads that come from the reverse strand | |
179 meth_cytosines = ATCG_rev['G'] | |
180 unmeth_cytosines = ATCG_rev['A'] | |
181 | |
182 if meth_cytosines + unmeth_cytosines > 0: | |
183 meth_level = float(meth_cytosines)/(meth_cytosines + unmeth_cytosines) | |
184 | |
185 pos = col.pos + 1 | |
186 | |
187 meth_level_string = str(meth_level) if meth_level is not None else 'na' | |
188 ATCGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(fwd_counts)s\t%(rev_counts)s\t%(meth_level_string)s\n' % locals()) | |
189 # | |
190 all_cytosines = meth_cytosines + unmeth_cytosines | |
191 if (meth_level is not None) and (all_cytosines >= options.read_no): | |
192 # print all_cytosines | |
193 if nuc == 'C': | |
194 wiggle.write('%d\t%f\n' % (pos, meth_level)) | |
195 else : | |
196 wiggle.write('%d\t-%f\n' % (pos, meth_level)) | |
197 CGmap.write('%(chrom)s\t%(nuc)s\t%(pos)d\t%(context)s\t%(subcontext)s\t%(meth_level_string)s\t%(meth_cytosines)s\t%(all_cytosines)s\n' % locals()) | |
198 ATCGmap.close() | |
199 CGmap.close() | |
200 wiggle.close() | |
201 | |
202 logm('Wiggle: %s'% wiggle_fname) | |
203 logm('ATCGMap: %s' % ATCGmap_fname) | |
204 logm('CGmap: %s' % CGmap_fname) | |
205 |