annotate BSseeker2/bs_align/bs_single_end.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 import fileinput, os, time, random, math
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2 from bs_utils.utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3 from bs_align_utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 # Read from the mapped results, return lists of unique / multiple-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 # The function suppose at most 2 hits will be reported in single file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 def extract_mapping(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9 unique_hits = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10 non_unique_hits = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12 header0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13 lst = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15 for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16 #------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17 if header != header0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 #---------- output -----------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 if len(lst) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21 elif len(lst) > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22 min_lst = min(lst, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23 max_lst = max(lst, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25 if min_lst[0] < max_lst[0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
26 unique_hits[header0] = min_lst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28 non_unique_hits[header0] = min_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29 #print "multiple hit", header, chr, location, no_mismatch, cigar # test
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30 header0 = header
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 lst = [(no_mismatch, chr, location, cigar)]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32 else: # header == header0, same header (read id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 lst.append((no_mismatch, chr, location, cigar))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35 if len(lst) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 unique_hits[header0] = lst[0] # [no_mismatch, chr, location]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37 elif len(lst) > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 min_lst = min(lst, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 max_lst = max(lst, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41 if min_lst[0] < max_lst[0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42 unique_hits[header0] = min_lst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44 non_unique_hits[header0] = min_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47 return unique_hits, non_unique_hits
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50 def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51 max_mismatch_no, aligner_command, db_path, tmp_path, outfile,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54 # adapter : strand-specific or not
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 adapter=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56 adapter_fw=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57 adapter_rc=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
58 if adapter_file !="":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 adapter_inf=open(adapter_file,"r")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61 except IOError:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 print "[Error] Cannot open adapter file : %s" % adapter_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 if asktag == "N": #<--- directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65 adapter=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 adapter=adapter.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 elif asktag == "Y":#<--- un-directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 adapter_fw=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 adapter_rc=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 adapter_fw=adapter_fw.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 adapter_rc=adapter_rc.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 logm("Read filename: %s"% main_read_file )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
81 logm("Un-directional library: %s" % asktag )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
82 logm("The first base (for mapping): %d" % cut1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
83 logm("The last base (for mapping): %d" % cut2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
84 logm("Max. lines per mapping: %d"% no_small_lines)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
85 logm("Aligner: %s" % aligner_command)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
86 logm("Reference genome library path: %s" % db_path )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
87 logm("Number of mismatches allowed: %s" % max_mismatch_no )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
88 if adapter_file !="":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
89 if asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
90 logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
91 elif asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
92 logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
93 logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
94 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
96 # helper method to join fname with tmp_path
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
97 tmp_d = lambda fname: os.path.join(tmp_path, fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99 db_d = lambda fname: os.path.join(db_path, fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
102 # splitting the big read file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104 input_fname = os.path.split(main_read_file)[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 # split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
107 # my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108 # if splitted_file.startswith("%s-s-" % input_fname))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110 #---- Stats ------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
111 all_raw_reads=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
112 all_trimed=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
113 all_mapped=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 all_mapped_passed=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
116 numbers_premapped_lst=[0,0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 numbers_mapped_lst=[0,0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
118
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
119 mC_lst=[0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
120 uC_lst=[0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
121
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
122
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
123 no_my_files=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
124
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
125 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
126 logm("== Start mapping ==")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
127
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
128 for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
129 # for read_file in my_files:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
130 original_bs_reads = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
131 no_my_files+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
132 random_id = ".tmp-"+str(random.randint(1000000,9999999))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
133
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
134 #-------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
135 # undirectional sequencing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
136 #-------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
137 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
138
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
139 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
140 outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
141 outfile3=tmp_d('Trimmed_G2A.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
142
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
143 outf2=open(outfile2,'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
144 outf3=open(outfile3,'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
145
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
146 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
147 # detect format of input file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
148 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
149 read_inf=open(read_file,"r")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151 print "[Error] Cannot open input file : %s" % read_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154 oneline=read_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
155 l=oneline.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
156 input_format=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
157 if oneline[0]=="@": # fastq
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
158 input_format="fastq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
159 n_fastq=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
160 elif len(l)==1 and oneline[0]!=">": # pure sequences
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161 input_format="seq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162 elif len(l)==11: # qseq
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
163 input_format="qseq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
164 elif oneline[0]==">": # fasta
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
165 input_format="fasta"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
166 n_fasta=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
167 read_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
168
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
169 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
170 # read sequence, remove adapter and convert
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
171 read_id=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
172 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
173 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 for line in fileinput.input(read_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175 l=line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
177 if input_format=="seq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
178 all_raw_reads+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
179 read_id=str(all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
180 read_id=read_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
181 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
182 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
183 elif input_format=="fastq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
184 m_fastq=math.fmod(n_fastq,4)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
185 n_fastq+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
186 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
187 if m_fastq==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
188 all_raw_reads+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
189 read_id=str(all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
190 read_id=read_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
192 elif m_fastq==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
193 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
194 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
195 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
196 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
197 elif input_format=="qseq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
198 all_raw_reads+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
199 read_id=str(all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
200 read_id=read_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
201 seq=l[8]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
202 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
203 elif input_format=="fasta":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
204 m_fasta=math.fmod(n_fasta,2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
205 n_fasta+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
206 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
207 if m_fasta==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
208 all_raw_reads+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
209 #read_id=str(all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
210 read_id=l[0][1:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
211 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
212 elif m_fasta==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
213 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
214 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
215 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
216 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
217
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
218 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
219 if seq_ready=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
220 seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
221 seq=seq.upper()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
222 seq=seq.replace(".","N")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
223
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
224 # striping BS adapter from 3' read
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
225 if (adapter_fw !="") and (adapter_rc !="") :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
226 new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
227 new_read = Remove_5end_Adapter(new_read, adapter_rc)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
228 if len(new_read) < len(seq) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
229 all_trimed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
230 seq = new_read
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
231
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
232 if len(seq)<=4:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
233 seq=''.join(["N" for x in xrange(cut2-cut1+1)])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
234
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
235 #--------- trimmed_raw_BS_read ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
236 original_bs_reads[read_id] = seq
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
237
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
238 #--------- FW_C2T ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
239 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
240 #--------- RC_G2A ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
241 outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
242
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
243 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
244
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
245 outf2.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
246 outf3.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
247
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
248 delete_files(read_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
249
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
250 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
251 # Bowtie mapping
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
252 #-------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
253 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
254 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
255 WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
256 CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
257
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
258 # print aligner_command % {'int_no_mismatches' : int_no_mismatches,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
259 # 'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
260 # 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
261 # 'output_file' : WC2T}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
262
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
263 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
264 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
265 'output_file' : WC2T},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
266
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
267 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
268 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
269 'output_file' : CC2T},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
270
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
271 aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
272 'input_file' : outfile3,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
273 'output_file' : WG2A},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
274
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
275 aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
276 'input_file' : outfile3,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
277 'output_file' : CG2A} ])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
278
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
279
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
280 delete_files(outfile2, outfile3)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
281
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
282
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
283 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
284 # Post processing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
285 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
286
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
287 FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
288 RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
289
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
290 FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
291 RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
292
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
293 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
294 # get unique-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
295 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
296 Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
297
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
298 Unique_FW_C2T=set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
299 Unique_RC_G2A=set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
300 Unique_FW_G2A=set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
301 Unique_RC_C2T=set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
302 Multiple_hits=set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
303
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
304
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
305 for x in Union_set:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
306 _list=[]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
307 for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
308 mis_lst=d.get(x,[99])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
309 mis=int(mis_lst[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
310 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
311 for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
312 mis=d.get(x,99)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
313 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
314 mini=min(_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
315 if _list.count(mini) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
316 mini_index=_list.index(mini)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
317 if mini_index == 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
318 Unique_FW_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
319 elif mini_index == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
320 Unique_RC_G2A.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
321 elif mini_index == 2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
322 Unique_FW_G2A.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
323 elif mini_index == 3:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
324 Unique_RC_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
325 # if mini_index = 4,5,6,7, indicating multiple hits
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
326 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
327 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
328 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
329 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
330 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
331 if show_multiple_hit :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
332 outf_MH=open("Multiple_hit.fa",'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
333 for i in Multiple_hits :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
334 outf_MH.write(">%s\n" % i)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
335 outf_MH.write("%s\n" % original_bs_reads[i])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
336 outf_MH.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
337
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
338 del Union_set
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
339 del FW_C2T_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
340 del FW_G2A_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
341 del RC_C2T_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
342 del RC_G2A_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
343
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
344 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
345 FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
346 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
347 RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
348 FW_C2T_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
349 RC_C2T_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
350 FW_G2A_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
351 RC_G2A_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
352 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
353 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
354 FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
355 RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
356
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
357 del Unique_FW_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
358 del Unique_FW_G2A
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
359 del Unique_RC_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
360 del Unique_RC_G2A
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
361
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
362 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
363 numbers_premapped_lst[0] += len(Unique_FW_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
364 numbers_premapped_lst[1] += len(Unique_RC_G2A)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
365 numbers_premapped_lst[2] += len(Unique_FW_G2A)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
366 numbers_premapped_lst[3] += len(Unique_RC_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
367
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
368
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
369 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
370
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
371 nn=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
372 gseq = dict()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
373 chr_length = dict()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
374 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
375 (RC_G2A_uniq_lst,RC_G2A_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
376 (FW_G2A_uniq_lst,FW_G2A_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
377 (RC_C2T_uniq_lst,RC_C2T_U)]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
378 nn += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
379 mapped_chr0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
380
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
381 for header in ali_unique_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
382
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
383 _, mapped_chr, mapped_location, cigar = ali_dic[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
384
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
385 original_BS = original_bs_reads[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
386 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
387 if mapped_chr not in gseq:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
388 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
389 chr_length[mapped_chr] = len(gseq[mapped_chr])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
390
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
391 if nn == 2 or nn == 3:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
392 cigar = list(reversed(cigar))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
393 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
394
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
395
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
396 all_mapped += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
397
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
398 if nn == 1: # +FW mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
399 FR = "+FW"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
400 mapped_strand="+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
401
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
402 elif nn == 2: # +RC mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
403 FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
404 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
405 mapped_strand = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
406 original_BS = reverse_compl_seq(original_BS) # for RC reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
407
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
408 elif nn == 3: # -RC mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
409 mapped_strand = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
410 FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
411 original_BS = reverse_compl_seq(original_BS) # for RC reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
412
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
413 elif nn == 4: # -FW mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
414 mapped_strand = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
415 FR = "-FW"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
416 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
417
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
418 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
419
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
420 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
421
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
422
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
423 if len(r_aln)==len(g_aln):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
424 N_mismatch = N_MIS(r_aln, g_aln)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
425 if N_mismatch <= int(max_mismatch_no):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
426 numbers_mapped_lst[nn-1] += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
427 all_mapped_passed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
428 methy = methy_seq(r_aln, g_aln + next)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
429 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
430
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
431 #---XS FILTER----------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
432 XS = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
433 nCH = methy.count('y') + methy.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
434 nmCH = methy.count('Y') + methy.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
435 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
436 XS = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
437
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
438 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
439
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
440 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
441 logm("--> %s (%d) "%(read_file, no_my_files))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
442 delete_files(WC2T, WG2A, CC2T, CG2A)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
443
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
444
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
445
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
446 #--------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
447 # directional sequencing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
448 #--------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
449
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
450 if asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
451 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
452 outfile2=tmp_d('Trimed_C2T.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
453 outf2=open(outfile2,'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
454
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
455 n=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
456 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
457 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
458 read_inf=open(read_file,"r")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
459 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
460 print "[Error] Cannot open input file : %s" % read_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
461 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
462
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
463 oneline=read_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
464 l=oneline.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
465 input_format=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
466 if oneline[0]=="@": # FastQ
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
467 input_format="fastq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
468 n_fastq=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
469 elif len(l)==1 and oneline[0]!=">": # pure sequences
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
470 input_format="seq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
471 elif len(l)==11: # Illumina GAII qseq file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
472 input_format="qseq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
473 elif oneline[0]==">": # fasta
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
474 input_format="fasta"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
475 n_fasta=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
476 read_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
477 #print "detected data format: %s"%(input_format)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
478 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
479 read_id=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
480 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
481 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
482 for line in fileinput.input(read_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
483 l=line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
484 if input_format=="seq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
485 all_raw_reads+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
486 read_id=str(all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
487 read_id=read_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
488 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
489 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
490 elif input_format=="fastq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
491 m_fastq=math.fmod(n_fastq,4)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
492 n_fastq+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
493 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
494 if m_fastq==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
495 all_raw_reads+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
496 read_id=str(all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
497 read_id=read_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
498 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
499 elif m_fastq==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
500 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
501 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
502 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
503 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
504 elif input_format=="qseq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
505 all_raw_reads+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
506 read_id=str(all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
507 read_id=read_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
508 seq=l[8]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
509 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
510 elif input_format=="fasta":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
511 m_fasta=math.fmod(n_fasta,2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
512 n_fasta+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
513 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
514 if m_fasta==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
515 all_raw_reads+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
516 read_id=l[0][1:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
517 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
518 elif m_fasta==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
519 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
520 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
521 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
522 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
523
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
524 #--------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
525 if seq_ready=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
526 seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
527 seq=seq.upper()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
528 seq=seq.replace(".","N")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
529
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
530 #--striping adapter from 3' read -------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
531 if adapter != "":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
532 new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
533 if len(new_read) < len(seq) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
534 all_trimed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
535 seq = new_read
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
536
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
537 if len(seq)<=4:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
538 seq = "N" * (cut2-cut1+1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
539
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
540 #--------- trimmed_raw_BS_read ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
541 original_bs_reads[read_id] = seq
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
542
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
543
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
544 #--------- FW_C2T ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
545 outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
546
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
547 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
548
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
549 outf2.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
550 delete_files(read_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
551
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
552 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
553 # Bowtie mapping
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
554 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
555 WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
556 CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
557
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
558 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
559 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
560 'output_file' : WC2T},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
561 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
562 'input_file' : outfile2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
563 'output_file' : CC2T} ])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
564
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
565 delete_files(outfile2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
566
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
567 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
568 # Post processing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
569 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
570
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
571
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
572 FW_C2T_U, FW_C2T_R = extract_mapping(WC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
573 RC_C2T_U, RC_C2T_R = extract_mapping(CC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
574
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
575 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
576 # get uniq-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
577 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
578 Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
579
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
580 Unique_FW_C2T = set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
581 Unique_RC_C2T = set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
582 Multiple_hits=set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
583 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
584
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
585 for x in Union_set:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
586 _list=[]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
587 for d in [FW_C2T_U,RC_C2T_U]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
588 mis_lst=d.get(x,[99])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
589 mis=int(mis_lst[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
590 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
591 for d in [FW_C2T_R,RC_C2T_R]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
592 mis=d.get(x,99)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
593 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
594 mini=min(_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
595 #print _list
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
596 if _list.count(mini)==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
597 mini_index=_list.index(mini)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
598 if mini_index==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
599 Unique_FW_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
600 elif mini_index==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
601 Unique_RC_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
602 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
603 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
604 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
605 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
606 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
607 if show_multiple_hit :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
608 outf_MH=open("Multiple_hit.fa",'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
609 for i in Multiple_hits :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
610 outf_MH.write(">%s\n" % i)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
611 outf_MH.write("%s\n" % original_bs_reads[i])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
612 outf_MH.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
613
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
614
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
615
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
616 FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
617 RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
618 FW_C2T_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
619 RC_C2T_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
620 FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
621 RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
622
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
623
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
624 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
625
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
626 numbers_premapped_lst[0] += len(Unique_FW_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
627 numbers_premapped_lst[1] += len(Unique_RC_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
628
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
629 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
630
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
631 nn = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
632 gseq = dict()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
633 chr_length = dict()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
634 for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
635 nn += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
636 mapped_chr0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
637 for header in ali_unique_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
638 _, mapped_chr, mapped_location, cigar = ali_dic[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
639 original_BS = original_bs_reads[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
640 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
641 if mapped_chr not in gseq :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
642 gseq[mapped_chr] = deserialize(db_d(mapped_chr))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
643 chr_length[mapped_chr] = len(gseq[mapped_chr])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
644 #if mapped_chr != mapped_chr0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
645 # my_gseq = deserialize(db_d(mapped_chr))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
646 # chr_length = len(my_gseq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
647 # mapped_chr0 = mapped_chr
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
648 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
649
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
650 r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
651
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
652 all_mapped+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
653 if nn == 1: # +FW mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
654 FR = "+FW"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
655 mapped_strand = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
656 elif nn == 2: # -FW mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
657 mapped_strand = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
658 FR = "-FW"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
659 mapped_location = chr_length[mapped_chr] - mapped_location - g_len
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
660
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
661
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
662 origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
663 r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
664
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
665 if len(r_aln) == len(g_aln):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
666 N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
667 if N_mismatch <= int(max_mismatch_no):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
668 numbers_mapped_lst[nn-1] += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
669 all_mapped_passed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
670 methy = methy_seq(r_aln, g_aln+next)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
671 mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
672
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
673 #---XS FILTER----------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
674 XS = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
675 nCH = methy.count('y') + methy.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
676 nmCH = methy.count('Y') + methy.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
677 if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
678 XS = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
679
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
680 outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
681
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
682 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
683 logm("--> %s (%d) "%(read_file,no_my_files))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
684 delete_files(WC2T, CC2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
685
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
686
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
687 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
688
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
689 # outf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
690 delete_files(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
691
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
692 logm("Number of raw reads: %d \n"% all_raw_reads)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
693 if all_raw_reads >0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
694 logm("Number of reads having adapter removed: %d \n" % all_trimed )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
695 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
696 logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
697 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
698 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
699 logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
700 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
701 logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
702 elif asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
703 logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
704 logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
705
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
706 logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
707 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
708 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
709 logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
710 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
711 logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
712 elif asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
713 logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
714 logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
715 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
716
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
717 n_CG=mC_lst[0]+uC_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
718 n_CHG=mC_lst[1]+uC_lst[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
719 n_CHH=mC_lst[2]+uC_lst[2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
720
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
721 logm("----------------------------------------------" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
722 logm("Methylated C in mapped reads ")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
723
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
724 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
725 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
726 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
727
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
728 logm("------------------- END --------------------" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
729 elapsed("=== END %s ===" % main_read_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
730
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
731
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
732