annotate BSseeker2/bs_align/bs_pair_end.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 import fileinput, os, random, math
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2 from bs_utils.utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3 from bs_align_utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5 def extract_mapping(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 unique_hits = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 non_unique_hits = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 header0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9 family = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10 for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11 #------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12 if header != header0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 # --- output ----
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15 if len(family) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16 unique_hits[header0] = family[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17 elif len(family) > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 min_lst = min(family, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 max_lst = max(family, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21 if min_lst[0] < max_lst[0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22 unique_hits[header0] = min_lst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24 non_unique_hits[header0] = min_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
26 header0 = header
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27 family = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28 family.append((no_mismatch, chr, location1, cigar1, location2, cigar2))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30 if len(family) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 unique_hits[header0] = family[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32 elif len(family) > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 min_lst = min(family, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34 max_lst = max(family, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 if min_lst[0] < max_lst[0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37 unique_hits[header0] = min_lst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 non_unique_hits[header0] = min_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41 return unique_hits, non_unique_hits
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 def _extract_mapping(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44 U = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45 R = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46 header0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47 family = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48 for line in fileinput.input(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49 l = line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50 header = l[0][:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51 chr = str(l[1])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 location = int(l[2])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 #no_hits=int(l[4])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54 #-------- mismatches -----------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 if len(l) == 4:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56 no_mismatch = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57 elif len(l) == 5:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
58 no_mismatch = l[4].count(":")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 print l
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61 #------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 if header != header0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63 #--------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 if header0 != "":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65 # --- output ----
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 if len(family) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 U[header0] = family[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 if family[0][0] < family[1][0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 U[header0] = family[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71 elif family[1][0] < family[0][0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 U[header0] = family[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74 R[header0] = family[0][0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75 family=[]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76 # ---------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77 header0 = header
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78 family = [[no_mismatch, chr, location]]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79 member = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 elif header == header0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
81 if member == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
82 family[-1][0] += no_mismatch
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
83 family[-1].append(location)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
84 member = 2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
85 elif member == 2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
86 family.append([no_mismatch, chr, location])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
87 member = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
88 #------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
89
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
90 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
91 return U, R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
92
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
93
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
94 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
96 def bs_pair_end(main_read_file_1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
97 main_read_file_2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98 asktag,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99 adapter_file,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100 cut1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 cut2, # add cut3 and cut4?
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
102 no_small_lines,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103 max_mismatch_no,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104 aligner_command,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105 db_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 tmp_path,
1
weilong-guo
parents: 0
diff changeset
107 outfile, XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110 #----------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
111 adapter = ""
weilong-guo
parents: 0
diff changeset
112 adapterA = ""
weilong-guo
parents: 0
diff changeset
113 adapterB = ""
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 if adapter_file !="":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
116 adapter_inf=open(adapter_file,"r")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 if asktag=="N": #<--- directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
118 adapter=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
119 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
120 adapter=adapter.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
121 elif asktag=="Y":#<--- un-directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
122 adapterA=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
123 adapterB=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
124 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
125 adapterA=adapterA.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
126 adapterB=adapterB.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
127 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
128 print "[Error] Cannot find adapter file : %s" % adapter_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
129 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
130
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
131
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
132 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
133
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
134 logm("End 1 filename: %s"% main_read_file_1 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
135 logm("End 2 filename: %s"% main_read_file_2 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
136 logm("The first base (for mapping): %d"% cut1 )
1
weilong-guo
parents: 0
diff changeset
137 logm("The last base (for mapping): %d"% cut2 )
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
138
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
139 logm("Path for short reads aligner: %s"% aligner_command + '\n')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
140 logm("Reference genome library path: %s"% db_path )
1
weilong-guo
parents: 0
diff changeset
141
weilong-guo
parents: 0
diff changeset
142 if asktag == "Y" :
weilong-guo
parents: 0
diff changeset
143 logm("Un-directional library" )
weilong-guo
parents: 0
diff changeset
144 else :
weilong-guo
parents: 0
diff changeset
145 logm("Directional library")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
146 logm("Number of mismatches allowed: %s"% max_mismatch_no )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
147
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
148 if adapter_file !="":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
149 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150 logm("Adapters to be removed from 3' of the reads:" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151 logm("-- A: %s" % adapterA )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152 logm("-- B: %s" % adapterB )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153 elif asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154 logm("Adapter to be removed from 3' of the reads:" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
155 logm("-- %s" % adapter )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
156
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
157 logm("-------------------------------- " )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
158
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
159 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
160
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161 # helper method to join fname with tmp_path
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162 tmp_d = lambda fname: os.path.join(tmp_path, fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
163
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
164 db_d = lambda fname: os.path.join(db_path, fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
165
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
166
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
167 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
168 # splitting the 2 big read files
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
169
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
170 input_fname1 = os.path.split(main_read_file_1)[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
171 input_fname2 = os.path.split(main_read_file_2)[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
172
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
173 # TODO: run these in parallel with a subprocess
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175 split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
177 dirList=os.listdir(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
178 my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
179 sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList)))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
180
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
181
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
182
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
183 #---- Stats ------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
184 all_raw_reads=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
185 all_trimmed=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
186 all_mapped=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
187 all_mapped_passed=0
1
weilong-guo
parents: 0
diff changeset
188 all_base_before_trim=0
weilong-guo
parents: 0
diff changeset
189 all_base_after_trim=0
weilong-guo
parents: 0
diff changeset
190 all_base_mapped=0
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
192 all_unmapped=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
193
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
194 numbers_premapped_lst=[0,0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
195 numbers_mapped_lst=[0,0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
196
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
197 mC_lst=[0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
198 uC_lst=[0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
199
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
200 no_my_files=0
1
weilong-guo
parents: 0
diff changeset
201 read_id_lst_1 = dict()
weilong-guo
parents: 0
diff changeset
202 read_id_lst_2 = dict()
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
203
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
204 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
205 print "== Start mapping =="
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
206
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
207 for read_file_1, read_file_2 in my_files:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
208
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
209 no_my_files+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
210
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
211 random_id=".tmp-"+str(random.randint(1000000,9999999))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
212
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
213 original_bs_reads_1 = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
214 original_bs_reads_2 = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
215 original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
216
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
217
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
218 if asktag == "Y" :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
219
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
220 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
221 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
222 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id)
1
weilong-guo
parents: 0
diff changeset
223 outfile_2RGA = tmp_d('Trimmed_FCT_2.fa'+random_id)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
224 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
225
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
226 try :
1
weilong-guo
parents: 0
diff changeset
227 if read_file_1.endswith(".gz") : # support input file ending with ".gz"
weilong-guo
parents: 0
diff changeset
228 read_inf = gzip.open(tmp_d(read_file_1), "rb")
weilong-guo
parents: 0
diff changeset
229 else :
weilong-guo
parents: 0
diff changeset
230 read_inf = open(tmp_d(read_file_1),"r")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
231 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
232 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
233 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
234 oneline = read_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
235 l = oneline.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
236 input_format = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
237
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
238 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
239 input_format="fastq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
240 elif len(l)==1 and oneline[0]!=">": # pure sequences
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
241 input_format="seq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
242 elif len(l)==11: # Illumina GAII qseq file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
243 input_format="qseq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
244 elif oneline[0]==">": # fasta
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
245 input_format="fasta"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
246 read_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
247
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
248 print "Detected data format: %s" % input_format
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
249
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
250 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
251 read_file_list = [read_file_1, read_file_2]
1
weilong-guo
parents: 0
diff changeset
252 outfile_FCT_list = [outfile_1FCT, outfile_2RGA]
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
253 outfile_RCT_list = [outfile_1RCT, outfile_2RCT]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
254 n_list = [0, 0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
255 for f in range(2):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
256 read_file = read_file_list[f]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
257 outf_FCT = open(outfile_FCT_list[f], 'w')
1
weilong-guo
parents: 0
diff changeset
258 outf_RGA = open(outfile_RCT_list[f], 'w')
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
259 original_bs_reads = original_bs_reads_lst[f]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
260 n = n_list[f]
1
weilong-guo
parents: 0
diff changeset
261 read_id = ""
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
262 seq = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
263 seq_ready = "N"
1
weilong-guo
parents: 0
diff changeset
264 line_no = 0
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
265 for line in fileinput.input(tmp_d(read_file)):
1
weilong-guo
parents: 0
diff changeset
266 l = line.split()
weilong-guo
parents: 0
diff changeset
267 line_no += 1
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
268 if input_format=="seq":
1
weilong-guo
parents: 0
diff changeset
269 n += 1
weilong-guo
parents: 0
diff changeset
270 read_id = str(n)
weilong-guo
parents: 0
diff changeset
271 read_id = read_id.zfill(12)
weilong-guo
parents: 0
diff changeset
272 if f == 1 :
weilong-guo
parents: 0
diff changeset
273 read_id_lst_1[read_id] = read_id
weilong-guo
parents: 0
diff changeset
274 else :
weilong-guo
parents: 0
diff changeset
275 read_id_lst_2[read_id] = read_id
weilong-guo
parents: 0
diff changeset
276 seq = l[0]
weilong-guo
parents: 0
diff changeset
277 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
278 elif input_format=="fastq":
1
weilong-guo
parents: 0
diff changeset
279 l_fastq = math.fmod(line_no, 4)
weilong-guo
parents: 0
diff changeset
280 if l_fastq==1 :
weilong-guo
parents: 0
diff changeset
281 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
282 read_id = str(line_no/4+1).zfill(12)
weilong-guo
parents: 0
diff changeset
283 if f==1 :
weilong-guo
parents: 0
diff changeset
284 read_id_lst_1[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
285 else :
weilong-guo
parents: 0
diff changeset
286 read_id_lst_2[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
287 seq_ready="N"
weilong-guo
parents: 0
diff changeset
288 elif l_fastq==2 :
weilong-guo
parents: 0
diff changeset
289 seq = l[0]
weilong-guo
parents: 0
diff changeset
290 seq_ready = "Y"
weilong-guo
parents: 0
diff changeset
291 else :
weilong-guo
parents: 0
diff changeset
292 seq = ""
weilong-guo
parents: 0
diff changeset
293 seq_ready = "N"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
294 elif input_format=="qseq":
1
weilong-guo
parents: 0
diff changeset
295 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
296 read_id = str(line_no)
weilong-guo
parents: 0
diff changeset
297 read_id = read_id.zfill(12)
weilong-guo
parents: 0
diff changeset
298 if f == 1 :
weilong-guo
parents: 0
diff changeset
299 read_id_lst_1[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
300 else :
weilong-guo
parents: 0
diff changeset
301 read_id_lst_2[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
302 seq = l[8]
weilong-guo
parents: 0
diff changeset
303 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
304 elif input_format=="fasta":
1
weilong-guo
parents: 0
diff changeset
305 l_fasta = math.fmod(line_no,2)
weilong-guo
parents: 0
diff changeset
306 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
307 if l_fasta==1:
weilong-guo
parents: 0
diff changeset
308 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
309 read_id = str(line_no/2+1).zfill(12)
weilong-guo
parents: 0
diff changeset
310 if f==1 :
weilong-guo
parents: 0
diff changeset
311 read_id_lst_1[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
312 else :
weilong-guo
parents: 0
diff changeset
313 read_id_lst_2[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
314 seq = ""
weilong-guo
parents: 0
diff changeset
315 elif l_fasta==0 :
weilong-guo
parents: 0
diff changeset
316 seq = l[0]
weilong-guo
parents: 0
diff changeset
317 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
318 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
319 if seq_ready=="Y":
1
weilong-guo
parents: 0
diff changeset
320 seq = seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52
weilong-guo
parents: 0
diff changeset
321 seq = seq.upper()
weilong-guo
parents: 0
diff changeset
322 seq = seq.replace(".","N")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
323
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
324 #--striping BS adapter from 3' read --------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
325 all_base_before_trim += len(seq)
weilong-guo
parents: 0
diff changeset
326 if f==0 and adapterA!="" :
weilong-guo
parents: 0
diff changeset
327 new_read = RemoveAdapter(seq, adapterA, adapter_mismatch)
weilong-guo
parents: 0
diff changeset
328 if len(new_read)<len(seq) :
weilong-guo
parents: 0
diff changeset
329 all_trimmed += 1
weilong-guo
parents: 0
diff changeset
330 seq = new_read
weilong-guo
parents: 0
diff changeset
331 if f==1 and adapterB!="" :
weilong-guo
parents: 0
diff changeset
332 new_read = RemoveAdapter(seq, adapterB, adapter_mismatch)
weilong-guo
parents: 0
diff changeset
333 if len(new_read)<len(seq) :
weilong-guo
parents: 0
diff changeset
334 all_trimmed += 1
weilong-guo
parents: 0
diff changeset
335 seq = new_read
weilong-guo
parents: 0
diff changeset
336 all_base_after_trim += len(seq)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
337
1
weilong-guo
parents: 0
diff changeset
338 if len(seq)<=4:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
339 seq = "N" * (cut2-cut1+1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
340 #--------- trimmed_raw_BS_read ------------------
1
weilong-guo
parents: 0
diff changeset
341 original_bs_reads[read_id] = seq
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
342
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
343 #--------- FW_C2T ------------------
1
weilong-guo
parents: 0
diff changeset
344 outf_FCT.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
345 #--------- RC_G2A ------------------
1
weilong-guo
parents: 0
diff changeset
346 outf_RGA.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
347
1
weilong-guo
parents: 0
diff changeset
348 n_list[f] = n
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
349
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
350 outf_FCT.close()
1
weilong-guo
parents: 0
diff changeset
351 outf_RGA.close()
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
352
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
353 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
354
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
355 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
1
weilong-guo
parents: 0
diff changeset
356 all_raw_reads += n
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
357
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
358 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
359 # Bowtie mapping
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
360 #--------------------------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
361 WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
weilong-guo
parents: 0
diff changeset
362 WC2T_rf = tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
weilong-guo
parents: 0
diff changeset
363 CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
weilong-guo
parents: 0
diff changeset
364 CC2T_rf = tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
365
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
366 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
367 'input_file_1' : outfile_1FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
368 'input_file_2' : outfile_2RCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
369 'output_file' : WC2T_fr},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
370
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
371 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
372 'input_file_1' : outfile_1FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
373 'input_file_2' : outfile_2RCT,
1
weilong-guo
parents: 0
diff changeset
374 'output_file' : CG2A_fr},
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
375
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
376 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
1
weilong-guo
parents: 0
diff changeset
377 'input_file_1' : outfile_2RGA,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
378 'input_file_2' : outfile_1RCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
379 'output_file' : WC2T_rf},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
380
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
381 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
1
weilong-guo
parents: 0
diff changeset
382 'input_file_1' : outfile_2RGA,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
383 'input_file_2' : outfile_1RCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
384 'output_file' : CC2T_rf}])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
385
1
weilong-guo
parents: 0
diff changeset
386 delete_files(outfile_1FCT, outfile_2RGA, outfile_1RCT, outfile_2RCT)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
387
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
388
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
389 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
390 # Post processing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
391 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
392
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
393 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
394 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf)
1
weilong-guo
parents: 0
diff changeset
395 RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
396 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
397
1
weilong-guo
parents: 0
diff changeset
398 delete_files(WC2T_fr, WC2T_rf, CG2A_fr, CC2T_rf)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
399
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
400 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
401 # get uniq-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
402 #----------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
403 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys())
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
404
1
weilong-guo
parents: 0
diff changeset
405 Unique_FW_fr_C2T = set() # +
weilong-guo
parents: 0
diff changeset
406 Unique_FW_rf_C2T = set() # +
weilong-guo
parents: 0
diff changeset
407 Unique_RC_fr_G2A = set() # -
weilong-guo
parents: 0
diff changeset
408 Unique_RC_rf_C2T = set() # -
weilong-guo
parents: 0
diff changeset
409 Multiple_hits = set()
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
410
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
411
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
412 for x in Union_set:
1
weilong-guo
parents: 0
diff changeset
413 _list = []
weilong-guo
parents: 0
diff changeset
414 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_G2A_fr_U, RC_C2T_rf_U]:
weilong-guo
parents: 0
diff changeset
415 mis_lst = d.get(x,[99])
weilong-guo
parents: 0
diff changeset
416 mis = int(mis_lst[0])
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
417 _list.append(mis)
1
weilong-guo
parents: 0
diff changeset
418 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_G2A_fr_R, RC_C2T_rf_R]:
weilong-guo
parents: 0
diff changeset
419 mis = d.get(x,99)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
420 _list.append(mis)
1
weilong-guo
parents: 0
diff changeset
421 mini = min(_list)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
422 if _list.count(mini)==1:
1
weilong-guo
parents: 0
diff changeset
423 mini_index = _list.index(mini)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
424 if mini_index==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
425 Unique_FW_fr_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
426 elif mini_index==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
427 Unique_FW_rf_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
428 elif mini_index==2:
1
weilong-guo
parents: 0
diff changeset
429 Unique_RC_fr_G2A.add(x)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
430 elif mini_index==3:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
431 Unique_RC_rf_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
432 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
433 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
434 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
435 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
436
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
437 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
438 if show_multiple_hit :
1
weilong-guo
parents: 0
diff changeset
439 outf_MH = open("Multiple_hit.fa",'w')
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
440 for i in Multiple_hits :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
441 outf_MH.write(">%s\n" % i)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
442 outf_MH.write("%s\n" % original_bs_reads[i])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
443 outf_MH.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
444
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
445 del Union_set
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
446 del FW_C2T_fr_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
447 del FW_C2T_rf_R
1
weilong-guo
parents: 0
diff changeset
448 del RC_G2A_fr_R
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
449 del RC_C2T_rf_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
450
1
weilong-guo
parents: 0
diff changeset
451 FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
weilong-guo
parents: 0
diff changeset
452 FW_C2T_rf_uniq_lst = [[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T]
weilong-guo
parents: 0
diff changeset
453 RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A]
weilong-guo
parents: 0
diff changeset
454 RC_C2T_rf_uniq_lst = [[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T]
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
455
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
456 FW_C2T_fr_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
457 FW_C2T_rf_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
458 RC_C2T_fr_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
459 RC_C2T_rf_uniq_lst.sort()
1
weilong-guo
parents: 0
diff changeset
460 FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst]
weilong-guo
parents: 0
diff changeset
461 FW_C2T_rf_uniq_lst = [x[1] for x in FW_C2T_rf_uniq_lst]
weilong-guo
parents: 0
diff changeset
462 RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst]
weilong-guo
parents: 0
diff changeset
463 RC_C2T_rf_uniq_lst = [x[1] for x in RC_C2T_rf_uniq_lst]
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
464 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
465
1
weilong-guo
parents: 0
diff changeset
466 numbers_premapped_lst[0] += len(Unique_FW_fr_C2T)
weilong-guo
parents: 0
diff changeset
467 numbers_premapped_lst[1] += len(Unique_FW_rf_C2T)
weilong-guo
parents: 0
diff changeset
468 numbers_premapped_lst[2] += len(Unique_RC_fr_G2A)
weilong-guo
parents: 0
diff changeset
469 numbers_premapped_lst[3] += len(Unique_RC_rf_C2T)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
470
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
471 del Unique_FW_fr_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
472 del Unique_FW_rf_C2T
1
weilong-guo
parents: 0
diff changeset
473 del Unique_RC_fr_G2A
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
474 del Unique_RC_rf_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
475
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
476 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
477
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
478 nn = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
479 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
480 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U),
1
weilong-guo
parents: 0
diff changeset
481 (RC_C2T_fr_uniq_lst,RC_G2A_fr_U),
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
482 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
483 nn += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
484
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
485 mapped_chr0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
486 for header in ali_unique_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
487
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
488 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
489
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
490 #-------------------------------------
1
weilong-guo
parents: 0
diff changeset
491 if mapped_chr!=mapped_chr0:
weilong-guo
parents: 0
diff changeset
492 my_gseq = deserialize(db_d(mapped_chr))
weilong-guo
parents: 0
diff changeset
493 chr_length = len(my_gseq)
weilong-guo
parents: 0
diff changeset
494 mapped_chr0 = mapped_chr
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
495 #-------------------------------------
1
weilong-guo
parents: 0
diff changeset
496 if nn==1 or nn==3 :
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
497 original_BS_1 = original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
498 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
1
weilong-guo
parents: 0
diff changeset
499 else :
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
500 original_BS_1 = original_bs_reads_2[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
501 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
502
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
503 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
504 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
505
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
506 all_mapped += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
507
1
weilong-guo
parents: 0
diff changeset
508 if nn==1 : # FW-RC mapped to + strand:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
509 FR = "+FR"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
510 mapped_strand_1 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
511 mapped_strand_2 = "+"
1
weilong-guo
parents: 0
diff changeset
512 elif nn==2 : # RC-FW mapped to + strand:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
513 FR = "+RF"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
514 mapped_strand_1 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
515 mapped_strand_2 = "+"
1
weilong-guo
parents: 0
diff changeset
516 elif nn==3 : # FW-RC mapped to - strand:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
517 FR = "-FR"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
518 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
519 mapped_strand_1 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
520 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
521 mapped_strand_2 = "-"
1
weilong-guo
parents: 0
diff changeset
522 elif nn==4 : # RC-FW mapped to - strand:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
523 FR = "-RF"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
524 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
525 mapped_strand_1 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
526 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
527 mapped_strand_2 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
528
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
529 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
530 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
531
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
532 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
533 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
534
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
535 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
536 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
537
1
weilong-guo
parents: 0
diff changeset
538 N_mismatch = max(N_mismatch_1, N_mismatch_2)
weilong-guo
parents: 0
diff changeset
539 mm_no=float(max_mismatch_no)
weilong-guo
parents: 0
diff changeset
540 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1)
weilong-guo
parents: 0
diff changeset
541 and N_mismatch_2<=mm_no*len(original_BS_2)):
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
542
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
543 all_mapped_passed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
544 numbers_mapped_lst[nn-1] += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
545 #---- unmapped -------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
546 del original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
547 del original_bs_reads_2[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
548 #---------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
549
1
weilong-guo
parents: 0
diff changeset
550 methy_1 = methy_seq(r_aln_1, g_aln_1 + next_1)
weilong-guo
parents: 0
diff changeset
551 methy_2 = methy_seq(r_aln_2, g_aln_2 + next_2)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
552
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
553 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
554 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
555
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
556 #---XS FILTER----------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
557 XS_1 = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
558 nCH_1 = methy_1.count('y') + methy_1.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
559 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
560 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
561 XS_1 = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
562 XS_2 = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
563 nCH_2 = methy_2.count('y') + methy_2.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
564 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
565 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
566 XS_2 = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
567
1
weilong-guo
parents: 0
diff changeset
568 if mapped_strand_1=='+' :
weilong-guo
parents: 0
diff changeset
569 flag_1 = 67 # 1000011 : 1st read, + strand
weilong-guo
parents: 0
diff changeset
570 flag_2 = 131 #10000011 : 2nd read, + strand
weilong-guo
parents: 0
diff changeset
571 else :
weilong-guo
parents: 0
diff changeset
572 flag_1 = 115 # 1110011 : 1st read, - strand
weilong-guo
parents: 0
diff changeset
573 flag_2 = 179 # 10110011 : 2nd read, - strand
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
574
1
weilong-guo
parents: 0
diff changeset
575 outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
576 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
1
weilong-guo
parents: 0
diff changeset
577 all_base_mapped += len(original_BS_1)
weilong-guo
parents: 0
diff changeset
578
weilong-guo
parents: 0
diff changeset
579 outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
580 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
1
weilong-guo
parents: 0
diff changeset
581 all_base_mapped += len(original_BS_2)
weilong-guo
parents: 0
diff changeset
582
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
583
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
584 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
585 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
586 # output unmapped pairs
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
587 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
588
1
weilong-guo
parents: 0
diff changeset
589 unmapped_lst = original_bs_reads_1.keys()
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
590 unmapped_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
591
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
592 # for u in unmapped_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
593 # outf_u1.write("%s\n"%original_bs_reads_1[u])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
594 # outf_u2.write("%s\n"%original_bs_reads_2[u])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
595
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
596 all_unmapped += len(unmapped_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
597
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
598
1
weilong-guo
parents: 0
diff changeset
599 # Directional library
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
600 if asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
601
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
602 #----------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
603 outfile_1FCT = tmp_d('Trimed_FCT_1.fa'+random_id)
weilong-guo
parents: 0
diff changeset
604 outfile_2RGA = tmp_d('Trimed_RGA_2.fa'+random_id)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
605
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
606 try :
1
weilong-guo
parents: 0
diff changeset
607 if read_file_1.endswith(".gz") : # support input file ending with ".gz"
weilong-guo
parents: 0
diff changeset
608 read_inf = gzip.open(tmp_d(read_file_1), "rb")
weilong-guo
parents: 0
diff changeset
609 else :
weilong-guo
parents: 0
diff changeset
610 read_inf = open(tmp_d(read_file_1),"r")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
611 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
612 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
613 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
614
1
weilong-guo
parents: 0
diff changeset
615 oneline = read_inf.readline()
weilong-guo
parents: 0
diff changeset
616 l = oneline.split()
weilong-guo
parents: 0
diff changeset
617 input_format = ""
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
618
1
weilong-guo
parents: 0
diff changeset
619 if oneline[0]=="@" :
weilong-guo
parents: 0
diff changeset
620 input_format = "fastq"
weilong-guo
parents: 0
diff changeset
621 elif len(l)==1 and oneline[0]!=">" :
weilong-guo
parents: 0
diff changeset
622 input_format = "seq"
weilong-guo
parents: 0
diff changeset
623 elif len(l)==11 :
weilong-guo
parents: 0
diff changeset
624 input_format = "qseq"
weilong-guo
parents: 0
diff changeset
625 elif oneline[0]==">" :
weilong-guo
parents: 0
diff changeset
626 input_format = "fasta"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
627
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
628 read_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
629
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
630 print "Detected data format: %s" % input_format
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
631
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
632 #----------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
633 read_file_list = [read_file_1,read_file_2]
weilong-guo
parents: 0
diff changeset
634 outfile_FCT_list = [outfile_1FCT,outfile_2RGA]
weilong-guo
parents: 0
diff changeset
635 n_list = [0,0]
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
636
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
637 for f in range(2):
1
weilong-guo
parents: 0
diff changeset
638 read_file = read_file_list[f]
weilong-guo
parents: 0
diff changeset
639 outf_FCT = open(outfile_FCT_list[f],'w')
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
640 original_bs_reads = original_bs_reads_lst[f]
1
weilong-guo
parents: 0
diff changeset
641 n = n_list[f]
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
642
1
weilong-guo
parents: 0
diff changeset
643 read_id = ""
weilong-guo
parents: 0
diff changeset
644 seq = ""
weilong-guo
parents: 0
diff changeset
645 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
646 line_no = 0
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
647 for line in fileinput.input(tmp_d(read_file)):
1
weilong-guo
parents: 0
diff changeset
648 l = line.split()
weilong-guo
parents: 0
diff changeset
649 line_no += 1
weilong-guo
parents: 0
diff changeset
650 if input_format=="seq":
weilong-guo
parents: 0
diff changeset
651 n += 1
weilong-guo
parents: 0
diff changeset
652 read_id = str(n)
weilong-guo
parents: 0
diff changeset
653 read_id = read_id.zfill(12)
weilong-guo
parents: 0
diff changeset
654 if f == 1 :
weilong-guo
parents: 0
diff changeset
655 read_id_lst_1[read_id] = read_id
weilong-guo
parents: 0
diff changeset
656 else :
weilong-guo
parents: 0
diff changeset
657 read_id_lst_2[read_id] = read_id
weilong-guo
parents: 0
diff changeset
658 seq = l[0]
weilong-guo
parents: 0
diff changeset
659 seq_ready = "Y"
weilong-guo
parents: 0
diff changeset
660 elif input_format=="fastq":
weilong-guo
parents: 0
diff changeset
661 l_fastq = math.fmod(line_no, 4)
weilong-guo
parents: 0
diff changeset
662 if l_fastq==1 :
weilong-guo
parents: 0
diff changeset
663 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
664 read_id = str(line_no/4+1).zfill(12)
weilong-guo
parents: 0
diff changeset
665 if f==1 :
weilong-guo
parents: 0
diff changeset
666 read_id_lst_1[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
667 else :
weilong-guo
parents: 0
diff changeset
668 read_id_lst_2[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
669 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
670 elif l_fastq==2 :
weilong-guo
parents: 0
diff changeset
671 seq = l[0]
weilong-guo
parents: 0
diff changeset
672 seq_ready = "Y"
weilong-guo
parents: 0
diff changeset
673 else :
weilong-guo
parents: 0
diff changeset
674 seq = ""
weilong-guo
parents: 0
diff changeset
675 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
676 elif input_format=="qseq":
weilong-guo
parents: 0
diff changeset
677 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
678 read_id = str(line_no)
weilong-guo
parents: 0
diff changeset
679 read_id = read_id.zfill(12)
weilong-guo
parents: 0
diff changeset
680 if f == 1 :
weilong-guo
parents: 0
diff changeset
681 read_id_lst_1[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
682 else :
weilong-guo
parents: 0
diff changeset
683 read_id_lst_2[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
684 seq = l[8]
weilong-guo
parents: 0
diff changeset
685 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
686 elif input_format=="fasta":
1
weilong-guo
parents: 0
diff changeset
687 l_fasta = math.fmod(line_no,2)
weilong-guo
parents: 0
diff changeset
688 if l_fasta==1:
weilong-guo
parents: 0
diff changeset
689 seq_ready = "N"
weilong-guo
parents: 0
diff changeset
690 all_raw_reads += 1
weilong-guo
parents: 0
diff changeset
691 read_id = str(line_no/2+1).zfill(12)
weilong-guo
parents: 0
diff changeset
692 if f == 1 :
weilong-guo
parents: 0
diff changeset
693 read_id_lst_1[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
694 else :
weilong-guo
parents: 0
diff changeset
695 read_id_lst_2[read_id] = l[0][1:]
weilong-guo
parents: 0
diff changeset
696 seq = ""
weilong-guo
parents: 0
diff changeset
697 elif l_fasta==0 :
weilong-guo
parents: 0
diff changeset
698 seq = l[0]
weilong-guo
parents: 0
diff changeset
699 seq_ready = "Y"
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
700 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
701 if seq_ready=="Y":
1
weilong-guo
parents: 0
diff changeset
702 seq = seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52
weilong-guo
parents: 0
diff changeset
703 seq = seq.upper()
weilong-guo
parents: 0
diff changeset
704 seq = seq.replace(".","N")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
705
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
706 #--striping BS adapter from 3' read --------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
707 all_base_before_trim += len(seq)
weilong-guo
parents: 0
diff changeset
708 if f==0 and adapterA!="" :
weilong-guo
parents: 0
diff changeset
709 new_read = RemoveAdapter(seq, adapterA, adapter_mismatch)
weilong-guo
parents: 0
diff changeset
710 if len(new_read) < len(seq) :
weilong-guo
parents: 0
diff changeset
711 all_trimmed += 1
weilong-guo
parents: 0
diff changeset
712 seq = new_read
weilong-guo
parents: 0
diff changeset
713 if f==1 and adapterB!="" :
weilong-guo
parents: 0
diff changeset
714 new_read = RemoveAdapter(seq, adapterB, adapter_mismatch)
weilong-guo
parents: 0
diff changeset
715 if len(new_read)<len(seq) :
weilong-guo
parents: 0
diff changeset
716 all_trimmed += 1
weilong-guo
parents: 0
diff changeset
717 seq = new_read
weilong-guo
parents: 0
diff changeset
718 all_base_after_trim += len(seq)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
719
1
weilong-guo
parents: 0
diff changeset
720 if len(seq)<=4:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
721 seq = "N" * (cut2-cut1+1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
722 #--------- trimmed_raw_BS_read ------------------
1
weilong-guo
parents: 0
diff changeset
723 original_bs_reads[read_id] = seq
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
724
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
725 #--------- FW_C2T ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
726 if f==0:
1
weilong-guo
parents: 0
diff changeset
727 outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("C","T")))
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
728 elif f==1:
1
weilong-guo
parents: 0
diff changeset
729 outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("G","A")))
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
730
1
weilong-guo
parents: 0
diff changeset
731 n_list[f] = n
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
732 outf_FCT.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
733 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
734
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
735 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
1
weilong-guo
parents: 0
diff changeset
736 all_raw_reads += n
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
737
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
738 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
739 # Bowtie mapping
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
740 #--------------------------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
741 WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
weilong-guo
parents: 0
diff changeset
742 CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
743
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
744 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
745 'input_file_1' : outfile_1FCT,
1
weilong-guo
parents: 0
diff changeset
746 'input_file_2' : outfile_2RGA,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
747 'output_file' : WC2T_fr},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
748
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
749 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
750 'input_file_1' : outfile_1FCT,
1
weilong-guo
parents: 0
diff changeset
751 'input_file_2' : outfile_2RGA,
weilong-guo
parents: 0
diff changeset
752 'output_file' : CG2A_fr} ])
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
753
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
754
1
weilong-guo
parents: 0
diff changeset
755 delete_files(outfile_1FCT, outfile_2RGA)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
756
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
757 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
758 # Post processing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
759 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
760
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
761 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
1
weilong-guo
parents: 0
diff changeset
762 RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
763
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
764 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
765 # get uniq-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
766 #----------------------------------------------------------------
1
weilong-guo
parents: 0
diff changeset
767 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys())
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
768
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
769 Unique_FW_fr_C2T = set() # +
1
weilong-guo
parents: 0
diff changeset
770 Unique_RC_fr_G2A = set() # -
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
771 Multiple_hits=set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
772
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
773
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
774 for x in Union_set:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
775 _list = []
1
weilong-guo
parents: 0
diff changeset
776 for d in [FW_C2T_fr_U, RC_G2A_fr_U]:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
777 mis_lst = d.get(x,[99])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
778 mis = int(mis_lst[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
779 _list.append(mis)
1
weilong-guo
parents: 0
diff changeset
780 for d in [FW_C2T_fr_R, RC_G2A_fr_R]:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
781 mis = d.get(x,99)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
782 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
783 mini = min(_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
784 if _list.count(mini) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
785 mini_index = _list.index(mini)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
786 if mini_index == 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
787 Unique_FW_fr_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
788 elif mini_index == 1:
1
weilong-guo
parents: 0
diff changeset
789 Unique_RC_fr_G2A.add(x)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
790 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
791 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
792 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
793 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
794
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
795 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
796 if show_multiple_hit :
1
weilong-guo
parents: 0
diff changeset
797 outf_MH = open("Multiple_hit.fa",'w')
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
798 for i in Multiple_hits :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
799 outf_MH.write(">%s\n" % i)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
800 outf_MH.write("%s\n" % original_bs_reads[i])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
801 outf_MH.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
802
1
weilong-guo
parents: 0
diff changeset
803 FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
weilong-guo
parents: 0
diff changeset
804 RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A]
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
805
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
806 FW_C2T_fr_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
807 RC_C2T_fr_uniq_lst.sort()
1
weilong-guo
parents: 0
diff changeset
808 FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst]
weilong-guo
parents: 0
diff changeset
809 RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst]
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
810 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
811
1
weilong-guo
parents: 0
diff changeset
812 numbers_premapped_lst[0] += len(Unique_FW_fr_C2T)
weilong-guo
parents: 0
diff changeset
813 numbers_premapped_lst[1] += len(Unique_RC_fr_G2A)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
814
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
815
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
816 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
817
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
818 nn = 0
1
weilong-guo
parents: 0
diff changeset
819 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_G2A_fr_U)]:
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
820 nn += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
821 mapped_chr0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
822 for header in ali_unique_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
823 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
824
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
825 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
826 if mapped_chr != mapped_chr0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
827 my_gseq = deserialize(db_d(mapped_chr))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
828 chr_length = len(my_gseq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
829 mapped_chr0 = mapped_chr
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
830 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
831
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
832 original_BS_1 = original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
833 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
1
weilong-guo
parents: 0
diff changeset
834 #original_BS_2 = original_bs_reads_2[header]
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
835
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
836 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
837 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
838
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
839 all_mapped += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
840
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
841 if nn == 1: # FW-RC mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
842 FR = "+FR"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
843 mapped_strand_1 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
844 mapped_strand_2 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
845 elif nn == 2: # FW-RC mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
846 FR="-FR"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
847 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
848 mapped_strand_1 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
849 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
850 mapped_strand_2 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
851
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
852 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
853 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
854
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
855 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
856 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
857
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
858 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
859 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
860
1
weilong-guo
parents: 0
diff changeset
861 # if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
weilong-guo
parents: 0
diff changeset
862 # if N_mismatch <= int(max_mismatch_no) :
weilong-guo
parents: 0
diff changeset
863 N_mismatch = max(N_mismatch_1, N_mismatch_2)
weilong-guo
parents: 0
diff changeset
864 mm_no=float(max_mismatch_no)
weilong-guo
parents: 0
diff changeset
865 if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1)
weilong-guo
parents: 0
diff changeset
866 and N_mismatch_2<=mm_no*len(original_BS_2)):
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
867
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
868 numbers_mapped_lst[nn-1] += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
869 all_mapped_passed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
870
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
871 #---- unmapped -------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
872 del original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
873 del original_bs_reads_2[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
874
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
875 #---------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
876 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
877 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
878 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
879 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
880
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
881 #
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
882 #---XS FILTER----------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
883 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
884 XS_1 = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
885 nCH_1 = methy_1.count('y') + methy_1.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
886 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
887 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
888 XS_1 = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
889 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
890 XS_2 = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
891 nCH_2 = methy_2.count('y') + methy_2.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
892 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
893 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
894 XS_2 = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
895
1
weilong-guo
parents: 0
diff changeset
896 if mapped_strand_1 == '+' :
weilong-guo
parents: 0
diff changeset
897 flag_1 = 67 # 1000011 : 1st read, + strand
weilong-guo
parents: 0
diff changeset
898 flag_2 = 131 # 10000011 : 2nd read, + strand
weilong-guo
parents: 0
diff changeset
899 else :
weilong-guo
parents: 0
diff changeset
900 flag_1 = 115 # 1110011 : 1st read, - strand
weilong-guo
parents: 0
diff changeset
901 flag_2 = 179 # 10110011 : 2nd read, - strand
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
902
1
weilong-guo
parents: 0
diff changeset
903 outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
904 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
1
weilong-guo
parents: 0
diff changeset
905 all_base_mapped += len(original_BS_1)
weilong-guo
parents: 0
diff changeset
906
weilong-guo
parents: 0
diff changeset
907 outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
908 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
1
weilong-guo
parents: 0
diff changeset
909 all_base_mapped += len(original_BS_2)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
910
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
911 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
912 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
913 # output unmapped pairs
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
914 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
915
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
916 unmapped_lst=original_bs_reads_1.keys()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
917 unmapped_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
918
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
919 # for u in unmapped_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
920 # outf_u1.write("%s\n"%(original_bs_reads_1[u]))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
921 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
922
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
923 all_unmapped+=len(unmapped_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
924
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
925 #==================================================================================================
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
926 delete_files(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
927
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
928 logm("-------------------------------- " )
1
weilong-guo
parents: 0
diff changeset
929 logm("Number of raw BS-read pairs: %d " %(all_raw_reads/2) )
weilong-guo
parents: 0
diff changeset
930 if adapterA != "" or adapterB != "" :
weilong-guo
parents: 0
diff changeset
931 logm("Number of reads having adapter removed: %d \n"% all_trimmed)
weilong-guo
parents: 0
diff changeset
932 logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
933
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
934 if all_raw_reads >0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
935
1
weilong-guo
parents: 0
diff changeset
936 logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
weilong-guo
parents: 0
diff changeset
937 logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped + "\n")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
938 if asktag=="Y":
1
weilong-guo
parents: 0
diff changeset
939 logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
weilong-guo
parents: 0
diff changeset
940 logm("-- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
weilong-guo
parents: 0
diff changeset
941 logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
weilong-guo
parents: 0
diff changeset
942 logm("-- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
943 elif asktag=="N":
1
weilong-guo
parents: 0
diff changeset
944 logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
weilong-guo
parents: 0
diff changeset
945 logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
weilong-guo
parents: 0
diff changeset
946 logm("--- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
weilong-guo
parents: 0
diff changeset
947 if asktag=="Y":
weilong-guo
parents: 0
diff changeset
948 logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
weilong-guo
parents: 0
diff changeset
949 logm("----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )
weilong-guo
parents: 0
diff changeset
950 logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )
weilong-guo
parents: 0
diff changeset
951 logm("----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )
weilong-guo
parents: 0
diff changeset
952 elif asktag=="N":
weilong-guo
parents: 0
diff changeset
953 logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
weilong-guo
parents: 0
diff changeset
954 logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
955
1
weilong-guo
parents: 0
diff changeset
956 logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)*2/all_raw_reads) )
weilong-guo
parents: 0
diff changeset
957 logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
weilong-guo
parents: 0
diff changeset
958 logm("Unmapped read pairs: %d"% all_unmapped+"\n")
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
959
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
960 n_CG=mC_lst[0]+uC_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
961 n_CHG=mC_lst[1]+uC_lst[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
962 n_CHH=mC_lst[2]+uC_lst[2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
963
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
964 logm("-------------------------------- " )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
965 logm("Methylated C in mapped reads " )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
966 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
967 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
968 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
969
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
970 logm("----------------------------------------------" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
971 logm("------------------- END ----------------------" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
972 elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))