annotate BSseeker2/bs_align/bs_pair_end.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 import fileinput, os, random, math
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2 from bs_utils.utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3 from bs_align_utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5 def extract_mapping(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 unique_hits = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 non_unique_hits = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 header0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9 family = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10 for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11 #------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12 if header != header0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 # --- output ----
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15 if len(family) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16 unique_hits[header0] = family[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17 elif len(family) > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 min_lst = min(family, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 max_lst = max(family, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21 if min_lst[0] < max_lst[0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22 unique_hits[header0] = min_lst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24 non_unique_hits[header0] = min_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
26 header0 = header
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27 family = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28 family.append((no_mismatch, chr, location1, cigar1, location2, cigar2))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30 if len(family) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 unique_hits[header0] = family[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32 elif len(family) > 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 min_lst = min(family, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34 max_lst = max(family, key = lambda x: x[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 if min_lst[0] < max_lst[0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37 unique_hits[header0] = min_lst
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 non_unique_hits[header0] = min_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41 return unique_hits, non_unique_hits
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 def _extract_mapping(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44 U = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45 R = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46 header0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47 family = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48 for line in fileinput.input(ali_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49 l = line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50 header = l[0][:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51 chr = str(l[1])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 location = int(l[2])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 #no_hits=int(l[4])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54 #-------- mismatches -----------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 if len(l) == 4:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56 no_mismatch = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57 elif len(l) == 5:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
58 no_mismatch = l[4].count(":")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 print l
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61 #------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 if header != header0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63 #--------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 if header0 != "":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65 # --- output ----
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 if len(family) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 U[header0] = family[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 if family[0][0] < family[1][0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 U[header0] = family[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71 elif family[1][0] < family[0][0]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 U[header0] = family[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74 R[header0] = family[0][0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75 family=[]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76 # ---------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77 header0 = header
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78 family = [[no_mismatch, chr, location]]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79 member = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 elif header == header0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
81 if member == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
82 family[-1][0] += no_mismatch
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
83 family[-1].append(location)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
84 member = 2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
85 elif member == 2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
86 family.append([no_mismatch, chr, location])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
87 member = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
88 #------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
89
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
90 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
91 return U, R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
92
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
93
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
94 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
96 def bs_pair_end(main_read_file_1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
97 main_read_file_2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98 asktag,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99 adapter_file,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100 cut1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 cut2, # add cut3 and cut4?
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
102 no_small_lines,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103 max_mismatch_no,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104 aligner_command,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105 db_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 tmp_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
107 outfile, XS_pct, XS_count, show_multiple_hit=False):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
111 adapter=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
112 adapterA=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
113 adapterB=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 if adapter_file !="":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
116 adapter_inf=open(adapter_file,"r")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 if asktag=="N": #<--- directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
118 adapter=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
119 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
120 adapter=adapter.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
121 elif asktag=="Y":#<--- un-directional library
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
122 adapterA=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
123 adapterB=adapter_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
124 adapter_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
125 adapterA=adapterA.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
126 adapterB=adapterB.rstrip("\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
127 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
128 print "[Error] Cannot find adapter file : %s" % adapter_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
129 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
130
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
131
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
132 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
133
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
134 logm("End 1 filename: %s"% main_read_file_1 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
135 logm("End 2 filename: %s"% main_read_file_2 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
136 logm("The first base (for mapping): %d"% cut1 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
137 logm("The last base (for mapping): %d"% cut2 )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
138
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
139 logm("-------------------------------- " )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
140 logm("Un-directional library: %s" % asktag )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
141 logm("Path for short reads aligner: %s"% aligner_command + '\n')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
142 logm("Reference genome library path: %s"% db_path )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
143 logm("Number of mismatches allowed: %s"% max_mismatch_no )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
144
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
145 if adapter_file !="":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
146 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
147 logm("Adapters to be removed from 3' of the reads:" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
148 logm("-- A: %s" % adapterA )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
149 logm("-- B: %s" % adapterB )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150 elif asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151 logm("Adapter to be removed from 3' of the reads:" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152 logm("-- %s" % adapter )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154 logm("-------------------------------- " )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
155
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
156 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
157
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
158 # helper method to join fname with tmp_path
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
159 tmp_d = lambda fname: os.path.join(tmp_path, fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
160
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161 db_d = lambda fname: os.path.join(db_path, fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
163
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
164 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
165 # splitting the 2 big read files
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
166
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
167 input_fname1 = os.path.split(main_read_file_1)[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
168 input_fname2 = os.path.split(main_read_file_2)[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
169
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
170 # TODO: run these in parallel with a subprocess
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
171 split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
172 split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
173
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 dirList=os.listdir(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175 my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176 sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList)))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
177
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
178
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
179
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
180 #---- Stats ------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
181 all_raw_reads=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
182 all_trimmed=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
183 all_mapped=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
184 all_mapped_passed=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
185
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
186 all_unmapped=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
187
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
188 numbers_premapped_lst=[0,0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
189 numbers_mapped_lst=[0,0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
190
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191 mC_lst=[0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
192 uC_lst=[0,0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
193
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
194 no_my_files=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
195
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
196 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
197 print "== Start mapping =="
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
198
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
199 for read_file_1, read_file_2 in my_files:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
200
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
201 no_my_files+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
202
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
203 random_id=".tmp-"+str(random.randint(1000000,9999999))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
204
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
205 original_bs_reads_1 = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
206 original_bs_reads_2 = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
207 original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
208
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
209
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
210 if asktag == "Y" :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
211
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
212 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
213 outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
214 outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
215 outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
216 outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
217
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
218 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
219 read_inf = open(tmp_d(read_file_1),"r")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
220 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
221 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
222 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
223 oneline = read_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
224 l = oneline.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
225 input_format = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
226
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
227 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
228 input_format="fastq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
229 n_fastq=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
230 elif len(l)==1 and oneline[0]!=">": # pure sequences
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
231 input_format="seq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
232 elif len(l)==11: # Illumina GAII qseq file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
233 input_format="qseq"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
234 elif oneline[0]==">": # fasta
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
235 input_format="fasta"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
236 n_fasta=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
237 read_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
238
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
239 print "Detected data format: %s" % input_format
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
240
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
241 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
242 read_file_list = [read_file_1, read_file_2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
243 outfile_FCT_list = [outfile_1FCT, outfile_2FCT]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
244 outfile_RCT_list = [outfile_1RCT, outfile_2RCT]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
245 n_list = [0, 0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
246
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
247
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
248 for f in range(2):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
249 read_file = read_file_list[f]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
250 outf_FCT = open(outfile_FCT_list[f], 'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
251 outf_RCT = open(outfile_RCT_list[f], 'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
252 original_bs_reads = original_bs_reads_lst[f]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
253 n = n_list[f]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
254
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
255 seq_id = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
256 seq = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
257 seq_ready = "N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
258 for line in fileinput.input(tmp_d(read_file)):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
259 l=line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
260 if input_format=="seq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
261 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
262 seq_id=str(n)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
263 seq_id=seq_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
264 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
265 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
266 elif input_format=="fastq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
267 m_fastq=math.fmod(n_fastq,4)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
268 n_fastq+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
269 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
270 if m_fastq==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
271 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
272 seq_id=str(n)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
273 seq_id=seq_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
274 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
275 elif m_fastq==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
276 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
277 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
278 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
279 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
280 elif input_format=="qseq":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
281 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
282 seq_id=str(n)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
283 seq_id=seq_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
284 seq=l[8]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
285 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
286 elif input_format=="fasta":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
287 m_fasta=math.fmod(n_fasta,2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
288 n_fasta+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
289 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
290 if m_fasta==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
291 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
292 seq_id=l[0][1:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
293 seq_id=seq_id.zfill(17)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
294 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
295 elif m_fasta==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
296 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
297 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
298 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
299 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
300 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
301 if seq_ready=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
302 seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
303 seq=seq.upper()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
304 seq=seq.replace(".","N")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
305
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
306 #--striping BS adapter from 3' read --------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
307 if (adapterA !="") and (adapterB !=""):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
308 signature=adapterA[:6]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
309 if signature in seq:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
310 signature_pos=seq.index(signature)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
311 if seq[signature_pos:] in adapterA:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
312 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
313 all_trimmed+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
314 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
315 signature=adapterB[:6]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
316 if signature in seq:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
317 #print seq_id,seq,signature;
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
318 signature_pos=seq.index(signature)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
319 if seq[signature_pos:] in adapterB:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
320 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
321 all_trimmed+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
322
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
323 if len(seq) <= 4:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
324 seq = "N" * (cut2-cut1+1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
325 #--------- trimmed_raw_BS_read ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
326 original_bs_reads[seq_id] = seq
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
327
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
328 #--------- FW_C2T ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
329 outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
330 #--------- RC_G2A ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
331 outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
332
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
333 n_list[f]=n
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
334
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
335 outf_FCT.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
336 outf_RCT.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
337
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
338 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
339
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
340
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
341 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
342 all_raw_reads+=n
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
343
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
344 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
345 # Bowtie mapping
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
346 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
347 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
348 WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
349 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
350 CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
351
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
352 run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
353 'input_file_1' : outfile_1FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
354 'input_file_2' : outfile_2RCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
355 'output_file' : WC2T_fr},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
356
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
357 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
358 'input_file_1' : outfile_1FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
359 'input_file_2' : outfile_2RCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
360 'output_file' : CC2T_fr},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
361
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
362 aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
363 'input_file_1' : outfile_2FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
364 'input_file_2' : outfile_1RCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
365 'output_file' : WC2T_rf},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
366
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
367 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
368 'input_file_1' : outfile_2FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
369 'input_file_2' : outfile_1RCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
370 'output_file' : CC2T_rf}])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
371
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
372
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
373 delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
374
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
375
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
376 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
377 # Post processing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
378 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
379
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
380
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
381 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
382 FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
383 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
384 RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
385
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
386 delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
387
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
388 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
389 # get uniq-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
390 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
391 Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys())
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
392
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
393 Unique_FW_fr_C2T=set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
394 Unique_FW_rf_C2T=set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
395 Unique_RC_fr_C2T=set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
396 Unique_RC_rf_C2T=set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
397 Multiple_hits=set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
398
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
399
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
400 for x in Union_set:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
401 _list=[]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
402 for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
403 mis_lst=d.get(x,[99])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
404 mis=int(mis_lst[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
405 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
406 for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
407 mis=d.get(x,99)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
408 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
409 mini=min(_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
410 if _list.count(mini)==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
411 mini_index=_list.index(mini)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
412 if mini_index==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
413 Unique_FW_fr_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
414 elif mini_index==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
415 Unique_FW_rf_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
416 elif mini_index==2:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
417 Unique_RC_fr_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
418 elif mini_index==3:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
419 Unique_RC_rf_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
420 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
421 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
422 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
423 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
424
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
425 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
426 if show_multiple_hit :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
427 outf_MH=open("Multiple_hit.fa",'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
428 for i in Multiple_hits :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
429 outf_MH.write(">%s\n" % i)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
430 outf_MH.write("%s\n" % original_bs_reads[i])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
431 outf_MH.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
432
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
433 del Union_set
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
434 del FW_C2T_fr_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
435 del FW_C2T_rf_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
436 del RC_C2T_fr_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
437 del RC_C2T_rf_R
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
438
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
439 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
440 FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
441 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
442 RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
443
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
444 FW_C2T_fr_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
445 FW_C2T_rf_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
446 RC_C2T_fr_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
447 RC_C2T_rf_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
448 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
449 FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
450 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
451 RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
452 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
453
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
454 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
455 numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
456 numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
457 numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
458
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
459 del Unique_FW_fr_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
460 del Unique_FW_rf_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
461 del Unique_RC_fr_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
462 del Unique_RC_rf_C2T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
463
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
464 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
465
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
466 nn = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
467 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
468 (FW_C2T_rf_uniq_lst,FW_C2T_rf_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
469 (RC_C2T_fr_uniq_lst,RC_C2T_fr_U),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
470 (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
471 nn += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
472
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
473 mapped_chr0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
474 for header in ali_unique_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
475
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
476 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
477
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
478 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
479 if mapped_chr != mapped_chr0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
480 my_gseq=deserialize(db_d(mapped_chr))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
481 chr_length=len(my_gseq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
482 mapped_chr0=mapped_chr
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
483 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
484 if nn == 1 or nn == 3:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
485 original_BS_1 = original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
486 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
487 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
488 original_BS_1 = original_bs_reads_2[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
489 original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
490
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
491 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
492 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
493
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
494 all_mapped += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
495
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
496 if nn == 1: # FW-RC mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
497
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
498 FR = "+FR"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
499
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
500 # mapped_location_1 += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
501 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
502 # origin_genome_1 = origin_genome_long_1[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
503 mapped_strand_1 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
504
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
505 # mapped_location_2 += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
506 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
507 # origin_genome_2 = origin_genome_long_2[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
508 mapped_strand_2 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
509
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
510 elif nn==2: # RC-FW mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
511
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
512 # original_BS_1 = original_bs_reads_2[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
513 # original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
514 FR = "+RF"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
515 # mapped_location_1 += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
516 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
517 # origin_genome_1 = origin_genome_long_1[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
518 mapped_strand_1 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
519
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
520 # mapped_location_2 += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
521 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
522 # origin_genome_2 = origin_genome_long_2[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
523 mapped_strand_2 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
524
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
525
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
526 elif nn==3: # FW-RC mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
527 # original_BS_1=original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
528 # original_BS_2=reverse_compl_seq(original_bs_reads_2[header])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
529
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
530 FR = "-FR"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
531 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
532 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
533 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
534 # origin_genome_1 = origin_genome_long_1[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
535 mapped_strand_1 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
536
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
537 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
538 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
539 # origin_genome_2 = origin_genome_long_2[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
540 mapped_strand_2 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
541
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
542 elif nn==4: # RC-FW mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
543 # original_BS_1=original_bs_reads_2[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
544 # original_BS_2=reverse_compl_seq(original_bs_reads_1[header])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
545
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
546 FR = "-RF"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
547 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
548 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
549 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
550 # origin_genome_1 = origin_genome_long_1[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
551 mapped_strand_1 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
552
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
553 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
554 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
555 # origin_genome_2 = origin_genome_long_2[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
556 mapped_strand_2 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
557
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
558
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
559
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
560
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
561 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
562 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
563
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
564 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
565 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
566
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
567
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
568 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
569 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
570
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
571
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
572 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
573 all_mapped_passed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
574 numbers_mapped_lst[nn-1] += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
575 #---- unmapped -------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
576 del original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
577 del original_bs_reads_2[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
578 #---------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
579
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
580 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
581 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
582
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
583 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
584 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
585
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
586 mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
587 mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
588
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
589
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
590 #
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
591 #---XS FILTER----------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
592 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
593 XS_1 = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
594 nCH_1 = methy_1.count('y') + methy_1.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
595 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
596 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
597 XS_1 = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
598 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
599 XS_2 = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
600 nCH_2 = methy_2.count('y') + methy_2.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
601 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
602 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
603 XS_2 = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
604
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
605
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
606 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
607 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
608 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
609 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
610
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
611 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
612 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
613 # output unmapped pairs
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
614 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
615
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
616 unmapped_lst=original_bs_reads_1.keys()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
617 unmapped_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
618
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
619 # for u in unmapped_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
620 # outf_u1.write("%s\n"%original_bs_reads_1[u])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
621 # outf_u2.write("%s\n"%original_bs_reads_2[u])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
622
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
623 all_unmapped += len(unmapped_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
624
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
625
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
626 if asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
627
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
628 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
629 outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
630 outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
631
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
632 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
633 read_inf=open(tmp_d(read_file_1),"r")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
634 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
635 print "[Error] Cannot open file : %s", tmp_d(read_file_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
636 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
637
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
638 oneline=read_inf.readline()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
639 l=oneline.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
640 input_format=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
641
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
642 #if len(l)==5: # old solexa format
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
643 # input_format="old Solexa Seq file"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
644
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
645 if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
646 input_format="FastQ"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
647 n_fastq=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
648 elif len(l)==1 and oneline[0]!=">": # pure sequences
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
649 input_format="_list of sequences"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
650 elif len(l)==11: # Illumina GAII qseq file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
651 input_format="Illumina GAII qseq file"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
652 elif oneline[0]==">": # fasta
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
653 input_format="fasta"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
654 n_fasta=0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
655
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
656 read_inf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
657
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
658 print "Detected data format: %s" % input_format
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
659
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
660 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
661 read_file_list=[read_file_1,read_file_2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
662 outfile_FCT_list=[outfile_1FCT,outfile_2FCT]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
663 n_list=[0,0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
664
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
665 for f in range(2):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
666 read_file=read_file_list[f]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
667 outf_FCT=open(outfile_FCT_list[f],'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
668 original_bs_reads = original_bs_reads_lst[f]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
669 n=n_list[f]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
670
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
671 seq_id=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
672 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
673 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
674 for line in fileinput.input(tmp_d(read_file)):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
675 l=line.split()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
676 if input_format=="old Solexa Seq file":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
677 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
678 seq_id=str(n)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
679 seq_id=seq_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
680 seq=l[4]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
681 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
682 elif input_format=="_list of sequences":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
683 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
684 seq_id=str(n)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
685 seq_id=seq_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
686 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
687 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
688 elif input_format=="FastQ":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
689 m_fastq=math.fmod(n_fastq,4)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
690 n_fastq+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
691 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
692 if m_fastq==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
693 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
694 seq_id=str(n)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
695 seq_id=seq_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
696 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
697 elif m_fastq==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
698 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
699 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
700 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
701 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
702 elif input_format=="Illumina GAII qseq file":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
703 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
704 seq_id=str(n)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
705 seq_id=seq_id.zfill(12)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
706 seq=l[8]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
707 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
708 elif input_format=="fasta":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
709 m_fasta=math.fmod(n_fasta,2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
710 n_fasta+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
711 seq_ready="N"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
712 if m_fasta==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
713 n+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
714 seq_id=l[0][1:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
715 seq_id=seq_id.zfill(17)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
716 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
717 elif m_fasta==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
718 seq=l[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
719 seq_ready="Y"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
720 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
721 seq=""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
722 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
723 if seq_ready=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
724 seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
725 seq=seq.upper()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
726 seq=seq.replace(".","N")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
727
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
728 #--striping BS adapter from 3' read --------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
729 if (adapterA !="") and (adapterB !=""):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
730 signature=adapterA[:6]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
731 if signature in seq:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
732 signature_pos=seq.index(signature)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
733 if seq[signature_pos:] in adapterA:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
734 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
735 all_trimmed+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
736 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
737 signature=adapterB[:6]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
738 if signature in seq:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
739 #print seq_id,seq,signature;
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
740 signature_pos=seq.index(signature)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
741 if seq[signature_pos:] in adapterB:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
742 seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
743 all_trimmed+=1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
744
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
745 if len(seq) <= 4:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
746 seq = "N" * (cut2-cut1+1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
747 #--------- trimmed_raw_BS_read ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
748 original_bs_reads[seq_id] = seq
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
749
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
750 #--------- FW_C2T ------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
751 if f==0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
752 outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
753 elif f==1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
754 outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
755
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
756
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
757 n_list[f]=n
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
758 outf_FCT.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
759
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
760 fileinput.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
761
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
762
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
763 #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
764 all_raw_reads+=n
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
765
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
766 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
767 # Bowtie mapping
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
768 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
769 WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
770 CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
771
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
772 run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
773 'input_file_1' : outfile_1FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
774 'input_file_2' : outfile_2FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
775 'output_file' : WC2T_fr},
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
776
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
777 aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
778 'input_file_1' : outfile_1FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
779 'input_file_2' : outfile_2FCT,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
780 'output_file' : CC2T_fr} ])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
781
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
782
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
783 delete_files(outfile_1FCT, outfile_2FCT)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
784
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
785 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
786 # Post processing
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
787 #--------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
788
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
789 FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
790 RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
791
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
792 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
793 # get uniq-hit reads
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
794 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
795 Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys())
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
796
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
797 Unique_FW_fr_C2T = set() # +
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
798 Unique_RC_fr_C2T = set() # -
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
799 Multiple_hits=set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
800
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
801
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
802 for x in Union_set:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
803 _list = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
804 for d in [FW_C2T_fr_U, RC_C2T_fr_U]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
805 mis_lst = d.get(x,[99])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
806 mis = int(mis_lst[0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
807 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
808 for d in [FW_C2T_fr_R, RC_C2T_fr_R]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
809 mis = d.get(x,99)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
810 _list.append(mis)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
811 mini = min(_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
812 if _list.count(mini) == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
813 mini_index = _list.index(mini)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
814 if mini_index == 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
815 Unique_FW_fr_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
816 elif mini_index == 1:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
817 Unique_RC_fr_C2T.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
818 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
819 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
820 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
821 Multiple_hits.add(x)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
822
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
823 # write reads rejected by Multiple Hits to file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
824 if show_multiple_hit :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
825 outf_MH=open("Multiple_hit.fa",'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
826 for i in Multiple_hits :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
827 outf_MH.write(">%s\n" % i)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
828 outf_MH.write("%s\n" % original_bs_reads[i])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
829 outf_MH.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
830
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
831 FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
832 RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
833
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
834 FW_C2T_fr_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
835 RC_C2T_fr_uniq_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
836 FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
837 RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
838 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
839
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
840 numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
841 numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
842
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
843
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
844 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
845
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
846 nn = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
847 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
848 nn += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
849 mapped_chr0 = ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
850 for header in ali_unique_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
851 _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
852
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
853
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
854 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
855 if mapped_chr != mapped_chr0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
856 my_gseq = deserialize(db_d(mapped_chr))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
857 chr_length = len(my_gseq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
858 mapped_chr0 = mapped_chr
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
859 #-------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
860
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
861 original_BS_1 = original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
862 original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
863
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
864 r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
865 r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
866
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
867 all_mapped += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
868
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
869 if nn == 1: # FW-RC mapped to + strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
870 FR = "+FR"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
871 # mapped_location_1 += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
872 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
873 # origin_genome_1 = origin_genome_long_1[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
874 mapped_strand_1 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
875
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
876 # mapped_location_2 += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
877 # origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
878 # origin_genome_2 = origin_genome_long_2[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
879 mapped_strand_2 = "+"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
880
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
881 elif nn == 2: # FW-RC mapped to - strand:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
882
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
883 FR="-FR"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
884 mapped_location_1 = chr_length - mapped_location_1 - g_len_1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
885 # origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
886 # origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
887 # origin_genome_1 = origin_genome_long_1[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
888 mapped_strand_1 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
889
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
890 mapped_location_2 = chr_length - mapped_location_2 - g_len_2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
891 # origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
892 # origin_genome_2 = origin_genome_long_2[2:-2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
893 mapped_strand_2 = "-"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
894
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
895
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
896
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
897 origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
898 origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
899
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
900
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
901 r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
902 r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
903
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
904 N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
905 N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
906
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
907 if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
908
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
909 numbers_mapped_lst[nn-1] += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
910 all_mapped_passed += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
911
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
912 #---- unmapped -------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
913 del original_bs_reads_1[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
914 del original_bs_reads_2[header]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
915
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
916 #---------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
917 # output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
918 # output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
919 methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
920 methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
921 mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
922 mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
923
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
924 #
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
925 #---XS FILTER----------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
926 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
927 XS_1 = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
928 nCH_1 = methy_1.count('y') + methy_1.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
929 nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
930 if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
931 XS_1 = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
932 #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
933 XS_2 = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
934 nCH_2 = methy_2.count('y') + methy_2.count('z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
935 nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
936 if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
937 XS_2 = 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
938
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
939
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
940 outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
941 output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
942 outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
943 output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
944
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
945 print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
946 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
947 # output unmapped pairs
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
948 #----------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
949
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
950 unmapped_lst=original_bs_reads_1.keys()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
951 unmapped_lst.sort()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
952
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
953 # for u in unmapped_lst:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
954 # outf_u1.write("%s\n"%(original_bs_reads_1[u]))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
955 # outf_u2.write("%s\n"%(original_bs_reads_2[u]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
956
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
957 all_unmapped+=len(unmapped_lst)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
958
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
959
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
960 #==================================================================================================
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
961 # outf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
962 #
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
963 # outf_u1.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
964 # outf_u2.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
965 delete_files(tmp_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
966
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
967 logm("-------------------------------- " )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
968 logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
969 logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
970
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
971 if all_raw_reads >0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
972
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
973 logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
974 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
975 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
976 logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
977 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
978 logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
979 elif asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
980 logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
981 logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
982
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
983
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
984 logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
985 logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
986 if asktag=="Y":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
987 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
988 logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
989 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
990 logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
991 elif asktag=="N":
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
992 logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
993 logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
994
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
995 logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
996 logm("O Unmapped read pairs: %d"% all_unmapped+"\n")
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
997
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
998
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
999 n_CG=mC_lst[0]+uC_lst[0]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1000 n_CHG=mC_lst[1]+uC_lst[1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1001 n_CHH=mC_lst[2]+uC_lst[2]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1002
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1003 logm("-------------------------------- " )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1004 logm("Methylated C in mapped reads " )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1005 logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1006 logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1007 logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1008
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1009 logm("----------------------------------------------" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1010 logm("------------------- END ----------------------" )
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1011 elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))