Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/GetTrustedSplice.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <stdio.h> | |
2 #include <math.h> | |
3 #include <vector> | |
4 #include <algorithm> | |
5 | |
6 #include "alignments.hpp" | |
7 | |
8 #define MAX(x, y) (((x)<(y))?(y):(x)) | |
9 #define MIN(x, y) (((x)<(y))?(x):(y)) | |
10 | |
11 char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n" | |
12 "Options:\n" | |
13 "\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ; | |
14 | |
15 struct _intron | |
16 { | |
17 int chrId ; | |
18 int start, end ; | |
19 double support ; | |
20 int uniqSupport ; | |
21 int secSupport ; | |
22 char strand ; | |
23 int editDist ; | |
24 | |
25 int sampleSupport ; | |
26 } ; | |
27 | |
28 struct _site | |
29 { | |
30 int chrId ; | |
31 int pos ; | |
32 double support ; | |
33 | |
34 char strand ; | |
35 int associatedIntronCnt ; | |
36 } ; | |
37 | |
38 bool CompIntrons( struct _intron a, struct _intron b ) | |
39 { | |
40 if ( a.chrId != b.chrId ) | |
41 return a.chrId < b.chrId ; | |
42 else if ( a.start != b.start ) | |
43 return a.start < b.start ; | |
44 else if ( a.end != b.end ) | |
45 return a.end < b.end ; | |
46 return false ; | |
47 } | |
48 | |
49 bool CompSites( struct _site a, struct _site b ) | |
50 { | |
51 if ( a.chrId != b.chrId ) | |
52 return a.chrId < b.chrId ; | |
53 else if ( a.pos != b.pos ) | |
54 return a.pos < b.pos ; | |
55 return false ; | |
56 } | |
57 | |
58 void CoalesceIntrons( std::vector<struct _intron> &introns ) | |
59 { | |
60 std::sort( introns.begin(), introns.end(), CompIntrons ) ; | |
61 int size = introns.size() ; | |
62 int i, k ; | |
63 k = 0 ; | |
64 for ( i = 1 ; i < size ; ++i ) | |
65 { | |
66 if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start | |
67 && introns[i].end == introns[k].end ) | |
68 { | |
69 introns[k].support += introns[i].support ; | |
70 introns[k].uniqSupport += introns[i].uniqSupport ; | |
71 introns[k].secSupport += introns[i].secSupport ; | |
72 introns[k].sampleSupport += introns[i].sampleSupport ; | |
73 | |
74 if ( introns[k].strand == '?' ) | |
75 introns[k].strand = introns[i].strand ; | |
76 } | |
77 else | |
78 { | |
79 ++k ; | |
80 introns[k] = introns[i] ; | |
81 } | |
82 } | |
83 introns.resize( k + 1 ) ; | |
84 } | |
85 | |
86 void CoalesceSites( std::vector<struct _site> &sites ) | |
87 { | |
88 std::sort( sites.begin(), sites.end(), CompSites ) ; | |
89 int size = sites.size() ; | |
90 int i, k ; | |
91 k = 0 ; | |
92 for ( i = 1 ; i < size ; ++i ) | |
93 { | |
94 if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos ) | |
95 { | |
96 sites[k].support += sites[i].support ; | |
97 sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ; | |
98 } | |
99 else | |
100 { | |
101 ++k ; | |
102 sites[k] = sites[i] ; | |
103 } | |
104 } | |
105 sites.resize( k + 1 ) ; | |
106 } | |
107 | |
108 int main( int argc, char *argv[] ) | |
109 { | |
110 int i, j, k ; | |
111 FILE *fpList ; | |
112 FILE *fp ; | |
113 char spliceFile[2048] ; | |
114 char chrName[1024], strand[3] ; | |
115 int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ; | |
116 double support ; | |
117 Alignments alignments ; | |
118 std::vector<struct _intron> introns ; | |
119 std::vector<struct _site> sites ; | |
120 double averageSupportThreshold = 0.5 ; | |
121 | |
122 if ( argc <= 1 ) | |
123 { | |
124 printf( "%s", usage ) ; | |
125 exit( 1 ) ; | |
126 } | |
127 | |
128 for ( i = 3 ; i < argc ; ++i ) | |
129 { | |
130 if ( !strcmp( argv[ i ], "-a" ) ) | |
131 { | |
132 averageSupportThreshold = atof( argv[i + 1] ) ; | |
133 ++i ; | |
134 } | |
135 else | |
136 { | |
137 printf( "Unknown option: %s", argv[i] ) ; | |
138 exit( 1 ) ; | |
139 } | |
140 } | |
141 | |
142 alignments.Open( argv[2] ) ; | |
143 | |
144 // Get the number of samples. | |
145 fpList = fopen( argv[1], "r" ) ; | |
146 int sampleCnt = 0 ; | |
147 while ( fscanf( fpList, "%s", spliceFile ) != EOF ) | |
148 { | |
149 ++sampleCnt ; | |
150 } | |
151 fclose( fpList ) ; | |
152 | |
153 // Get all the introns. | |
154 fpList = fopen( argv[1], "r" ) ; | |
155 while ( fscanf( fpList, "%s", spliceFile ) != EOF ) | |
156 { | |
157 fp = fopen( spliceFile, "r" ) ; | |
158 while ( fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support, | |
159 strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF ) | |
160 { | |
161 | |
162 if ( support <= 0 ) | |
163 support = 0.1 ; | |
164 else if ( support == 1 && sampleCnt > 5 ) | |
165 support = 0.75 ; | |
166 | |
167 struct _intron ni ; | |
168 ni.chrId = alignments.GetChromIdFromName( chrName ) ; | |
169 ni.start = start ; | |
170 ni.end = end ; | |
171 ni.support = support ; | |
172 ni.strand = strand[0] ; | |
173 ni.uniqSupport = uniqSupport ; | |
174 ni.secSupport = secSupport ; | |
175 ni.sampleSupport = 1 ; | |
176 ni.editDist = uniqEditDistance + secEditDistance ; | |
177 introns.push_back( ni ) ; | |
178 } | |
179 fclose( fp ) ; | |
180 | |
181 CoalesceIntrons( introns ) ; | |
182 } | |
183 fclose( fpList ) ; | |
184 | |
185 // Obtain the split sites. | |
186 int intronCnt = introns.size() ; | |
187 for ( i = 0 ; i < intronCnt ; ++i ) | |
188 { | |
189 struct _site ns ; | |
190 ns.chrId = introns[i].chrId ; | |
191 ns.associatedIntronCnt = 1 ; | |
192 ns.support = introns[i].support ; | |
193 ns.strand = introns[i].strand ; | |
194 | |
195 ns.pos = introns[i].start ; | |
196 sites.push_back( ns ) ; | |
197 ns.pos = introns[i].end ; | |
198 sites.push_back( ns ) ; | |
199 } | |
200 CoalesceSites( sites ) ; | |
201 | |
202 // Get the chromsomes with too many split sites. | |
203 int siteCnt = sites.size() ; | |
204 std::vector<bool> badChrom ; | |
205 | |
206 badChrom.resize( alignments.GetChromCount() ) ; | |
207 int size = alignments.GetChromCount() ; | |
208 for ( i = 0 ; i < size ; ++i ) | |
209 badChrom[i] = false ; | |
210 | |
211 for ( i = 0 ; i < siteCnt ; ) | |
212 { | |
213 for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j ) | |
214 ; | |
215 //printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ; | |
216 if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) ) | |
217 badChrom[ sites[i].chrId ] = true ; | |
218 i = j ; | |
219 } | |
220 | |
221 // Output the valid introns. | |
222 k = 0 ; | |
223 double unit = sampleCnt / 50 ; | |
224 if ( unit < 1 ) | |
225 unit = 1 ; | |
226 | |
227 int longIntronSize ; | |
228 std::vector<int> intronSizes ; | |
229 for (i = 0 ; i < intronCnt ; ++i) | |
230 intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ; | |
231 std::sort( intronSizes.begin(), intronSizes.end() ) ; | |
232 longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ; | |
233 if ( longIntronSize > 100000 ) | |
234 longIntronSize = 100000 ; | |
235 for ( i = 0 ; i < intronCnt ; ++i ) | |
236 { | |
237 if ( introns[i].support / sampleCnt < averageSupportThreshold ) | |
238 continue ; | |
239 | |
240 if ( badChrom[ introns[i].chrId ] ) | |
241 { | |
242 if ( introns[i].sampleSupport <= sampleCnt / 2 ) | |
243 continue ; | |
244 } | |
245 | |
246 if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) ) | |
247 continue ; | |
248 | |
249 //Locate the two split sites. | |
250 while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) ) | |
251 { | |
252 ++k ; | |
253 } | |
254 int a, b ; | |
255 a = k ; | |
256 for ( b = a + 1 ; b < siteCnt ; ++b ) | |
257 { | |
258 if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end ) | |
259 break ; | |
260 } | |
261 double siteSupport = MAX( sites[a].support, sites[b].support ) ; | |
262 //if ( introns[i].start == 100233371 && introns[i].end == 100236850 ) | |
263 // printf( "test: %lf %lf\n", introns[i].support, siteSupport) ; | |
264 if ( introns[i].support < siteSupport / 10.0 ) | |
265 { | |
266 double needSample = MIN( ( -log( introns[i].support / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ; | |
267 if ( introns[i].sampleSupport < needSample ) | |
268 continue ; | |
269 } | |
270 | |
271 if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 ) | |
272 { | |
273 if ( introns[i].sampleSupport <= sampleCnt * 0.01 ) | |
274 continue ; | |
275 } | |
276 /*if ( sampleCnt >= 50 ) | |
277 { | |
278 // just some randomly intron. | |
279 if ( introns[i].sampleSupport == 1 | |
280 && ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) ) | |
281 continue ; | |
282 }*/ | |
283 | |
284 /*if ( introns[i].end - introns[i].start + 1 < 50 ) | |
285 { | |
286 int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ; | |
287 int flag = 0 ; | |
288 | |
289 if ( introns[i].sampleSupport < needSample ) | |
290 flag = 1 ; | |
291 if ( flag == 1 ) | |
292 continue ; | |
293 }*/ | |
294 if ( introns[i].strand == '?' && sampleCnt == 1 && | |
295 ( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) ) | |
296 continue ; | |
297 | |
298 // Since the strand is uncertain, the alinger may make different decision sample from sample. | |
299 // To keep this intron, one of its splice sites must be more supported than adjacent splice sites. | |
300 if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) | |
301 { | |
302 int s, e ; | |
303 int l ; | |
304 int cnt = 0 ; | |
305 for ( l = 0 ; l < 2 ; ++l ) | |
306 { | |
307 int ind = ( l == 0 ) ? a : b ; | |
308 double max = sites[ind].support ; | |
309 int maxTag = ind ; | |
310 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s ) | |
311 { | |
312 if ( sites[s].pos + 7 < sites[s + 1].pos ) | |
313 break ; | |
314 if ( sites[s].support >= max ) | |
315 { | |
316 max = sites[s].support ; | |
317 maxTag = s ; | |
318 } | |
319 } | |
320 | |
321 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e ) | |
322 { | |
323 if ( sites[e].pos - 7 > sites[e - 1].pos ) | |
324 break ; | |
325 if ( sites[e].support >= max ) | |
326 { | |
327 max = sites[e].support ; | |
328 maxTag = e ; | |
329 } | |
330 } | |
331 | |
332 if ( maxTag == ind ) | |
333 ++cnt ; | |
334 } | |
335 if ( cnt == 0 ) | |
336 continue ; | |
337 } | |
338 | |
339 // Test whether this a intron coming from a wrong strand | |
340 /*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) | |
341 { | |
342 int plusStrand = 0 ; | |
343 int minusStrand = 0 ; | |
344 int l ; | |
345 int s, e ; | |
346 | |
347 if ( introns[i].strand == '+' ) | |
348 plusStrand = 2 ; | |
349 else | |
350 minusStrand = 2 ; | |
351 | |
352 for ( l = 0 ; l < 2 ; ++l ) | |
353 { | |
354 int ind = ( l == 0 ) ? a : b ; | |
355 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s ) | |
356 { | |
357 if ( sites[s].pos + 10000 < sites[s + 1].pos ) | |
358 break ; | |
359 | |
360 if ( sites[s].strand == '+' ) | |
361 ++plusStrand ; | |
362 else if ( sites[s].strand == '-' ) | |
363 ++minusStrand ; | |
364 } | |
365 | |
366 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e ) | |
367 { | |
368 if ( sites[e].pos - 10000 > sites[e - 1].pos ) | |
369 break ; | |
370 | |
371 if ( sites[e].strand == '+' ) | |
372 ++plusStrand ; | |
373 else if ( sites[e].strand == '-' ) | |
374 ++minusStrand ; | |
375 } | |
376 } | |
377 | |
378 if ( introns[i].start == 161517978 ) | |
379 printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ; | |
380 | |
381 if ( introns[i].strand == '+' && minusStrand >= 20 && plusStrand == 2 ) | |
382 continue ; | |
383 else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 ) | |
384 continue ; | |
385 | |
386 }*/ | |
387 | |
388 // Filter a intron if one of its splice site associated with too many introns. | |
389 /*if ( sites[a].associatedIntronCnt >= 10 ) | |
390 { | |
391 int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ; | |
392 if ( introns[i].support < sites[a].support / 100 && | |
393 introns[i].sampleSupport < needSample ) | |
394 { | |
395 continue ; | |
396 } | |
397 } | |
398 if ( sites[b].associatedIntronCnt >= 10 ) | |
399 { | |
400 int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ; | |
401 if ( introns[i].support < sites[b].support / 100 && | |
402 introns[i].sampleSupport < needSample ) | |
403 { | |
404 continue ; | |
405 } | |
406 }*/ | |
407 | |
408 | |
409 // Test for long intron | |
410 if ( introns[i].end - introns[i].start + 1 >= longIntronSize ) | |
411 { | |
412 int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ; | |
413 int flag = 0 ; | |
414 if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1 | |
415 || introns[i].uniqSupport / sampleCnt < 1 | |
416 || introns[i].sampleSupport < needSample ) | |
417 flag = 1 ; | |
418 if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize ) | |
419 continue ; | |
420 if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes | |
421 continue ; | |
422 } | |
423 | |
424 | |
425 printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ; | |
426 } | |
427 | |
428 return 0 ; | |
429 } |