Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/GetTrustedSplice.cpp @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #include <stdio.h> | |
| 2 #include <math.h> | |
| 3 #include <vector> | |
| 4 #include <algorithm> | |
| 5 | |
| 6 #include "alignments.hpp" | |
| 7 | |
| 8 #define MAX(x, y) (((x)<(y))?(y):(x)) | |
| 9 #define MIN(x, y) (((x)<(y))?(x):(y)) | |
| 10 | |
| 11 char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n" | |
| 12 "Options:\n" | |
| 13 "\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ; | |
| 14 | |
| 15 struct _intron | |
| 16 { | |
| 17 int chrId ; | |
| 18 int start, end ; | |
| 19 double support ; | |
| 20 int uniqSupport ; | |
| 21 int secSupport ; | |
| 22 char strand ; | |
| 23 int editDist ; | |
| 24 | |
| 25 int sampleSupport ; | |
| 26 } ; | |
| 27 | |
| 28 struct _site | |
| 29 { | |
| 30 int chrId ; | |
| 31 int pos ; | |
| 32 double support ; | |
| 33 | |
| 34 char strand ; | |
| 35 int associatedIntronCnt ; | |
| 36 } ; | |
| 37 | |
| 38 bool CompIntrons( struct _intron a, struct _intron b ) | |
| 39 { | |
| 40 if ( a.chrId != b.chrId ) | |
| 41 return a.chrId < b.chrId ; | |
| 42 else if ( a.start != b.start ) | |
| 43 return a.start < b.start ; | |
| 44 else if ( a.end != b.end ) | |
| 45 return a.end < b.end ; | |
| 46 return false ; | |
| 47 } | |
| 48 | |
| 49 bool CompSites( struct _site a, struct _site b ) | |
| 50 { | |
| 51 if ( a.chrId != b.chrId ) | |
| 52 return a.chrId < b.chrId ; | |
| 53 else if ( a.pos != b.pos ) | |
| 54 return a.pos < b.pos ; | |
| 55 return false ; | |
| 56 } | |
| 57 | |
| 58 void CoalesceIntrons( std::vector<struct _intron> &introns ) | |
| 59 { | |
| 60 std::sort( introns.begin(), introns.end(), CompIntrons ) ; | |
| 61 int size = introns.size() ; | |
| 62 int i, k ; | |
| 63 k = 0 ; | |
| 64 for ( i = 1 ; i < size ; ++i ) | |
| 65 { | |
| 66 if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start | |
| 67 && introns[i].end == introns[k].end ) | |
| 68 { | |
| 69 introns[k].support += introns[i].support ; | |
| 70 introns[k].uniqSupport += introns[i].uniqSupport ; | |
| 71 introns[k].secSupport += introns[i].secSupport ; | |
| 72 introns[k].sampleSupport += introns[i].sampleSupport ; | |
| 73 | |
| 74 if ( introns[k].strand == '?' ) | |
| 75 introns[k].strand = introns[i].strand ; | |
| 76 } | |
| 77 else | |
| 78 { | |
| 79 ++k ; | |
| 80 introns[k] = introns[i] ; | |
| 81 } | |
| 82 } | |
| 83 introns.resize( k + 1 ) ; | |
| 84 } | |
| 85 | |
| 86 void CoalesceSites( std::vector<struct _site> &sites ) | |
| 87 { | |
| 88 std::sort( sites.begin(), sites.end(), CompSites ) ; | |
| 89 int size = sites.size() ; | |
| 90 int i, k ; | |
| 91 k = 0 ; | |
| 92 for ( i = 1 ; i < size ; ++i ) | |
| 93 { | |
| 94 if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos ) | |
| 95 { | |
| 96 sites[k].support += sites[i].support ; | |
| 97 sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ; | |
| 98 } | |
| 99 else | |
| 100 { | |
| 101 ++k ; | |
| 102 sites[k] = sites[i] ; | |
| 103 } | |
| 104 } | |
| 105 sites.resize( k + 1 ) ; | |
| 106 } | |
| 107 | |
| 108 int main( int argc, char *argv[] ) | |
| 109 { | |
| 110 int i, j, k ; | |
| 111 FILE *fpList ; | |
| 112 FILE *fp ; | |
| 113 char spliceFile[2048] ; | |
| 114 char chrName[1024], strand[3] ; | |
| 115 int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ; | |
| 116 double support ; | |
| 117 Alignments alignments ; | |
| 118 std::vector<struct _intron> introns ; | |
| 119 std::vector<struct _site> sites ; | |
| 120 double averageSupportThreshold = 0.5 ; | |
| 121 | |
| 122 if ( argc <= 1 ) | |
| 123 { | |
| 124 printf( "%s", usage ) ; | |
| 125 exit( 1 ) ; | |
| 126 } | |
| 127 | |
| 128 for ( i = 3 ; i < argc ; ++i ) | |
| 129 { | |
| 130 if ( !strcmp( argv[ i ], "-a" ) ) | |
| 131 { | |
| 132 averageSupportThreshold = atof( argv[i + 1] ) ; | |
| 133 ++i ; | |
| 134 } | |
| 135 else | |
| 136 { | |
| 137 printf( "Unknown option: %s", argv[i] ) ; | |
| 138 exit( 1 ) ; | |
| 139 } | |
| 140 } | |
| 141 | |
| 142 alignments.Open( argv[2] ) ; | |
| 143 | |
| 144 // Get the number of samples. | |
| 145 fpList = fopen( argv[1], "r" ) ; | |
| 146 int sampleCnt = 0 ; | |
| 147 while ( fscanf( fpList, "%s", spliceFile ) != EOF ) | |
| 148 { | |
| 149 ++sampleCnt ; | |
| 150 } | |
| 151 fclose( fpList ) ; | |
| 152 | |
| 153 // Get all the introns. | |
| 154 fpList = fopen( argv[1], "r" ) ; | |
| 155 while ( fscanf( fpList, "%s", spliceFile ) != EOF ) | |
| 156 { | |
| 157 fp = fopen( spliceFile, "r" ) ; | |
| 158 while ( fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support, | |
| 159 strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF ) | |
| 160 { | |
| 161 | |
| 162 if ( support <= 0 ) | |
| 163 support = 0.1 ; | |
| 164 else if ( support == 1 && sampleCnt > 5 ) | |
| 165 support = 0.75 ; | |
| 166 | |
| 167 struct _intron ni ; | |
| 168 ni.chrId = alignments.GetChromIdFromName( chrName ) ; | |
| 169 ni.start = start ; | |
| 170 ni.end = end ; | |
| 171 ni.support = support ; | |
| 172 ni.strand = strand[0] ; | |
| 173 ni.uniqSupport = uniqSupport ; | |
| 174 ni.secSupport = secSupport ; | |
| 175 ni.sampleSupport = 1 ; | |
| 176 ni.editDist = uniqEditDistance + secEditDistance ; | |
| 177 introns.push_back( ni ) ; | |
| 178 } | |
| 179 fclose( fp ) ; | |
| 180 | |
| 181 CoalesceIntrons( introns ) ; | |
| 182 } | |
| 183 fclose( fpList ) ; | |
| 184 | |
| 185 // Obtain the split sites. | |
| 186 int intronCnt = introns.size() ; | |
| 187 for ( i = 0 ; i < intronCnt ; ++i ) | |
| 188 { | |
| 189 struct _site ns ; | |
| 190 ns.chrId = introns[i].chrId ; | |
| 191 ns.associatedIntronCnt = 1 ; | |
| 192 ns.support = introns[i].support ; | |
| 193 ns.strand = introns[i].strand ; | |
| 194 | |
| 195 ns.pos = introns[i].start ; | |
| 196 sites.push_back( ns ) ; | |
| 197 ns.pos = introns[i].end ; | |
| 198 sites.push_back( ns ) ; | |
| 199 } | |
| 200 CoalesceSites( sites ) ; | |
| 201 | |
| 202 // Get the chromsomes with too many split sites. | |
| 203 int siteCnt = sites.size() ; | |
| 204 std::vector<bool> badChrom ; | |
| 205 | |
| 206 badChrom.resize( alignments.GetChromCount() ) ; | |
| 207 int size = alignments.GetChromCount() ; | |
| 208 for ( i = 0 ; i < size ; ++i ) | |
| 209 badChrom[i] = false ; | |
| 210 | |
| 211 for ( i = 0 ; i < siteCnt ; ) | |
| 212 { | |
| 213 for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j ) | |
| 214 ; | |
| 215 //printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ; | |
| 216 if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) ) | |
| 217 badChrom[ sites[i].chrId ] = true ; | |
| 218 i = j ; | |
| 219 } | |
| 220 | |
| 221 // Output the valid introns. | |
| 222 k = 0 ; | |
| 223 double unit = sampleCnt / 50 ; | |
| 224 if ( unit < 1 ) | |
| 225 unit = 1 ; | |
| 226 | |
| 227 int longIntronSize ; | |
| 228 std::vector<int> intronSizes ; | |
| 229 for (i = 0 ; i < intronCnt ; ++i) | |
| 230 intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ; | |
| 231 std::sort( intronSizes.begin(), intronSizes.end() ) ; | |
| 232 longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ; | |
| 233 if ( longIntronSize > 100000 ) | |
| 234 longIntronSize = 100000 ; | |
| 235 for ( i = 0 ; i < intronCnt ; ++i ) | |
| 236 { | |
| 237 if ( introns[i].support / sampleCnt < averageSupportThreshold ) | |
| 238 continue ; | |
| 239 | |
| 240 if ( badChrom[ introns[i].chrId ] ) | |
| 241 { | |
| 242 if ( introns[i].sampleSupport <= sampleCnt / 2 ) | |
| 243 continue ; | |
| 244 } | |
| 245 | |
| 246 if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) ) | |
| 247 continue ; | |
| 248 | |
| 249 //Locate the two split sites. | |
| 250 while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) ) | |
| 251 { | |
| 252 ++k ; | |
| 253 } | |
| 254 int a, b ; | |
| 255 a = k ; | |
| 256 for ( b = a + 1 ; b < siteCnt ; ++b ) | |
| 257 { | |
| 258 if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end ) | |
| 259 break ; | |
| 260 } | |
| 261 double siteSupport = MAX( sites[a].support, sites[b].support ) ; | |
| 262 //if ( introns[i].start == 100233371 && introns[i].end == 100236850 ) | |
| 263 // printf( "test: %lf %lf\n", introns[i].support, siteSupport) ; | |
| 264 if ( introns[i].support < siteSupport / 10.0 ) | |
| 265 { | |
| 266 double needSample = MIN( ( -log( introns[i].support / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ; | |
| 267 if ( introns[i].sampleSupport < needSample ) | |
| 268 continue ; | |
| 269 } | |
| 270 | |
| 271 if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 ) | |
| 272 { | |
| 273 if ( introns[i].sampleSupport <= sampleCnt * 0.01 ) | |
| 274 continue ; | |
| 275 } | |
| 276 /*if ( sampleCnt >= 50 ) | |
| 277 { | |
| 278 // just some randomly intron. | |
| 279 if ( introns[i].sampleSupport == 1 | |
| 280 && ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) ) | |
| 281 continue ; | |
| 282 }*/ | |
| 283 | |
| 284 /*if ( introns[i].end - introns[i].start + 1 < 50 ) | |
| 285 { | |
| 286 int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ; | |
| 287 int flag = 0 ; | |
| 288 | |
| 289 if ( introns[i].sampleSupport < needSample ) | |
| 290 flag = 1 ; | |
| 291 if ( flag == 1 ) | |
| 292 continue ; | |
| 293 }*/ | |
| 294 if ( introns[i].strand == '?' && sampleCnt == 1 && | |
| 295 ( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) ) | |
| 296 continue ; | |
| 297 | |
| 298 // Since the strand is uncertain, the alinger may make different decision sample from sample. | |
| 299 // To keep this intron, one of its splice sites must be more supported than adjacent splice sites. | |
| 300 if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) | |
| 301 { | |
| 302 int s, e ; | |
| 303 int l ; | |
| 304 int cnt = 0 ; | |
| 305 for ( l = 0 ; l < 2 ; ++l ) | |
| 306 { | |
| 307 int ind = ( l == 0 ) ? a : b ; | |
| 308 double max = sites[ind].support ; | |
| 309 int maxTag = ind ; | |
| 310 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s ) | |
| 311 { | |
| 312 if ( sites[s].pos + 7 < sites[s + 1].pos ) | |
| 313 break ; | |
| 314 if ( sites[s].support >= max ) | |
| 315 { | |
| 316 max = sites[s].support ; | |
| 317 maxTag = s ; | |
| 318 } | |
| 319 } | |
| 320 | |
| 321 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e ) | |
| 322 { | |
| 323 if ( sites[e].pos - 7 > sites[e - 1].pos ) | |
| 324 break ; | |
| 325 if ( sites[e].support >= max ) | |
| 326 { | |
| 327 max = sites[e].support ; | |
| 328 maxTag = e ; | |
| 329 } | |
| 330 } | |
| 331 | |
| 332 if ( maxTag == ind ) | |
| 333 ++cnt ; | |
| 334 } | |
| 335 if ( cnt == 0 ) | |
| 336 continue ; | |
| 337 } | |
| 338 | |
| 339 // Test whether this a intron coming from a wrong strand | |
| 340 /*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) | |
| 341 { | |
| 342 int plusStrand = 0 ; | |
| 343 int minusStrand = 0 ; | |
| 344 int l ; | |
| 345 int s, e ; | |
| 346 | |
| 347 if ( introns[i].strand == '+' ) | |
| 348 plusStrand = 2 ; | |
| 349 else | |
| 350 minusStrand = 2 ; | |
| 351 | |
| 352 for ( l = 0 ; l < 2 ; ++l ) | |
| 353 { | |
| 354 int ind = ( l == 0 ) ? a : b ; | |
| 355 for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s ) | |
| 356 { | |
| 357 if ( sites[s].pos + 10000 < sites[s + 1].pos ) | |
| 358 break ; | |
| 359 | |
| 360 if ( sites[s].strand == '+' ) | |
| 361 ++plusStrand ; | |
| 362 else if ( sites[s].strand == '-' ) | |
| 363 ++minusStrand ; | |
| 364 } | |
| 365 | |
| 366 for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e ) | |
| 367 { | |
| 368 if ( sites[e].pos - 10000 > sites[e - 1].pos ) | |
| 369 break ; | |
| 370 | |
| 371 if ( sites[e].strand == '+' ) | |
| 372 ++plusStrand ; | |
| 373 else if ( sites[e].strand == '-' ) | |
| 374 ++minusStrand ; | |
| 375 } | |
| 376 } | |
| 377 | |
| 378 if ( introns[i].start == 161517978 ) | |
| 379 printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ; | |
| 380 | |
| 381 if ( introns[i].strand == '+' && minusStrand >= 20 && plusStrand == 2 ) | |
| 382 continue ; | |
| 383 else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 ) | |
| 384 continue ; | |
| 385 | |
| 386 }*/ | |
| 387 | |
| 388 // Filter a intron if one of its splice site associated with too many introns. | |
| 389 /*if ( sites[a].associatedIntronCnt >= 10 ) | |
| 390 { | |
| 391 int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ; | |
| 392 if ( introns[i].support < sites[a].support / 100 && | |
| 393 introns[i].sampleSupport < needSample ) | |
| 394 { | |
| 395 continue ; | |
| 396 } | |
| 397 } | |
| 398 if ( sites[b].associatedIntronCnt >= 10 ) | |
| 399 { | |
| 400 int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ; | |
| 401 if ( introns[i].support < sites[b].support / 100 && | |
| 402 introns[i].sampleSupport < needSample ) | |
| 403 { | |
| 404 continue ; | |
| 405 } | |
| 406 }*/ | |
| 407 | |
| 408 | |
| 409 // Test for long intron | |
| 410 if ( introns[i].end - introns[i].start + 1 >= longIntronSize ) | |
| 411 { | |
| 412 int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ; | |
| 413 int flag = 0 ; | |
| 414 if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1 | |
| 415 || introns[i].uniqSupport / sampleCnt < 1 | |
| 416 || introns[i].sampleSupport < needSample ) | |
| 417 flag = 1 ; | |
| 418 if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize ) | |
| 419 continue ; | |
| 420 if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes | |
| 421 continue ; | |
| 422 } | |
| 423 | |
| 424 | |
| 425 printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ; | |
| 426 } | |
| 427 | |
| 428 return 0 ; | |
| 429 } |
