0
|
1 #include <stdio.h>
|
|
2 #include <string.h>
|
|
3 #include <stdlib.h>
|
|
4
|
|
5 #include <string>
|
|
6 #include <map>
|
|
7 #include <vector>
|
|
8 #include <algorithm>
|
|
9
|
|
10 char usage[] = "Usage: ./add-genename annotation.gtf gtflist [OPTIONS]\n"
|
|
11 "\t-o: the directory for output gtf files (default: ./)\n" ;
|
|
12
|
|
13 struct _interval
|
|
14 {
|
|
15 char chrom[97] ;
|
|
16 char geneName[97] ;
|
|
17
|
|
18 int start, end ;
|
|
19 } ;
|
|
20
|
|
21 char line[10000] ;
|
|
22 char buffer[10000], buffer2[10000], buffer3[10000] ;
|
|
23
|
|
24 int CompInterval( const struct _interval &a, const struct _interval &b )
|
|
25 {
|
|
26 int chromCmp = strcmp( a.chrom, b.chrom ) ;
|
|
27 if ( chromCmp )
|
|
28 return chromCmp ;
|
|
29 else if ( a.start != b.start )
|
|
30 return a.start - b.start ;
|
|
31 else
|
|
32 return a.end - b.end ;
|
|
33 }
|
|
34
|
|
35 bool Comp( const struct _interval &a, const struct _interval &b )
|
|
36 {
|
|
37 int cmp = CompInterval( a, b ) ;
|
|
38 if ( cmp < 0 )
|
|
39 return true ;
|
|
40 else
|
|
41 return false ;
|
|
42
|
|
43 return false ;
|
|
44 }
|
|
45
|
|
46 void SortAndCleanIntervals( std::vector<struct _interval> &it )
|
|
47 {
|
|
48 std::sort( it.begin(), it.end(), Comp ) ;
|
|
49
|
|
50 int i, k ;
|
|
51 int size = it.size() ;
|
|
52 if ( size == 0 )
|
|
53 return ;
|
|
54
|
|
55 k = 1 ;
|
|
56 for ( i = 1 ; i < size ; ++i )
|
|
57 {
|
|
58 if ( CompInterval( it[k - 1], it[i] ) )
|
|
59 {
|
|
60 it[k] = it[i] ;
|
|
61 ++k ;
|
|
62 }
|
|
63 }
|
|
64 it.resize( k ) ;
|
|
65 }
|
|
66
|
|
67 int GetGTFField( char *ret, char *line, const char *fid )
|
|
68 {
|
|
69 char *p = strstr( line, fid ) ;
|
|
70 if ( p == NULL )
|
|
71 return 0 ;
|
|
72 //printf( "%s %s\n", line, fid ) ;
|
|
73 for ( ; *p != ' ' ; ++p )
|
|
74 ;
|
|
75 p += 2 ;
|
|
76
|
|
77 sscanf( p, "%s", ret ) ;
|
|
78 p = ret + strlen( ret ) ;
|
|
79 while ( p != ret && *p != '\"' )
|
|
80 --p ;
|
|
81 *p = '\0' ;
|
|
82 return 1 ;
|
|
83 }
|
|
84
|
|
85 void UpdateIdToAnnoId( std::vector<struct _interval> &transcript, char *tid, int exonTag, int intronTag, std::vector<struct _interval> &exons, std::vector<struct _interval> &introns,
|
|
86 std::map<std::string, std::string> &txptIdToAnnoId, std::map<std::string, std::string> &geneIdToAnnoId )
|
|
87 {
|
|
88 int i, k ;
|
|
89 bool flag = false ;
|
|
90 // First, try whether intron works.
|
|
91 int ecnt = transcript.size() ;
|
|
92 int intronSize = introns.size() ;
|
|
93 int exonSize = exons.size() ;
|
|
94
|
|
95 struct _interval itron ;
|
|
96 strcpy( itron.chrom, transcript[0].chrom ) ;
|
|
97 k = intronTag ;
|
|
98 for ( i = 0 ; i < ecnt - 1 && !flag ; ++i )
|
|
99 {
|
|
100 itron.start = transcript[i].end ;
|
|
101 itron.end = transcript[i + 1].start ;
|
|
102 for ( ; k < intronSize ; ++k )
|
|
103 {
|
|
104 //printf( "hi %d: %s %d %d; %d %s %d %d\n", 0, itron.chrom, itron.start, itron.end, k, introns[k].chrom, introns[k].start, introns[k].end ) ;
|
|
105 if ( strcmp( introns[k].chrom, itron.chrom ) )
|
|
106 break ;
|
|
107
|
|
108 int cmp = CompInterval( introns[k], itron ) ;
|
|
109 if ( cmp == 0 )
|
|
110 {
|
|
111 txptIdToAnnoId[ std::string( tid ) ] = introns[k].geneName ;
|
|
112 geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = introns[k].geneName ;
|
|
113 flag = true ;
|
|
114 break ;
|
|
115 }
|
|
116 else if ( cmp > 0 )
|
|
117 break ;
|
|
118 }
|
|
119 }
|
|
120
|
|
121 // Next, try whether exon works
|
|
122 k = exonTag ;
|
|
123 for ( i = 0 ; i < ecnt && !flag ; ++i )
|
|
124 {
|
|
125 for ( ; k < exonSize ; ++k )
|
|
126 {
|
|
127 if ( strcmp( exons[k].chrom, transcript[i].chrom ) )
|
|
128 break ;
|
|
129
|
|
130 if ( exons[k].end < transcript[i].start )
|
|
131 continue ;
|
|
132 else if ( exons[k].start > transcript[i].end )
|
|
133 break ;
|
|
134 else
|
|
135 {
|
|
136 txptIdToAnnoId[ std::string( tid ) ] = exons[k].geneName ;
|
|
137 geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = exons[k].geneName ;
|
|
138 flag = true ;
|
|
139 break ;
|
|
140 }
|
|
141 }
|
|
142 }
|
|
143 }
|
|
144
|
|
145 int main( int argc, char *argv[] )
|
|
146 {
|
|
147 int i, j, k ;
|
|
148 FILE *fp ;
|
|
149 FILE *fpList ;
|
|
150 char outputPath[1024] = "./" ;
|
|
151 char prevTid[97] = "" ;
|
|
152
|
|
153 std::vector<struct _interval> introns, exons ;
|
|
154 std::map<std::string, std::string> txptIdToAnnoId, geneIdToAnnoId ;
|
|
155 std::map<std::string, int> chromIdMapIntrons, chromIdMapExons ; // the index for the starting of a chrom.
|
|
156
|
|
157 if ( argc < 3 )
|
|
158 {
|
|
159 fprintf( stderr, "%s", usage ) ;
|
|
160 exit( 1 ) ;
|
|
161 }
|
|
162
|
|
163 for ( i = 3 ; i < argc ; ++i )
|
|
164 {
|
|
165 if ( !strcmp( argv[i], "-o" ) )
|
|
166 {
|
|
167 strcpy( outputPath, argv[i + 1 ] ) ;
|
|
168 ++i ;
|
|
169 }
|
|
170 else
|
|
171 {
|
|
172 fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ;
|
|
173 exit( 1 ) ;
|
|
174 }
|
|
175 }
|
|
176
|
|
177 // Get the exons, introns from the annotation.
|
|
178 fp = fopen( argv[1], "r" ) ;
|
|
179 while ( fgets( line, sizeof( line ), fp ) != NULL )
|
|
180 {
|
|
181 if ( line[0] == '#' )
|
|
182 continue ;
|
|
183 char tid[97] ;
|
|
184 char gname[97] ;
|
|
185 char chrom[97] ;
|
|
186 char type[50] ;
|
|
187 int start, end ;
|
|
188
|
|
189 sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ;
|
|
190 if ( strcmp( type, "exon" ) )
|
|
191 continue ;
|
|
192 if ( GetGTFField( gname, line, "gene_name" ) )
|
|
193 {
|
|
194 struct _interval ne ;
|
|
195 strcpy( ne.chrom, chrom ) ;
|
|
196 strcpy( ne.geneName, gname ) ;
|
|
197 ne.start = start ; ne.end = end ;
|
|
198
|
|
199 exons.push_back( ne ) ;
|
|
200 }
|
|
201 else
|
|
202 {
|
|
203 fprintf( stderr, "%s has no field of gene_name.\n", line ) ;
|
|
204 exit( 1 ) ;
|
|
205 }
|
|
206
|
|
207 if ( GetGTFField( tid, line, "transcript_id" ) )
|
|
208 {
|
|
209 if ( !strcmp( tid, prevTid ) )
|
|
210 {
|
|
211 struct _interval ni ;
|
|
212
|
|
213 strcpy( ni.chrom, chrom ) ;
|
|
214 strcpy( ni.geneName, gname ) ;
|
|
215
|
|
216 int size = exons.size() ;
|
|
217 if ( size < 2 )
|
|
218 continue ;
|
|
219
|
|
220 struct _interval &e1 = exons[ size - 2 ] ;
|
|
221 struct _interval &e2 = exons[ size - 1 ] ;
|
|
222 if ( e1.start < e2.start )
|
|
223 {
|
|
224 ni.start = e1.end ;
|
|
225 ni.end = e2.start ;
|
|
226 }
|
|
227 else
|
|
228 {
|
|
229 ni.start = e2.end ;
|
|
230 ni.end = e1.start ;
|
|
231 }
|
|
232
|
|
233 introns.push_back( ni ) ;
|
|
234 }
|
|
235 else
|
|
236 strcpy( prevTid, tid ) ;
|
|
237 }
|
|
238 else
|
|
239 {
|
|
240 fprintf( stderr, "%s has no field of transcript_id.\n", line ) ;
|
|
241 exit( 1 ) ;
|
|
242 }
|
|
243 }
|
|
244 fclose( fp ) ;
|
|
245 SortAndCleanIntervals( exons ) ;
|
|
246 SortAndCleanIntervals( introns ) ;
|
|
247 int exonSize = exons.size() ;
|
|
248 int intronSize = introns.size() ;
|
|
249 // Determine the offset for each chrom on the list of features.
|
|
250 if ( exonSize )
|
|
251 {
|
|
252 chromIdMapExons[ std::string( exons[0].chrom ) ] = 0 ;
|
|
253 for ( i = 1 ; i < exonSize ; ++i )
|
|
254 {
|
|
255 if ( strcmp( exons[i].chrom, exons[i - 1].chrom ) )
|
|
256 chromIdMapExons[ std::string( exons[i].chrom ) ] = i ;
|
|
257 }
|
|
258 }
|
|
259
|
|
260 if ( intronSize )
|
|
261 {
|
|
262 chromIdMapIntrons[ std::string( introns[0].chrom ) ] = 0 ;
|
|
263 for ( i = 1 ; i < intronSize ; ++i )
|
|
264 {
|
|
265 if ( strcmp( introns[i].chrom, introns[i - 1].chrom ) )
|
|
266 {
|
|
267 //printf( "%s %d\n", introns[i].chrom, i ) ;
|
|
268 chromIdMapIntrons[ std::string( introns[i].chrom ) ] = i ;
|
|
269 }
|
|
270 }
|
|
271
|
|
272 //for ( i = 0 ; i < intronSize ; ++i )
|
|
273 // printf( "%s\t%d\t%d\n", introns[i].chrom, introns[i].start, introns[i].end ) ;
|
|
274 }
|
|
275 //printf( "%d %d\n", exonSize, intronSize ) ;
|
|
276
|
|
277 // Go through all the GTF files to find the map between PsiCLASS's gene_id to annotation's gene name
|
|
278 fpList = fopen( argv[2], "r ") ;
|
|
279 std::vector<struct _interval> transcript ;
|
|
280 int exonTag = 0 ;
|
|
281 int intronTag = 0 ;
|
|
282
|
|
283 while ( fscanf( fpList, "%s", line ) != EOF )
|
|
284 {
|
|
285 // Set the output file
|
|
286 //printf( "hi: %s\n", line ) ;
|
|
287 char *p ;
|
|
288 p = line + strlen( line ) ;
|
|
289 while ( p != line && *p != '/' )
|
|
290 {
|
|
291 if ( *p == '\n' )
|
|
292 *p = '\0' ;
|
|
293 --p ;
|
|
294 }
|
|
295 sprintf( buffer, "%s/%s", outputPath, p ) ;
|
|
296
|
|
297 // Test whether this will overwrite the input gtf file.
|
|
298 if ( realpath( buffer, buffer2 ) != NULL )
|
|
299 {
|
|
300 if ( realpath( line, buffer3 ) && !strcmp( buffer2, buffer3 ) )
|
|
301 {
|
|
302 fprintf( stderr, "Output will overwrite the input files. Please use -o to specify a different output directory.\n" ) ;
|
|
303 exit( 1 ) ;
|
|
304 }
|
|
305 }
|
|
306
|
|
307 fp = fopen( line, "r" ) ;
|
|
308 transcript.resize( 0 ) ; // hold the exons in the transcript.
|
|
309 prevTid[0] = '\0' ;
|
|
310 exonTag = 0 ;
|
|
311 intronTag = 0 ;
|
|
312 int farthest = 0 ;
|
|
313
|
|
314 while ( fgets( line, sizeof( line ), fp ) )
|
|
315 {
|
|
316 char tid[97] ;
|
|
317 //char gname[97] ;
|
|
318 char chrom[97] ;
|
|
319 char type[50] ;
|
|
320 int start, end ;
|
|
321
|
|
322 if ( line[0] == '#' )
|
|
323 continue ;
|
|
324
|
|
325 sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ;
|
|
326 if ( strcmp( type, "exon" ) )
|
|
327 continue ;
|
|
328
|
|
329 if ( GetGTFField( tid, line, "transcript_id" ) )
|
|
330 {
|
|
331 struct _interval ne ;
|
|
332 strcpy( ne.chrom, chrom ) ;
|
|
333 ne.start = start ;
|
|
334 ne.end = end ;
|
|
335 GetGTFField( ne.geneName, line, "gene_id" ) ;
|
|
336
|
|
337 if ( !strcmp( tid, prevTid ) )
|
|
338 {
|
|
339 transcript.push_back( ne ) ;
|
|
340 if ( end > farthest )
|
|
341 farthest = end ;
|
|
342 }
|
|
343 else if ( transcript.size() > 0 )
|
|
344 {
|
|
345 // the non-existed chrom will be put to offset 0, and that's fine
|
|
346 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
|
|
347 intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;
|
|
348 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
|
|
349 exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;
|
|
350
|
|
351 UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ;
|
|
352
|
|
353 // Adjust the offset if we are done with a gene cluster
|
|
354 // We don't need to worry about the case of changing chrom here.
|
|
355 if ( !strcmp( ne.geneName, transcript[0].geneName ) &&
|
|
356 start > farthest ) // Use farthest to avoid inteleaved gene.
|
|
357 {
|
|
358 while ( intronTag < intronSize && !strcmp( introns[intronTag ].chrom, ne.chrom )
|
|
359 && introns[intronTag].end < ne.start )
|
|
360 ++intronTag ;
|
|
361
|
|
362 while ( exonTag < exonSize && !strcmp( exons[ exonTag ].chrom, ne.chrom )
|
|
363 && exons[ exonTag ].end < ne.start )
|
|
364 ++exonTag ;
|
|
365
|
|
366 farthest = end ;
|
|
367 }
|
|
368
|
|
369 transcript.resize( 0 ) ;
|
|
370 transcript.push_back( ne ) ;
|
|
371
|
|
372 // Find the overlaps.
|
|
373 strcpy( prevTid, tid ) ;
|
|
374 }
|
|
375 else
|
|
376 strcpy( prevTid, tid ) ;
|
|
377 }
|
|
378 else
|
|
379 {
|
|
380 fprintf( stderr, "Could not find transcript_id field in GTF file: %s", line ) ;
|
|
381 exit( 1 ) ;
|
|
382 }
|
|
383 }
|
|
384
|
|
385 if ( transcript.size() > 0 )
|
|
386 {
|
|
387 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
|
|
388 intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;
|
|
389 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
|
|
390 exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;
|
|
391
|
|
392 UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ;
|
|
393 }
|
|
394 fclose( fp ) ;
|
|
395 }
|
|
396 fclose( fpList ) ;
|
|
397
|
|
398 // Add the gene_name field.
|
|
399 fpList = fopen( argv[2], "r" ) ;
|
|
400 int novelCnt = 0 ;
|
|
401 while ( fscanf( fpList, "%s", line ) != EOF )
|
|
402 {
|
|
403 FILE *fpOut ;
|
|
404 // Set the output file
|
|
405 char *p ;
|
|
406 p = line + strlen( line ) ;
|
|
407 while ( p != line && *p != '/' )
|
|
408 {
|
|
409 if ( *p == '\n' )
|
|
410 *p = '\0' ;
|
|
411 --p ;
|
|
412 }
|
|
413 sprintf( buffer, "%s/%s", outputPath, p ) ;
|
|
414 fpOut = fopen( buffer, "w" ) ;
|
|
415
|
|
416 // Process the input file
|
|
417 fp = fopen( line, "r" ) ;
|
|
418
|
|
419 transcript.resize( 0 ) ;
|
|
420 prevTid[0] = '\0' ;
|
|
421 while ( fgets( line, sizeof( line ), fp ) )
|
|
422 {
|
|
423 char gname[97] ;
|
|
424 if ( line[0] == '#' )
|
|
425 {
|
|
426 fprintf( fpOut, "%s", line ) ;
|
|
427 continue ;
|
|
428 }
|
|
429
|
|
430 if ( GetGTFField( buffer, line, "transcript_id" ) )
|
|
431 {
|
|
432 if ( txptIdToAnnoId.find( std::string( buffer ) ) != txptIdToAnnoId.end() )
|
|
433 {
|
|
434 int len = strlen( line ) ;
|
|
435 if ( line[len - 1] == '\n' )
|
|
436 {
|
|
437 line[len - 1] = '\0' ;
|
|
438 --len ;
|
|
439 }
|
|
440 fprintf( fpOut, "%s gene_name \"%s\";\n", line, txptIdToAnnoId[std::string( buffer )].c_str() ) ;
|
|
441 continue ;
|
|
442 }
|
|
443 }
|
|
444
|
|
445 if ( GetGTFField( gname, line, "gene_id" ) )
|
|
446 {
|
|
447 int len = strlen( line ) ;
|
|
448 if ( line[len - 1] == '\n' )
|
|
449 {
|
|
450 line[len - 1] = '\0' ;
|
|
451 --len ;
|
|
452 }
|
|
453 if ( geneIdToAnnoId.find( std::string( gname ) ) != geneIdToAnnoId.end() )
|
|
454 {
|
|
455 fprintf( fpOut, "%s gene_name \"%s\";\n", line, geneIdToAnnoId[std::string(gname)].c_str() ) ;
|
|
456 }
|
|
457 else
|
|
458 {
|
|
459 sprintf( buffer, "novel_%d", novelCnt ) ;
|
|
460 geneIdToAnnoId[ std::string( gname ) ] = std::string( buffer ) ;
|
|
461 fprintf( fpOut, "%s gene_name \"%s\";\n", line, buffer ) ;
|
|
462 ++novelCnt ;
|
|
463 }
|
|
464
|
|
465 }
|
|
466 else
|
|
467 fprintf( fpOut, "%s", line ) ;
|
|
468
|
|
469 }
|
|
470
|
|
471 fclose( fp ) ;
|
|
472 fclose( fpOut ) ;
|
|
473 }
|
|
474
|
|
475 return 0 ;
|
|
476 }
|