comparison PsiCLASS-1.0.2/AddGeneName.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #include <stdio.h>
2 #include <string.h>
3 #include <stdlib.h>
4
5 #include <string>
6 #include <map>
7 #include <vector>
8 #include <algorithm>
9
10 char usage[] = "Usage: ./add-genename annotation.gtf gtflist [OPTIONS]\n"
11 "\t-o: the directory for output gtf files (default: ./)\n" ;
12
13 struct _interval
14 {
15 char chrom[97] ;
16 char geneName[97] ;
17
18 int start, end ;
19 } ;
20
21 char line[10000] ;
22 char buffer[10000], buffer2[10000], buffer3[10000] ;
23
24 int CompInterval( const struct _interval &a, const struct _interval &b )
25 {
26 int chromCmp = strcmp( a.chrom, b.chrom ) ;
27 if ( chromCmp )
28 return chromCmp ;
29 else if ( a.start != b.start )
30 return a.start - b.start ;
31 else
32 return a.end - b.end ;
33 }
34
35 bool Comp( const struct _interval &a, const struct _interval &b )
36 {
37 int cmp = CompInterval( a, b ) ;
38 if ( cmp < 0 )
39 return true ;
40 else
41 return false ;
42
43 return false ;
44 }
45
46 void SortAndCleanIntervals( std::vector<struct _interval> &it )
47 {
48 std::sort( it.begin(), it.end(), Comp ) ;
49
50 int i, k ;
51 int size = it.size() ;
52 if ( size == 0 )
53 return ;
54
55 k = 1 ;
56 for ( i = 1 ; i < size ; ++i )
57 {
58 if ( CompInterval( it[k - 1], it[i] ) )
59 {
60 it[k] = it[i] ;
61 ++k ;
62 }
63 }
64 it.resize( k ) ;
65 }
66
67 int GetGTFField( char *ret, char *line, const char *fid )
68 {
69 char *p = strstr( line, fid ) ;
70 if ( p == NULL )
71 return 0 ;
72 //printf( "%s %s\n", line, fid ) ;
73 for ( ; *p != ' ' ; ++p )
74 ;
75 p += 2 ;
76
77 sscanf( p, "%s", ret ) ;
78 p = ret + strlen( ret ) ;
79 while ( p != ret && *p != '\"' )
80 --p ;
81 *p = '\0' ;
82 return 1 ;
83 }
84
85 void UpdateIdToAnnoId( std::vector<struct _interval> &transcript, char *tid, int exonTag, int intronTag, std::vector<struct _interval> &exons, std::vector<struct _interval> &introns,
86 std::map<std::string, std::string> &txptIdToAnnoId, std::map<std::string, std::string> &geneIdToAnnoId )
87 {
88 int i, k ;
89 bool flag = false ;
90 // First, try whether intron works.
91 int ecnt = transcript.size() ;
92 int intronSize = introns.size() ;
93 int exonSize = exons.size() ;
94
95 struct _interval itron ;
96 strcpy( itron.chrom, transcript[0].chrom ) ;
97 k = intronTag ;
98 for ( i = 0 ; i < ecnt - 1 && !flag ; ++i )
99 {
100 itron.start = transcript[i].end ;
101 itron.end = transcript[i + 1].start ;
102 for ( ; k < intronSize ; ++k )
103 {
104 //printf( "hi %d: %s %d %d; %d %s %d %d\n", 0, itron.chrom, itron.start, itron.end, k, introns[k].chrom, introns[k].start, introns[k].end ) ;
105 if ( strcmp( introns[k].chrom, itron.chrom ) )
106 break ;
107
108 int cmp = CompInterval( introns[k], itron ) ;
109 if ( cmp == 0 )
110 {
111 txptIdToAnnoId[ std::string( tid ) ] = introns[k].geneName ;
112 geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = introns[k].geneName ;
113 flag = true ;
114 break ;
115 }
116 else if ( cmp > 0 )
117 break ;
118 }
119 }
120
121 // Next, try whether exon works
122 k = exonTag ;
123 for ( i = 0 ; i < ecnt && !flag ; ++i )
124 {
125 for ( ; k < exonSize ; ++k )
126 {
127 if ( strcmp( exons[k].chrom, transcript[i].chrom ) )
128 break ;
129
130 if ( exons[k].end < transcript[i].start )
131 continue ;
132 else if ( exons[k].start > transcript[i].end )
133 break ;
134 else
135 {
136 txptIdToAnnoId[ std::string( tid ) ] = exons[k].geneName ;
137 geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = exons[k].geneName ;
138 flag = true ;
139 break ;
140 }
141 }
142 }
143 }
144
145 int main( int argc, char *argv[] )
146 {
147 int i, j, k ;
148 FILE *fp ;
149 FILE *fpList ;
150 char outputPath[1024] = "./" ;
151 char prevTid[97] = "" ;
152
153 std::vector<struct _interval> introns, exons ;
154 std::map<std::string, std::string> txptIdToAnnoId, geneIdToAnnoId ;
155 std::map<std::string, int> chromIdMapIntrons, chromIdMapExons ; // the index for the starting of a chrom.
156
157 if ( argc < 3 )
158 {
159 fprintf( stderr, "%s", usage ) ;
160 exit( 1 ) ;
161 }
162
163 for ( i = 3 ; i < argc ; ++i )
164 {
165 if ( !strcmp( argv[i], "-o" ) )
166 {
167 strcpy( outputPath, argv[i + 1 ] ) ;
168 ++i ;
169 }
170 else
171 {
172 fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ;
173 exit( 1 ) ;
174 }
175 }
176
177 // Get the exons, introns from the annotation.
178 fp = fopen( argv[1], "r" ) ;
179 while ( fgets( line, sizeof( line ), fp ) != NULL )
180 {
181 if ( line[0] == '#' )
182 continue ;
183 char tid[97] ;
184 char gname[97] ;
185 char chrom[97] ;
186 char type[50] ;
187 int start, end ;
188
189 sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ;
190 if ( strcmp( type, "exon" ) )
191 continue ;
192 if ( GetGTFField( gname, line, "gene_name" ) )
193 {
194 struct _interval ne ;
195 strcpy( ne.chrom, chrom ) ;
196 strcpy( ne.geneName, gname ) ;
197 ne.start = start ; ne.end = end ;
198
199 exons.push_back( ne ) ;
200 }
201 else
202 {
203 fprintf( stderr, "%s has no field of gene_name.\n", line ) ;
204 exit( 1 ) ;
205 }
206
207 if ( GetGTFField( tid, line, "transcript_id" ) )
208 {
209 if ( !strcmp( tid, prevTid ) )
210 {
211 struct _interval ni ;
212
213 strcpy( ni.chrom, chrom ) ;
214 strcpy( ni.geneName, gname ) ;
215
216 int size = exons.size() ;
217 if ( size < 2 )
218 continue ;
219
220 struct _interval &e1 = exons[ size - 2 ] ;
221 struct _interval &e2 = exons[ size - 1 ] ;
222 if ( e1.start < e2.start )
223 {
224 ni.start = e1.end ;
225 ni.end = e2.start ;
226 }
227 else
228 {
229 ni.start = e2.end ;
230 ni.end = e1.start ;
231 }
232
233 introns.push_back( ni ) ;
234 }
235 else
236 strcpy( prevTid, tid ) ;
237 }
238 else
239 {
240 fprintf( stderr, "%s has no field of transcript_id.\n", line ) ;
241 exit( 1 ) ;
242 }
243 }
244 fclose( fp ) ;
245 SortAndCleanIntervals( exons ) ;
246 SortAndCleanIntervals( introns ) ;
247 int exonSize = exons.size() ;
248 int intronSize = introns.size() ;
249 // Determine the offset for each chrom on the list of features.
250 if ( exonSize )
251 {
252 chromIdMapExons[ std::string( exons[0].chrom ) ] = 0 ;
253 for ( i = 1 ; i < exonSize ; ++i )
254 {
255 if ( strcmp( exons[i].chrom, exons[i - 1].chrom ) )
256 chromIdMapExons[ std::string( exons[i].chrom ) ] = i ;
257 }
258 }
259
260 if ( intronSize )
261 {
262 chromIdMapIntrons[ std::string( introns[0].chrom ) ] = 0 ;
263 for ( i = 1 ; i < intronSize ; ++i )
264 {
265 if ( strcmp( introns[i].chrom, introns[i - 1].chrom ) )
266 {
267 //printf( "%s %d\n", introns[i].chrom, i ) ;
268 chromIdMapIntrons[ std::string( introns[i].chrom ) ] = i ;
269 }
270 }
271
272 //for ( i = 0 ; i < intronSize ; ++i )
273 // printf( "%s\t%d\t%d\n", introns[i].chrom, introns[i].start, introns[i].end ) ;
274 }
275 //printf( "%d %d\n", exonSize, intronSize ) ;
276
277 // Go through all the GTF files to find the map between PsiCLASS's gene_id to annotation's gene name
278 fpList = fopen( argv[2], "r ") ;
279 std::vector<struct _interval> transcript ;
280 int exonTag = 0 ;
281 int intronTag = 0 ;
282
283 while ( fscanf( fpList, "%s", line ) != EOF )
284 {
285 // Set the output file
286 //printf( "hi: %s\n", line ) ;
287 char *p ;
288 p = line + strlen( line ) ;
289 while ( p != line && *p != '/' )
290 {
291 if ( *p == '\n' )
292 *p = '\0' ;
293 --p ;
294 }
295 sprintf( buffer, "%s/%s", outputPath, p ) ;
296
297 // Test whether this will overwrite the input gtf file.
298 if ( realpath( buffer, buffer2 ) != NULL )
299 {
300 if ( realpath( line, buffer3 ) && !strcmp( buffer2, buffer3 ) )
301 {
302 fprintf( stderr, "Output will overwrite the input files. Please use -o to specify a different output directory.\n" ) ;
303 exit( 1 ) ;
304 }
305 }
306
307 fp = fopen( line, "r" ) ;
308 transcript.resize( 0 ) ; // hold the exons in the transcript.
309 prevTid[0] = '\0' ;
310 exonTag = 0 ;
311 intronTag = 0 ;
312 int farthest = 0 ;
313
314 while ( fgets( line, sizeof( line ), fp ) )
315 {
316 char tid[97] ;
317 //char gname[97] ;
318 char chrom[97] ;
319 char type[50] ;
320 int start, end ;
321
322 if ( line[0] == '#' )
323 continue ;
324
325 sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ;
326 if ( strcmp( type, "exon" ) )
327 continue ;
328
329 if ( GetGTFField( tid, line, "transcript_id" ) )
330 {
331 struct _interval ne ;
332 strcpy( ne.chrom, chrom ) ;
333 ne.start = start ;
334 ne.end = end ;
335 GetGTFField( ne.geneName, line, "gene_id" ) ;
336
337 if ( !strcmp( tid, prevTid ) )
338 {
339 transcript.push_back( ne ) ;
340 if ( end > farthest )
341 farthest = end ;
342 }
343 else if ( transcript.size() > 0 )
344 {
345 // the non-existed chrom will be put to offset 0, and that's fine
346 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
347 intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;
348 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
349 exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;
350
351 UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ;
352
353 // Adjust the offset if we are done with a gene cluster
354 // We don't need to worry about the case of changing chrom here.
355 if ( !strcmp( ne.geneName, transcript[0].geneName ) &&
356 start > farthest ) // Use farthest to avoid inteleaved gene.
357 {
358 while ( intronTag < intronSize && !strcmp( introns[intronTag ].chrom, ne.chrom )
359 && introns[intronTag].end < ne.start )
360 ++intronTag ;
361
362 while ( exonTag < exonSize && !strcmp( exons[ exonTag ].chrom, ne.chrom )
363 && exons[ exonTag ].end < ne.start )
364 ++exonTag ;
365
366 farthest = end ;
367 }
368
369 transcript.resize( 0 ) ;
370 transcript.push_back( ne ) ;
371
372 // Find the overlaps.
373 strcpy( prevTid, tid ) ;
374 }
375 else
376 strcpy( prevTid, tid ) ;
377 }
378 else
379 {
380 fprintf( stderr, "Could not find transcript_id field in GTF file: %s", line ) ;
381 exit( 1 ) ;
382 }
383 }
384
385 if ( transcript.size() > 0 )
386 {
387 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
388 intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;
389 if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
390 exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;
391
392 UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ;
393 }
394 fclose( fp ) ;
395 }
396 fclose( fpList ) ;
397
398 // Add the gene_name field.
399 fpList = fopen( argv[2], "r" ) ;
400 int novelCnt = 0 ;
401 while ( fscanf( fpList, "%s", line ) != EOF )
402 {
403 FILE *fpOut ;
404 // Set the output file
405 char *p ;
406 p = line + strlen( line ) ;
407 while ( p != line && *p != '/' )
408 {
409 if ( *p == '\n' )
410 *p = '\0' ;
411 --p ;
412 }
413 sprintf( buffer, "%s/%s", outputPath, p ) ;
414 fpOut = fopen( buffer, "w" ) ;
415
416 // Process the input file
417 fp = fopen( line, "r" ) ;
418
419 transcript.resize( 0 ) ;
420 prevTid[0] = '\0' ;
421 while ( fgets( line, sizeof( line ), fp ) )
422 {
423 char gname[97] ;
424 if ( line[0] == '#' )
425 {
426 fprintf( fpOut, "%s", line ) ;
427 continue ;
428 }
429
430 if ( GetGTFField( buffer, line, "transcript_id" ) )
431 {
432 if ( txptIdToAnnoId.find( std::string( buffer ) ) != txptIdToAnnoId.end() )
433 {
434 int len = strlen( line ) ;
435 if ( line[len - 1] == '\n' )
436 {
437 line[len - 1] = '\0' ;
438 --len ;
439 }
440 fprintf( fpOut, "%s gene_name \"%s\";\n", line, txptIdToAnnoId[std::string( buffer )].c_str() ) ;
441 continue ;
442 }
443 }
444
445 if ( GetGTFField( gname, line, "gene_id" ) )
446 {
447 int len = strlen( line ) ;
448 if ( line[len - 1] == '\n' )
449 {
450 line[len - 1] = '\0' ;
451 --len ;
452 }
453 if ( geneIdToAnnoId.find( std::string( gname ) ) != geneIdToAnnoId.end() )
454 {
455 fprintf( fpOut, "%s gene_name \"%s\";\n", line, geneIdToAnnoId[std::string(gname)].c_str() ) ;
456 }
457 else
458 {
459 sprintf( buffer, "novel_%d", novelCnt ) ;
460 geneIdToAnnoId[ std::string( gname ) ] = std::string( buffer ) ;
461 fprintf( fpOut, "%s gene_name \"%s\";\n", line, buffer ) ;
462 ++novelCnt ;
463 }
464
465 }
466 else
467 fprintf( fpOut, "%s", line ) ;
468
469 }
470
471 fclose( fp ) ;
472 fclose( fpOut ) ;
473 }
474
475 return 0 ;
476 }