annotate PsiCLASS-1.0.2/Vote.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 // The program that vote to pick the trusted transcripts.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 #include <stdlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4 #include <map>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 #include <vector>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 #include <algorithm>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7 #include <string.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 #include <string>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10 #include "defs.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 #include "TranscriptDecider.hpp"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13 char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14 "Required:\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15 "\t--lg: path to the list of GTF files.\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16 "Optional:\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17 "\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 //"\t-n INT: the number of samples a transcript showed up. (default: 3)\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 /*struct _transcript
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 int chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25 int geneId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26 char strand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27 struct _pair32 *exons ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 int ecnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 int sampleId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 } ;*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 void GetGTFField( char *s, const char *field, char *ret )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34 char *p = strstr( s, field ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35 if ( p == NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 return ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37 for ( ; *p != ' ' ; ++p )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 p += 2 ; // add extra 1 to skip \"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 sscanf( p, "%s", ret ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41 //printf( "+%s %d\n", tid, strlen( tid ) ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 p = ret + strlen( ret ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 while ( *p != '\"' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 --p ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 *p = '\0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48 int GetTailNumber( char *s )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 int len = strlen( s ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 int ret = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 int i ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 int factor = 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54 for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 ret += factor * ( s[i] - '0' ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 return ret ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61 int CompDouble( const void *p1, const void *p2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 double a = *(double *)p1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 double b = *(double *)p2 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 if ( a > b )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 return 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 else if ( a < b )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68 return -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 return 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73 int main( int argc, char *argv[] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 int i, j, k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 double minAvgDepth = 1.0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 double fraction = 1.0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 int minSampleCnt = 3 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 std::map<std::string, int> chrNameToId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 std::map<int, std::string> chrIdToName ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 FILE *fpGTFlist = NULL ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 FILE *fp = NULL ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 for ( i = 1 ; i < argc ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 if ( !strcmp( argv[i], "--lg" ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 fpGTFlist = fopen( argv[i + 1], "r" ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 ++i ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 else if ( !strcmp( argv[i], "-d" ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 minAvgDepth = atof( argv[i + 1] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94 ++i ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 /*else if ( !strcmp( argv[i], "-n" ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98 minSampleCnt = atoi( argv[i + 1] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 ++i ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 }*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 printf( "%s", usage ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 exit( 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107 if ( fpGTFlist == NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 exit( 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113 std::vector<struct _outputTranscript> transcripts ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 std::vector<struct _outputTranscript> outputTranscripts ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 char buffer[4096] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 char line[10000] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 char chrom[50], tool[20], type[40], strand[3] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 //char tid[50] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 int start, end ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121 std::vector<struct _pair32> tmpExons ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 int sampleCnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 int gid = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 int tid = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 int chrId = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 int chrIdUsed = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 char cStrand = '.' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 char prefix[50] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 double FPKM = 0, TPM = 0, cov = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 int len = strlen( buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 if ( buffer[len - 1] == '\n' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 buffer[len - 1] = '\0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 --len ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139 fp = fopen( buffer, "r" ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 while ( fgets( line, sizeof( line ), fp ) != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 if ( line[0] == '#' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149 if ( strcmp( type, "exon" ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 if ( tmpExons.size() > 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 struct _outputTranscript nt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154 nt.sampleId = sampleCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155 nt.chrId = chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 nt.geneId = gid ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 nt.transcriptId = tid ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158 nt.strand = cStrand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159 nt.ecnt = tmpExons.size() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 nt.exons = new struct _pair32[nt.ecnt] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161 nt.FPKM = FPKM ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162 nt.TPM = TPM ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
163 nt.cov = cov ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
164 for ( i = 0 ; i < nt.ecnt ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
165 nt.exons[i] = tmpExons[i] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
166 tmpExons.clear() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
167 transcripts.push_back( nt ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
168 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
169 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
170 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
171
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
172 if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
173 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
174 chrId = chrIdUsed ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
175
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
176 std::string s( chrom ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
177 chrNameToId[s] = chrIdUsed ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
178 chrIdToName[ chrIdUsed ] = s ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
179 ++chrIdUsed ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
180 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
181 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
182 chrId = chrNameToId[ std::string( chrom ) ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
183
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
184 GetGTFField( line, "gene_id", buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
185 gid = GetTailNumber( buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
186
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
187 GetGTFField( line, "transcript_id", buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
188 tid = GetTailNumber( buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
189
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
190 GetGTFField( line, "FPKM", buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
191 FPKM = atof( buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
192
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
193 GetGTFField( line, "TPM", buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
194 TPM = atof( buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
195
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
196 GetGTFField( line, "cov", buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
197 cov = atof( buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
198
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
199 cStrand = strand[0] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
200
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
201 // Look for the user-defined prefix.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
202 int len = strlen( buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
203 j = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
204 for ( i = len - 1 ; i >= 0 ; --i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
205 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
206 if ( buffer[i] == '.' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
207 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
208 ++j ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
209 if ( j >= 2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
210 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
211 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
212 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
213
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
214 for ( j = 0 ; j < i ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
215 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
216 prefix[j] = buffer[j] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
217 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
218 if ( i > 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
219 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
220 prefix[j] = '.' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
221 prefix[j + 1] = '\0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
222 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
223 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
224 prefix[0] = '\0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
225
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
226 struct _pair32 ne ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
227 ne.a = start ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
228 ne.b = end ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
229 tmpExons.push_back( ne ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
230 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
231 if ( tmpExons.size() > 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
232 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
233 struct _outputTranscript nt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
234 nt.sampleId = sampleCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
235 nt.chrId = chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
236 nt.geneId = gid ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
237 nt.transcriptId = tid ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
238 nt.strand = cStrand ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
239 nt.ecnt = tmpExons.size() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
240 nt.exons = new struct _pair32[nt.ecnt] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
241 nt.FPKM = FPKM ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
242 nt.TPM = TPM ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
243 nt.cov = cov ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
244 for ( i = 0 ; i < nt.ecnt ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
245 nt.exons[i] = tmpExons[i] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
246 tmpExons.clear() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
247 transcripts.push_back( nt ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
248 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
249
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
250 ++sampleCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
251 fclose( fp ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
252 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
253 fclose( fpGTFlist ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
254
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
255 // Coalesce same transcripts
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
256 std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
257 std::vector<int> sampleSupport ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
258 int size = transcripts.size() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
259 if ( minSampleCnt > sampleCnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
260 minSampleCnt = sampleCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
261
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
262 for ( i = 0 ; i < size ; )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
263 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
264 int l ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
265 for ( j = i + 1 ; j < size ; ++j )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
266 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
267 if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
268 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
269 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
270 // [i,j) are the same transcripts.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
271 /*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
272 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
273 outputTranscripts.push_back( transcripts[i] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
274 }*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
275
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
276 double sumFPKM = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
277 double sumTPM = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
278 double sumCov = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
279 for ( l = i ; l < j ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
280 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
281 sumFPKM += transcripts[l].FPKM ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
282 sumTPM += transcripts[l].TPM ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
283 sumCov += transcripts[l].cov ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
284 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
285
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
286 transcripts[k] = transcripts[i] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
287 for ( l = i + 1 ; l < j ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
288 delete[] transcripts[l].exons ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
289
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
290 transcripts[k].FPKM = sumFPKM / ( j - i ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
291 transcripts[k].TPM = sumTPM / ( j - i ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
292 transcripts[k].cov = sumCov / ( j - i ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
293
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
294 if ( ( j - i < int( fraction * sampleCnt ) || j - i < minSampleCnt ) && sumCov < minAvgDepth * sampleCnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
295 //if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
296 //if ( sumTPM < sampleCnt || sumFPKM < sampleCnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
297 //if ( sumCov < minAvgDepthn * sampleCnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
298 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
299 transcripts[k].FPKM = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
300 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
301 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
302 sampleSupport.push_back( j - i ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
303 ++k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
304 i = j ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
305 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
306 transcripts.resize( k ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
307
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
308 // The motivation to separate the coalscing and voting is to allow
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
309 // a gene with no txpt passing the vote to pick a representative.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
310 size = k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
311 for ( i = 0 ; i < size ; )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
312 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
313 int cnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
314 if ( transcripts[i].FPKM != -1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
315 ++cnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
316
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
317 for ( j = i + 1 ; j < size ; ++j )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
318 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
319 if ( transcripts[i].geneId != transcripts[j].geneId )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
320 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
321 if ( transcripts[j].FPKM != -1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
322 ++cnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
323 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
324 int l ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
325 /*double *freq = new double[cnt] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
326 for ( l = 0 ; l < cnt ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
327 freq[l] = transcripts[l].FPKM / sampleCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
328 qsort( freq, cnt, sizeof( double ), CompDouble ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
329
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
330 for ( l = 0 ; l < cnt ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
331 printf( "%lf\n", freq[l] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
332 printf( "===========\n" ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
333 delete[] freq ;*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
334 /*if ( cnt == 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
335 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
336 int maxEcnt = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
337 int maxCnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
338 int maxtag ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
339 for ( l = i ; l < j ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
340 if ( transcripts[l].ecnt > maxEcnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
341 maxEcnt = transcripts[l].ecnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
342
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
343 if ( maxEcnt >= 2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
344 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
345 int maxSampleSupport = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
346 maxtag = i ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
347 for ( l = i ; l < j ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
348 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
349 if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
350 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
351 if ( sampleSupport[l] > maxSampleSupport )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
352 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
353 maxSampleSupport = sampleSupport[l] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
354 maxtag = l ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
355 maxCnt = 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
356 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
357 else if ( sampleSupport[l] == maxSampleSupport )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
358 ++maxCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
359 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
360 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
361
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
362 if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
363 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
364 if ( maxCnt > 1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
365 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
366 int maxTPM = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
367 for ( l = i ; l < j ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
368 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
369 if ( sampleSupport[l] != maxSampleSupport )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
370 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
371 if ( transcripts[l].TPM > maxTPM )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
372 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
373 maxTPM = transcripts[l].TPM ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
374 maxtag = l ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
375 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
376 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
377
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
378 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
379 transcripts[maxtag].FPKM = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
380 //fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
381 outputTranscripts.push_back( transcripts[maxtag] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
382 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
383 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
384 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
385 else*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
386 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
387 for ( l = i ; l < j ; ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
388 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
389 //if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
390 if ( transcripts[l].FPKM != -1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
391 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
392 outputTranscripts.push_back( transcripts[l] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
393 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
394 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
395 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
396
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
397 i = j ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
398 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
399
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
400 // Output
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
401 size = outputTranscripts.size() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
402 //printf( "%d\n", size ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
403 int transcriptId = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
404 int prevGid = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
405 for ( i = 0 ; i < size ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
406 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
407 struct _outputTranscript &t = outputTranscripts[i] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
408 const char *chrom = chrIdToName[t.chrId].c_str() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
409 /*if ( t.geneId != prevGid )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
410 transcriptId = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
411 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
412 ++transcriptId ;*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
413 transcriptId = outputTranscripts[i].transcriptId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
414 fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
415 chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
416 prefix, chrom, t.geneId,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
417 prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
418 for ( j = 0 ; j < t.ecnt ; ++j )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
419 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
420 fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; "
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
421 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n",
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
422 chrom, t.exons[j].a, t.exons[j].b, t.strand,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
423 prefix, chrom, t.geneId,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
424 prefix, chrom, t.geneId, transcriptId,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
425 j + 1, t.FPKM, t.TPM, t.cov,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
426 sampleSupport[i] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
427 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
428
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
429 prevGid = t.geneId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
430 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
431 return 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
432 }