0
|
1 #include "prep.h"
|
|
2
|
|
3 void prep_genome(void){
|
|
4 FILE *fasta;
|
|
5 FILE *gaps;
|
|
6 FILE *config;
|
|
7
|
|
8 int num_gaps;
|
|
9
|
|
10 char chrom[MAX_STR];
|
|
11 int start, end;
|
|
12 int max_len;
|
|
13
|
|
14 int i;
|
|
15
|
|
16 if (GENOME_FASTA == NULL)
|
|
17 print_error("Select genome fasta file (input) through -fasta parameter.\n");
|
|
18 if (GENOME_GAPS == NULL)
|
|
19 print_error("Select genome gaps file (input) through -gaps parameter.\n");
|
|
20 if (GENOME_CONF == NULL)
|
|
21 print_error("Select genome fasta file (output) through -conf parameter.\n");
|
|
22
|
|
23
|
|
24 if (LW_SIZE <= 0)
|
|
25 print_error ("Large window size (-lw_size) must be a poitive integer.\n");
|
|
26 if (SW_SIZE <= 0)
|
|
27 print_error ("Small window size (-sw_size) must be a poitive integer.\n");
|
|
28 if (CW_SIZE <= 0)
|
|
29 print_error ("Copy number window size (-cw_size) must be a poitive integer.\n");
|
|
30 if (LW_SLIDE <= 0)
|
|
31 print_error ("Large window slide (-lw_slide) must be a poitive integer.\n");
|
|
32 if (SW_SLIDE <= 0)
|
|
33 print_error ("Small window slide (-sw_slide) must be a poitive integer.\n");
|
|
34
|
|
35 if (SW_SIZE >= LW_SIZE)
|
|
36 print_error ("Small window size (-sw_size) should be smaller than large window size (-lw_size).\n");
|
|
37 if (SW_SLIDE > LW_SIZE)
|
|
38 print_error ("Small window slide (-sw_slide) should be smaller than or equal to large window slide (-lw_slide).\n");
|
|
39
|
|
40
|
|
41 fasta = my_fopen(GENOME_FASTA, "r", 0);
|
|
42 gaps = my_fopen(GENOME_GAPS, "r", 0);
|
|
43
|
|
44 num_gaps = 0;
|
|
45
|
|
46
|
|
47 /* initialize gap table */
|
|
48 while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0)
|
|
49 num_gaps ++;
|
|
50
|
|
51
|
|
52 gaptable = (struct gapcell *) malloc(sizeof(struct gapcell) * num_gaps);
|
|
53
|
|
54 rewind(gaps);
|
|
55
|
|
56
|
|
57 /* read gap table */
|
|
58
|
|
59 i = 0;
|
|
60 while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0){
|
|
61 trimspace(chrom);
|
|
62 gaptable[i].chrom = NULL;
|
|
63 set_str(&(gaptable[i].chrom), chrom);
|
|
64 gaptable[i].start = start;
|
|
65 gaptable[i].end = end;
|
|
66 i++;
|
|
67 }
|
|
68 fclose(gaps);
|
|
69
|
|
70 fprintf (stdout, "Scanning the reference genome.\n");
|
|
71 /* count basic numbers from the reference genome */
|
|
72 max_len = 0;
|
|
73 num_chrom = count_chrom(fasta, &max_len);
|
|
74 rewind(fasta);
|
|
75
|
|
76 chromosomes = (struct chrom **) malloc (sizeof (struct chrom *) * num_chrom);
|
|
77
|
|
78 for (i=0; i<num_chrom; i++)
|
|
79 chromosomes[i] = (struct chrom *) malloc (sizeof (struct chrom) * num_chrom);
|
|
80
|
|
81 fprintf(stdout, "Total of %d chromosomes in the reference genome, with %d gaps.\nLongest chromosome is %d bp\n", num_chrom, num_gaps, max_len);
|
|
82
|
|
83 fprintf (stdout, "Reading the reference genome.\n");
|
|
84
|
|
85 read_ref(fasta, max_len, num_gaps);
|
|
86
|
|
87 fprintf (stdout, "Saving the reference configuration.\n");
|
|
88 saveRefConfig(GENOME_CONF);
|
|
89
|
|
90 }
|
|
91
|
|
92
|
|
93
|
|
94 int count_chrom(FILE *fasta, int *max_len){
|
|
95 int length=0; int maxlength=0;
|
|
96 char ch;
|
|
97 int cnt = 0;
|
|
98 char skip[MAX_STR];
|
|
99 char *retStr;
|
|
100
|
|
101 //while (fscanf(fasta, "%c", &ch) > 0){
|
|
102 while (1){
|
|
103
|
|
104 ch = fgetc(fasta);
|
|
105 if (feof(fasta)) break;
|
|
106
|
|
107 if (ch == '>') {
|
|
108 cnt ++;
|
|
109 retStr = fgets(skip, MAX_STR, fasta);
|
|
110 fprintf(stdout, ".");
|
|
111 fflush(stdout);
|
|
112 if (length > maxlength) maxlength = length;
|
|
113 length = 0;
|
|
114 }
|
|
115 else if (!isspace(ch)) length++;
|
|
116 }
|
|
117 fprintf(stdout, "\n");
|
|
118
|
|
119 if (length > maxlength)
|
|
120 maxlength = length;
|
|
121
|
|
122 *max_len = maxlength;
|
|
123 return cnt;
|
|
124 }
|
|
125
|
|
126
|
|
127
|
|
128 void read_ref(FILE *fasta, int max_len, int num_gaps){
|
|
129
|
|
130 char ch;
|
|
131 int cnt = 0;
|
|
132 char chrom_name[MAX_STR];
|
|
133 int i;
|
|
134 int length;
|
|
135 char *chrom_seq;
|
|
136 char *retStr;
|
|
137
|
|
138 chrom_seq = (char *) malloc (sizeof(char) * (max_len+1));
|
|
139 cnt = -1;
|
|
140 i = 0;
|
|
141 length = 0;
|
|
142
|
|
143 // while (fscanf(fasta, "%c", &ch) > 0){
|
|
144 while (1){
|
|
145
|
|
146 ch = fgetc(fasta);
|
|
147 if (feof(fasta)) break;
|
|
148
|
|
149 if (ch == '>') {
|
|
150
|
|
151 if (cnt != -1){
|
|
152 /* insert the previous chromosome */
|
|
153 chrom_seq[length]=0;
|
|
154 insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps);
|
|
155 }
|
|
156
|
|
157 cnt ++;
|
|
158 retStr = fgets(chrom_name, MAX_STR, fasta);
|
|
159 chrom_name[strlen(chrom_name)-1] = 0;
|
|
160 trimspace(chrom_name);
|
|
161 length = 0;
|
|
162 }
|
|
163 else if (!isspace(ch)){
|
|
164 chrom_seq[length++] = ch;
|
|
165 }
|
|
166 }
|
|
167
|
|
168 /* insert the last chromosome */
|
|
169 chrom_seq[length]=0;
|
|
170 trimspace(chrom_name);
|
|
171 insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps);
|
|
172
|
|
173 free(chrom_seq);
|
|
174 }
|
|
175
|
|
176
|
|
177 void insert_chrom(int cnt, char *chrom_name, int length, char *chrom_seq, int num_gaps){
|
|
178
|
|
179 fprintf(stdout, "Chromosome %s (%d bp). Counting windows ... ", chrom_name, length);
|
|
180 fflush(stdout);
|
|
181
|
|
182 chromosomes[cnt]->name = NULL;
|
|
183 set_str(&(chromosomes[cnt]->name), chrom_name);
|
|
184 chromosomes[cnt]->length = length;
|
|
185
|
|
186 windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 0);
|
|
187 fprintf(stdout, "\t\tRecalculating windows ... ");
|
|
188 fflush(stdout);
|
|
189 windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 1);
|
|
190
|
|
191 fprintf(stdout, "\n");
|
|
192 }
|
|
193
|
|
194
|
|
195 void windowmaker(int num_gaps, int chrom_id, char *chrom_name, char *chrom_seq, int length, int flag){
|
|
196 int i;
|
|
197 int j;
|
|
198 int nchar;
|
|
199 int s, e;
|
|
200
|
|
201 int gc;
|
|
202 float gc_p;
|
|
203
|
|
204 int lw_cnt = 0; // large window (5Kb)
|
|
205 int sw_cnt = 0; // small window (1Kb)
|
|
206 int cw_cnt = 0; // copy number window (1kb non-ovp);
|
|
207
|
|
208
|
|
209 int win_cnt;
|
|
210
|
|
211 /* if flag = 0; don't record it */
|
|
212
|
|
213 if (!flag){
|
|
214 for (i=0;i<num_gaps;i++){
|
|
215 if (!strcmp(gaptable[i].chrom, chrom_name)){
|
|
216 if (gaptable[i].start == gaptable[i].end)
|
|
217 chrom_seq[gaptable[i].start] = 'X';
|
|
218 else{
|
|
219 for (j=gaptable[i].start; (j<gaptable[i].end && j<length); j++)
|
|
220 chrom_seq[j] = 'X';
|
|
221 }
|
|
222 }
|
|
223 }
|
|
224 }
|
|
225
|
|
226
|
|
227 /* count cw_cnt */
|
|
228
|
|
229 nchar = 0;
|
|
230 s = 0 ; e = 0;
|
|
231 gc = 0;
|
|
232
|
|
233 for (i=0; i<length; i++){
|
|
234
|
|
235 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X')
|
|
236 nchar++;
|
|
237 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
|
|
238 gc++;
|
|
239
|
|
240 if ((nchar == CW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
|
|
241 e = i;
|
|
242 if (chrom_seq[i] == 'X'){
|
|
243 nchar = 0;
|
|
244 gc = 0;
|
|
245 }
|
|
246 else if (flag == 1 && nchar == CW_SIZE){
|
|
247 /* save */
|
|
248 chromosomes[chrom_id]->cw[cw_cnt].start = s;
|
|
249 chromosomes[chrom_id]->cw[cw_cnt].end = e+1;
|
|
250 chromosomes[chrom_id]->cw[cw_cnt].gc = (float)gc / (float)nchar;
|
|
251 chromosomes[chrom_id]->cw[cw_cnt].depth = 0; // for readability/completeness
|
|
252 cw_cnt++;
|
|
253 gc = 0;
|
|
254 }
|
|
255 else if (flag == 0 && nchar == CW_SIZE){
|
|
256 cw_cnt ++;
|
|
257 }
|
|
258
|
|
259 s = i + 1;
|
|
260 nchar = 0;
|
|
261 }
|
|
262
|
|
263 if (chrom_seq[i] == 'X'){
|
|
264 s = i + 1;
|
|
265 nchar = 0;
|
|
266 gc = 0;
|
|
267 }
|
|
268
|
|
269 }
|
|
270
|
|
271
|
|
272 /* count sw_cnt */
|
|
273
|
|
274 i = 0;
|
|
275 nchar = 0;
|
|
276 s = 0 ; e = 0;
|
|
277 gc = 0;
|
|
278
|
|
279 while (i < length){
|
|
280
|
|
281 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X')
|
|
282 nchar++;
|
|
283 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
|
|
284 gc++;
|
|
285
|
|
286 if ((nchar == SW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
|
|
287 e = i;
|
|
288 if (chrom_seq[i] == 'X'){
|
|
289 nchar = 0;
|
|
290 gc = 0;
|
|
291 }
|
|
292 else if (flag == 1 && nchar == SW_SIZE){
|
|
293 /* save */
|
|
294 chromosomes[chrom_id]->sw[sw_cnt].start = s;
|
|
295 chromosomes[chrom_id]->sw[sw_cnt].end = e+1;
|
|
296 chromosomes[chrom_id]->sw[sw_cnt].gc = (float)gc / (float)nchar;
|
|
297 chromosomes[chrom_id]->sw[sw_cnt].depth = 0; // for readability/completeness
|
|
298 sw_cnt++;
|
|
299 gc = 0;
|
|
300 }
|
|
301 else if (flag == 0 && nchar == SW_SIZE)
|
|
302 sw_cnt++;
|
|
303
|
|
304 s = s + SW_SLIDE;
|
|
305 i = s - 1;
|
|
306 nchar = 0;
|
|
307 }
|
|
308 if (chrom_seq[i]=='X'){
|
|
309 s = i + 1;
|
|
310 i = s - 1;
|
|
311 gc = 0;
|
|
312 }
|
|
313
|
|
314 i++;
|
|
315 }
|
|
316
|
|
317
|
|
318 nchar = 0;
|
|
319 i = 0;
|
|
320 s = 0 ; e = 0;
|
|
321 gc = 0;
|
|
322 /* count lw_cnt */
|
|
323
|
|
324 while (i<length){
|
|
325
|
|
326 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X')
|
|
327 nchar++;
|
|
328 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
|
|
329 gc++;
|
|
330
|
|
331 if ((nchar == LW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
|
|
332 e = i;
|
|
333 if (chrom_seq[i] == 'X'){
|
|
334 nchar = 0;
|
|
335 gc = 0;
|
|
336 }
|
|
337 else if (flag == 1 && nchar == LW_SIZE){
|
|
338 /* save */
|
|
339 chromosomes[chrom_id]->lw[lw_cnt].start = s;
|
|
340 chromosomes[chrom_id]->lw[lw_cnt].end = e+1;
|
|
341 chromosomes[chrom_id]->lw[lw_cnt].gc = (float)gc / (float)nchar;
|
|
342 chromosomes[chrom_id]->lw[lw_cnt].depth = 0; // for readability/completeness
|
|
343 lw_cnt++;
|
|
344 gc = 0;
|
|
345 }
|
|
346 else if (flag == 0 && nchar == LW_SIZE)
|
|
347 lw_cnt++;
|
|
348
|
|
349 s = s + LW_SLIDE;
|
|
350 i = s - 1;
|
|
351 nchar = 0;
|
|
352 }
|
|
353
|
|
354 if (chrom_seq[i] == 'X') {
|
|
355 s = i + 1;
|
|
356 i = s - 1;
|
|
357 gc = 0;
|
|
358 }
|
|
359
|
|
360 i++;
|
|
361 }
|
|
362
|
|
363
|
|
364
|
|
365 if (flag == 0){
|
|
366
|
|
367 printf("\n\t\tLW: %d\tSW: %d\tCW: %d\n", lw_cnt, sw_cnt, cw_cnt);
|
|
368
|
|
369 chromosomes[chrom_id]->lw_cnt = lw_cnt;
|
|
370 chromosomes[chrom_id]->sw_cnt = sw_cnt;
|
|
371 chromosomes[chrom_id]->cw_cnt = cw_cnt;
|
|
372
|
|
373 chromosomes[chrom_id]->cw = (struct window *) malloc (sizeof(struct window) * cw_cnt);
|
|
374 chromosomes[chrom_id]->lw = (struct window *) malloc (sizeof(struct window) * lw_cnt);
|
|
375 chromosomes[chrom_id]->sw = (struct window *) malloc (sizeof(struct window) * sw_cnt);
|
|
376
|
|
377
|
|
378 }
|
|
379
|
|
380
|
|
381 }
|