comparison mrcanavar-0.34/prep.c @ 0:86522a0b5f59 default tip

Uploaded source code for mrCaNaVaR
author calkan
date Tue, 21 Feb 2012 10:44:13 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:86522a0b5f59
1 #include "prep.h"
2
3 void prep_genome(void){
4 FILE *fasta;
5 FILE *gaps;
6 FILE *config;
7
8 int num_gaps;
9
10 char chrom[MAX_STR];
11 int start, end;
12 int max_len;
13
14 int i;
15
16 if (GENOME_FASTA == NULL)
17 print_error("Select genome fasta file (input) through -fasta parameter.\n");
18 if (GENOME_GAPS == NULL)
19 print_error("Select genome gaps file (input) through -gaps parameter.\n");
20 if (GENOME_CONF == NULL)
21 print_error("Select genome fasta file (output) through -conf parameter.\n");
22
23
24 if (LW_SIZE <= 0)
25 print_error ("Large window size (-lw_size) must be a poitive integer.\n");
26 if (SW_SIZE <= 0)
27 print_error ("Small window size (-sw_size) must be a poitive integer.\n");
28 if (CW_SIZE <= 0)
29 print_error ("Copy number window size (-cw_size) must be a poitive integer.\n");
30 if (LW_SLIDE <= 0)
31 print_error ("Large window slide (-lw_slide) must be a poitive integer.\n");
32 if (SW_SLIDE <= 0)
33 print_error ("Small window slide (-sw_slide) must be a poitive integer.\n");
34
35 if (SW_SIZE >= LW_SIZE)
36 print_error ("Small window size (-sw_size) should be smaller than large window size (-lw_size).\n");
37 if (SW_SLIDE > LW_SIZE)
38 print_error ("Small window slide (-sw_slide) should be smaller than or equal to large window slide (-lw_slide).\n");
39
40
41 fasta = my_fopen(GENOME_FASTA, "r", 0);
42 gaps = my_fopen(GENOME_GAPS, "r", 0);
43
44 num_gaps = 0;
45
46
47 /* initialize gap table */
48 while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0)
49 num_gaps ++;
50
51
52 gaptable = (struct gapcell *) malloc(sizeof(struct gapcell) * num_gaps);
53
54 rewind(gaps);
55
56
57 /* read gap table */
58
59 i = 0;
60 while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0){
61 trimspace(chrom);
62 gaptable[i].chrom = NULL;
63 set_str(&(gaptable[i].chrom), chrom);
64 gaptable[i].start = start;
65 gaptable[i].end = end;
66 i++;
67 }
68 fclose(gaps);
69
70 fprintf (stdout, "Scanning the reference genome.\n");
71 /* count basic numbers from the reference genome */
72 max_len = 0;
73 num_chrom = count_chrom(fasta, &max_len);
74 rewind(fasta);
75
76 chromosomes = (struct chrom **) malloc (sizeof (struct chrom *) * num_chrom);
77
78 for (i=0; i<num_chrom; i++)
79 chromosomes[i] = (struct chrom *) malloc (sizeof (struct chrom) * num_chrom);
80
81 fprintf(stdout, "Total of %d chromosomes in the reference genome, with %d gaps.\nLongest chromosome is %d bp\n", num_chrom, num_gaps, max_len);
82
83 fprintf (stdout, "Reading the reference genome.\n");
84
85 read_ref(fasta, max_len, num_gaps);
86
87 fprintf (stdout, "Saving the reference configuration.\n");
88 saveRefConfig(GENOME_CONF);
89
90 }
91
92
93
94 int count_chrom(FILE *fasta, int *max_len){
95 int length=0; int maxlength=0;
96 char ch;
97 int cnt = 0;
98 char skip[MAX_STR];
99 char *retStr;
100
101 //while (fscanf(fasta, "%c", &ch) > 0){
102 while (1){
103
104 ch = fgetc(fasta);
105 if (feof(fasta)) break;
106
107 if (ch == '>') {
108 cnt ++;
109 retStr = fgets(skip, MAX_STR, fasta);
110 fprintf(stdout, ".");
111 fflush(stdout);
112 if (length > maxlength) maxlength = length;
113 length = 0;
114 }
115 else if (!isspace(ch)) length++;
116 }
117 fprintf(stdout, "\n");
118
119 if (length > maxlength)
120 maxlength = length;
121
122 *max_len = maxlength;
123 return cnt;
124 }
125
126
127
128 void read_ref(FILE *fasta, int max_len, int num_gaps){
129
130 char ch;
131 int cnt = 0;
132 char chrom_name[MAX_STR];
133 int i;
134 int length;
135 char *chrom_seq;
136 char *retStr;
137
138 chrom_seq = (char *) malloc (sizeof(char) * (max_len+1));
139 cnt = -1;
140 i = 0;
141 length = 0;
142
143 // while (fscanf(fasta, "%c", &ch) > 0){
144 while (1){
145
146 ch = fgetc(fasta);
147 if (feof(fasta)) break;
148
149 if (ch == '>') {
150
151 if (cnt != -1){
152 /* insert the previous chromosome */
153 chrom_seq[length]=0;
154 insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps);
155 }
156
157 cnt ++;
158 retStr = fgets(chrom_name, MAX_STR, fasta);
159 chrom_name[strlen(chrom_name)-1] = 0;
160 trimspace(chrom_name);
161 length = 0;
162 }
163 else if (!isspace(ch)){
164 chrom_seq[length++] = ch;
165 }
166 }
167
168 /* insert the last chromosome */
169 chrom_seq[length]=0;
170 trimspace(chrom_name);
171 insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps);
172
173 free(chrom_seq);
174 }
175
176
177 void insert_chrom(int cnt, char *chrom_name, int length, char *chrom_seq, int num_gaps){
178
179 fprintf(stdout, "Chromosome %s (%d bp). Counting windows ... ", chrom_name, length);
180 fflush(stdout);
181
182 chromosomes[cnt]->name = NULL;
183 set_str(&(chromosomes[cnt]->name), chrom_name);
184 chromosomes[cnt]->length = length;
185
186 windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 0);
187 fprintf(stdout, "\t\tRecalculating windows ... ");
188 fflush(stdout);
189 windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 1);
190
191 fprintf(stdout, "\n");
192 }
193
194
195 void windowmaker(int num_gaps, int chrom_id, char *chrom_name, char *chrom_seq, int length, int flag){
196 int i;
197 int j;
198 int nchar;
199 int s, e;
200
201 int gc;
202 float gc_p;
203
204 int lw_cnt = 0; // large window (5Kb)
205 int sw_cnt = 0; // small window (1Kb)
206 int cw_cnt = 0; // copy number window (1kb non-ovp);
207
208
209 int win_cnt;
210
211 /* if flag = 0; don't record it */
212
213 if (!flag){
214 for (i=0;i<num_gaps;i++){
215 if (!strcmp(gaptable[i].chrom, chrom_name)){
216 if (gaptable[i].start == gaptable[i].end)
217 chrom_seq[gaptable[i].start] = 'X';
218 else{
219 for (j=gaptable[i].start; (j<gaptable[i].end && j<length); j++)
220 chrom_seq[j] = 'X';
221 }
222 }
223 }
224 }
225
226
227 /* count cw_cnt */
228
229 nchar = 0;
230 s = 0 ; e = 0;
231 gc = 0;
232
233 for (i=0; i<length; i++){
234
235 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X')
236 nchar++;
237 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
238 gc++;
239
240 if ((nchar == CW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
241 e = i;
242 if (chrom_seq[i] == 'X'){
243 nchar = 0;
244 gc = 0;
245 }
246 else if (flag == 1 && nchar == CW_SIZE){
247 /* save */
248 chromosomes[chrom_id]->cw[cw_cnt].start = s;
249 chromosomes[chrom_id]->cw[cw_cnt].end = e+1;
250 chromosomes[chrom_id]->cw[cw_cnt].gc = (float)gc / (float)nchar;
251 chromosomes[chrom_id]->cw[cw_cnt].depth = 0; // for readability/completeness
252 cw_cnt++;
253 gc = 0;
254 }
255 else if (flag == 0 && nchar == CW_SIZE){
256 cw_cnt ++;
257 }
258
259 s = i + 1;
260 nchar = 0;
261 }
262
263 if (chrom_seq[i] == 'X'){
264 s = i + 1;
265 nchar = 0;
266 gc = 0;
267 }
268
269 }
270
271
272 /* count sw_cnt */
273
274 i = 0;
275 nchar = 0;
276 s = 0 ; e = 0;
277 gc = 0;
278
279 while (i < length){
280
281 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X')
282 nchar++;
283 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
284 gc++;
285
286 if ((nchar == SW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
287 e = i;
288 if (chrom_seq[i] == 'X'){
289 nchar = 0;
290 gc = 0;
291 }
292 else if (flag == 1 && nchar == SW_SIZE){
293 /* save */
294 chromosomes[chrom_id]->sw[sw_cnt].start = s;
295 chromosomes[chrom_id]->sw[sw_cnt].end = e+1;
296 chromosomes[chrom_id]->sw[sw_cnt].gc = (float)gc / (float)nchar;
297 chromosomes[chrom_id]->sw[sw_cnt].depth = 0; // for readability/completeness
298 sw_cnt++;
299 gc = 0;
300 }
301 else if (flag == 0 && nchar == SW_SIZE)
302 sw_cnt++;
303
304 s = s + SW_SLIDE;
305 i = s - 1;
306 nchar = 0;
307 }
308 if (chrom_seq[i]=='X'){
309 s = i + 1;
310 i = s - 1;
311 gc = 0;
312 }
313
314 i++;
315 }
316
317
318 nchar = 0;
319 i = 0;
320 s = 0 ; e = 0;
321 gc = 0;
322 /* count lw_cnt */
323
324 while (i<length){
325
326 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X')
327 nchar++;
328 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C')
329 gc++;
330
331 if ((nchar == LW_SIZE || chrom_seq[i] == 'X') && nchar != 0){
332 e = i;
333 if (chrom_seq[i] == 'X'){
334 nchar = 0;
335 gc = 0;
336 }
337 else if (flag == 1 && nchar == LW_SIZE){
338 /* save */
339 chromosomes[chrom_id]->lw[lw_cnt].start = s;
340 chromosomes[chrom_id]->lw[lw_cnt].end = e+1;
341 chromosomes[chrom_id]->lw[lw_cnt].gc = (float)gc / (float)nchar;
342 chromosomes[chrom_id]->lw[lw_cnt].depth = 0; // for readability/completeness
343 lw_cnt++;
344 gc = 0;
345 }
346 else if (flag == 0 && nchar == LW_SIZE)
347 lw_cnt++;
348
349 s = s + LW_SLIDE;
350 i = s - 1;
351 nchar = 0;
352 }
353
354 if (chrom_seq[i] == 'X') {
355 s = i + 1;
356 i = s - 1;
357 gc = 0;
358 }
359
360 i++;
361 }
362
363
364
365 if (flag == 0){
366
367 printf("\n\t\tLW: %d\tSW: %d\tCW: %d\n", lw_cnt, sw_cnt, cw_cnt);
368
369 chromosomes[chrom_id]->lw_cnt = lw_cnt;
370 chromosomes[chrom_id]->sw_cnt = sw_cnt;
371 chromosomes[chrom_id]->cw_cnt = cw_cnt;
372
373 chromosomes[chrom_id]->cw = (struct window *) malloc (sizeof(struct window) * cw_cnt);
374 chromosomes[chrom_id]->lw = (struct window *) malloc (sizeof(struct window) * lw_cnt);
375 chromosomes[chrom_id]->sw = (struct window *) malloc (sizeof(struct window) * sw_cnt);
376
377
378 }
379
380
381 }