Mercurial > repos > calkan > mrcanavar
comparison mrcanavar-0.34/prep.c @ 0:86522a0b5f59 default tip
Uploaded source code for mrCaNaVaR
author | calkan |
---|---|
date | Tue, 21 Feb 2012 10:44:13 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:86522a0b5f59 |
---|---|
1 #include "prep.h" | |
2 | |
3 void prep_genome(void){ | |
4 FILE *fasta; | |
5 FILE *gaps; | |
6 FILE *config; | |
7 | |
8 int num_gaps; | |
9 | |
10 char chrom[MAX_STR]; | |
11 int start, end; | |
12 int max_len; | |
13 | |
14 int i; | |
15 | |
16 if (GENOME_FASTA == NULL) | |
17 print_error("Select genome fasta file (input) through -fasta parameter.\n"); | |
18 if (GENOME_GAPS == NULL) | |
19 print_error("Select genome gaps file (input) through -gaps parameter.\n"); | |
20 if (GENOME_CONF == NULL) | |
21 print_error("Select genome fasta file (output) through -conf parameter.\n"); | |
22 | |
23 | |
24 if (LW_SIZE <= 0) | |
25 print_error ("Large window size (-lw_size) must be a poitive integer.\n"); | |
26 if (SW_SIZE <= 0) | |
27 print_error ("Small window size (-sw_size) must be a poitive integer.\n"); | |
28 if (CW_SIZE <= 0) | |
29 print_error ("Copy number window size (-cw_size) must be a poitive integer.\n"); | |
30 if (LW_SLIDE <= 0) | |
31 print_error ("Large window slide (-lw_slide) must be a poitive integer.\n"); | |
32 if (SW_SLIDE <= 0) | |
33 print_error ("Small window slide (-sw_slide) must be a poitive integer.\n"); | |
34 | |
35 if (SW_SIZE >= LW_SIZE) | |
36 print_error ("Small window size (-sw_size) should be smaller than large window size (-lw_size).\n"); | |
37 if (SW_SLIDE > LW_SIZE) | |
38 print_error ("Small window slide (-sw_slide) should be smaller than or equal to large window slide (-lw_slide).\n"); | |
39 | |
40 | |
41 fasta = my_fopen(GENOME_FASTA, "r", 0); | |
42 gaps = my_fopen(GENOME_GAPS, "r", 0); | |
43 | |
44 num_gaps = 0; | |
45 | |
46 | |
47 /* initialize gap table */ | |
48 while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0) | |
49 num_gaps ++; | |
50 | |
51 | |
52 gaptable = (struct gapcell *) malloc(sizeof(struct gapcell) * num_gaps); | |
53 | |
54 rewind(gaps); | |
55 | |
56 | |
57 /* read gap table */ | |
58 | |
59 i = 0; | |
60 while (fscanf (gaps, "%s\t%d\t%d\n", chrom, &start, &end) > 0){ | |
61 trimspace(chrom); | |
62 gaptable[i].chrom = NULL; | |
63 set_str(&(gaptable[i].chrom), chrom); | |
64 gaptable[i].start = start; | |
65 gaptable[i].end = end; | |
66 i++; | |
67 } | |
68 fclose(gaps); | |
69 | |
70 fprintf (stdout, "Scanning the reference genome.\n"); | |
71 /* count basic numbers from the reference genome */ | |
72 max_len = 0; | |
73 num_chrom = count_chrom(fasta, &max_len); | |
74 rewind(fasta); | |
75 | |
76 chromosomes = (struct chrom **) malloc (sizeof (struct chrom *) * num_chrom); | |
77 | |
78 for (i=0; i<num_chrom; i++) | |
79 chromosomes[i] = (struct chrom *) malloc (sizeof (struct chrom) * num_chrom); | |
80 | |
81 fprintf(stdout, "Total of %d chromosomes in the reference genome, with %d gaps.\nLongest chromosome is %d bp\n", num_chrom, num_gaps, max_len); | |
82 | |
83 fprintf (stdout, "Reading the reference genome.\n"); | |
84 | |
85 read_ref(fasta, max_len, num_gaps); | |
86 | |
87 fprintf (stdout, "Saving the reference configuration.\n"); | |
88 saveRefConfig(GENOME_CONF); | |
89 | |
90 } | |
91 | |
92 | |
93 | |
94 int count_chrom(FILE *fasta, int *max_len){ | |
95 int length=0; int maxlength=0; | |
96 char ch; | |
97 int cnt = 0; | |
98 char skip[MAX_STR]; | |
99 char *retStr; | |
100 | |
101 //while (fscanf(fasta, "%c", &ch) > 0){ | |
102 while (1){ | |
103 | |
104 ch = fgetc(fasta); | |
105 if (feof(fasta)) break; | |
106 | |
107 if (ch == '>') { | |
108 cnt ++; | |
109 retStr = fgets(skip, MAX_STR, fasta); | |
110 fprintf(stdout, "."); | |
111 fflush(stdout); | |
112 if (length > maxlength) maxlength = length; | |
113 length = 0; | |
114 } | |
115 else if (!isspace(ch)) length++; | |
116 } | |
117 fprintf(stdout, "\n"); | |
118 | |
119 if (length > maxlength) | |
120 maxlength = length; | |
121 | |
122 *max_len = maxlength; | |
123 return cnt; | |
124 } | |
125 | |
126 | |
127 | |
128 void read_ref(FILE *fasta, int max_len, int num_gaps){ | |
129 | |
130 char ch; | |
131 int cnt = 0; | |
132 char chrom_name[MAX_STR]; | |
133 int i; | |
134 int length; | |
135 char *chrom_seq; | |
136 char *retStr; | |
137 | |
138 chrom_seq = (char *) malloc (sizeof(char) * (max_len+1)); | |
139 cnt = -1; | |
140 i = 0; | |
141 length = 0; | |
142 | |
143 // while (fscanf(fasta, "%c", &ch) > 0){ | |
144 while (1){ | |
145 | |
146 ch = fgetc(fasta); | |
147 if (feof(fasta)) break; | |
148 | |
149 if (ch == '>') { | |
150 | |
151 if (cnt != -1){ | |
152 /* insert the previous chromosome */ | |
153 chrom_seq[length]=0; | |
154 insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps); | |
155 } | |
156 | |
157 cnt ++; | |
158 retStr = fgets(chrom_name, MAX_STR, fasta); | |
159 chrom_name[strlen(chrom_name)-1] = 0; | |
160 trimspace(chrom_name); | |
161 length = 0; | |
162 } | |
163 else if (!isspace(ch)){ | |
164 chrom_seq[length++] = ch; | |
165 } | |
166 } | |
167 | |
168 /* insert the last chromosome */ | |
169 chrom_seq[length]=0; | |
170 trimspace(chrom_name); | |
171 insert_chrom(cnt, chrom_name, length, chrom_seq, num_gaps); | |
172 | |
173 free(chrom_seq); | |
174 } | |
175 | |
176 | |
177 void insert_chrom(int cnt, char *chrom_name, int length, char *chrom_seq, int num_gaps){ | |
178 | |
179 fprintf(stdout, "Chromosome %s (%d bp). Counting windows ... ", chrom_name, length); | |
180 fflush(stdout); | |
181 | |
182 chromosomes[cnt]->name = NULL; | |
183 set_str(&(chromosomes[cnt]->name), chrom_name); | |
184 chromosomes[cnt]->length = length; | |
185 | |
186 windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 0); | |
187 fprintf(stdout, "\t\tRecalculating windows ... "); | |
188 fflush(stdout); | |
189 windowmaker(num_gaps, cnt, chrom_name, chrom_seq, length, 1); | |
190 | |
191 fprintf(stdout, "\n"); | |
192 } | |
193 | |
194 | |
195 void windowmaker(int num_gaps, int chrom_id, char *chrom_name, char *chrom_seq, int length, int flag){ | |
196 int i; | |
197 int j; | |
198 int nchar; | |
199 int s, e; | |
200 | |
201 int gc; | |
202 float gc_p; | |
203 | |
204 int lw_cnt = 0; // large window (5Kb) | |
205 int sw_cnt = 0; // small window (1Kb) | |
206 int cw_cnt = 0; // copy number window (1kb non-ovp); | |
207 | |
208 | |
209 int win_cnt; | |
210 | |
211 /* if flag = 0; don't record it */ | |
212 | |
213 if (!flag){ | |
214 for (i=0;i<num_gaps;i++){ | |
215 if (!strcmp(gaptable[i].chrom, chrom_name)){ | |
216 if (gaptable[i].start == gaptable[i].end) | |
217 chrom_seq[gaptable[i].start] = 'X'; | |
218 else{ | |
219 for (j=gaptable[i].start; (j<gaptable[i].end && j<length); j++) | |
220 chrom_seq[j] = 'X'; | |
221 } | |
222 } | |
223 } | |
224 } | |
225 | |
226 | |
227 /* count cw_cnt */ | |
228 | |
229 nchar = 0; | |
230 s = 0 ; e = 0; | |
231 gc = 0; | |
232 | |
233 for (i=0; i<length; i++){ | |
234 | |
235 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') | |
236 nchar++; | |
237 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C') | |
238 gc++; | |
239 | |
240 if ((nchar == CW_SIZE || chrom_seq[i] == 'X') && nchar != 0){ | |
241 e = i; | |
242 if (chrom_seq[i] == 'X'){ | |
243 nchar = 0; | |
244 gc = 0; | |
245 } | |
246 else if (flag == 1 && nchar == CW_SIZE){ | |
247 /* save */ | |
248 chromosomes[chrom_id]->cw[cw_cnt].start = s; | |
249 chromosomes[chrom_id]->cw[cw_cnt].end = e+1; | |
250 chromosomes[chrom_id]->cw[cw_cnt].gc = (float)gc / (float)nchar; | |
251 chromosomes[chrom_id]->cw[cw_cnt].depth = 0; // for readability/completeness | |
252 cw_cnt++; | |
253 gc = 0; | |
254 } | |
255 else if (flag == 0 && nchar == CW_SIZE){ | |
256 cw_cnt ++; | |
257 } | |
258 | |
259 s = i + 1; | |
260 nchar = 0; | |
261 } | |
262 | |
263 if (chrom_seq[i] == 'X'){ | |
264 s = i + 1; | |
265 nchar = 0; | |
266 gc = 0; | |
267 } | |
268 | |
269 } | |
270 | |
271 | |
272 /* count sw_cnt */ | |
273 | |
274 i = 0; | |
275 nchar = 0; | |
276 s = 0 ; e = 0; | |
277 gc = 0; | |
278 | |
279 while (i < length){ | |
280 | |
281 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') | |
282 nchar++; | |
283 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C') | |
284 gc++; | |
285 | |
286 if ((nchar == SW_SIZE || chrom_seq[i] == 'X') && nchar != 0){ | |
287 e = i; | |
288 if (chrom_seq[i] == 'X'){ | |
289 nchar = 0; | |
290 gc = 0; | |
291 } | |
292 else if (flag == 1 && nchar == SW_SIZE){ | |
293 /* save */ | |
294 chromosomes[chrom_id]->sw[sw_cnt].start = s; | |
295 chromosomes[chrom_id]->sw[sw_cnt].end = e+1; | |
296 chromosomes[chrom_id]->sw[sw_cnt].gc = (float)gc / (float)nchar; | |
297 chromosomes[chrom_id]->sw[sw_cnt].depth = 0; // for readability/completeness | |
298 sw_cnt++; | |
299 gc = 0; | |
300 } | |
301 else if (flag == 0 && nchar == SW_SIZE) | |
302 sw_cnt++; | |
303 | |
304 s = s + SW_SLIDE; | |
305 i = s - 1; | |
306 nchar = 0; | |
307 } | |
308 if (chrom_seq[i]=='X'){ | |
309 s = i + 1; | |
310 i = s - 1; | |
311 gc = 0; | |
312 } | |
313 | |
314 i++; | |
315 } | |
316 | |
317 | |
318 nchar = 0; | |
319 i = 0; | |
320 s = 0 ; e = 0; | |
321 gc = 0; | |
322 /* count lw_cnt */ | |
323 | |
324 while (i<length){ | |
325 | |
326 if (chrom_seq[i] != 'N' && chrom_seq[i] != 'X') | |
327 nchar++; | |
328 if (chrom_seq[i] == 'G' || chrom_seq[i] == 'C') | |
329 gc++; | |
330 | |
331 if ((nchar == LW_SIZE || chrom_seq[i] == 'X') && nchar != 0){ | |
332 e = i; | |
333 if (chrom_seq[i] == 'X'){ | |
334 nchar = 0; | |
335 gc = 0; | |
336 } | |
337 else if (flag == 1 && nchar == LW_SIZE){ | |
338 /* save */ | |
339 chromosomes[chrom_id]->lw[lw_cnt].start = s; | |
340 chromosomes[chrom_id]->lw[lw_cnt].end = e+1; | |
341 chromosomes[chrom_id]->lw[lw_cnt].gc = (float)gc / (float)nchar; | |
342 chromosomes[chrom_id]->lw[lw_cnt].depth = 0; // for readability/completeness | |
343 lw_cnt++; | |
344 gc = 0; | |
345 } | |
346 else if (flag == 0 && nchar == LW_SIZE) | |
347 lw_cnt++; | |
348 | |
349 s = s + LW_SLIDE; | |
350 i = s - 1; | |
351 nchar = 0; | |
352 } | |
353 | |
354 if (chrom_seq[i] == 'X') { | |
355 s = i + 1; | |
356 i = s - 1; | |
357 gc = 0; | |
358 } | |
359 | |
360 i++; | |
361 } | |
362 | |
363 | |
364 | |
365 if (flag == 0){ | |
366 | |
367 printf("\n\t\tLW: %d\tSW: %d\tCW: %d\n", lw_cnt, sw_cnt, cw_cnt); | |
368 | |
369 chromosomes[chrom_id]->lw_cnt = lw_cnt; | |
370 chromosomes[chrom_id]->sw_cnt = sw_cnt; | |
371 chromosomes[chrom_id]->cw_cnt = cw_cnt; | |
372 | |
373 chromosomes[chrom_id]->cw = (struct window *) malloc (sizeof(struct window) * cw_cnt); | |
374 chromosomes[chrom_id]->lw = (struct window *) malloc (sizeof(struct window) * lw_cnt); | |
375 chromosomes[chrom_id]->sw = (struct window *) malloc (sizeof(struct window) * sw_cnt); | |
376 | |
377 | |
378 } | |
379 | |
380 | |
381 } |