Mercurial > repos > calkan > mrcanavar
comparison mrcanavar-0.34/sam.c @ 0:86522a0b5f59 default tip
Uploaded source code for mrCaNaVaR
author | calkan |
---|---|
date | Tue, 21 Feb 2012 10:44:13 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:86522a0b5f59 |
---|---|
1 #include "sam.h" | |
2 | |
3 void read_depth(char *indirSAM, char *depthFile, char gzSAM){ | |
4 | |
5 | |
6 DIR *dp; | |
7 int fileCnt; | |
8 int totalFile; | |
9 | |
10 struct dirent *ep; | |
11 | |
12 int i; | |
13 | |
14 if (GENOME_CONF == NULL) | |
15 print_error("Select genom configuration file (input) through the -conf parameter.\n"); | |
16 if (indirSAM == NULL) | |
17 print_error("Select input directory that contains the SAM files through the -samdir parameter.\n"); | |
18 if (depthFile == NULL) | |
19 print_error("Select read depth output file through the -depth parameter.\n"); | |
20 | |
21 | |
22 loadRefConfig(GENOME_CONF); | |
23 | |
24 fprintf(stdout, "Scanning the SAM directory: %s\n", indirSAM); | |
25 | |
26 dp = opendir(indirSAM); | |
27 | |
28 if (dp == NULL) | |
29 print_error("SAM directory not found.\n"); | |
30 | |
31 fileCnt = 0; | |
32 totalFile = 0; | |
33 | |
34 while((ep=readdir(dp))){ | |
35 if (ep->d_name[0] == '.') | |
36 continue; | |
37 if (ep->d_type == DT_DIR) | |
38 continue; | |
39 | |
40 if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz")) | |
41 continue; | |
42 | |
43 /* | |
44 if (!strstr(ep->d_name, "sam")) | |
45 continue; | |
46 */ | |
47 | |
48 i = strlen(ep->d_name)-1; | |
49 | |
50 if ((ep->d_name[i] == 'z' && ep->d_name[i-1] == 'g' && ep->d_name[i-2] == '.') && gzSAM == 0){ | |
51 print_error("File name ends with .gz yet --gz option is not selected. Are you sure?\nIf the files are indeed uncompressed, rename the files.\n"); | |
52 } | |
53 totalFile++; | |
54 } | |
55 | |
56 rewinddir(dp); | |
57 | |
58 while((ep=readdir(dp))){ | |
59 if (ep->d_name[0] == '.') | |
60 continue; | |
61 if (ep->d_type == DT_DIR) | |
62 continue; | |
63 | |
64 if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz")) | |
65 continue; | |
66 | |
67 | |
68 /* | |
69 if (!strstr(ep->d_name, "sam")) | |
70 continue; | |
71 */ | |
72 | |
73 fprintf(stdout, "\r \rLoading file %d of %d: %s...", (fileCnt+1), totalFile, ep->d_name); | |
74 fflush(stdout); | |
75 | |
76 readSAM(indirSAM, ep->d_name, gzSAM); | |
77 | |
78 fileCnt++; | |
79 } | |
80 | |
81 closedir(dp); | |
82 | |
83 if (fileCnt == 0) | |
84 print_error("SAM directory does not contain any files with extensions \".sam\" or \".sam.gz\".\nIf you do have the correct files, please rename them to have either \".sam\" or \".sam.gz\" extensions.\n"); | |
85 else | |
86 fprintf(stdout, "\n\n%d file%s loaded.\n", fileCnt, fileCnt>1 ? "s" : ""); | |
87 | |
88 saveDepth(depthFile); | |
89 | |
90 } | |
91 | |
92 | |
93 void readSAM(char *indirSAM, char *fname, char gzSAM){ | |
94 FILE *sam; | |
95 | |
96 char *samfile; | |
97 | |
98 char samLine[4 * MAX_STR]; | |
99 char prevChrom[MAX_STR]; | |
100 char chrom[MAX_STR]; | |
101 char *token; | |
102 | |
103 | |
104 int pos; | |
105 int chrom_id; | |
106 int prev_chrom_id = -1; | |
107 int lineCnt; | |
108 | |
109 samfile = (char *) malloc (sizeof(char) * (strlen(indirSAM) + strlen(fname) + 2)); | |
110 | |
111 sprintf(samfile, "%s/%s", indirSAM, fname); | |
112 | |
113 sam = my_fopen(samfile, "r", gzSAM); | |
114 | |
115 prevChrom[0] = 0; | |
116 | |
117 while (1){ | |
118 | |
119 /* read entire line from the SAM file */ | |
120 if (gzSAM){ | |
121 gzgets(sam, samLine, (4 * MAX_STR)); | |
122 if (gzeof(sam)) | |
123 break; | |
124 } | |
125 else{ | |
126 fgets(samLine, (4 * MAX_STR), sam); | |
127 if (feof(sam)) | |
128 break; | |
129 } | |
130 | |
131 /* parse chrom */ | |
132 token = NULL; | |
133 | |
134 token = strtok(samLine, "\t"); // read name | |
135 token = strtok(NULL, "\t"); // flag | |
136 token = strtok(NULL, "\t"); //chrom | |
137 | |
138 strcpy(chrom, token); | |
139 trimspace(chrom); | |
140 | |
141 token = strtok(NULL, "\t"); // pos | |
142 | |
143 pos = atoi(token) - 1; // SAM file is 1-based; our format is 0-based | |
144 | |
145 if (pos < 0) | |
146 continue; | |
147 | |
148 /* | |
149 debug if needed | |
150 fprintf(stdout, "%s\t%d\n", chrom, pos); | |
151 */ | |
152 | |
153 | |
154 | |
155 chrom_id = insert_read_lw(chrom, pos, prevChrom, prev_chrom_id); | |
156 | |
157 /* | |
158 if (VERBOSE) | |
159 fprintf(stdout, "[READSAM]\t%s\t%d\tchrom_id: %d\n", chrom, pos, chrom_id); | |
160 */ | |
161 | |
162 if (chrom_id == -1) // this chromosome is not in the config file | |
163 continue; | |
164 | |
165 chrom_id = insert_read_sw(chrom, pos, prevChrom, chrom_id); | |
166 | |
167 if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this | |
168 continue; | |
169 | |
170 chrom_id = insert_read_cw(chrom, pos, prevChrom, chrom_id); | |
171 | |
172 | |
173 | |
174 if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this | |
175 continue; | |
176 | |
177 strcpy(prevChrom, chrom); | |
178 prev_chrom_id = chrom_id; | |
179 | |
180 | |
181 } | |
182 | |
183 if (gzSAM) | |
184 gzclose(sam); | |
185 else | |
186 fclose(sam); | |
187 | |
188 free(samfile); | |
189 } | |
190 | |
191 | |
192 | |
193 | |
194 | |
195 int insert_read_lw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ | |
196 int chrom_id; | |
197 int window_id; | |
198 | |
199 int flank; | |
200 | |
201 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); | |
202 | |
203 if (chrom_id != -1){ | |
204 | |
205 if (pos >= chromosomes[chrom_id]->length){ | |
206 return chrom_id; | |
207 } | |
208 | |
209 window_id = windowSearch(chromosomes[chrom_id]->lw, chromosomes[chrom_id]->lw_cnt, pos); | |
210 | |
211 | |
212 if (window_id != -1){ | |
213 | |
214 chromosomes[chrom_id]->lw[window_id].depth += 1; | |
215 | |
216 /* iterate left */ | |
217 flank = window_id - 1; | |
218 | |
219 while (flank >= 0 && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end)) | |
220 chromosomes[chrom_id]->lw[flank--].depth += 1; | |
221 | |
222 | |
223 /* iterate right */ | |
224 flank = window_id + 1; | |
225 | |
226 while (flank < chromosomes[chrom_id]->lw_cnt && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end)) | |
227 chromosomes[chrom_id]->lw[flank++].depth += 1; | |
228 | |
229 | |
230 } | |
231 | |
232 } | |
233 | |
234 return chrom_id; | |
235 } | |
236 | |
237 | |
238 | |
239 int insert_read_sw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ | |
240 int chrom_id; | |
241 int window_id; | |
242 | |
243 int flank; | |
244 | |
245 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); | |
246 | |
247 if (chrom_id != -1){ | |
248 | |
249 if (pos >= chromosomes[chrom_id]->length) | |
250 return chrom_id; | |
251 | |
252 window_id = windowSearch(chromosomes[chrom_id]->sw, chromosomes[chrom_id]->sw_cnt, pos); | |
253 | |
254 if (window_id != -1){ | |
255 | |
256 chromosomes[chrom_id]->sw[window_id].depth += 1; | |
257 | |
258 /* iterate left */ | |
259 flank = window_id - 1; | |
260 | |
261 while (flank >= 0 && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end)) | |
262 chromosomes[chrom_id]->sw[flank--].depth += 1; | |
263 | |
264 | |
265 /* iterate right */ | |
266 flank = window_id + 1; | |
267 | |
268 while (flank < chromosomes[chrom_id]->sw_cnt && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end)) | |
269 chromosomes[chrom_id]->sw[flank++].depth += 1; | |
270 | |
271 | |
272 } | |
273 | |
274 } | |
275 | |
276 return chrom_id; | |
277 } | |
278 | |
279 | |
280 int insert_read_cw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){ | |
281 int chrom_id; | |
282 int window_id; | |
283 | |
284 int flank; | |
285 | |
286 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id); | |
287 | |
288 if (chrom_id != -1){ | |
289 | |
290 if (pos >= chromosomes[chrom_id]->length) | |
291 return chrom_id; | |
292 | |
293 window_id = windowSearch(chromosomes[chrom_id]->cw, chromosomes[chrom_id]->cw_cnt, pos); | |
294 | |
295 if (window_id != -1){ | |
296 | |
297 chromosomes[chrom_id]->cw[window_id].depth += 1; | |
298 | |
299 } | |
300 | |
301 } | |
302 return chrom_id; | |
303 } | |
304 | |
305 | |
306 int findChrom(char *chrom, char *prevChrom, int prev_chrom_id){ | |
307 int chrom_id; | |
308 | |
309 | |
310 if (!strcmp(chrom, prevChrom)){ | |
311 chrom_id = prev_chrom_id; | |
312 } | |
313 | |
314 else if (prev_chrom_id == -1 && !strcmp(chrom, chromosomes[0]->name)){ // the first entry in the file | |
315 chrom_id = 0; | |
316 } | |
317 else if (prev_chrom_id != -1 && prev_chrom_id < num_chrom -1 && !strcmp(chrom, chromosomes[prev_chrom_id+1]->name)){ | |
318 chrom_id = prev_chrom_id + 1; | |
319 } | |
320 else{ | |
321 chrom_id = binSearch(chrom); | |
322 } | |
323 | |
324 return chrom_id; | |
325 | |
326 } | |
327 | |
328 int binSearch(char *chrom){ | |
329 int start; | |
330 int end; | |
331 int med; | |
332 int strtest; | |
333 | |
334 start = 0; | |
335 | |
336 end = num_chrom - 1; | |
337 | |
338 med = (start + end) / 2; | |
339 | |
340 | |
341 while (1){ | |
342 | |
343 if (start > end) | |
344 return -1; | |
345 | |
346 if (start == med || end == med){ | |
347 if (!strcmp(chrom, chromosomes[start]->name)) | |
348 return start; | |
349 else if (!strcmp(chrom, chromosomes[end]->name)) | |
350 return end; | |
351 else{ | |
352 return -1; | |
353 } | |
354 } | |
355 | |
356 strtest = strcmp(chrom, chromosomes[med]->name); | |
357 | |
358 if(strtest == 0) | |
359 return med; | |
360 | |
361 else if (strtest < 0){ | |
362 end = med; | |
363 med = (start + end) / 2; | |
364 } | |
365 | |
366 else { | |
367 start = med; | |
368 med = (start + end) / 2; | |
369 } | |
370 | |
371 } | |
372 } | |
373 | |
374 | |
375 | |
376 int windowSearch(struct window *searchWin, int winCnt, int pos){ | |
377 int start; | |
378 int end; | |
379 int med; | |
380 | |
381 start = 0; | |
382 end = winCnt - 1; | |
383 | |
384 med = (start + end) / 2; | |
385 | |
386 while (1){ | |
387 | |
388 if (start > end) | |
389 return -1; | |
390 | |
391 if (start == med || end == med){ | |
392 | |
393 if (pos >= searchWin[start].start && pos < searchWin[start].end) | |
394 return start; | |
395 | |
396 else if (pos >= searchWin[end].start && pos < searchWin[end].end) | |
397 return end; | |
398 | |
399 else return -1; | |
400 | |
401 } | |
402 | |
403 if (pos >= searchWin[med].start && pos < searchWin[med].end) | |
404 return med; | |
405 | |
406 else if (pos < searchWin[med].start){ | |
407 end = med; | |
408 med = (start + end) / 2; | |
409 } | |
410 | |
411 else { | |
412 start = med; | |
413 med = (start + end) / 2; | |
414 } | |
415 } | |
416 } | |
417 | |
418 | |
419 | |
420 void saveDepth(char *depthFile){ | |
421 int i; | |
422 int j; | |
423 int retVal; | |
424 | |
425 char *fname; | |
426 FILE *txtDepth; | |
427 FILE *binDepth; | |
428 | |
429 fname = (char *) malloc (sizeof(char) * (strlen(depthFile) + 10)); | |
430 | |
431 sprintf(fname, "%s.lw.txt", depthFile); | |
432 | |
433 txtDepth = my_fopen(fname, "w", 0); | |
434 binDepth = my_fopen(depthFile, "w", 0); | |
435 | |
436 /* start with the magicNum, I will use this as a format check when loading */ | |
437 retVal = fwrite(&magicNum, sizeof(magicNum), 1, binDepth); | |
438 | |
439 fprintf(stdout, "Saving depth file %s\n", fname); | |
440 | |
441 for (i = 0; i < num_chrom; i++){ | |
442 for (j = 0; j < chromosomes[i]->lw_cnt; j++){ | |
443 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, (int) (chromosomes[i]->lw[j].depth)); | |
444 retVal = fwrite(&(chromosomes[i]->lw[j].depth), sizeof(chromosomes[i]->lw[j].depth), 1, binDepth); | |
445 } | |
446 } | |
447 | |
448 fclose(txtDepth); | |
449 | |
450 sprintf(fname, "%s.sw.txt", depthFile); | |
451 txtDepth = my_fopen(fname, "w", 0); | |
452 fprintf(stdout, "Saving depth file %s\n", fname); | |
453 | |
454 for (i = 0; i < num_chrom; i++){ | |
455 | |
456 for (j = 0; j < chromosomes[i]->sw_cnt; j++){ | |
457 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end, chromosomes[i]->sw[j].gc, (int) (chromosomes[i]->sw[j].depth)); | |
458 retVal = fwrite(&(chromosomes[i]->sw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth); | |
459 } | |
460 } | |
461 | |
462 fclose(txtDepth); | |
463 | |
464 sprintf(fname, "%s.cw.txt", depthFile); | |
465 txtDepth = my_fopen(fname, "w", 0); | |
466 fprintf(stdout, "Saving depth file %s\n", fname); | |
467 | |
468 for (i = 0; i < num_chrom; i++){ | |
469 for (j = 0; j < chromosomes[i]->cw_cnt; j++){ | |
470 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end, chromosomes[i]->cw[j].gc, (int) (chromosomes[i]->cw[j].depth)); | |
471 retVal = fwrite(&(chromosomes[i]->cw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth); | |
472 } | |
473 } | |
474 | |
475 fclose(txtDepth); | |
476 fclose(binDepth); | |
477 | |
478 free(fname); | |
479 } |