comparison mrcanavar-0.34/sam.c @ 0:86522a0b5f59 default tip

Uploaded source code for mrCaNaVaR
author calkan
date Tue, 21 Feb 2012 10:44:13 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:86522a0b5f59
1 #include "sam.h"
2
3 void read_depth(char *indirSAM, char *depthFile, char gzSAM){
4
5
6 DIR *dp;
7 int fileCnt;
8 int totalFile;
9
10 struct dirent *ep;
11
12 int i;
13
14 if (GENOME_CONF == NULL)
15 print_error("Select genom configuration file (input) through the -conf parameter.\n");
16 if (indirSAM == NULL)
17 print_error("Select input directory that contains the SAM files through the -samdir parameter.\n");
18 if (depthFile == NULL)
19 print_error("Select read depth output file through the -depth parameter.\n");
20
21
22 loadRefConfig(GENOME_CONF);
23
24 fprintf(stdout, "Scanning the SAM directory: %s\n", indirSAM);
25
26 dp = opendir(indirSAM);
27
28 if (dp == NULL)
29 print_error("SAM directory not found.\n");
30
31 fileCnt = 0;
32 totalFile = 0;
33
34 while((ep=readdir(dp))){
35 if (ep->d_name[0] == '.')
36 continue;
37 if (ep->d_type == DT_DIR)
38 continue;
39
40 if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz"))
41 continue;
42
43 /*
44 if (!strstr(ep->d_name, "sam"))
45 continue;
46 */
47
48 i = strlen(ep->d_name)-1;
49
50 if ((ep->d_name[i] == 'z' && ep->d_name[i-1] == 'g' && ep->d_name[i-2] == '.') && gzSAM == 0){
51 print_error("File name ends with .gz yet --gz option is not selected. Are you sure?\nIf the files are indeed uncompressed, rename the files.\n");
52 }
53 totalFile++;
54 }
55
56 rewinddir(dp);
57
58 while((ep=readdir(dp))){
59 if (ep->d_name[0] == '.')
60 continue;
61 if (ep->d_type == DT_DIR)
62 continue;
63
64 if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz"))
65 continue;
66
67
68 /*
69 if (!strstr(ep->d_name, "sam"))
70 continue;
71 */
72
73 fprintf(stdout, "\r \rLoading file %d of %d: %s...", (fileCnt+1), totalFile, ep->d_name);
74 fflush(stdout);
75
76 readSAM(indirSAM, ep->d_name, gzSAM);
77
78 fileCnt++;
79 }
80
81 closedir(dp);
82
83 if (fileCnt == 0)
84 print_error("SAM directory does not contain any files with extensions \".sam\" or \".sam.gz\".\nIf you do have the correct files, please rename them to have either \".sam\" or \".sam.gz\" extensions.\n");
85 else
86 fprintf(stdout, "\n\n%d file%s loaded.\n", fileCnt, fileCnt>1 ? "s" : "");
87
88 saveDepth(depthFile);
89
90 }
91
92
93 void readSAM(char *indirSAM, char *fname, char gzSAM){
94 FILE *sam;
95
96 char *samfile;
97
98 char samLine[4 * MAX_STR];
99 char prevChrom[MAX_STR];
100 char chrom[MAX_STR];
101 char *token;
102
103
104 int pos;
105 int chrom_id;
106 int prev_chrom_id = -1;
107 int lineCnt;
108
109 samfile = (char *) malloc (sizeof(char) * (strlen(indirSAM) + strlen(fname) + 2));
110
111 sprintf(samfile, "%s/%s", indirSAM, fname);
112
113 sam = my_fopen(samfile, "r", gzSAM);
114
115 prevChrom[0] = 0;
116
117 while (1){
118
119 /* read entire line from the SAM file */
120 if (gzSAM){
121 gzgets(sam, samLine, (4 * MAX_STR));
122 if (gzeof(sam))
123 break;
124 }
125 else{
126 fgets(samLine, (4 * MAX_STR), sam);
127 if (feof(sam))
128 break;
129 }
130
131 /* parse chrom */
132 token = NULL;
133
134 token = strtok(samLine, "\t"); // read name
135 token = strtok(NULL, "\t"); // flag
136 token = strtok(NULL, "\t"); //chrom
137
138 strcpy(chrom, token);
139 trimspace(chrom);
140
141 token = strtok(NULL, "\t"); // pos
142
143 pos = atoi(token) - 1; // SAM file is 1-based; our format is 0-based
144
145 if (pos < 0)
146 continue;
147
148 /*
149 debug if needed
150 fprintf(stdout, "%s\t%d\n", chrom, pos);
151 */
152
153
154
155 chrom_id = insert_read_lw(chrom, pos, prevChrom, prev_chrom_id);
156
157 /*
158 if (VERBOSE)
159 fprintf(stdout, "[READSAM]\t%s\t%d\tchrom_id: %d\n", chrom, pos, chrom_id);
160 */
161
162 if (chrom_id == -1) // this chromosome is not in the config file
163 continue;
164
165 chrom_id = insert_read_sw(chrom, pos, prevChrom, chrom_id);
166
167 if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this
168 continue;
169
170 chrom_id = insert_read_cw(chrom, pos, prevChrom, chrom_id);
171
172
173
174 if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this
175 continue;
176
177 strcpy(prevChrom, chrom);
178 prev_chrom_id = chrom_id;
179
180
181 }
182
183 if (gzSAM)
184 gzclose(sam);
185 else
186 fclose(sam);
187
188 free(samfile);
189 }
190
191
192
193
194
195 int insert_read_lw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
196 int chrom_id;
197 int window_id;
198
199 int flank;
200
201 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
202
203 if (chrom_id != -1){
204
205 if (pos >= chromosomes[chrom_id]->length){
206 return chrom_id;
207 }
208
209 window_id = windowSearch(chromosomes[chrom_id]->lw, chromosomes[chrom_id]->lw_cnt, pos);
210
211
212 if (window_id != -1){
213
214 chromosomes[chrom_id]->lw[window_id].depth += 1;
215
216 /* iterate left */
217 flank = window_id - 1;
218
219 while (flank >= 0 && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end))
220 chromosomes[chrom_id]->lw[flank--].depth += 1;
221
222
223 /* iterate right */
224 flank = window_id + 1;
225
226 while (flank < chromosomes[chrom_id]->lw_cnt && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end))
227 chromosomes[chrom_id]->lw[flank++].depth += 1;
228
229
230 }
231
232 }
233
234 return chrom_id;
235 }
236
237
238
239 int insert_read_sw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
240 int chrom_id;
241 int window_id;
242
243 int flank;
244
245 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
246
247 if (chrom_id != -1){
248
249 if (pos >= chromosomes[chrom_id]->length)
250 return chrom_id;
251
252 window_id = windowSearch(chromosomes[chrom_id]->sw, chromosomes[chrom_id]->sw_cnt, pos);
253
254 if (window_id != -1){
255
256 chromosomes[chrom_id]->sw[window_id].depth += 1;
257
258 /* iterate left */
259 flank = window_id - 1;
260
261 while (flank >= 0 && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end))
262 chromosomes[chrom_id]->sw[flank--].depth += 1;
263
264
265 /* iterate right */
266 flank = window_id + 1;
267
268 while (flank < chromosomes[chrom_id]->sw_cnt && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end))
269 chromosomes[chrom_id]->sw[flank++].depth += 1;
270
271
272 }
273
274 }
275
276 return chrom_id;
277 }
278
279
280 int insert_read_cw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
281 int chrom_id;
282 int window_id;
283
284 int flank;
285
286 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
287
288 if (chrom_id != -1){
289
290 if (pos >= chromosomes[chrom_id]->length)
291 return chrom_id;
292
293 window_id = windowSearch(chromosomes[chrom_id]->cw, chromosomes[chrom_id]->cw_cnt, pos);
294
295 if (window_id != -1){
296
297 chromosomes[chrom_id]->cw[window_id].depth += 1;
298
299 }
300
301 }
302 return chrom_id;
303 }
304
305
306 int findChrom(char *chrom, char *prevChrom, int prev_chrom_id){
307 int chrom_id;
308
309
310 if (!strcmp(chrom, prevChrom)){
311 chrom_id = prev_chrom_id;
312 }
313
314 else if (prev_chrom_id == -1 && !strcmp(chrom, chromosomes[0]->name)){ // the first entry in the file
315 chrom_id = 0;
316 }
317 else if (prev_chrom_id != -1 && prev_chrom_id < num_chrom -1 && !strcmp(chrom, chromosomes[prev_chrom_id+1]->name)){
318 chrom_id = prev_chrom_id + 1;
319 }
320 else{
321 chrom_id = binSearch(chrom);
322 }
323
324 return chrom_id;
325
326 }
327
328 int binSearch(char *chrom){
329 int start;
330 int end;
331 int med;
332 int strtest;
333
334 start = 0;
335
336 end = num_chrom - 1;
337
338 med = (start + end) / 2;
339
340
341 while (1){
342
343 if (start > end)
344 return -1;
345
346 if (start == med || end == med){
347 if (!strcmp(chrom, chromosomes[start]->name))
348 return start;
349 else if (!strcmp(chrom, chromosomes[end]->name))
350 return end;
351 else{
352 return -1;
353 }
354 }
355
356 strtest = strcmp(chrom, chromosomes[med]->name);
357
358 if(strtest == 0)
359 return med;
360
361 else if (strtest < 0){
362 end = med;
363 med = (start + end) / 2;
364 }
365
366 else {
367 start = med;
368 med = (start + end) / 2;
369 }
370
371 }
372 }
373
374
375
376 int windowSearch(struct window *searchWin, int winCnt, int pos){
377 int start;
378 int end;
379 int med;
380
381 start = 0;
382 end = winCnt - 1;
383
384 med = (start + end) / 2;
385
386 while (1){
387
388 if (start > end)
389 return -1;
390
391 if (start == med || end == med){
392
393 if (pos >= searchWin[start].start && pos < searchWin[start].end)
394 return start;
395
396 else if (pos >= searchWin[end].start && pos < searchWin[end].end)
397 return end;
398
399 else return -1;
400
401 }
402
403 if (pos >= searchWin[med].start && pos < searchWin[med].end)
404 return med;
405
406 else if (pos < searchWin[med].start){
407 end = med;
408 med = (start + end) / 2;
409 }
410
411 else {
412 start = med;
413 med = (start + end) / 2;
414 }
415 }
416 }
417
418
419
420 void saveDepth(char *depthFile){
421 int i;
422 int j;
423 int retVal;
424
425 char *fname;
426 FILE *txtDepth;
427 FILE *binDepth;
428
429 fname = (char *) malloc (sizeof(char) * (strlen(depthFile) + 10));
430
431 sprintf(fname, "%s.lw.txt", depthFile);
432
433 txtDepth = my_fopen(fname, "w", 0);
434 binDepth = my_fopen(depthFile, "w", 0);
435
436 /* start with the magicNum, I will use this as a format check when loading */
437 retVal = fwrite(&magicNum, sizeof(magicNum), 1, binDepth);
438
439 fprintf(stdout, "Saving depth file %s\n", fname);
440
441 for (i = 0; i < num_chrom; i++){
442 for (j = 0; j < chromosomes[i]->lw_cnt; j++){
443 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, (int) (chromosomes[i]->lw[j].depth));
444 retVal = fwrite(&(chromosomes[i]->lw[j].depth), sizeof(chromosomes[i]->lw[j].depth), 1, binDepth);
445 }
446 }
447
448 fclose(txtDepth);
449
450 sprintf(fname, "%s.sw.txt", depthFile);
451 txtDepth = my_fopen(fname, "w", 0);
452 fprintf(stdout, "Saving depth file %s\n", fname);
453
454 for (i = 0; i < num_chrom; i++){
455
456 for (j = 0; j < chromosomes[i]->sw_cnt; j++){
457 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end, chromosomes[i]->sw[j].gc, (int) (chromosomes[i]->sw[j].depth));
458 retVal = fwrite(&(chromosomes[i]->sw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth);
459 }
460 }
461
462 fclose(txtDepth);
463
464 sprintf(fname, "%s.cw.txt", depthFile);
465 txtDepth = my_fopen(fname, "w", 0);
466 fprintf(stdout, "Saving depth file %s\n", fname);
467
468 for (i = 0; i < num_chrom; i++){
469 for (j = 0; j < chromosomes[i]->cw_cnt; j++){
470 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end, chromosomes[i]->cw[j].gc, (int) (chromosomes[i]->cw[j].depth));
471 retVal = fwrite(&(chromosomes[i]->cw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth);
472 }
473 }
474
475 fclose(txtDepth);
476 fclose(binDepth);
477
478 free(fname);
479 }