0
|
1 #include "sam.h"
|
|
2
|
|
3 void read_depth(char *indirSAM, char *depthFile, char gzSAM){
|
|
4
|
|
5
|
|
6 DIR *dp;
|
|
7 int fileCnt;
|
|
8 int totalFile;
|
|
9
|
|
10 struct dirent *ep;
|
|
11
|
|
12 int i;
|
|
13
|
|
14 if (GENOME_CONF == NULL)
|
|
15 print_error("Select genom configuration file (input) through the -conf parameter.\n");
|
|
16 if (indirSAM == NULL)
|
|
17 print_error("Select input directory that contains the SAM files through the -samdir parameter.\n");
|
|
18 if (depthFile == NULL)
|
|
19 print_error("Select read depth output file through the -depth parameter.\n");
|
|
20
|
|
21
|
|
22 loadRefConfig(GENOME_CONF);
|
|
23
|
|
24 fprintf(stdout, "Scanning the SAM directory: %s\n", indirSAM);
|
|
25
|
|
26 dp = opendir(indirSAM);
|
|
27
|
|
28 if (dp == NULL)
|
|
29 print_error("SAM directory not found.\n");
|
|
30
|
|
31 fileCnt = 0;
|
|
32 totalFile = 0;
|
|
33
|
|
34 while((ep=readdir(dp))){
|
|
35 if (ep->d_name[0] == '.')
|
|
36 continue;
|
|
37 if (ep->d_type == DT_DIR)
|
|
38 continue;
|
|
39
|
|
40 if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz"))
|
|
41 continue;
|
|
42
|
|
43 /*
|
|
44 if (!strstr(ep->d_name, "sam"))
|
|
45 continue;
|
|
46 */
|
|
47
|
|
48 i = strlen(ep->d_name)-1;
|
|
49
|
|
50 if ((ep->d_name[i] == 'z' && ep->d_name[i-1] == 'g' && ep->d_name[i-2] == '.') && gzSAM == 0){
|
|
51 print_error("File name ends with .gz yet --gz option is not selected. Are you sure?\nIf the files are indeed uncompressed, rename the files.\n");
|
|
52 }
|
|
53 totalFile++;
|
|
54 }
|
|
55
|
|
56 rewinddir(dp);
|
|
57
|
|
58 while((ep=readdir(dp))){
|
|
59 if (ep->d_name[0] == '.')
|
|
60 continue;
|
|
61 if (ep->d_type == DT_DIR)
|
|
62 continue;
|
|
63
|
|
64 if (CHECKSAM && !endswith(ep->d_name, ".sam") && !endswith(ep->d_name, ".sam.gz"))
|
|
65 continue;
|
|
66
|
|
67
|
|
68 /*
|
|
69 if (!strstr(ep->d_name, "sam"))
|
|
70 continue;
|
|
71 */
|
|
72
|
|
73 fprintf(stdout, "\r \rLoading file %d of %d: %s...", (fileCnt+1), totalFile, ep->d_name);
|
|
74 fflush(stdout);
|
|
75
|
|
76 readSAM(indirSAM, ep->d_name, gzSAM);
|
|
77
|
|
78 fileCnt++;
|
|
79 }
|
|
80
|
|
81 closedir(dp);
|
|
82
|
|
83 if (fileCnt == 0)
|
|
84 print_error("SAM directory does not contain any files with extensions \".sam\" or \".sam.gz\".\nIf you do have the correct files, please rename them to have either \".sam\" or \".sam.gz\" extensions.\n");
|
|
85 else
|
|
86 fprintf(stdout, "\n\n%d file%s loaded.\n", fileCnt, fileCnt>1 ? "s" : "");
|
|
87
|
|
88 saveDepth(depthFile);
|
|
89
|
|
90 }
|
|
91
|
|
92
|
|
93 void readSAM(char *indirSAM, char *fname, char gzSAM){
|
|
94 FILE *sam;
|
|
95
|
|
96 char *samfile;
|
|
97
|
|
98 char samLine[4 * MAX_STR];
|
|
99 char prevChrom[MAX_STR];
|
|
100 char chrom[MAX_STR];
|
|
101 char *token;
|
|
102
|
|
103
|
|
104 int pos;
|
|
105 int chrom_id;
|
|
106 int prev_chrom_id = -1;
|
|
107 int lineCnt;
|
|
108
|
|
109 samfile = (char *) malloc (sizeof(char) * (strlen(indirSAM) + strlen(fname) + 2));
|
|
110
|
|
111 sprintf(samfile, "%s/%s", indirSAM, fname);
|
|
112
|
|
113 sam = my_fopen(samfile, "r", gzSAM);
|
|
114
|
|
115 prevChrom[0] = 0;
|
|
116
|
|
117 while (1){
|
|
118
|
|
119 /* read entire line from the SAM file */
|
|
120 if (gzSAM){
|
|
121 gzgets(sam, samLine, (4 * MAX_STR));
|
|
122 if (gzeof(sam))
|
|
123 break;
|
|
124 }
|
|
125 else{
|
|
126 fgets(samLine, (4 * MAX_STR), sam);
|
|
127 if (feof(sam))
|
|
128 break;
|
|
129 }
|
|
130
|
|
131 /* parse chrom */
|
|
132 token = NULL;
|
|
133
|
|
134 token = strtok(samLine, "\t"); // read name
|
|
135 token = strtok(NULL, "\t"); // flag
|
|
136 token = strtok(NULL, "\t"); //chrom
|
|
137
|
|
138 strcpy(chrom, token);
|
|
139 trimspace(chrom);
|
|
140
|
|
141 token = strtok(NULL, "\t"); // pos
|
|
142
|
|
143 pos = atoi(token) - 1; // SAM file is 1-based; our format is 0-based
|
|
144
|
|
145 if (pos < 0)
|
|
146 continue;
|
|
147
|
|
148 /*
|
|
149 debug if needed
|
|
150 fprintf(stdout, "%s\t%d\n", chrom, pos);
|
|
151 */
|
|
152
|
|
153
|
|
154
|
|
155 chrom_id = insert_read_lw(chrom, pos, prevChrom, prev_chrom_id);
|
|
156
|
|
157 /*
|
|
158 if (VERBOSE)
|
|
159 fprintf(stdout, "[READSAM]\t%s\t%d\tchrom_id: %d\n", chrom, pos, chrom_id);
|
|
160 */
|
|
161
|
|
162 if (chrom_id == -1) // this chromosome is not in the config file
|
|
163 continue;
|
|
164
|
|
165 chrom_id = insert_read_sw(chrom, pos, prevChrom, chrom_id);
|
|
166
|
|
167 if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this
|
|
168 continue;
|
|
169
|
|
170 chrom_id = insert_read_cw(chrom, pos, prevChrom, chrom_id);
|
|
171
|
|
172
|
|
173
|
|
174 if (chrom_id == -1) // this chromosome is not in the config file; it shouldn't come to this
|
|
175 continue;
|
|
176
|
|
177 strcpy(prevChrom, chrom);
|
|
178 prev_chrom_id = chrom_id;
|
|
179
|
|
180
|
|
181 }
|
|
182
|
|
183 if (gzSAM)
|
|
184 gzclose(sam);
|
|
185 else
|
|
186 fclose(sam);
|
|
187
|
|
188 free(samfile);
|
|
189 }
|
|
190
|
|
191
|
|
192
|
|
193
|
|
194
|
|
195 int insert_read_lw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
|
|
196 int chrom_id;
|
|
197 int window_id;
|
|
198
|
|
199 int flank;
|
|
200
|
|
201 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
|
|
202
|
|
203 if (chrom_id != -1){
|
|
204
|
|
205 if (pos >= chromosomes[chrom_id]->length){
|
|
206 return chrom_id;
|
|
207 }
|
|
208
|
|
209 window_id = windowSearch(chromosomes[chrom_id]->lw, chromosomes[chrom_id]->lw_cnt, pos);
|
|
210
|
|
211
|
|
212 if (window_id != -1){
|
|
213
|
|
214 chromosomes[chrom_id]->lw[window_id].depth += 1;
|
|
215
|
|
216 /* iterate left */
|
|
217 flank = window_id - 1;
|
|
218
|
|
219 while (flank >= 0 && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end))
|
|
220 chromosomes[chrom_id]->lw[flank--].depth += 1;
|
|
221
|
|
222
|
|
223 /* iterate right */
|
|
224 flank = window_id + 1;
|
|
225
|
|
226 while (flank < chromosomes[chrom_id]->lw_cnt && (pos >= chromosomes[chrom_id]->lw[flank].start && pos <= chromosomes[chrom_id]->lw[flank].end))
|
|
227 chromosomes[chrom_id]->lw[flank++].depth += 1;
|
|
228
|
|
229
|
|
230 }
|
|
231
|
|
232 }
|
|
233
|
|
234 return chrom_id;
|
|
235 }
|
|
236
|
|
237
|
|
238
|
|
239 int insert_read_sw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
|
|
240 int chrom_id;
|
|
241 int window_id;
|
|
242
|
|
243 int flank;
|
|
244
|
|
245 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
|
|
246
|
|
247 if (chrom_id != -1){
|
|
248
|
|
249 if (pos >= chromosomes[chrom_id]->length)
|
|
250 return chrom_id;
|
|
251
|
|
252 window_id = windowSearch(chromosomes[chrom_id]->sw, chromosomes[chrom_id]->sw_cnt, pos);
|
|
253
|
|
254 if (window_id != -1){
|
|
255
|
|
256 chromosomes[chrom_id]->sw[window_id].depth += 1;
|
|
257
|
|
258 /* iterate left */
|
|
259 flank = window_id - 1;
|
|
260
|
|
261 while (flank >= 0 && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end))
|
|
262 chromosomes[chrom_id]->sw[flank--].depth += 1;
|
|
263
|
|
264
|
|
265 /* iterate right */
|
|
266 flank = window_id + 1;
|
|
267
|
|
268 while (flank < chromosomes[chrom_id]->sw_cnt && (pos >= chromosomes[chrom_id]->sw[flank].start && pos <= chromosomes[chrom_id]->sw[flank].end))
|
|
269 chromosomes[chrom_id]->sw[flank++].depth += 1;
|
|
270
|
|
271
|
|
272 }
|
|
273
|
|
274 }
|
|
275
|
|
276 return chrom_id;
|
|
277 }
|
|
278
|
|
279
|
|
280 int insert_read_cw(char *chrom, int pos, char *prevChrom, int prev_chrom_id){
|
|
281 int chrom_id;
|
|
282 int window_id;
|
|
283
|
|
284 int flank;
|
|
285
|
|
286 chrom_id = findChrom(chrom, prevChrom, prev_chrom_id);
|
|
287
|
|
288 if (chrom_id != -1){
|
|
289
|
|
290 if (pos >= chromosomes[chrom_id]->length)
|
|
291 return chrom_id;
|
|
292
|
|
293 window_id = windowSearch(chromosomes[chrom_id]->cw, chromosomes[chrom_id]->cw_cnt, pos);
|
|
294
|
|
295 if (window_id != -1){
|
|
296
|
|
297 chromosomes[chrom_id]->cw[window_id].depth += 1;
|
|
298
|
|
299 }
|
|
300
|
|
301 }
|
|
302 return chrom_id;
|
|
303 }
|
|
304
|
|
305
|
|
306 int findChrom(char *chrom, char *prevChrom, int prev_chrom_id){
|
|
307 int chrom_id;
|
|
308
|
|
309
|
|
310 if (!strcmp(chrom, prevChrom)){
|
|
311 chrom_id = prev_chrom_id;
|
|
312 }
|
|
313
|
|
314 else if (prev_chrom_id == -1 && !strcmp(chrom, chromosomes[0]->name)){ // the first entry in the file
|
|
315 chrom_id = 0;
|
|
316 }
|
|
317 else if (prev_chrom_id != -1 && prev_chrom_id < num_chrom -1 && !strcmp(chrom, chromosomes[prev_chrom_id+1]->name)){
|
|
318 chrom_id = prev_chrom_id + 1;
|
|
319 }
|
|
320 else{
|
|
321 chrom_id = binSearch(chrom);
|
|
322 }
|
|
323
|
|
324 return chrom_id;
|
|
325
|
|
326 }
|
|
327
|
|
328 int binSearch(char *chrom){
|
|
329 int start;
|
|
330 int end;
|
|
331 int med;
|
|
332 int strtest;
|
|
333
|
|
334 start = 0;
|
|
335
|
|
336 end = num_chrom - 1;
|
|
337
|
|
338 med = (start + end) / 2;
|
|
339
|
|
340
|
|
341 while (1){
|
|
342
|
|
343 if (start > end)
|
|
344 return -1;
|
|
345
|
|
346 if (start == med || end == med){
|
|
347 if (!strcmp(chrom, chromosomes[start]->name))
|
|
348 return start;
|
|
349 else if (!strcmp(chrom, chromosomes[end]->name))
|
|
350 return end;
|
|
351 else{
|
|
352 return -1;
|
|
353 }
|
|
354 }
|
|
355
|
|
356 strtest = strcmp(chrom, chromosomes[med]->name);
|
|
357
|
|
358 if(strtest == 0)
|
|
359 return med;
|
|
360
|
|
361 else if (strtest < 0){
|
|
362 end = med;
|
|
363 med = (start + end) / 2;
|
|
364 }
|
|
365
|
|
366 else {
|
|
367 start = med;
|
|
368 med = (start + end) / 2;
|
|
369 }
|
|
370
|
|
371 }
|
|
372 }
|
|
373
|
|
374
|
|
375
|
|
376 int windowSearch(struct window *searchWin, int winCnt, int pos){
|
|
377 int start;
|
|
378 int end;
|
|
379 int med;
|
|
380
|
|
381 start = 0;
|
|
382 end = winCnt - 1;
|
|
383
|
|
384 med = (start + end) / 2;
|
|
385
|
|
386 while (1){
|
|
387
|
|
388 if (start > end)
|
|
389 return -1;
|
|
390
|
|
391 if (start == med || end == med){
|
|
392
|
|
393 if (pos >= searchWin[start].start && pos < searchWin[start].end)
|
|
394 return start;
|
|
395
|
|
396 else if (pos >= searchWin[end].start && pos < searchWin[end].end)
|
|
397 return end;
|
|
398
|
|
399 else return -1;
|
|
400
|
|
401 }
|
|
402
|
|
403 if (pos >= searchWin[med].start && pos < searchWin[med].end)
|
|
404 return med;
|
|
405
|
|
406 else if (pos < searchWin[med].start){
|
|
407 end = med;
|
|
408 med = (start + end) / 2;
|
|
409 }
|
|
410
|
|
411 else {
|
|
412 start = med;
|
|
413 med = (start + end) / 2;
|
|
414 }
|
|
415 }
|
|
416 }
|
|
417
|
|
418
|
|
419
|
|
420 void saveDepth(char *depthFile){
|
|
421 int i;
|
|
422 int j;
|
|
423 int retVal;
|
|
424
|
|
425 char *fname;
|
|
426 FILE *txtDepth;
|
|
427 FILE *binDepth;
|
|
428
|
|
429 fname = (char *) malloc (sizeof(char) * (strlen(depthFile) + 10));
|
|
430
|
|
431 sprintf(fname, "%s.lw.txt", depthFile);
|
|
432
|
|
433 txtDepth = my_fopen(fname, "w", 0);
|
|
434 binDepth = my_fopen(depthFile, "w", 0);
|
|
435
|
|
436 /* start with the magicNum, I will use this as a format check when loading */
|
|
437 retVal = fwrite(&magicNum, sizeof(magicNum), 1, binDepth);
|
|
438
|
|
439 fprintf(stdout, "Saving depth file %s\n", fname);
|
|
440
|
|
441 for (i = 0; i < num_chrom; i++){
|
|
442 for (j = 0; j < chromosomes[i]->lw_cnt; j++){
|
|
443 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->lw[j].start, chromosomes[i]->lw[j].end, chromosomes[i]->lw[j].gc, (int) (chromosomes[i]->lw[j].depth));
|
|
444 retVal = fwrite(&(chromosomes[i]->lw[j].depth), sizeof(chromosomes[i]->lw[j].depth), 1, binDepth);
|
|
445 }
|
|
446 }
|
|
447
|
|
448 fclose(txtDepth);
|
|
449
|
|
450 sprintf(fname, "%s.sw.txt", depthFile);
|
|
451 txtDepth = my_fopen(fname, "w", 0);
|
|
452 fprintf(stdout, "Saving depth file %s\n", fname);
|
|
453
|
|
454 for (i = 0; i < num_chrom; i++){
|
|
455
|
|
456 for (j = 0; j < chromosomes[i]->sw_cnt; j++){
|
|
457 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->sw[j].start, chromosomes[i]->sw[j].end, chromosomes[i]->sw[j].gc, (int) (chromosomes[i]->sw[j].depth));
|
|
458 retVal = fwrite(&(chromosomes[i]->sw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth);
|
|
459 }
|
|
460 }
|
|
461
|
|
462 fclose(txtDepth);
|
|
463
|
|
464 sprintf(fname, "%s.cw.txt", depthFile);
|
|
465 txtDepth = my_fopen(fname, "w", 0);
|
|
466 fprintf(stdout, "Saving depth file %s\n", fname);
|
|
467
|
|
468 for (i = 0; i < num_chrom; i++){
|
|
469 for (j = 0; j < chromosomes[i]->cw_cnt; j++){
|
|
470 fprintf(txtDepth, "%s\t%d\t%d\t%f\t%d\n", chromosomes[i]->name, chromosomes[i]->cw[j].start, chromosomes[i]->cw[j].end, chromosomes[i]->cw[j].gc, (int) (chromosomes[i]->cw[j].depth));
|
|
471 retVal = fwrite(&(chromosomes[i]->cw[j].depth), sizeof(chromosomes[i]->cw[j].depth), 1, binDepth);
|
|
472 }
|
|
473 }
|
|
474
|
|
475 fclose(txtDepth);
|
|
476 fclose(binDepth);
|
|
477
|
|
478 free(fname);
|
|
479 }
|