1
|
1
|
|
2 /*********************
|
|
3 File indexmaker.c
|
|
4 Author Bitlab
|
|
5 Description Computes a series of statistics and stores them in natural order and sorted order
|
|
6
|
|
7 USAGE <sequences.fasta> The fasta sequences
|
|
8 <output.index> The output name for the two indexes
|
|
9 * ------------------------------------------*/
|
|
10
|
|
11 #include <stdio.h>
|
|
12 #include <stdlib.h>
|
|
13 #include <ctype.h>
|
|
14 #include <string.h>
|
|
15 #include <inttypes.h>
|
|
16 #include "structs.h"
|
|
17 #include "comparisonFunctions.h"
|
|
18 #include "commonFunctions.h"
|
|
19
|
|
20
|
|
21 void copyRead(struct rIndex2 *To, struct rIndex2 *From);
|
|
22 void QsortRead(struct rIndex2 *array,uint64_t x, uint64_t y);
|
|
23
|
|
24 int main(int ac, char **av) {
|
|
25
|
|
26 FILE *f, *fOut;
|
|
27 struct rIndex2 R;//, *RR;
|
|
28 //uint64_t *positions;
|
|
29 uint64_t c;
|
|
30 uint64_t tLen=0, tLmask=0, tN=0, nReads=0;
|
|
31 uint64_t maxRlen=0, maxRlenMasked=0, maxNonACGT=0;
|
|
32 uint64_t N=0, i;
|
|
33 //char tmp[MAXLID];
|
|
34
|
|
35 if (ac!=3) terror("USE: indexmaker <sequences.fasta> <output.index>");
|
|
36
|
|
37
|
|
38 if ((f=fopen(av[1],"rt"))==NULL) terror("Could not open sequences fasta file");
|
|
39
|
|
40 if ((fOut=fopen(av[2],"wb"))==NULL) terror("Could not open output file");
|
|
41
|
|
42 R.Lac=0;
|
|
43 R.rLen=0;
|
|
44 c=fgetc(f);
|
|
45 while (!feof(f)) {
|
|
46 while(c!='>') c=(char)fgetc(f); // only first time
|
|
47 R.pos=ftell(f) - 1;
|
|
48 i=0;
|
|
49 c=getc(f);
|
|
50 if(i==0 && c== ' ') c=getc(f); //Handle sequences with a ' ' after the '>'
|
|
51 while(i< MAXLID && c!='\n' && c!=' ') {
|
|
52 R.id[i++] = c;
|
|
53 c=getc(f);
|
|
54 }
|
|
55 R.id[i]=0x00;
|
|
56
|
|
57 while(c!='\n') c=getc(f);
|
|
58
|
|
59 i=0;
|
|
60 R.rLen=0;
|
|
61 R.rLmasked=0;
|
|
62 R.nonACGT=0;
|
|
63
|
|
64 while(c!='>' && !feof(f)) {
|
|
65 if (c=='N' || c=='n') R.nonACGT++;
|
|
66 else if (c>96 && c<123) R.rLmasked++;
|
|
67 c=toupper(c);
|
|
68 if (c>64 && c<91) R.rLen++;
|
|
69 if (c>47 && c<58) R.rLen++; //Handling color space reads
|
|
70
|
|
71 c=(char)fgetc(f);
|
|
72 }
|
|
73 R.rNumber = N;
|
|
74 if (R.rLen > maxRlen) maxRlen=R.rLen;
|
|
75 if (R.rLmasked > maxRlenMasked) maxRlenMasked = R.rLmasked;
|
|
76 if (R.nonACGT > maxNonACGT) maxNonACGT = R.nonACGT;
|
|
77
|
|
78 fwrite(&R,sizeof(struct rIndex2),1,fOut);
|
|
79
|
|
80 N++;
|
|
81
|
|
82 R.Lac +=R.rLen;
|
|
83 tLen +=R.rLen;
|
|
84 tN +=R.nonACGT;
|
|
85 tLmask+=R.rLmasked;
|
|
86
|
|
87 nReads++;
|
|
88
|
|
89 }
|
|
90
|
|
91 fclose(f);
|
|
92 fclose(fOut);
|
|
93
|
|
94 fprintf(stdout,"....done\n");
|
|
95 return 0;
|
|
96 }
|
|
97
|
|
98 void copyRead(struct rIndex2 *To, struct rIndex2 *From){
|
|
99
|
|
100 strcpy(To->id, From->id);
|
|
101 To->rNumber = From->rNumber;
|
|
102 To->rLen = From->rLen;
|
|
103 To->rLmasked= From->rLmasked;
|
|
104 To->nonACGT = From->nonACGT;
|
|
105 To->pos = From->pos; //Start position of read
|
|
106 To->Lac = From->Lac; // accumulated length
|
|
107 }
|
|
108
|
|
109
|
|
110 void QsortRead(struct rIndex2 *array,uint64_t x, uint64_t y) {
|
|
111
|
|
112 struct rIndex2 pivote, aux;
|
|
113 uint64_t x1, y1;
|
|
114
|
|
115 copyRead(&pivote, &array[(x+y)/2]);
|
|
116 x1 = x;
|
|
117 y1 = y;
|
|
118
|
|
119 do{
|
|
120 while (strcmp(pivote.id, array[x1].id)>0) x1++;
|
|
121 while (strcmp(pivote.id, array[y1].id)<0) y1--;
|
|
122 if (x1 < y1) {
|
|
123 copyRead(&aux,&array[x1]);
|
|
124 copyRead(&array[x1], &array[y1]);
|
|
125 copyRead(&array[y1], &aux);
|
|
126 x1++;
|
|
127 y1--;
|
|
128 }else if (x1 == y1) x1++;
|
|
129 } while (x1 <=y1);
|
|
130
|
|
131 if (x<y1) QsortRead(array,x,y1);
|
|
132 if (x1<y) QsortRead(array,x1,y);
|
|
133 }
|