comparison srf2fastq/io_lib-1.12.2/progs/hash_sff.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d901c9f41a6a
1 /*
2 * This adds a hash table index (".hsh" v1.00 format) to an SFF archive.
3 * It does this either inline on the file itself (provided it doesn't already
4 * have an index) or by producing a new indexed SFF archive.
5 *
6 * It has been coded to require only the memory needed to store the index
7 * and so does quite a lot of I/O but with minimised memory. For a 460,000
8 * SFF archive it took about 22 seconds real time on a 1.7GHz P4 when copying
9 * or 10 seconds when updating inline.
10 */
11
12 /* ---------------------------------------------------------------------- */
13
14 #include <stdio.h>
15 #include <string.h>
16 #include <errno.h>
17 #include <stdlib.h>
18 #include <io_lib/hash_table.h>
19 #include <io_lib/sff.h>
20 #include <io_lib/os.h>
21 #include <io_lib/mFILE.h>
22
23 /*
24 * Override the sff.c functions to use FILE pointers instead. This means
25 * we don't have to load the entire archive into memory, which is optimal when
26 * dealing with a single file (ie in sff/sff.c), but not when indexing it.
27 *
28 * Done with minimal error checking I'll admit...
29 */
30 static sff_read_header *fread_sff_read_header(FILE *fp) {
31 sff_read_header *h;
32 unsigned char rhdr[16];
33
34 if (16 != fread(rhdr, 1, 16, fp))
35 return NULL;
36 h = decode_sff_read_header(rhdr);
37
38 if (h->name_len != fread(h->name, 1, h->name_len, fp))
39 return free_sff_read_header(h), NULL;
40
41 /* Pad to 8 chars */
42 fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET);
43
44 return h;
45 }
46
47 static sff_common_header *fread_sff_common_header(FILE *fp) {
48 sff_common_header *h;
49 unsigned char chdr[31];
50
51 if (31 != fread(chdr, 1, 31, fp))
52 return NULL;
53 h = decode_sff_common_header(chdr);
54 if (h->flow_len != fread(h->flow, 1, h->flow_len, fp))
55 return free_sff_common_header(h), NULL;
56 if (h->key_len != fread(h->key , 1, h->key_len, fp))
57 return free_sff_common_header(h), NULL;
58
59 /* Pad to 8 chars */
60 fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET);
61
62 return h;
63 }
64
65 void usage(void) {
66 fprintf(stderr, "Usage: hash_sff [-o outfile] [-t] sff_file ...\n");
67 exit(1);
68 }
69
70 int main(int argc, char **argv) {
71 HashFile *hf;
72 sff_common_header *ch;
73 sff_read_header *rh;
74 int i, dot, arg;
75 char *sff;
76 char hdr[31];
77 uint64_t index_offset = 0;
78 uint32_t index_size, index_skipped;
79 FILE *fp, *fpout = NULL;
80 int copy_archive = 1;
81
82
83 /* process command line arguments of the form -arg */
84 for (argc--, argv++; argc > 0; argc--, argv++) {
85 if (**argv != '-' || strcmp(*argv, "--") == 0)
86 break;
87
88 if (strcmp(*argv, "-o") == 0 && argc > 1) {
89 if (NULL == (fpout = fopen(argv[1], "wb+"))) {
90 perror(argv[1]);
91 return 1;
92 }
93 argv++;
94 argc--;
95
96 } else if (strcmp(*argv, "-t") == 0) {
97 copy_archive = 0;
98
99 } else if (**argv == '-') {
100 usage();
101 }
102
103 }
104
105 if (argc < 1)
106 usage();
107
108 if (copy_archive == 0 && argc != 1) {
109 fprintf(stderr, "-t option only supported with a single sff argument\n");
110 return 1;
111 }
112
113 /* Create the hash table */
114 hf = HashFileCreate(0, HASH_DYNAMIC_SIZE);
115 hf->nheaders = 0;
116 hf->headers = NULL;
117
118 for (arg = 0; arg < argc; arg++) {
119 /* open (and read) the entire sff file */
120 sff = argv[arg];
121
122 printf("Indexing %s:\n", sff);
123 if (fpout) {
124 if (NULL == (fp = fopen(sff, "rb"))) {
125 perror(sff);
126 return 1;
127 }
128 } else {
129 if (NULL == (fp = fopen(sff, "rb+"))) {
130 perror(sff);
131 return 1;
132 }
133 }
134
135 /* Read the common header */
136 ch = fread_sff_common_header(fp);
137
138 if (ch->index_len && !fpout) {
139 fprintf(stderr, "Archive already contains index.\nReplacing the"
140 " index requires the \"-o outfile\" option.\n");
141 return 1;
142 }
143
144 /* Add the SFF common header as a hash file-header */
145 hf->nheaders++;
146 hf->headers = (HashFileSection *)realloc(hf->headers, hf->nheaders *
147 sizeof(*hf->headers));
148 hf->headers[hf->nheaders-1].pos = 0;
149 hf->headers[hf->nheaders-1].size = ch->header_len;
150 hf->headers[hf->nheaders-1].cached_data = NULL;
151
152 /* Read the index items, adding to the hash */
153 index_skipped = 0;
154 dot = 0;
155 printf(" |\r|");
156 for (i = 0; i < ch->nreads; i++) {
157 int dlen;
158 uint32_t offset;
159 HashData hd;
160 HashFileItem *hfi;
161
162 if (i >= dot * (ch->nreads/69)) {
163 putchar('.');
164 fflush(stdout);
165 dot++;
166 }
167
168 /* Skip old index if present */
169 offset = ftell(fp);
170 if (offset == ch->index_offset) {
171 fseek(fp, ch->index_len, SEEK_CUR);
172 index_skipped = ch->index_len;
173 continue;
174 }
175
176 hfi = (HashFileItem *)calloc(1, sizeof(*hfi));
177 rh = fread_sff_read_header(fp);
178 dlen = (2*ch->flow_len + 3*rh->nbases + 7) & ~7;
179 fseek(fp, dlen, SEEK_CUR);
180
181 hfi->header = hf->nheaders;
182 hfi->footer = 0;
183 hfi->pos = offset - index_skipped;
184 hfi->size = (ftell(fp) - index_skipped) - hfi->pos;
185 hd.p = hfi;
186
187 HashTableAdd(hf->h, rh->name, rh->name_len, hd, NULL);
188 }
189 printf("\n");
190 HashTableStats(hf->h, stdout);
191
192 index_offset = ftell(fp) - index_skipped;
193
194 /* Copy the archive if needed, minus the old index */
195 if (fpout && copy_archive) {
196 char block[8192];
197 size_t len;
198 uint64_t pos = 0;
199
200 printf("\nCopying archive\n");
201
202 fseek(fp, 0, SEEK_SET);
203 while (len = fread(block, 1, 8192, fp)) {
204 /* Skip previous index */
205 if (pos < ch->index_offset && pos+len > ch->index_offset) {
206 len = ch->index_offset - pos;
207 fseek(fp, ch->index_offset + ch->index_len, SEEK_SET);
208 }
209 if (len && len != fwrite(block, 1, len, fpout)) {
210 fprintf(stderr, "Failed to output new archive\n");
211 return 1;
212 }
213 pos += len;
214 }
215 }
216
217 if (!fpout) {
218 /* Save the hash */
219 printf("Saving index\n");
220 fseek(fp, 0, SEEK_END);
221 index_size = HashFileSave(hf, fp, 0);
222 HashFileDestroy(hf);
223
224 /* Update the common header */
225 fseek(fp, 0, SEEK_SET);
226 fread(hdr, 1, 31, fp);
227 *(uint64_t *)(hdr+8) = be_int8(index_offset);
228 *(uint32_t *)(hdr+16) = be_int4(index_size);
229 fseek(fp, 0, SEEK_SET);
230 fwrite(hdr, 1, 31, fp);
231 }
232
233 fclose(fp);
234 }
235
236 if (fpout) {
237 /* Save the hash */
238 printf("Saving index\n");
239
240 if (!copy_archive) {
241 hf->archive = strdup(argv[0]);
242 index_offset = 0;
243 }
244
245 fseek(fpout, 0, SEEK_END);
246 index_size = HashFileSave(hf, fpout, 0);
247 HashFileDestroy(hf);
248
249 /* Update the common header to indicate index location */
250 if (copy_archive) {
251 fseek(fpout, 0, SEEK_SET);
252 fread(hdr, 1, 31, fpout);
253 *(uint64_t *)(hdr+8) = be_int8(index_offset);
254 *(uint32_t *)(hdr+16) = be_int4(index_size);
255 fseek(fpout, 0, SEEK_SET);
256 fwrite(hdr, 1, 31, fpout);
257 }
258 fclose(fpout);
259 }
260
261 return 0;
262 }