Mercurial > repos > dawe > srf2fastq
comparison srf2fastq/io_lib-1.12.2/progs/hash_sff.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d901c9f41a6a |
---|---|
1 /* | |
2 * This adds a hash table index (".hsh" v1.00 format) to an SFF archive. | |
3 * It does this either inline on the file itself (provided it doesn't already | |
4 * have an index) or by producing a new indexed SFF archive. | |
5 * | |
6 * It has been coded to require only the memory needed to store the index | |
7 * and so does quite a lot of I/O but with minimised memory. For a 460,000 | |
8 * SFF archive it took about 22 seconds real time on a 1.7GHz P4 when copying | |
9 * or 10 seconds when updating inline. | |
10 */ | |
11 | |
12 /* ---------------------------------------------------------------------- */ | |
13 | |
14 #include <stdio.h> | |
15 #include <string.h> | |
16 #include <errno.h> | |
17 #include <stdlib.h> | |
18 #include <io_lib/hash_table.h> | |
19 #include <io_lib/sff.h> | |
20 #include <io_lib/os.h> | |
21 #include <io_lib/mFILE.h> | |
22 | |
23 /* | |
24 * Override the sff.c functions to use FILE pointers instead. This means | |
25 * we don't have to load the entire archive into memory, which is optimal when | |
26 * dealing with a single file (ie in sff/sff.c), but not when indexing it. | |
27 * | |
28 * Done with minimal error checking I'll admit... | |
29 */ | |
30 static sff_read_header *fread_sff_read_header(FILE *fp) { | |
31 sff_read_header *h; | |
32 unsigned char rhdr[16]; | |
33 | |
34 if (16 != fread(rhdr, 1, 16, fp)) | |
35 return NULL; | |
36 h = decode_sff_read_header(rhdr); | |
37 | |
38 if (h->name_len != fread(h->name, 1, h->name_len, fp)) | |
39 return free_sff_read_header(h), NULL; | |
40 | |
41 /* Pad to 8 chars */ | |
42 fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET); | |
43 | |
44 return h; | |
45 } | |
46 | |
47 static sff_common_header *fread_sff_common_header(FILE *fp) { | |
48 sff_common_header *h; | |
49 unsigned char chdr[31]; | |
50 | |
51 if (31 != fread(chdr, 1, 31, fp)) | |
52 return NULL; | |
53 h = decode_sff_common_header(chdr); | |
54 if (h->flow_len != fread(h->flow, 1, h->flow_len, fp)) | |
55 return free_sff_common_header(h), NULL; | |
56 if (h->key_len != fread(h->key , 1, h->key_len, fp)) | |
57 return free_sff_common_header(h), NULL; | |
58 | |
59 /* Pad to 8 chars */ | |
60 fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET); | |
61 | |
62 return h; | |
63 } | |
64 | |
65 void usage(void) { | |
66 fprintf(stderr, "Usage: hash_sff [-o outfile] [-t] sff_file ...\n"); | |
67 exit(1); | |
68 } | |
69 | |
70 int main(int argc, char **argv) { | |
71 HashFile *hf; | |
72 sff_common_header *ch; | |
73 sff_read_header *rh; | |
74 int i, dot, arg; | |
75 char *sff; | |
76 char hdr[31]; | |
77 uint64_t index_offset = 0; | |
78 uint32_t index_size, index_skipped; | |
79 FILE *fp, *fpout = NULL; | |
80 int copy_archive = 1; | |
81 | |
82 | |
83 /* process command line arguments of the form -arg */ | |
84 for (argc--, argv++; argc > 0; argc--, argv++) { | |
85 if (**argv != '-' || strcmp(*argv, "--") == 0) | |
86 break; | |
87 | |
88 if (strcmp(*argv, "-o") == 0 && argc > 1) { | |
89 if (NULL == (fpout = fopen(argv[1], "wb+"))) { | |
90 perror(argv[1]); | |
91 return 1; | |
92 } | |
93 argv++; | |
94 argc--; | |
95 | |
96 } else if (strcmp(*argv, "-t") == 0) { | |
97 copy_archive = 0; | |
98 | |
99 } else if (**argv == '-') { | |
100 usage(); | |
101 } | |
102 | |
103 } | |
104 | |
105 if (argc < 1) | |
106 usage(); | |
107 | |
108 if (copy_archive == 0 && argc != 1) { | |
109 fprintf(stderr, "-t option only supported with a single sff argument\n"); | |
110 return 1; | |
111 } | |
112 | |
113 /* Create the hash table */ | |
114 hf = HashFileCreate(0, HASH_DYNAMIC_SIZE); | |
115 hf->nheaders = 0; | |
116 hf->headers = NULL; | |
117 | |
118 for (arg = 0; arg < argc; arg++) { | |
119 /* open (and read) the entire sff file */ | |
120 sff = argv[arg]; | |
121 | |
122 printf("Indexing %s:\n", sff); | |
123 if (fpout) { | |
124 if (NULL == (fp = fopen(sff, "rb"))) { | |
125 perror(sff); | |
126 return 1; | |
127 } | |
128 } else { | |
129 if (NULL == (fp = fopen(sff, "rb+"))) { | |
130 perror(sff); | |
131 return 1; | |
132 } | |
133 } | |
134 | |
135 /* Read the common header */ | |
136 ch = fread_sff_common_header(fp); | |
137 | |
138 if (ch->index_len && !fpout) { | |
139 fprintf(stderr, "Archive already contains index.\nReplacing the" | |
140 " index requires the \"-o outfile\" option.\n"); | |
141 return 1; | |
142 } | |
143 | |
144 /* Add the SFF common header as a hash file-header */ | |
145 hf->nheaders++; | |
146 hf->headers = (HashFileSection *)realloc(hf->headers, hf->nheaders * | |
147 sizeof(*hf->headers)); | |
148 hf->headers[hf->nheaders-1].pos = 0; | |
149 hf->headers[hf->nheaders-1].size = ch->header_len; | |
150 hf->headers[hf->nheaders-1].cached_data = NULL; | |
151 | |
152 /* Read the index items, adding to the hash */ | |
153 index_skipped = 0; | |
154 dot = 0; | |
155 printf(" |\r|"); | |
156 for (i = 0; i < ch->nreads; i++) { | |
157 int dlen; | |
158 uint32_t offset; | |
159 HashData hd; | |
160 HashFileItem *hfi; | |
161 | |
162 if (i >= dot * (ch->nreads/69)) { | |
163 putchar('.'); | |
164 fflush(stdout); | |
165 dot++; | |
166 } | |
167 | |
168 /* Skip old index if present */ | |
169 offset = ftell(fp); | |
170 if (offset == ch->index_offset) { | |
171 fseek(fp, ch->index_len, SEEK_CUR); | |
172 index_skipped = ch->index_len; | |
173 continue; | |
174 } | |
175 | |
176 hfi = (HashFileItem *)calloc(1, sizeof(*hfi)); | |
177 rh = fread_sff_read_header(fp); | |
178 dlen = (2*ch->flow_len + 3*rh->nbases + 7) & ~7; | |
179 fseek(fp, dlen, SEEK_CUR); | |
180 | |
181 hfi->header = hf->nheaders; | |
182 hfi->footer = 0; | |
183 hfi->pos = offset - index_skipped; | |
184 hfi->size = (ftell(fp) - index_skipped) - hfi->pos; | |
185 hd.p = hfi; | |
186 | |
187 HashTableAdd(hf->h, rh->name, rh->name_len, hd, NULL); | |
188 } | |
189 printf("\n"); | |
190 HashTableStats(hf->h, stdout); | |
191 | |
192 index_offset = ftell(fp) - index_skipped; | |
193 | |
194 /* Copy the archive if needed, minus the old index */ | |
195 if (fpout && copy_archive) { | |
196 char block[8192]; | |
197 size_t len; | |
198 uint64_t pos = 0; | |
199 | |
200 printf("\nCopying archive\n"); | |
201 | |
202 fseek(fp, 0, SEEK_SET); | |
203 while (len = fread(block, 1, 8192, fp)) { | |
204 /* Skip previous index */ | |
205 if (pos < ch->index_offset && pos+len > ch->index_offset) { | |
206 len = ch->index_offset - pos; | |
207 fseek(fp, ch->index_offset + ch->index_len, SEEK_SET); | |
208 } | |
209 if (len && len != fwrite(block, 1, len, fpout)) { | |
210 fprintf(stderr, "Failed to output new archive\n"); | |
211 return 1; | |
212 } | |
213 pos += len; | |
214 } | |
215 } | |
216 | |
217 if (!fpout) { | |
218 /* Save the hash */ | |
219 printf("Saving index\n"); | |
220 fseek(fp, 0, SEEK_END); | |
221 index_size = HashFileSave(hf, fp, 0); | |
222 HashFileDestroy(hf); | |
223 | |
224 /* Update the common header */ | |
225 fseek(fp, 0, SEEK_SET); | |
226 fread(hdr, 1, 31, fp); | |
227 *(uint64_t *)(hdr+8) = be_int8(index_offset); | |
228 *(uint32_t *)(hdr+16) = be_int4(index_size); | |
229 fseek(fp, 0, SEEK_SET); | |
230 fwrite(hdr, 1, 31, fp); | |
231 } | |
232 | |
233 fclose(fp); | |
234 } | |
235 | |
236 if (fpout) { | |
237 /* Save the hash */ | |
238 printf("Saving index\n"); | |
239 | |
240 if (!copy_archive) { | |
241 hf->archive = strdup(argv[0]); | |
242 index_offset = 0; | |
243 } | |
244 | |
245 fseek(fpout, 0, SEEK_END); | |
246 index_size = HashFileSave(hf, fpout, 0); | |
247 HashFileDestroy(hf); | |
248 | |
249 /* Update the common header to indicate index location */ | |
250 if (copy_archive) { | |
251 fseek(fpout, 0, SEEK_SET); | |
252 fread(hdr, 1, 31, fpout); | |
253 *(uint64_t *)(hdr+8) = be_int8(index_offset); | |
254 *(uint32_t *)(hdr+16) = be_int4(index_size); | |
255 fseek(fpout, 0, SEEK_SET); | |
256 fwrite(hdr, 1, 31, fpout); | |
257 } | |
258 fclose(fpout); | |
259 } | |
260 | |
261 return 0; | |
262 } |