0
|
1 /* bgzip.c -- Block compression/decompression utility.
|
|
2
|
|
3 Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
|
|
4 Copyright (C) 2010, 2013, 2014 Genome Research Ltd.
|
|
5
|
|
6 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7 of this software and associated documentation files (the "Software"), to deal
|
|
8 in the Software without restriction, including without limitation the rights
|
|
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10 copies of the Software, and to permit persons to whom the Software is
|
|
11 furnished to do so, subject to the following conditions:
|
|
12
|
|
13 The above copyright notices and this permission notice shall be included in
|
|
14 all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22 THE SOFTWARE.
|
|
23 */
|
|
24
|
|
25 #include <stdlib.h>
|
|
26 #include <string.h>
|
|
27 #include <stdio.h>
|
|
28 #include <fcntl.h>
|
|
29 #include <unistd.h>
|
|
30 #include <errno.h>
|
|
31 #include <stdarg.h>
|
|
32 #include <getopt.h>
|
|
33 #include <sys/select.h>
|
|
34 #include <sys/stat.h>
|
|
35 #include "htslib/bgzf.h"
|
|
36 #include "htslib/hts.h"
|
|
37
|
|
38 static const int WINDOW_SIZE = 64 * 1024;
|
|
39
|
|
40 static void error(const char *format, ...)
|
|
41 {
|
|
42 va_list ap;
|
|
43 va_start(ap, format);
|
|
44 vfprintf(stderr, format, ap);
|
|
45 va_end(ap);
|
|
46 exit(EXIT_FAILURE);
|
|
47 }
|
|
48
|
|
49 static int write_open(const char *fn, int is_forced)
|
|
50 {
|
|
51 int fd = -1;
|
|
52 char c;
|
|
53 if (!is_forced) {
|
|
54 if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
|
|
55 fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
|
|
56 if ( scanf("%c", &c) != 1 ) c = 'n';
|
|
57 if (c != 'Y' && c != 'y') {
|
|
58 fprintf(stderr, "[bgzip] not overwritten\n");
|
|
59 exit(EXIT_FAILURE);
|
|
60 }
|
|
61 }
|
|
62 }
|
|
63 if (fd < 0) {
|
|
64 if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
|
|
65 fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
|
|
66 exit(EXIT_FAILURE);
|
|
67 }
|
|
68 }
|
|
69 return fd;
|
|
70 }
|
|
71
|
|
72 static int bgzip_main_usage(void)
|
|
73 {
|
|
74 fprintf(stderr, "\n");
|
|
75 fprintf(stderr, "Version: %s\n", hts_version());
|
|
76 fprintf(stderr, "Usage: bgzip [OPTIONS] [FILE] ...\n");
|
|
77 fprintf(stderr, "Options:\n");
|
|
78 fprintf(stderr, " -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)\n");
|
|
79 fprintf(stderr, " -c, --stdout write on standard output, keep original files unchanged\n");
|
|
80 fprintf(stderr, " -d, --decompress decompress\n");
|
|
81 fprintf(stderr, " -f, --force overwrite files without asking\n");
|
|
82 fprintf(stderr, " -h, --help give this help\n");
|
|
83 fprintf(stderr, " -i, --index compress and create BGZF index\n");
|
|
84 fprintf(stderr, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n");
|
|
85 fprintf(stderr, " -r, --reindex (re)index compressed file\n");
|
|
86 fprintf(stderr, " -s, --size INT decompress INT bytes (uncompressed size)\n");
|
|
87 fprintf(stderr, "\n");
|
|
88 return 1;
|
|
89 }
|
|
90
|
|
91 int main(int argc, char **argv)
|
|
92 {
|
|
93 int c, compress, pstdout, is_forced, index = 0, reindex = 0;
|
|
94 BGZF *fp;
|
|
95 void *buffer;
|
|
96 long start, end, size;
|
|
97 char *index_fname = NULL;
|
|
98
|
|
99 static struct option loptions[] =
|
|
100 {
|
|
101 {"help",0,0,'h'},
|
|
102 {"offset",1,0,'b'},
|
|
103 {"stdout",0,0,'c'},
|
|
104 {"decompress",0,0,'d'},
|
|
105 {"force",0,0,'f'},
|
|
106 {"index",0,0,'i'},
|
|
107 {"index-name",1,0,'I'},
|
|
108 {"reindex",0,0,'r'},
|
|
109 {"size",1,0,'s'},
|
|
110 {0,0,0,0}
|
|
111 };
|
|
112
|
|
113 compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
|
|
114 while((c = getopt_long(argc, argv, "cdh?fb:s:iI:r",loptions,NULL)) >= 0){
|
|
115 switch(c){
|
|
116 case 'd': compress = 0; break;
|
|
117 case 'c': pstdout = 1; break;
|
|
118 case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
|
|
119 case 's': size = atol(optarg); pstdout = 1; break;
|
|
120 case 'f': is_forced = 1; break;
|
|
121 case 'i': index = 1; break;
|
|
122 case 'I': index_fname = optarg; break;
|
|
123 case 'r': reindex = 1; compress = 0; break;
|
|
124 case 'h':
|
|
125 case '?': return bgzip_main_usage();
|
|
126 }
|
|
127 }
|
|
128 if (size >= 0) end = start + size;
|
|
129 if (end >= 0 && end < start) {
|
|
130 fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
|
|
131 return 1;
|
|
132 }
|
|
133 if (compress == 1) {
|
|
134 struct stat sbuf;
|
|
135 int f_src = fileno(stdin);
|
|
136 int f_dst = fileno(stdout);
|
|
137
|
|
138 if ( argc>optind )
|
|
139 {
|
|
140 if ( stat(argv[optind],&sbuf)<0 )
|
|
141 {
|
|
142 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
|
|
143 return 1;
|
|
144 }
|
|
145
|
|
146 if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
|
|
147 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
|
|
148 return 1;
|
|
149 }
|
|
150
|
|
151 if (pstdout)
|
|
152 f_dst = fileno(stdout);
|
|
153 else
|
|
154 {
|
|
155 char *name = malloc(strlen(argv[optind]) + 5);
|
|
156 strcpy(name, argv[optind]);
|
|
157 strcat(name, ".gz");
|
|
158 f_dst = write_open(name, is_forced);
|
|
159 free(name);
|
|
160 if (f_dst < 0) return 1;
|
|
161 }
|
|
162 }
|
|
163 else if (!pstdout && isatty(fileno((FILE *)stdout)) )
|
|
164 return bgzip_main_usage();
|
|
165 else if ( index && !index_fname )
|
|
166 {
|
|
167 fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
|
|
168 return 1;
|
|
169 }
|
|
170
|
|
171 fp = bgzf_fdopen(f_dst, "w");
|
|
172 if ( index ) bgzf_index_build_init(fp);
|
|
173 buffer = malloc(WINDOW_SIZE);
|
|
174 while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
|
|
175 if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
|
|
176 // f_dst will be closed here
|
|
177 if ( index )
|
|
178 {
|
|
179 if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL);
|
|
180 else bgzf_index_dump(fp, argv[optind], ".gz.gzi");
|
|
181 }
|
|
182 if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
|
|
183 if (argc > optind && !pstdout) unlink(argv[optind]);
|
|
184 free(buffer);
|
|
185 close(f_src);
|
|
186 return 0;
|
|
187 }
|
|
188 else if ( reindex )
|
|
189 {
|
|
190 if ( argc>optind )
|
|
191 {
|
|
192 fp = bgzf_open(argv[optind], "r");
|
|
193 if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
|
|
194 }
|
|
195 else
|
|
196 {
|
|
197 if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
|
|
198 fp = bgzf_fdopen(fileno(stdin), "r");
|
|
199 if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
|
|
200 }
|
|
201
|
|
202 buffer = malloc(BGZF_BLOCK_SIZE);
|
|
203 bgzf_index_build_init(fp);
|
|
204 int ret;
|
|
205 while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
|
|
206 free(buffer);
|
|
207 if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");
|
|
208
|
|
209 if ( index_fname )
|
|
210 bgzf_index_dump(fp, index_fname, NULL);
|
|
211 else
|
|
212 bgzf_index_dump(fp, argv[optind], ".gzi");
|
|
213
|
|
214 if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
|
|
215 return 0;
|
|
216 }
|
|
217 else
|
|
218 {
|
|
219 struct stat sbuf;
|
|
220 int f_dst;
|
|
221
|
|
222 if ( argc>optind )
|
|
223 {
|
|
224 if ( stat(argv[optind],&sbuf)<0 )
|
|
225 {
|
|
226 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
|
|
227 return 1;
|
|
228 }
|
|
229 char *name;
|
|
230 int len = strlen(argv[optind]);
|
|
231 if ( strcmp(argv[optind]+len-3,".gz") )
|
|
232 {
|
|
233 fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
|
|
234 return 1;
|
|
235 }
|
|
236 fp = bgzf_open(argv[optind], "r");
|
|
237 if (fp == NULL) {
|
|
238 fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
|
|
239 return 1;
|
|
240 }
|
|
241
|
|
242 if (pstdout) {
|
|
243 f_dst = fileno(stdout);
|
|
244 }
|
|
245 else {
|
|
246 name = strdup(argv[optind]);
|
|
247 name[strlen(name) - 3] = '\0';
|
|
248 f_dst = write_open(name, is_forced);
|
|
249 free(name);
|
|
250 }
|
|
251 }
|
|
252 else if (!pstdout && isatty(fileno((FILE *)stdin)) )
|
|
253 return bgzip_main_usage();
|
|
254 else
|
|
255 {
|
|
256 f_dst = fileno(stdout);
|
|
257 fp = bgzf_fdopen(fileno(stdin), "r");
|
|
258 if (fp == NULL) {
|
|
259 fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
|
|
260 return 1;
|
|
261 }
|
|
262 }
|
|
263 buffer = malloc(WINDOW_SIZE);
|
|
264 if ( start>0 )
|
|
265 {
|
|
266 if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
|
|
267 if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
|
|
268 }
|
|
269 while (1) {
|
|
270 if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
|
|
271 else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
|
|
272 if (c == 0) break;
|
|
273 if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode);
|
|
274 start += c;
|
|
275 if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c);
|
|
276 if (end >= 0 && start >= end) break;
|
|
277 }
|
|
278 free(buffer);
|
|
279 if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
|
|
280 if (!pstdout) unlink(argv[optind]);
|
|
281 return 0;
|
|
282 }
|
|
283 return 0;
|
|
284 }
|