comparison ezBAMQC/src/htslib/bgzip.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* bgzip.c -- Block compression/decompression utility.
2
3 Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
4 Copyright (C) 2010, 2013, 2014 Genome Research Ltd.
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notices and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24
25 #include <stdlib.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <errno.h>
31 #include <stdarg.h>
32 #include <getopt.h>
33 #include <sys/select.h>
34 #include <sys/stat.h>
35 #include "htslib/bgzf.h"
36 #include "htslib/hts.h"
37
38 static const int WINDOW_SIZE = 64 * 1024;
39
40 static void error(const char *format, ...)
41 {
42 va_list ap;
43 va_start(ap, format);
44 vfprintf(stderr, format, ap);
45 va_end(ap);
46 exit(EXIT_FAILURE);
47 }
48
49 static int write_open(const char *fn, int is_forced)
50 {
51 int fd = -1;
52 char c;
53 if (!is_forced) {
54 if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
55 fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
56 if ( scanf("%c", &c) != 1 ) c = 'n';
57 if (c != 'Y' && c != 'y') {
58 fprintf(stderr, "[bgzip] not overwritten\n");
59 exit(EXIT_FAILURE);
60 }
61 }
62 }
63 if (fd < 0) {
64 if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
65 fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
66 exit(EXIT_FAILURE);
67 }
68 }
69 return fd;
70 }
71
72 static int bgzip_main_usage(void)
73 {
74 fprintf(stderr, "\n");
75 fprintf(stderr, "Version: %s\n", hts_version());
76 fprintf(stderr, "Usage: bgzip [OPTIONS] [FILE] ...\n");
77 fprintf(stderr, "Options:\n");
78 fprintf(stderr, " -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset)\n");
79 fprintf(stderr, " -c, --stdout write on standard output, keep original files unchanged\n");
80 fprintf(stderr, " -d, --decompress decompress\n");
81 fprintf(stderr, " -f, --force overwrite files without asking\n");
82 fprintf(stderr, " -h, --help give this help\n");
83 fprintf(stderr, " -i, --index compress and create BGZF index\n");
84 fprintf(stderr, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n");
85 fprintf(stderr, " -r, --reindex (re)index compressed file\n");
86 fprintf(stderr, " -s, --size INT decompress INT bytes (uncompressed size)\n");
87 fprintf(stderr, "\n");
88 return 1;
89 }
90
91 int main(int argc, char **argv)
92 {
93 int c, compress, pstdout, is_forced, index = 0, reindex = 0;
94 BGZF *fp;
95 void *buffer;
96 long start, end, size;
97 char *index_fname = NULL;
98
99 static struct option loptions[] =
100 {
101 {"help",0,0,'h'},
102 {"offset",1,0,'b'},
103 {"stdout",0,0,'c'},
104 {"decompress",0,0,'d'},
105 {"force",0,0,'f'},
106 {"index",0,0,'i'},
107 {"index-name",1,0,'I'},
108 {"reindex",0,0,'r'},
109 {"size",1,0,'s'},
110 {0,0,0,0}
111 };
112
113 compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
114 while((c = getopt_long(argc, argv, "cdh?fb:s:iI:r",loptions,NULL)) >= 0){
115 switch(c){
116 case 'd': compress = 0; break;
117 case 'c': pstdout = 1; break;
118 case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
119 case 's': size = atol(optarg); pstdout = 1; break;
120 case 'f': is_forced = 1; break;
121 case 'i': index = 1; break;
122 case 'I': index_fname = optarg; break;
123 case 'r': reindex = 1; compress = 0; break;
124 case 'h':
125 case '?': return bgzip_main_usage();
126 }
127 }
128 if (size >= 0) end = start + size;
129 if (end >= 0 && end < start) {
130 fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
131 return 1;
132 }
133 if (compress == 1) {
134 struct stat sbuf;
135 int f_src = fileno(stdin);
136 int f_dst = fileno(stdout);
137
138 if ( argc>optind )
139 {
140 if ( stat(argv[optind],&sbuf)<0 )
141 {
142 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
143 return 1;
144 }
145
146 if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
147 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
148 return 1;
149 }
150
151 if (pstdout)
152 f_dst = fileno(stdout);
153 else
154 {
155 char *name = malloc(strlen(argv[optind]) + 5);
156 strcpy(name, argv[optind]);
157 strcat(name, ".gz");
158 f_dst = write_open(name, is_forced);
159 free(name);
160 if (f_dst < 0) return 1;
161 }
162 }
163 else if (!pstdout && isatty(fileno((FILE *)stdout)) )
164 return bgzip_main_usage();
165 else if ( index && !index_fname )
166 {
167 fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
168 return 1;
169 }
170
171 fp = bgzf_fdopen(f_dst, "w");
172 if ( index ) bgzf_index_build_init(fp);
173 buffer = malloc(WINDOW_SIZE);
174 while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
175 if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
176 // f_dst will be closed here
177 if ( index )
178 {
179 if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL);
180 else bgzf_index_dump(fp, argv[optind], ".gz.gzi");
181 }
182 if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
183 if (argc > optind && !pstdout) unlink(argv[optind]);
184 free(buffer);
185 close(f_src);
186 return 0;
187 }
188 else if ( reindex )
189 {
190 if ( argc>optind )
191 {
192 fp = bgzf_open(argv[optind], "r");
193 if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
194 }
195 else
196 {
197 if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
198 fp = bgzf_fdopen(fileno(stdin), "r");
199 if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
200 }
201
202 buffer = malloc(BGZF_BLOCK_SIZE);
203 bgzf_index_build_init(fp);
204 int ret;
205 while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
206 free(buffer);
207 if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");
208
209 if ( index_fname )
210 bgzf_index_dump(fp, index_fname, NULL);
211 else
212 bgzf_index_dump(fp, argv[optind], ".gzi");
213
214 if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
215 return 0;
216 }
217 else
218 {
219 struct stat sbuf;
220 int f_dst;
221
222 if ( argc>optind )
223 {
224 if ( stat(argv[optind],&sbuf)<0 )
225 {
226 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
227 return 1;
228 }
229 char *name;
230 int len = strlen(argv[optind]);
231 if ( strcmp(argv[optind]+len-3,".gz") )
232 {
233 fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
234 return 1;
235 }
236 fp = bgzf_open(argv[optind], "r");
237 if (fp == NULL) {
238 fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
239 return 1;
240 }
241
242 if (pstdout) {
243 f_dst = fileno(stdout);
244 }
245 else {
246 name = strdup(argv[optind]);
247 name[strlen(name) - 3] = '\0';
248 f_dst = write_open(name, is_forced);
249 free(name);
250 }
251 }
252 else if (!pstdout && isatty(fileno((FILE *)stdin)) )
253 return bgzip_main_usage();
254 else
255 {
256 f_dst = fileno(stdout);
257 fp = bgzf_fdopen(fileno(stdin), "r");
258 if (fp == NULL) {
259 fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
260 return 1;
261 }
262 }
263 buffer = malloc(WINDOW_SIZE);
264 if ( start>0 )
265 {
266 if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
267 if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
268 }
269 while (1) {
270 if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
271 else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
272 if (c == 0) break;
273 if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode);
274 start += c;
275 if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c);
276 if (end >= 0 && start >= end) break;
277 }
278 free(buffer);
279 if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
280 if (!pstdout) unlink(argv[optind]);
281 return 0;
282 }
283 return 0;
284 }