comparison fastx_toolkit-0.0.6/src/libfastx/fastx.c @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
comparison
equal deleted inserted replaced
2:dfe9332138cf 3:997f5136985f
1 /*
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Affero General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Affero General Public License for more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <error.h>
21 #include <err.h>
22 #include <string.h>
23 #include <linux/limits.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <fcntl.h>
28
29
30 #include "chomp.h"
31 #include "fastx.h"
32
33 /*
34 valid_sequence_string -
35 check validity of a given sequence string.
36
37 input -
38 sequence - NULL terminated string to be validated.
39
40 Output -
41 1 (true) - The given sequence is valid - contained only A/C/G/N/T characters.
42 0 (false) - The given string contained invalid characeters.
43
44 Remark -
45 sequences with unknown (N) bases are considered VALID.
46 */
47 static int validate_nucleotides_string(const FASTX *pFASTX)
48 {
49 int match = 1 ;
50 const char* seq = pFASTX->nucleotides;
51
52 while (*seq != '\0' && match) {
53 match &= pFASTX->allowed_nucleotides[ (int) *seq ];
54 seq++;
55 }
56 return match;
57 }
58
59 static void create_lookup_table(FASTX *pFASTX)
60 {
61 int i;
62
63 for (i=0; i<256; i++)
64 pFASTX->allowed_nucleotides[i] = 0 ;
65
66 pFASTX->allowed_nucleotides['A'] = 1;
67 pFASTX->allowed_nucleotides['C'] = 1;
68 pFASTX->allowed_nucleotides['G'] = 1;
69 pFASTX->allowed_nucleotides['T'] = 1;
70
71 if (pFASTX->allow_N)
72 pFASTX->allowed_nucleotides['N'] = 1;
73
74 if (pFASTX->allow_lowercase) {
75 pFASTX->allowed_nucleotides['a'] = 1;
76 pFASTX->allowed_nucleotides['c'] = 1;
77 pFASTX->allowed_nucleotides['g'] = 1;
78 pFASTX->allowed_nucleotides['t'] = 1;
79
80 if (pFASTX->allow_N)
81 pFASTX->allowed_nucleotides['n'] = 1;
82 }
83 }
84
85 static void detect_input_format(FASTX *pFASTX)
86 {
87 //Get the first character in the file,
88 //and put it right back
89 int c = fgetc(pFASTX->input);
90 ungetc(c, pFASTX->input);
91
92 switch(c) {
93 case '>': /* FASTA file */
94 if ( pFASTX->allow_input_filetype==FASTQ_ONLY )
95 errx(1,"input file (%s) is FASTA, but only FASTQ input is allowed.",
96 pFASTX->input_file_name);
97 pFASTX->read_fastq = 0 ;
98 break;
99
100 case '@': /* FASTQ file */
101 if ( pFASTX->allow_input_filetype==FASTA_ONLY )
102 errx(1,"input file (%s) is FASTQ, but only FASTA input is allowed.",
103 pFASTX->input_file_name);
104 pFASTX->read_fastq = 1;
105 break;
106
107 case -1: /* EOF as first character - no input */
108 errx(1, "Premature End-Of-File (filename ='%s')", pFASTX->input_file_name);
109 break;
110
111 default:
112 errx(1, "input file (%s) has unknown file format (not FASTA or FASTQ), first character = %c (%d)",
113 pFASTX->input_file_name, c,c);
114 }
115 }
116
117 static void convert_ascii_quality_score_line(const char* ascii_quality_scores, FASTX *pFASTX)
118 {
119 size_t i;
120
121 if (strlen(ascii_quality_scores) != strlen(pFASTX->nucleotides))
122 errx(1,"number of quality values (%zu) doesn't match number of nucleotides (%zu) on line %lld",
123 strlen(ascii_quality_scores), strlen(pFASTX->nucleotides),
124 pFASTX->input_line_number);
125
126 for (i=0; i<strlen(ascii_quality_scores); i++) {
127 pFASTX->quality[i] = (int) (ascii_quality_scores[i] - 64) ;
128 if (pFASTX->quality[i] < -15 || pFASTX->quality[i] > 40)
129 errx(1, "Invalid quality score value (char '%c' ord %d quality value %d) on line %lld",
130 ascii_quality_scores[i], ascii_quality_scores[i],
131 pFASTX->quality[i], pFASTX->input_line_number );
132 }
133
134 }
135
136 static void convert_numeric_quality_score_line ( const char* numeric_quality_line, FASTX *pFASTX )
137 {
138 size_t index;
139 const char *quality_tok;
140 char *endptr;
141 int quality_value;
142
143 index=0;
144 quality_tok = numeric_quality_line;
145 do {
146 //read the quality score as an integer value
147 quality_value = strtol(quality_tok, &endptr, 10);
148 if (endptr == quality_tok)
149 errx(1,"Error: invalid quality score data on line %lld (quality_tok = \"%s\"",
150 pFASTX->input_line_number ,quality_tok);
151
152 if (quality_value > 40 || quality_value < -15)
153 errx(1, "invalid quality score value (%d) in line %lld.",
154 quality_value, pFASTX->input_line_number);
155
156 //convert it ASCII (as per solexa's encoding)
157 pFASTX->quality[index] = quality_value;
158 index++;
159 quality_tok = endptr;
160 } while (quality_tok != NULL && *quality_tok!='\0') ;
161
162 if (index != strlen(pFASTX->nucleotides)) {
163 errx(1,"number of quality values (%zu) doesn't match number of nucleotides (%zu) on line %lld",
164 index, strlen(pFASTX->nucleotides), pFASTX->input_line_number );
165 }
166 }
167
168 void fastx_init_reader(FASTX *pFASTX, const char* filename,
169 ALLOWED_INPUT_FILE_TYPES allowed_input_filetype,
170 ALLOWED_INPUT_UNKNOWN_BASES allow_N,
171 ALLOWED_INPUT_CASE allow_lowercase)
172 {
173 if (pFASTX==NULL)
174 errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__);
175
176 memset(pFASTX, 0, sizeof(FASTX));
177
178 if (strncmp(filename,"-",5)==0) {
179 pFASTX->input = stdin;
180 } else {
181 pFASTX->input = fopen(filename, "r");
182 if (pFASTX->input==NULL)
183 err(1, "failed to open input file '%s'", filename);
184 }
185
186 strncpy(pFASTX->input_file_name, filename, sizeof(pFASTX->input_file_name)-1);
187
188 pFASTX->allow_input_filetype = allowed_input_filetype;
189 pFASTX->allow_lowercase = allow_lowercase;
190 pFASTX->allow_N = allow_N;
191
192 create_lookup_table(pFASTX);
193
194 detect_input_format(pFASTX);
195 }
196
197 int open_output_file(const char* filename)
198 {
199 int fd ;
200 if (strncmp(filename,"-", 6)==0) {
201 fd = STDOUT_FILENO;
202 } else {
203 fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0666 );
204 if (fd==-1)
205 err(1, "Failed to create output file (%s)", filename);
206 }
207 return fd;
208 }
209
210 int open_output_compressor(FASTX __attribute__((unused)) *pFASTX, const char* filename)
211 {
212 int fd;
213 pid_t child_pid;
214 int parent_pipe[2];
215 if (pipe(parent_pipe)!=0)
216 err(1,"pipe (for gzip) failed");
217
218 child_pid = fork();
219 if (child_pid>0) {
220 /* The parent process */
221 fd = parent_pipe[1];
222 close(parent_pipe[0]);
223 return fd;
224 }
225
226 /* The child process */
227
228 //the compressor's STDIN is the pipe from the parent
229 dup2(parent_pipe[0], STDIN_FILENO);
230 close(parent_pipe[1]);
231
232 //the compressor's STDOUT is the output file
233 //(which can be the parent's STDOUT, too)
234 fd = open_output_file(filename);
235 dup2(fd, STDOUT_FILENO);
236
237 //Run GZIP
238 execlp("gzip","gzip",NULL);
239
240 //Should never get here...
241 err(1,"execlp(gzip) failed");
242 }
243
244
245 void fastx_init_writer(FASTX *pFASTX,
246 const char *filename,
247 OUTPUT_FILE_TYPE output_type,
248 int compress_output)
249 {
250 int fd;
251
252 if (pFASTX==NULL)
253 errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__);
254 if (pFASTX->input==NULL)
255 errx(1,"Internal error: pFASTX not initialized (%s:%d)", __FILE__, __LINE__);
256
257 pFASTX->compress_output = compress_output;
258 if (pFASTX->compress_output)
259 fd = open_output_compressor(pFASTX, filename);
260 else
261 fd = open_output_file(filename);
262
263 pFASTX->output = fdopen(fd,"w");
264 if (pFASTX->output==NULL)
265 err(1,"fdopen failed");
266
267 switch(output_type)
268 {
269 case OUTPUT_FASTA:
270 pFASTX->write_fastq = 0 ;
271 pFASTX->output_sequence_id_prefix = '>';
272 break ;
273
274 case OUTPUT_FASTQ_ASCII_QUAL:
275 if (! pFASTX->read_fastq)
276 errx(1,"Can't output FASTQ when input is FASTA.");
277 pFASTX->write_fastq = 1;
278 pFASTX->write_fastq_ascii = 1;
279 pFASTX->output_sequence_id_prefix = '@';
280 break ;
281
282 case OUTPUT_FASTQ_NUMERIC_QUAL:
283 if (! pFASTX->read_fastq)
284 errx(1,"Can't output FASTQ when input is FASTA.");
285 pFASTX->write_fastq = 1;
286 pFASTX->write_fastq_ascii = 0;
287 pFASTX->output_sequence_id_prefix = '@';
288 break ;
289
290 case OUTPUT_SAME_AS_INPUT:
291 pFASTX->write_fastq = pFASTX->read_fastq;
292
293 //Assume we're writing ASCII format,
294 pFASTX->write_fastq_ascii = 1 ;
295 //But set this flag and the real format will be determined
296 //when we actually read the FASTQ record
297 pFASTX->copy_input_fastq_format_to_output = 1;
298
299 pFASTX->output_sequence_id_prefix = (pFASTX->write_fastq) ? '@' : '>';
300 break;
301
302 default:
303 errx(1, __FILE__ ":%d: Unknown output_type (%d)",
304 __LINE__, output_type ) ;
305 }
306 }
307
308 int fastx_read_next_record(FASTX *pFASTX)
309 {
310 char temp_qual[MAX_SEQ_LINE_LENGTH+1];
311
312 temp_qual[MAX_SEQ_LINE_LENGTH] = 0;
313
314 if (pFASTX==NULL)
315 errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__);
316
317 if (fgets(pFASTX->input_sequence_id_prefix, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL)
318 return 0; //assume end-of-file, if we couldn't read the first line of the foursome
319
320 //for the rest of the lines, if they don't appear, it's an error
321 pFASTX->input_line_number++;
322
323 if (fgets(pFASTX->nucleotides, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL)
324 errx(1,"Failed to read complete record, missing 2nd line (nucleotides), on line %lld\n",
325 pFASTX->input_line_number);
326
327 chomp(pFASTX->name);
328 chomp(pFASTX->nucleotides);
329
330 validate_nucleotides_string(pFASTX);
331
332 if (pFASTX->read_fastq) {
333 pFASTX->input_line_number++;
334 if (fgets(pFASTX->input_name2_prefix, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL)
335 errx(1,"Failed to read complete record, missing 3rd line (name-2), on line %lld\n",
336 pFASTX->input_line_number);
337
338 pFASTX->input_line_number++;
339 if (fgets(temp_qual, sizeof(temp_qual), pFASTX->input) == NULL)
340 errx(1,"Failed to read complete record, missing 4th line (quality), on line %lld\n",
341 pFASTX->input_line_number);
342
343 chomp(pFASTX->name2);
344 chomp(temp_qual);
345
346 if (strlen(temp_qual) == strlen(pFASTX->nucleotides)) {
347 //Assume this is an ASCII quality score line, convert it to values
348 convert_ascii_quality_score_line ( temp_qual, pFASTX ) ;
349 pFASTX->read_fastq_ascii = 1 ;
350 } else {
351 //Assume this is a numeric quality score line, convert it to values
352 convert_numeric_quality_score_line ( temp_qual, pFASTX ) ;
353 pFASTX->read_fastq_ascii = 0 ;
354 }
355
356 //Copy the input format to the output format flag
357 if (pFASTX->copy_input_fastq_format_to_output) {
358 pFASTX->write_fastq_ascii = pFASTX->read_fastq_ascii;
359 }
360
361
362 }
363
364 pFASTX->num_input_sequences++;
365 pFASTX->num_input_reads += get_reads_count(pFASTX);
366
367 return 1;
368 }
369
370 static void write_ascii_qual_string(FASTX *pFASTX, int length)
371 {
372 int i;
373 int rc;
374
375 for (i=0; i<length; i++) {
376 rc = fprintf(pFASTX->output, "%c", pFASTX->quality[i] + 64 ) ;
377 if (rc<=0)
378 err(1,"writing quality scores failed");
379 }
380 rc = fprintf(pFASTX->output, "\n");
381 if (rc<=0)
382 err(1,"writing quality scores failed");
383 }
384
385 static void write_numeric_qual_string(FASTX *pFASTX, int length)
386 {
387 int i;
388 int rc;
389 for (i=0; i<length; i++) {
390 rc = fprintf(pFASTX->output, "%d", pFASTX->quality[i] ) ;
391 if (rc<=0)
392 err(1,"writing quality scores failed");
393 if (i<length-1) {
394 rc = fprintf(pFASTX->output," ");
395 if (rc<=0)
396 err(1,"writing quality scores failed");
397 }
398 }
399 rc = fprintf(pFASTX->output, "\n");
400 if (rc<=0)
401 err(1,"writing quality scores failed");
402 }
403
404 void fastx_write_record(FASTX *pFASTX)
405 {
406 int len;
407 int rc;
408
409 if (pFASTX==NULL)
410 errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__);
411
412
413 rc = fprintf(pFASTX->output, "%c%s\n",
414 pFASTX->output_sequence_id_prefix,
415 pFASTX->name ) ;
416 if (rc<=0)
417 err(1,"writing sequence identifier failed");
418
419 rc = fprintf(pFASTX->output, "%s\n", pFASTX->nucleotides);
420 if (rc<=0)
421 err(1,"writing nucleotides failed");
422
423 if (pFASTX->write_fastq) {
424 rc = fprintf(pFASTX->output, "+%s\n", pFASTX->name2 ) ;
425 if (rc<=0)
426 err(1,"writing 2nd sequence identifier failed");
427
428 len = strlen(pFASTX->nucleotides);
429 if (pFASTX->write_fastq_ascii)
430 write_ascii_qual_string(pFASTX, len);
431 else
432 write_numeric_qual_string(pFASTX, len);
433 }
434
435 pFASTX->num_output_sequences++;
436 pFASTX->num_output_reads += get_reads_count(pFASTX);
437 }
438
439 int get_reads_count(const FASTX *pFASTX)
440 {
441 char *dash = NULL ;
442
443 //FASTQ files are never collapsed (at least not in Gordon's Galaxy)
444 if (pFASTX->read_fastq)
445 return 1;
446
447 dash = strchr(pFASTX->name,'-');
448
449 // minus character wasn't found-
450 // this sequence is most probably not collapsed
451 if (dash==NULL)
452 return 1;
453
454 int count = atoi(dash+1);
455 if (count>0)
456 return count;
457
458 return 1;
459 }
460
461 size_t num_input_sequences(const FASTX *pFASTX)
462 {
463 return pFASTX->num_input_sequences;
464 }
465
466 size_t num_input_reads(const FASTX *pFASTX)
467 {
468 return pFASTX->num_input_reads;
469 }
470
471 size_t num_output_sequences(const FASTX *pFASTX)
472 {
473 return pFASTX->num_output_sequences;
474 }
475
476 size_t num_output_reads(const FASTX *pFASTX)
477 {
478 return pFASTX->num_output_reads;
479 }
480
481