Mercurial > repos > xilinxu > xilinxu
comparison fastx_toolkit-0.0.6/src/libfastx/fastx.c @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:dfe9332138cf | 3:997f5136985f |
---|---|
1 /* | |
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools. | |
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu) | |
4 | |
5 This program is free software: you can redistribute it and/or modify | |
6 it under the terms of the GNU Affero General Public License as | |
7 published by the Free Software Foundation, either version 3 of the | |
8 License, or (at your option) any later version. | |
9 | |
10 This program is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU Affero General Public License for more details. | |
14 | |
15 You should have received a copy of the GNU Affero General Public License | |
16 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
17 */ | |
18 #include <stdio.h> | |
19 #include <stdlib.h> | |
20 #include <error.h> | |
21 #include <err.h> | |
22 #include <string.h> | |
23 #include <linux/limits.h> | |
24 #include <unistd.h> | |
25 #include <sys/types.h> | |
26 #include <sys/stat.h> | |
27 #include <fcntl.h> | |
28 | |
29 | |
30 #include "chomp.h" | |
31 #include "fastx.h" | |
32 | |
33 /* | |
34 valid_sequence_string - | |
35 check validity of a given sequence string. | |
36 | |
37 input - | |
38 sequence - NULL terminated string to be validated. | |
39 | |
40 Output - | |
41 1 (true) - The given sequence is valid - contained only A/C/G/N/T characters. | |
42 0 (false) - The given string contained invalid characeters. | |
43 | |
44 Remark - | |
45 sequences with unknown (N) bases are considered VALID. | |
46 */ | |
47 static int validate_nucleotides_string(const FASTX *pFASTX) | |
48 { | |
49 int match = 1 ; | |
50 const char* seq = pFASTX->nucleotides; | |
51 | |
52 while (*seq != '\0' && match) { | |
53 match &= pFASTX->allowed_nucleotides[ (int) *seq ]; | |
54 seq++; | |
55 } | |
56 return match; | |
57 } | |
58 | |
59 static void create_lookup_table(FASTX *pFASTX) | |
60 { | |
61 int i; | |
62 | |
63 for (i=0; i<256; i++) | |
64 pFASTX->allowed_nucleotides[i] = 0 ; | |
65 | |
66 pFASTX->allowed_nucleotides['A'] = 1; | |
67 pFASTX->allowed_nucleotides['C'] = 1; | |
68 pFASTX->allowed_nucleotides['G'] = 1; | |
69 pFASTX->allowed_nucleotides['T'] = 1; | |
70 | |
71 if (pFASTX->allow_N) | |
72 pFASTX->allowed_nucleotides['N'] = 1; | |
73 | |
74 if (pFASTX->allow_lowercase) { | |
75 pFASTX->allowed_nucleotides['a'] = 1; | |
76 pFASTX->allowed_nucleotides['c'] = 1; | |
77 pFASTX->allowed_nucleotides['g'] = 1; | |
78 pFASTX->allowed_nucleotides['t'] = 1; | |
79 | |
80 if (pFASTX->allow_N) | |
81 pFASTX->allowed_nucleotides['n'] = 1; | |
82 } | |
83 } | |
84 | |
85 static void detect_input_format(FASTX *pFASTX) | |
86 { | |
87 //Get the first character in the file, | |
88 //and put it right back | |
89 int c = fgetc(pFASTX->input); | |
90 ungetc(c, pFASTX->input); | |
91 | |
92 switch(c) { | |
93 case '>': /* FASTA file */ | |
94 if ( pFASTX->allow_input_filetype==FASTQ_ONLY ) | |
95 errx(1,"input file (%s) is FASTA, but only FASTQ input is allowed.", | |
96 pFASTX->input_file_name); | |
97 pFASTX->read_fastq = 0 ; | |
98 break; | |
99 | |
100 case '@': /* FASTQ file */ | |
101 if ( pFASTX->allow_input_filetype==FASTA_ONLY ) | |
102 errx(1,"input file (%s) is FASTQ, but only FASTA input is allowed.", | |
103 pFASTX->input_file_name); | |
104 pFASTX->read_fastq = 1; | |
105 break; | |
106 | |
107 case -1: /* EOF as first character - no input */ | |
108 errx(1, "Premature End-Of-File (filename ='%s')", pFASTX->input_file_name); | |
109 break; | |
110 | |
111 default: | |
112 errx(1, "input file (%s) has unknown file format (not FASTA or FASTQ), first character = %c (%d)", | |
113 pFASTX->input_file_name, c,c); | |
114 } | |
115 } | |
116 | |
117 static void convert_ascii_quality_score_line(const char* ascii_quality_scores, FASTX *pFASTX) | |
118 { | |
119 size_t i; | |
120 | |
121 if (strlen(ascii_quality_scores) != strlen(pFASTX->nucleotides)) | |
122 errx(1,"number of quality values (%zu) doesn't match number of nucleotides (%zu) on line %lld", | |
123 strlen(ascii_quality_scores), strlen(pFASTX->nucleotides), | |
124 pFASTX->input_line_number); | |
125 | |
126 for (i=0; i<strlen(ascii_quality_scores); i++) { | |
127 pFASTX->quality[i] = (int) (ascii_quality_scores[i] - 64) ; | |
128 if (pFASTX->quality[i] < -15 || pFASTX->quality[i] > 40) | |
129 errx(1, "Invalid quality score value (char '%c' ord %d quality value %d) on line %lld", | |
130 ascii_quality_scores[i], ascii_quality_scores[i], | |
131 pFASTX->quality[i], pFASTX->input_line_number ); | |
132 } | |
133 | |
134 } | |
135 | |
136 static void convert_numeric_quality_score_line ( const char* numeric_quality_line, FASTX *pFASTX ) | |
137 { | |
138 size_t index; | |
139 const char *quality_tok; | |
140 char *endptr; | |
141 int quality_value; | |
142 | |
143 index=0; | |
144 quality_tok = numeric_quality_line; | |
145 do { | |
146 //read the quality score as an integer value | |
147 quality_value = strtol(quality_tok, &endptr, 10); | |
148 if (endptr == quality_tok) | |
149 errx(1,"Error: invalid quality score data on line %lld (quality_tok = \"%s\"", | |
150 pFASTX->input_line_number ,quality_tok); | |
151 | |
152 if (quality_value > 40 || quality_value < -15) | |
153 errx(1, "invalid quality score value (%d) in line %lld.", | |
154 quality_value, pFASTX->input_line_number); | |
155 | |
156 //convert it ASCII (as per solexa's encoding) | |
157 pFASTX->quality[index] = quality_value; | |
158 index++; | |
159 quality_tok = endptr; | |
160 } while (quality_tok != NULL && *quality_tok!='\0') ; | |
161 | |
162 if (index != strlen(pFASTX->nucleotides)) { | |
163 errx(1,"number of quality values (%zu) doesn't match number of nucleotides (%zu) on line %lld", | |
164 index, strlen(pFASTX->nucleotides), pFASTX->input_line_number ); | |
165 } | |
166 } | |
167 | |
168 void fastx_init_reader(FASTX *pFASTX, const char* filename, | |
169 ALLOWED_INPUT_FILE_TYPES allowed_input_filetype, | |
170 ALLOWED_INPUT_UNKNOWN_BASES allow_N, | |
171 ALLOWED_INPUT_CASE allow_lowercase) | |
172 { | |
173 if (pFASTX==NULL) | |
174 errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); | |
175 | |
176 memset(pFASTX, 0, sizeof(FASTX)); | |
177 | |
178 if (strncmp(filename,"-",5)==0) { | |
179 pFASTX->input = stdin; | |
180 } else { | |
181 pFASTX->input = fopen(filename, "r"); | |
182 if (pFASTX->input==NULL) | |
183 err(1, "failed to open input file '%s'", filename); | |
184 } | |
185 | |
186 strncpy(pFASTX->input_file_name, filename, sizeof(pFASTX->input_file_name)-1); | |
187 | |
188 pFASTX->allow_input_filetype = allowed_input_filetype; | |
189 pFASTX->allow_lowercase = allow_lowercase; | |
190 pFASTX->allow_N = allow_N; | |
191 | |
192 create_lookup_table(pFASTX); | |
193 | |
194 detect_input_format(pFASTX); | |
195 } | |
196 | |
197 int open_output_file(const char* filename) | |
198 { | |
199 int fd ; | |
200 if (strncmp(filename,"-", 6)==0) { | |
201 fd = STDOUT_FILENO; | |
202 } else { | |
203 fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0666 ); | |
204 if (fd==-1) | |
205 err(1, "Failed to create output file (%s)", filename); | |
206 } | |
207 return fd; | |
208 } | |
209 | |
210 int open_output_compressor(FASTX __attribute__((unused)) *pFASTX, const char* filename) | |
211 { | |
212 int fd; | |
213 pid_t child_pid; | |
214 int parent_pipe[2]; | |
215 if (pipe(parent_pipe)!=0) | |
216 err(1,"pipe (for gzip) failed"); | |
217 | |
218 child_pid = fork(); | |
219 if (child_pid>0) { | |
220 /* The parent process */ | |
221 fd = parent_pipe[1]; | |
222 close(parent_pipe[0]); | |
223 return fd; | |
224 } | |
225 | |
226 /* The child process */ | |
227 | |
228 //the compressor's STDIN is the pipe from the parent | |
229 dup2(parent_pipe[0], STDIN_FILENO); | |
230 close(parent_pipe[1]); | |
231 | |
232 //the compressor's STDOUT is the output file | |
233 //(which can be the parent's STDOUT, too) | |
234 fd = open_output_file(filename); | |
235 dup2(fd, STDOUT_FILENO); | |
236 | |
237 //Run GZIP | |
238 execlp("gzip","gzip",NULL); | |
239 | |
240 //Should never get here... | |
241 err(1,"execlp(gzip) failed"); | |
242 } | |
243 | |
244 | |
245 void fastx_init_writer(FASTX *pFASTX, | |
246 const char *filename, | |
247 OUTPUT_FILE_TYPE output_type, | |
248 int compress_output) | |
249 { | |
250 int fd; | |
251 | |
252 if (pFASTX==NULL) | |
253 errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); | |
254 if (pFASTX->input==NULL) | |
255 errx(1,"Internal error: pFASTX not initialized (%s:%d)", __FILE__, __LINE__); | |
256 | |
257 pFASTX->compress_output = compress_output; | |
258 if (pFASTX->compress_output) | |
259 fd = open_output_compressor(pFASTX, filename); | |
260 else | |
261 fd = open_output_file(filename); | |
262 | |
263 pFASTX->output = fdopen(fd,"w"); | |
264 if (pFASTX->output==NULL) | |
265 err(1,"fdopen failed"); | |
266 | |
267 switch(output_type) | |
268 { | |
269 case OUTPUT_FASTA: | |
270 pFASTX->write_fastq = 0 ; | |
271 pFASTX->output_sequence_id_prefix = '>'; | |
272 break ; | |
273 | |
274 case OUTPUT_FASTQ_ASCII_QUAL: | |
275 if (! pFASTX->read_fastq) | |
276 errx(1,"Can't output FASTQ when input is FASTA."); | |
277 pFASTX->write_fastq = 1; | |
278 pFASTX->write_fastq_ascii = 1; | |
279 pFASTX->output_sequence_id_prefix = '@'; | |
280 break ; | |
281 | |
282 case OUTPUT_FASTQ_NUMERIC_QUAL: | |
283 if (! pFASTX->read_fastq) | |
284 errx(1,"Can't output FASTQ when input is FASTA."); | |
285 pFASTX->write_fastq = 1; | |
286 pFASTX->write_fastq_ascii = 0; | |
287 pFASTX->output_sequence_id_prefix = '@'; | |
288 break ; | |
289 | |
290 case OUTPUT_SAME_AS_INPUT: | |
291 pFASTX->write_fastq = pFASTX->read_fastq; | |
292 | |
293 //Assume we're writing ASCII format, | |
294 pFASTX->write_fastq_ascii = 1 ; | |
295 //But set this flag and the real format will be determined | |
296 //when we actually read the FASTQ record | |
297 pFASTX->copy_input_fastq_format_to_output = 1; | |
298 | |
299 pFASTX->output_sequence_id_prefix = (pFASTX->write_fastq) ? '@' : '>'; | |
300 break; | |
301 | |
302 default: | |
303 errx(1, __FILE__ ":%d: Unknown output_type (%d)", | |
304 __LINE__, output_type ) ; | |
305 } | |
306 } | |
307 | |
308 int fastx_read_next_record(FASTX *pFASTX) | |
309 { | |
310 char temp_qual[MAX_SEQ_LINE_LENGTH+1]; | |
311 | |
312 temp_qual[MAX_SEQ_LINE_LENGTH] = 0; | |
313 | |
314 if (pFASTX==NULL) | |
315 errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); | |
316 | |
317 if (fgets(pFASTX->input_sequence_id_prefix, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) | |
318 return 0; //assume end-of-file, if we couldn't read the first line of the foursome | |
319 | |
320 //for the rest of the lines, if they don't appear, it's an error | |
321 pFASTX->input_line_number++; | |
322 | |
323 if (fgets(pFASTX->nucleotides, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) | |
324 errx(1,"Failed to read complete record, missing 2nd line (nucleotides), on line %lld\n", | |
325 pFASTX->input_line_number); | |
326 | |
327 chomp(pFASTX->name); | |
328 chomp(pFASTX->nucleotides); | |
329 | |
330 validate_nucleotides_string(pFASTX); | |
331 | |
332 if (pFASTX->read_fastq) { | |
333 pFASTX->input_line_number++; | |
334 if (fgets(pFASTX->input_name2_prefix, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) | |
335 errx(1,"Failed to read complete record, missing 3rd line (name-2), on line %lld\n", | |
336 pFASTX->input_line_number); | |
337 | |
338 pFASTX->input_line_number++; | |
339 if (fgets(temp_qual, sizeof(temp_qual), pFASTX->input) == NULL) | |
340 errx(1,"Failed to read complete record, missing 4th line (quality), on line %lld\n", | |
341 pFASTX->input_line_number); | |
342 | |
343 chomp(pFASTX->name2); | |
344 chomp(temp_qual); | |
345 | |
346 if (strlen(temp_qual) == strlen(pFASTX->nucleotides)) { | |
347 //Assume this is an ASCII quality score line, convert it to values | |
348 convert_ascii_quality_score_line ( temp_qual, pFASTX ) ; | |
349 pFASTX->read_fastq_ascii = 1 ; | |
350 } else { | |
351 //Assume this is a numeric quality score line, convert it to values | |
352 convert_numeric_quality_score_line ( temp_qual, pFASTX ) ; | |
353 pFASTX->read_fastq_ascii = 0 ; | |
354 } | |
355 | |
356 //Copy the input format to the output format flag | |
357 if (pFASTX->copy_input_fastq_format_to_output) { | |
358 pFASTX->write_fastq_ascii = pFASTX->read_fastq_ascii; | |
359 } | |
360 | |
361 | |
362 } | |
363 | |
364 pFASTX->num_input_sequences++; | |
365 pFASTX->num_input_reads += get_reads_count(pFASTX); | |
366 | |
367 return 1; | |
368 } | |
369 | |
370 static void write_ascii_qual_string(FASTX *pFASTX, int length) | |
371 { | |
372 int i; | |
373 int rc; | |
374 | |
375 for (i=0; i<length; i++) { | |
376 rc = fprintf(pFASTX->output, "%c", pFASTX->quality[i] + 64 ) ; | |
377 if (rc<=0) | |
378 err(1,"writing quality scores failed"); | |
379 } | |
380 rc = fprintf(pFASTX->output, "\n"); | |
381 if (rc<=0) | |
382 err(1,"writing quality scores failed"); | |
383 } | |
384 | |
385 static void write_numeric_qual_string(FASTX *pFASTX, int length) | |
386 { | |
387 int i; | |
388 int rc; | |
389 for (i=0; i<length; i++) { | |
390 rc = fprintf(pFASTX->output, "%d", pFASTX->quality[i] ) ; | |
391 if (rc<=0) | |
392 err(1,"writing quality scores failed"); | |
393 if (i<length-1) { | |
394 rc = fprintf(pFASTX->output," "); | |
395 if (rc<=0) | |
396 err(1,"writing quality scores failed"); | |
397 } | |
398 } | |
399 rc = fprintf(pFASTX->output, "\n"); | |
400 if (rc<=0) | |
401 err(1,"writing quality scores failed"); | |
402 } | |
403 | |
404 void fastx_write_record(FASTX *pFASTX) | |
405 { | |
406 int len; | |
407 int rc; | |
408 | |
409 if (pFASTX==NULL) | |
410 errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); | |
411 | |
412 | |
413 rc = fprintf(pFASTX->output, "%c%s\n", | |
414 pFASTX->output_sequence_id_prefix, | |
415 pFASTX->name ) ; | |
416 if (rc<=0) | |
417 err(1,"writing sequence identifier failed"); | |
418 | |
419 rc = fprintf(pFASTX->output, "%s\n", pFASTX->nucleotides); | |
420 if (rc<=0) | |
421 err(1,"writing nucleotides failed"); | |
422 | |
423 if (pFASTX->write_fastq) { | |
424 rc = fprintf(pFASTX->output, "+%s\n", pFASTX->name2 ) ; | |
425 if (rc<=0) | |
426 err(1,"writing 2nd sequence identifier failed"); | |
427 | |
428 len = strlen(pFASTX->nucleotides); | |
429 if (pFASTX->write_fastq_ascii) | |
430 write_ascii_qual_string(pFASTX, len); | |
431 else | |
432 write_numeric_qual_string(pFASTX, len); | |
433 } | |
434 | |
435 pFASTX->num_output_sequences++; | |
436 pFASTX->num_output_reads += get_reads_count(pFASTX); | |
437 } | |
438 | |
439 int get_reads_count(const FASTX *pFASTX) | |
440 { | |
441 char *dash = NULL ; | |
442 | |
443 //FASTQ files are never collapsed (at least not in Gordon's Galaxy) | |
444 if (pFASTX->read_fastq) | |
445 return 1; | |
446 | |
447 dash = strchr(pFASTX->name,'-'); | |
448 | |
449 // minus character wasn't found- | |
450 // this sequence is most probably not collapsed | |
451 if (dash==NULL) | |
452 return 1; | |
453 | |
454 int count = atoi(dash+1); | |
455 if (count>0) | |
456 return count; | |
457 | |
458 return 1; | |
459 } | |
460 | |
461 size_t num_input_sequences(const FASTX *pFASTX) | |
462 { | |
463 return pFASTX->num_input_sequences; | |
464 } | |
465 | |
466 size_t num_input_reads(const FASTX *pFASTX) | |
467 { | |
468 return pFASTX->num_input_reads; | |
469 } | |
470 | |
471 size_t num_output_sequences(const FASTX *pFASTX) | |
472 { | |
473 return pFASTX->num_output_sequences; | |
474 } | |
475 | |
476 size_t num_output_reads(const FASTX *pFASTX) | |
477 { | |
478 return pFASTX->num_output_reads; | |
479 } | |
480 | |
481 |