Mercurial > repos > xilinxu > xilinxu
view fastx_toolkit-0.0.6/src/libfastx/fastx.c @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
line wrap: on
line source
/* FASTX-toolkit - FASTA/FASTQ preprocessing tools. Copyright (C) 2009 A. Gordon (gordon@cshl.edu) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <stdio.h> #include <stdlib.h> #include <error.h> #include <err.h> #include <string.h> #include <linux/limits.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include "chomp.h" #include "fastx.h" /* valid_sequence_string - check validity of a given sequence string. input - sequence - NULL terminated string to be validated. Output - 1 (true) - The given sequence is valid - contained only A/C/G/N/T characters. 0 (false) - The given string contained invalid characeters. Remark - sequences with unknown (N) bases are considered VALID. */ static int validate_nucleotides_string(const FASTX *pFASTX) { int match = 1 ; const char* seq = pFASTX->nucleotides; while (*seq != '\0' && match) { match &= pFASTX->allowed_nucleotides[ (int) *seq ]; seq++; } return match; } static void create_lookup_table(FASTX *pFASTX) { int i; for (i=0; i<256; i++) pFASTX->allowed_nucleotides[i] = 0 ; pFASTX->allowed_nucleotides['A'] = 1; pFASTX->allowed_nucleotides['C'] = 1; pFASTX->allowed_nucleotides['G'] = 1; pFASTX->allowed_nucleotides['T'] = 1; if (pFASTX->allow_N) pFASTX->allowed_nucleotides['N'] = 1; if (pFASTX->allow_lowercase) { pFASTX->allowed_nucleotides['a'] = 1; pFASTX->allowed_nucleotides['c'] = 1; pFASTX->allowed_nucleotides['g'] = 1; pFASTX->allowed_nucleotides['t'] = 1; if (pFASTX->allow_N) pFASTX->allowed_nucleotides['n'] = 1; } } static void detect_input_format(FASTX *pFASTX) { //Get the first character in the file, //and put it right back int c = fgetc(pFASTX->input); ungetc(c, pFASTX->input); switch(c) { case '>': /* FASTA file */ if ( pFASTX->allow_input_filetype==FASTQ_ONLY ) errx(1,"input file (%s) is FASTA, but only FASTQ input is allowed.", pFASTX->input_file_name); pFASTX->read_fastq = 0 ; break; case '@': /* FASTQ file */ if ( pFASTX->allow_input_filetype==FASTA_ONLY ) errx(1,"input file (%s) is FASTQ, but only FASTA input is allowed.", pFASTX->input_file_name); pFASTX->read_fastq = 1; break; case -1: /* EOF as first character - no input */ errx(1, "Premature End-Of-File (filename ='%s')", pFASTX->input_file_name); break; default: errx(1, "input file (%s) has unknown file format (not FASTA or FASTQ), first character = %c (%d)", pFASTX->input_file_name, c,c); } } static void convert_ascii_quality_score_line(const char* ascii_quality_scores, FASTX *pFASTX) { size_t i; if (strlen(ascii_quality_scores) != strlen(pFASTX->nucleotides)) errx(1,"number of quality values (%zu) doesn't match number of nucleotides (%zu) on line %lld", strlen(ascii_quality_scores), strlen(pFASTX->nucleotides), pFASTX->input_line_number); for (i=0; i<strlen(ascii_quality_scores); i++) { pFASTX->quality[i] = (int) (ascii_quality_scores[i] - 64) ; if (pFASTX->quality[i] < -15 || pFASTX->quality[i] > 40) errx(1, "Invalid quality score value (char '%c' ord %d quality value %d) on line %lld", ascii_quality_scores[i], ascii_quality_scores[i], pFASTX->quality[i], pFASTX->input_line_number ); } } static void convert_numeric_quality_score_line ( const char* numeric_quality_line, FASTX *pFASTX ) { size_t index; const char *quality_tok; char *endptr; int quality_value; index=0; quality_tok = numeric_quality_line; do { //read the quality score as an integer value quality_value = strtol(quality_tok, &endptr, 10); if (endptr == quality_tok) errx(1,"Error: invalid quality score data on line %lld (quality_tok = \"%s\"", pFASTX->input_line_number ,quality_tok); if (quality_value > 40 || quality_value < -15) errx(1, "invalid quality score value (%d) in line %lld.", quality_value, pFASTX->input_line_number); //convert it ASCII (as per solexa's encoding) pFASTX->quality[index] = quality_value; index++; quality_tok = endptr; } while (quality_tok != NULL && *quality_tok!='\0') ; if (index != strlen(pFASTX->nucleotides)) { errx(1,"number of quality values (%zu) doesn't match number of nucleotides (%zu) on line %lld", index, strlen(pFASTX->nucleotides), pFASTX->input_line_number ); } } void fastx_init_reader(FASTX *pFASTX, const char* filename, ALLOWED_INPUT_FILE_TYPES allowed_input_filetype, ALLOWED_INPUT_UNKNOWN_BASES allow_N, ALLOWED_INPUT_CASE allow_lowercase) { if (pFASTX==NULL) errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); memset(pFASTX, 0, sizeof(FASTX)); if (strncmp(filename,"-",5)==0) { pFASTX->input = stdin; } else { pFASTX->input = fopen(filename, "r"); if (pFASTX->input==NULL) err(1, "failed to open input file '%s'", filename); } strncpy(pFASTX->input_file_name, filename, sizeof(pFASTX->input_file_name)-1); pFASTX->allow_input_filetype = allowed_input_filetype; pFASTX->allow_lowercase = allow_lowercase; pFASTX->allow_N = allow_N; create_lookup_table(pFASTX); detect_input_format(pFASTX); } int open_output_file(const char* filename) { int fd ; if (strncmp(filename,"-", 6)==0) { fd = STDOUT_FILENO; } else { fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0666 ); if (fd==-1) err(1, "Failed to create output file (%s)", filename); } return fd; } int open_output_compressor(FASTX __attribute__((unused)) *pFASTX, const char* filename) { int fd; pid_t child_pid; int parent_pipe[2]; if (pipe(parent_pipe)!=0) err(1,"pipe (for gzip) failed"); child_pid = fork(); if (child_pid>0) { /* The parent process */ fd = parent_pipe[1]; close(parent_pipe[0]); return fd; } /* The child process */ //the compressor's STDIN is the pipe from the parent dup2(parent_pipe[0], STDIN_FILENO); close(parent_pipe[1]); //the compressor's STDOUT is the output file //(which can be the parent's STDOUT, too) fd = open_output_file(filename); dup2(fd, STDOUT_FILENO); //Run GZIP execlp("gzip","gzip",NULL); //Should never get here... err(1,"execlp(gzip) failed"); } void fastx_init_writer(FASTX *pFASTX, const char *filename, OUTPUT_FILE_TYPE output_type, int compress_output) { int fd; if (pFASTX==NULL) errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); if (pFASTX->input==NULL) errx(1,"Internal error: pFASTX not initialized (%s:%d)", __FILE__, __LINE__); pFASTX->compress_output = compress_output; if (pFASTX->compress_output) fd = open_output_compressor(pFASTX, filename); else fd = open_output_file(filename); pFASTX->output = fdopen(fd,"w"); if (pFASTX->output==NULL) err(1,"fdopen failed"); switch(output_type) { case OUTPUT_FASTA: pFASTX->write_fastq = 0 ; pFASTX->output_sequence_id_prefix = '>'; break ; case OUTPUT_FASTQ_ASCII_QUAL: if (! pFASTX->read_fastq) errx(1,"Can't output FASTQ when input is FASTA."); pFASTX->write_fastq = 1; pFASTX->write_fastq_ascii = 1; pFASTX->output_sequence_id_prefix = '@'; break ; case OUTPUT_FASTQ_NUMERIC_QUAL: if (! pFASTX->read_fastq) errx(1,"Can't output FASTQ when input is FASTA."); pFASTX->write_fastq = 1; pFASTX->write_fastq_ascii = 0; pFASTX->output_sequence_id_prefix = '@'; break ; case OUTPUT_SAME_AS_INPUT: pFASTX->write_fastq = pFASTX->read_fastq; //Assume we're writing ASCII format, pFASTX->write_fastq_ascii = 1 ; //But set this flag and the real format will be determined //when we actually read the FASTQ record pFASTX->copy_input_fastq_format_to_output = 1; pFASTX->output_sequence_id_prefix = (pFASTX->write_fastq) ? '@' : '>'; break; default: errx(1, __FILE__ ":%d: Unknown output_type (%d)", __LINE__, output_type ) ; } } int fastx_read_next_record(FASTX *pFASTX) { char temp_qual[MAX_SEQ_LINE_LENGTH+1]; temp_qual[MAX_SEQ_LINE_LENGTH] = 0; if (pFASTX==NULL) errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); if (fgets(pFASTX->input_sequence_id_prefix, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) return 0; //assume end-of-file, if we couldn't read the first line of the foursome //for the rest of the lines, if they don't appear, it's an error pFASTX->input_line_number++; if (fgets(pFASTX->nucleotides, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) errx(1,"Failed to read complete record, missing 2nd line (nucleotides), on line %lld\n", pFASTX->input_line_number); chomp(pFASTX->name); chomp(pFASTX->nucleotides); validate_nucleotides_string(pFASTX); if (pFASTX->read_fastq) { pFASTX->input_line_number++; if (fgets(pFASTX->input_name2_prefix, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) errx(1,"Failed to read complete record, missing 3rd line (name-2), on line %lld\n", pFASTX->input_line_number); pFASTX->input_line_number++; if (fgets(temp_qual, sizeof(temp_qual), pFASTX->input) == NULL) errx(1,"Failed to read complete record, missing 4th line (quality), on line %lld\n", pFASTX->input_line_number); chomp(pFASTX->name2); chomp(temp_qual); if (strlen(temp_qual) == strlen(pFASTX->nucleotides)) { //Assume this is an ASCII quality score line, convert it to values convert_ascii_quality_score_line ( temp_qual, pFASTX ) ; pFASTX->read_fastq_ascii = 1 ; } else { //Assume this is a numeric quality score line, convert it to values convert_numeric_quality_score_line ( temp_qual, pFASTX ) ; pFASTX->read_fastq_ascii = 0 ; } //Copy the input format to the output format flag if (pFASTX->copy_input_fastq_format_to_output) { pFASTX->write_fastq_ascii = pFASTX->read_fastq_ascii; } } pFASTX->num_input_sequences++; pFASTX->num_input_reads += get_reads_count(pFASTX); return 1; } static void write_ascii_qual_string(FASTX *pFASTX, int length) { int i; int rc; for (i=0; i<length; i++) { rc = fprintf(pFASTX->output, "%c", pFASTX->quality[i] + 64 ) ; if (rc<=0) err(1,"writing quality scores failed"); } rc = fprintf(pFASTX->output, "\n"); if (rc<=0) err(1,"writing quality scores failed"); } static void write_numeric_qual_string(FASTX *pFASTX, int length) { int i; int rc; for (i=0; i<length; i++) { rc = fprintf(pFASTX->output, "%d", pFASTX->quality[i] ) ; if (rc<=0) err(1,"writing quality scores failed"); if (i<length-1) { rc = fprintf(pFASTX->output," "); if (rc<=0) err(1,"writing quality scores failed"); } } rc = fprintf(pFASTX->output, "\n"); if (rc<=0) err(1,"writing quality scores failed"); } void fastx_write_record(FASTX *pFASTX) { int len; int rc; if (pFASTX==NULL) errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); rc = fprintf(pFASTX->output, "%c%s\n", pFASTX->output_sequence_id_prefix, pFASTX->name ) ; if (rc<=0) err(1,"writing sequence identifier failed"); rc = fprintf(pFASTX->output, "%s\n", pFASTX->nucleotides); if (rc<=0) err(1,"writing nucleotides failed"); if (pFASTX->write_fastq) { rc = fprintf(pFASTX->output, "+%s\n", pFASTX->name2 ) ; if (rc<=0) err(1,"writing 2nd sequence identifier failed"); len = strlen(pFASTX->nucleotides); if (pFASTX->write_fastq_ascii) write_ascii_qual_string(pFASTX, len); else write_numeric_qual_string(pFASTX, len); } pFASTX->num_output_sequences++; pFASTX->num_output_reads += get_reads_count(pFASTX); } int get_reads_count(const FASTX *pFASTX) { char *dash = NULL ; //FASTQ files are never collapsed (at least not in Gordon's Galaxy) if (pFASTX->read_fastq) return 1; dash = strchr(pFASTX->name,'-'); // minus character wasn't found- // this sequence is most probably not collapsed if (dash==NULL) return 1; int count = atoi(dash+1); if (count>0) return count; return 1; } size_t num_input_sequences(const FASTX *pFASTX) { return pFASTX->num_input_sequences; } size_t num_input_reads(const FASTX *pFASTX) { return pFASTX->num_input_reads; } size_t num_output_sequences(const FASTX *pFASTX) { return pFASTX->num_output_sequences; } size_t num_output_reads(const FASTX *pFASTX) { return pFASTX->num_output_reads; }