Mercurial > repos > xilinxu > xilinxu
diff fastx_toolkit-0.0.6/src/libfastx/fastx.c @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_toolkit-0.0.6/src/libfastx/fastx.c Thu Aug 14 04:52:17 2014 -0400 @@ -0,0 +1,481 @@ +/* + FASTX-toolkit - FASTA/FASTQ preprocessing tools. + Copyright (C) 2009 A. Gordon (gordon@cshl.edu) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +#include <stdio.h> +#include <stdlib.h> +#include <error.h> +#include <err.h> +#include <string.h> +#include <linux/limits.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + + +#include "chomp.h" +#include "fastx.h" + +/* + valid_sequence_string - + check validity of a given sequence string. + + input - + sequence - NULL terminated string to be validated. + + Output - + 1 (true) - The given sequence is valid - contained only A/C/G/N/T characters. + 0 (false) - The given string contained invalid characeters. + + Remark - + sequences with unknown (N) bases are considered VALID. +*/ +static int validate_nucleotides_string(const FASTX *pFASTX) +{ + int match = 1 ; + const char* seq = pFASTX->nucleotides; + + while (*seq != '\0' && match) { + match &= pFASTX->allowed_nucleotides[ (int) *seq ]; + seq++; + } + return match; +} + +static void create_lookup_table(FASTX *pFASTX) +{ + int i; + + for (i=0; i<256; i++) + pFASTX->allowed_nucleotides[i] = 0 ; + + pFASTX->allowed_nucleotides['A'] = 1; + pFASTX->allowed_nucleotides['C'] = 1; + pFASTX->allowed_nucleotides['G'] = 1; + pFASTX->allowed_nucleotides['T'] = 1; + + if (pFASTX->allow_N) + pFASTX->allowed_nucleotides['N'] = 1; + + if (pFASTX->allow_lowercase) { + pFASTX->allowed_nucleotides['a'] = 1; + pFASTX->allowed_nucleotides['c'] = 1; + pFASTX->allowed_nucleotides['g'] = 1; + pFASTX->allowed_nucleotides['t'] = 1; + + if (pFASTX->allow_N) + pFASTX->allowed_nucleotides['n'] = 1; + } +} + +static void detect_input_format(FASTX *pFASTX) +{ + //Get the first character in the file, + //and put it right back + int c = fgetc(pFASTX->input); + ungetc(c, pFASTX->input); + + switch(c) { + case '>': /* FASTA file */ + if ( pFASTX->allow_input_filetype==FASTQ_ONLY ) + errx(1,"input file (%s) is FASTA, but only FASTQ input is allowed.", + pFASTX->input_file_name); + pFASTX->read_fastq = 0 ; + break; + + case '@': /* FASTQ file */ + if ( pFASTX->allow_input_filetype==FASTA_ONLY ) + errx(1,"input file (%s) is FASTQ, but only FASTA input is allowed.", + pFASTX->input_file_name); + pFASTX->read_fastq = 1; + break; + + case -1: /* EOF as first character - no input */ + errx(1, "Premature End-Of-File (filename ='%s')", pFASTX->input_file_name); + break; + + default: + errx(1, "input file (%s) has unknown file format (not FASTA or FASTQ), first character = %c (%d)", + pFASTX->input_file_name, c,c); + } +} + +static void convert_ascii_quality_score_line(const char* ascii_quality_scores, FASTX *pFASTX) +{ + size_t i; + + if (strlen(ascii_quality_scores) != strlen(pFASTX->nucleotides)) + errx(1,"number of quality values (%zu) doesn't match number of nucleotides (%zu) on line %lld", + strlen(ascii_quality_scores), strlen(pFASTX->nucleotides), + pFASTX->input_line_number); + + for (i=0; i<strlen(ascii_quality_scores); i++) { + pFASTX->quality[i] = (int) (ascii_quality_scores[i] - 64) ; + if (pFASTX->quality[i] < -15 || pFASTX->quality[i] > 40) + errx(1, "Invalid quality score value (char '%c' ord %d quality value %d) on line %lld", + ascii_quality_scores[i], ascii_quality_scores[i], + pFASTX->quality[i], pFASTX->input_line_number ); + } + +} + +static void convert_numeric_quality_score_line ( const char* numeric_quality_line, FASTX *pFASTX ) +{ + size_t index; + const char *quality_tok; + char *endptr; + int quality_value; + + index=0; + quality_tok = numeric_quality_line; + do { + //read the quality score as an integer value + quality_value = strtol(quality_tok, &endptr, 10); + if (endptr == quality_tok) + errx(1,"Error: invalid quality score data on line %lld (quality_tok = \"%s\"", + pFASTX->input_line_number ,quality_tok); + + if (quality_value > 40 || quality_value < -15) + errx(1, "invalid quality score value (%d) in line %lld.", + quality_value, pFASTX->input_line_number); + + //convert it ASCII (as per solexa's encoding) + pFASTX->quality[index] = quality_value; + index++; + quality_tok = endptr; + } while (quality_tok != NULL && *quality_tok!='\0') ; + + if (index != strlen(pFASTX->nucleotides)) { + errx(1,"number of quality values (%zu) doesn't match number of nucleotides (%zu) on line %lld", + index, strlen(pFASTX->nucleotides), pFASTX->input_line_number ); + } +} + +void fastx_init_reader(FASTX *pFASTX, const char* filename, + ALLOWED_INPUT_FILE_TYPES allowed_input_filetype, + ALLOWED_INPUT_UNKNOWN_BASES allow_N, + ALLOWED_INPUT_CASE allow_lowercase) +{ + if (pFASTX==NULL) + errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); + + memset(pFASTX, 0, sizeof(FASTX)); + + if (strncmp(filename,"-",5)==0) { + pFASTX->input = stdin; + } else { + pFASTX->input = fopen(filename, "r"); + if (pFASTX->input==NULL) + err(1, "failed to open input file '%s'", filename); + } + + strncpy(pFASTX->input_file_name, filename, sizeof(pFASTX->input_file_name)-1); + + pFASTX->allow_input_filetype = allowed_input_filetype; + pFASTX->allow_lowercase = allow_lowercase; + pFASTX->allow_N = allow_N; + + create_lookup_table(pFASTX); + + detect_input_format(pFASTX); +} + +int open_output_file(const char* filename) +{ + int fd ; + if (strncmp(filename,"-", 6)==0) { + fd = STDOUT_FILENO; + } else { + fd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0666 ); + if (fd==-1) + err(1, "Failed to create output file (%s)", filename); + } + return fd; +} + +int open_output_compressor(FASTX __attribute__((unused)) *pFASTX, const char* filename) +{ + int fd; + pid_t child_pid; + int parent_pipe[2]; + if (pipe(parent_pipe)!=0) + err(1,"pipe (for gzip) failed"); + + child_pid = fork(); + if (child_pid>0) { + /* The parent process */ + fd = parent_pipe[1]; + close(parent_pipe[0]); + return fd; + } + + /* The child process */ + + //the compressor's STDIN is the pipe from the parent + dup2(parent_pipe[0], STDIN_FILENO); + close(parent_pipe[1]); + + //the compressor's STDOUT is the output file + //(which can be the parent's STDOUT, too) + fd = open_output_file(filename); + dup2(fd, STDOUT_FILENO); + + //Run GZIP + execlp("gzip","gzip",NULL); + + //Should never get here... + err(1,"execlp(gzip) failed"); +} + + +void fastx_init_writer(FASTX *pFASTX, + const char *filename, + OUTPUT_FILE_TYPE output_type, + int compress_output) +{ + int fd; + + if (pFASTX==NULL) + errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); + if (pFASTX->input==NULL) + errx(1,"Internal error: pFASTX not initialized (%s:%d)", __FILE__, __LINE__); + + pFASTX->compress_output = compress_output; + if (pFASTX->compress_output) + fd = open_output_compressor(pFASTX, filename); + else + fd = open_output_file(filename); + + pFASTX->output = fdopen(fd,"w"); + if (pFASTX->output==NULL) + err(1,"fdopen failed"); + + switch(output_type) + { + case OUTPUT_FASTA: + pFASTX->write_fastq = 0 ; + pFASTX->output_sequence_id_prefix = '>'; + break ; + + case OUTPUT_FASTQ_ASCII_QUAL: + if (! pFASTX->read_fastq) + errx(1,"Can't output FASTQ when input is FASTA."); + pFASTX->write_fastq = 1; + pFASTX->write_fastq_ascii = 1; + pFASTX->output_sequence_id_prefix = '@'; + break ; + + case OUTPUT_FASTQ_NUMERIC_QUAL: + if (! pFASTX->read_fastq) + errx(1,"Can't output FASTQ when input is FASTA."); + pFASTX->write_fastq = 1; + pFASTX->write_fastq_ascii = 0; + pFASTX->output_sequence_id_prefix = '@'; + break ; + + case OUTPUT_SAME_AS_INPUT: + pFASTX->write_fastq = pFASTX->read_fastq; + + //Assume we're writing ASCII format, + pFASTX->write_fastq_ascii = 1 ; + //But set this flag and the real format will be determined + //when we actually read the FASTQ record + pFASTX->copy_input_fastq_format_to_output = 1; + + pFASTX->output_sequence_id_prefix = (pFASTX->write_fastq) ? '@' : '>'; + break; + + default: + errx(1, __FILE__ ":%d: Unknown output_type (%d)", + __LINE__, output_type ) ; + } +} + +int fastx_read_next_record(FASTX *pFASTX) +{ + char temp_qual[MAX_SEQ_LINE_LENGTH+1]; + + temp_qual[MAX_SEQ_LINE_LENGTH] = 0; + + if (pFASTX==NULL) + errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); + + if (fgets(pFASTX->input_sequence_id_prefix, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) + return 0; //assume end-of-file, if we couldn't read the first line of the foursome + + //for the rest of the lines, if they don't appear, it's an error + pFASTX->input_line_number++; + + if (fgets(pFASTX->nucleotides, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) + errx(1,"Failed to read complete record, missing 2nd line (nucleotides), on line %lld\n", + pFASTX->input_line_number); + + chomp(pFASTX->name); + chomp(pFASTX->nucleotides); + + validate_nucleotides_string(pFASTX); + + if (pFASTX->read_fastq) { + pFASTX->input_line_number++; + if (fgets(pFASTX->input_name2_prefix, MAX_SEQ_LINE_LENGTH, pFASTX->input) == NULL) + errx(1,"Failed to read complete record, missing 3rd line (name-2), on line %lld\n", + pFASTX->input_line_number); + + pFASTX->input_line_number++; + if (fgets(temp_qual, sizeof(temp_qual), pFASTX->input) == NULL) + errx(1,"Failed to read complete record, missing 4th line (quality), on line %lld\n", + pFASTX->input_line_number); + + chomp(pFASTX->name2); + chomp(temp_qual); + + if (strlen(temp_qual) == strlen(pFASTX->nucleotides)) { + //Assume this is an ASCII quality score line, convert it to values + convert_ascii_quality_score_line ( temp_qual, pFASTX ) ; + pFASTX->read_fastq_ascii = 1 ; + } else { + //Assume this is a numeric quality score line, convert it to values + convert_numeric_quality_score_line ( temp_qual, pFASTX ) ; + pFASTX->read_fastq_ascii = 0 ; + } + + //Copy the input format to the output format flag + if (pFASTX->copy_input_fastq_format_to_output) { + pFASTX->write_fastq_ascii = pFASTX->read_fastq_ascii; + } + + + } + + pFASTX->num_input_sequences++; + pFASTX->num_input_reads += get_reads_count(pFASTX); + + return 1; +} + +static void write_ascii_qual_string(FASTX *pFASTX, int length) +{ + int i; + int rc; + + for (i=0; i<length; i++) { + rc = fprintf(pFASTX->output, "%c", pFASTX->quality[i] + 64 ) ; + if (rc<=0) + err(1,"writing quality scores failed"); + } + rc = fprintf(pFASTX->output, "\n"); + if (rc<=0) + err(1,"writing quality scores failed"); +} + +static void write_numeric_qual_string(FASTX *pFASTX, int length) +{ + int i; + int rc; + for (i=0; i<length; i++) { + rc = fprintf(pFASTX->output, "%d", pFASTX->quality[i] ) ; + if (rc<=0) + err(1,"writing quality scores failed"); + if (i<length-1) { + rc = fprintf(pFASTX->output," "); + if (rc<=0) + err(1,"writing quality scores failed"); + } + } + rc = fprintf(pFASTX->output, "\n"); + if (rc<=0) + err(1,"writing quality scores failed"); +} + +void fastx_write_record(FASTX *pFASTX) +{ + int len; + int rc; + + if (pFASTX==NULL) + errx(1,"Internal error: pFASTX==NULL (%s:%d)", __FILE__,__LINE__); + + + rc = fprintf(pFASTX->output, "%c%s\n", + pFASTX->output_sequence_id_prefix, + pFASTX->name ) ; + if (rc<=0) + err(1,"writing sequence identifier failed"); + + rc = fprintf(pFASTX->output, "%s\n", pFASTX->nucleotides); + if (rc<=0) + err(1,"writing nucleotides failed"); + + if (pFASTX->write_fastq) { + rc = fprintf(pFASTX->output, "+%s\n", pFASTX->name2 ) ; + if (rc<=0) + err(1,"writing 2nd sequence identifier failed"); + + len = strlen(pFASTX->nucleotides); + if (pFASTX->write_fastq_ascii) + write_ascii_qual_string(pFASTX, len); + else + write_numeric_qual_string(pFASTX, len); + } + + pFASTX->num_output_sequences++; + pFASTX->num_output_reads += get_reads_count(pFASTX); +} + +int get_reads_count(const FASTX *pFASTX) +{ + char *dash = NULL ; + + //FASTQ files are never collapsed (at least not in Gordon's Galaxy) + if (pFASTX->read_fastq) + return 1; + + dash = strchr(pFASTX->name,'-'); + + // minus character wasn't found- + // this sequence is most probably not collapsed + if (dash==NULL) + return 1; + + int count = atoi(dash+1); + if (count>0) + return count; + + return 1; +} + +size_t num_input_sequences(const FASTX *pFASTX) +{ + return pFASTX->num_input_sequences; +} + +size_t num_input_reads(const FASTX *pFASTX) +{ + return pFASTX->num_input_reads; +} + +size_t num_output_sequences(const FASTX *pFASTX) +{ + return pFASTX->num_output_sequences; +} + +size_t num_output_reads(const FASTX *pFASTX) +{ + return pFASTX->num_output_reads; +} + +