Mercurial > repos > xilinxu > xilinxu
diff fastx_toolkit-0.0.6/src/libfastx/fastx.h @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_toolkit-0.0.6/src/libfastx/fastx.h Thu Aug 14 04:52:17 2014 -0400 @@ -0,0 +1,136 @@ +/* + FASTX-toolkit - FASTA/FASTQ preprocessing tools. + Copyright (C) 2009 A. Gordon (gordon@cshl.edu) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +#ifndef __FASTX_HEADER__ +#define __FASTX_HEADER__ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef PATH_MAX +#include <linux/limits.h> +#endif + +#define MIN_QUALITY_VALUE (-50) +#define MAX_QUALITY_VALUE 50 +#define QUALITY_VALUES_RANGE (MAX_QUALITY_VALUE-MIN_QUALITY_VALUE) + + +#ifndef MAX_SEQ_LINE_LENGTH +#define MAX_SEQ_LINE_LENGTH (25000) +#endif + +typedef enum { + FASTA_ONLY=0, + FASTA_OR_FASTQ=1, + FASTQ_ONLY=2 +} ALLOWED_INPUT_FILE_TYPES; + +typedef enum { + DISALLOW_N=0, + ALLOW_N=1 +} ALLOWED_INPUT_UNKNOWN_BASES; + +typedef enum { + REQUIRE_UPPERCASE=0, + ALLOW_LOWERCASE=1 +} ALLOWED_INPUT_CASE; + +typedef enum { + OUTPUT_FASTA=0, + OUTPUT_FASTQ_ASCII_QUAL=1, + OUTPUT_FASTQ_NUMERIC_QUAL=2, + OUTPUT_SAME_AS_INPUT=3 +} OUTPUT_FILE_TYPE; + +#pragma pack(1) +typedef struct +{ + /* Record data - common for FASTA/FASTQ */ + char input_sequence_id_prefix[1]; //DON'T touch this - this hack will read the entire name into the variable 'name', + //leaving the prefix ('>' or '@') in 'input_sequence_id_name'. + char name[MAX_SEQ_LINE_LENGTH+1]; + char nucleotides[MAX_SEQ_LINE_LENGTH+1]; + /* Record data - only for FASTQ */ + char input_name2_prefix[1]; //same hack as 'input_sequence_id_prefix' + char name2[MAX_SEQ_LINE_LENGTH+1]; + int quality[MAX_SEQ_LINE_LENGTH+1]; //note: this is NOT ascii values, but numerical values + // numeric quality scores and ASCII quality scores + // are automatically converted to numbers (-15 to 40) + + /* Configuration */ + int allow_input_filetype; // 0 = Allow only FASTA + int allow_N; // 1 = N is valid nucleotide, 0 = only A/G/C/T are valid + int allow_lowercase; + int read_fastq; // 1 = Input is FASTQ (only if allow_input_fastq==1) + int read_fastq_ascii; // 1 = Input is FASTQ with ASCII quality scores (0 = with numeric quality scores) + int write_fastq; // 0 = Write only FASTA (regardless of input type) + int write_fastq_ascii; // 1 = Write ASCII quality scores, 0 = write numeric quality scores + int compress_output; // 1 = pass output through GZIP + + int copy_input_fastq_format_to_output ; // 1 = copy 'read_fastq_ascii' to 'write_fastq_ascii' + // so that the output format is the same as the input + + + /* Internal data */ + int allowed_nucleotides[256]; //quick lookup table for valid input + char output_sequence_id_prefix; // '>' or '@', depending on the requested output type + + char input_file_name[PATH_MAX]; //in linux, PATH_MAX is defined in <linux/limits.h> + unsigned long long input_line_number; + char output_file_name[PATH_MAX]; //in linux, PATH_MAX is defined in <linux/limits.h> + + size_t num_input_sequences; + size_t num_output_sequences; + size_t num_input_reads; + size_t num_output_reads; + + FILE* input; + FILE* output; +} FASTX ; + + +void fastx_init_reader(FASTX *pFASTX, const char* filename, + ALLOWED_INPUT_FILE_TYPES allowed_input_filetype, + ALLOWED_INPUT_UNKNOWN_BASES allow_N, + ALLOWED_INPUT_CASE allow_lowercase); + +// If the sequence identifier is collapsed (= "N-N") returns the reads_count, +// otherwise, returns 1 +int get_reads_count(const FASTX *pFASTX); + +void fastx_init_writer(FASTX *pFASTX, + const char* filename, + OUTPUT_FILE_TYPE output_type, + int compress_output); + +int fastx_read_next_record(FASTX *pFASTX); + +void fastx_write_record(FASTX *pFASTX); + +size_t num_input_sequences(const FASTX *pFASTX); +size_t num_input_reads(const FASTX *pFASTX); +size_t num_output_sequences(const FASTX *pFASTX); +size_t num_output_reads(const FASTX *pFASTX); + +#ifdef __cplusplus +} +#endif + +#endif +