Mercurial > repos > xilinxu > xilinxu
view fastx_toolkit-0.0.6/src/fastx_clipper/fastx_clipper.cpp @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
line wrap: on
line source
/* FASTX-toolkit - FASTA/FASTQ preprocessing tools. Copyright (C) 2009 A. Gordon (gordon@cshl.edu) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <cstddef> #include <cstdlib> #include <algorithm> #include <ostream> #include <iostream> #include <string> #include <vector> #include <string.h> #include "sequence_alignment.h" #include <errno.h> #include <err.h> #include <config.h> #include "fastx.h" #include "fastx_args.h" #define MAX_ADAPTER_LEN 100 const char* usage= "usage: fastx_clipper [-h] [-a ADAPTER] [-D] [-l N] [-n] [-d N] [-c] [-C] [-o] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \ "\n" \ "version " VERSION "\n" \ " [-h] = This helpful help screen.\n" \ " [-a ADAPTER] = ADAPTER string. default is CCTTAAGG (dummy adapter).\n" \ " [-l N] = discard sequences shorter than N nucleotides. default is 5.\n" \ " [-d N] = Keep the adapter and N bases after it.\n" \ " (using '-d 0' is the same as not using '-d' at all. which is the default).\n" \ " [-c] = Discard non-clipped sequences (i.e. - keep only sequences which contained the adapter).\n" \ " [-C] = Discard clipped sequences (i.e. - keep only sequences which did not contained the adapter).\n" \ " [-k] = Report Adapter-Only sequences.\n" \ " [-n] = keep sequences with unknown (N) nucleotides. default is to discard such sequences.\n" \ " [-v] = Verbose - report number of sequences.\n" \ " If [-o] is specified, report will be printed to STDOUT.\n" \ " If [-o] is not specified (and output goes to STDOUT),\n" \ " report will be printed to STDERR.\n" \ " [-z] = Compress output with GZIP.\n" \ " [-D] = DEBUG output.\n" \ " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \ " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \ "\n"; //Default adapter - Dummy sequence char adapter[MAX_ADAPTER_LEN]="CCTTAAGG"; unsigned int min_length=5; int discard_unknown_bases=1; int keep_delta=0; int discard_non_clipped=0; int discard_clipped=0; int show_adapter_only=0; int debug = 0 ; //Statistics for verbose report unsigned int count_input=0 ; unsigned int count_discarded_too_short=0; // see [-l N] option unsigned int count_discarded_adapter_at_index_zero=0; //empty sequences (after clipping) unsigned int count_discarded_no_adapter_found=0; // see [-c] option unsigned int count_discarded_adapter_found=0; // see [-C] option unsigned int count_discarded_N=0; // see [-n] FASTX fastx; HalfLocalSequenceAlignment align; int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg) { switch(optc) { case 'k': show_adapter_only=1; break; case 'D': debug++; break ; case 'c': discard_non_clipped = 1; break; case 'C': discard_clipped = 1 ; break ; case 'd': if (optarg==NULL) errx(1, "[-d] parameter requires an argument value"); keep_delta = strtoul(optarg,NULL,10); if (keep_delta<0) errx(1,"Invalid number bases to keep (-d %s)", optarg); break; case 'a': strncpy(adapter,optarg,sizeof(adapter)-1); //TODO: //if (!valid_sequence_string(adapter)) // errx(1,"Invalid adapter string (-a %s)", adapter); break ; case 'l': if (optarg==NULL) errx(1,"[-l] parameter requires an argument value"); min_length = strtoul(optarg, NULL, 10); break; case 'n': discard_unknown_bases = 0 ; break; default: errx(1,"Unknown argument (%c)", optc ) ; } return 1; } int parse_commandline(int argc, char* argv[]) { fastx_parse_cmdline(argc, argv, "kDCcd:a:s:l:n", parse_program_args); if (keep_delta>0) keep_delta += strlen(adapter); return 1; } int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) __attribute__ ((const)); int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) { #if 0 int mismatches = alignment_results.mismatches ; //The adapter(=target) is expected to align from the first base. //If the start is not zero (=not aligned from first base), //count each skipped base as a mismatch mismatches += alignment_results.target_start ; //The adapter is expected to align up to the end //of the adapter(=target), or the end of the query. //If it doesn't, count the un-aligned bases as mismatches int missing_from_query_end = (alignment_results.query_size - alignment_results.query_end-1); int missing_from_target_end = (alignment_results.target_size - alignment_results.target_end-1); int missing_from_end = std::min(missing_from_query_end, missing_from_target_end); mismatches += missing_from_end ; std::cout << "Missing from start = " << alignment_results.target_start << " Missing from end = " << missing_from_end << " mismatches = " << mismatches << std::endl; if (mismatches > max_mismatches) return -1; return alignment_results.query_start; #endif int alignment_size = alignment_results.neutral_matches + alignment_results.matches + alignment_results.mismatches + alignment_results.gaps ; //No alignment at all? if (alignment_size==0) return -1; //Any good alignment at the end of the query //(even only a single nucleotide) //Example: // The adapter starts with CTGTAG, The Query ends with CT - it's a match. if ( alignment_results.query_end == alignment_results.query_size-1 && alignment_results.mismatches == 0 ) { //printf("--1\n"); return alignment_results.query_start ; } if ( alignment_size > 5 && alignment_results.target_start == 0 && (alignment_results.matches * 100 / alignment_size ) >= 75 ) { //printf("--2\n"); return alignment_results.query_start ; } if ( alignment_size > 11 && (alignment_results.matches * 100 / alignment_size ) >= 80 ) { //printf("--2\n"); return alignment_results.query_start ; } // //Be very lenient regarding alignments at the end of the query sequence if ( alignment_results.query_end >= alignment_results.query_size-2 && alignment_size <= 5 && alignment_results.matches >= 3) { //printf("--3\n"); return alignment_results.query_start ; } return -1; } int main(int argc, char* argv[]) { int i; int reads_count; parse_commandline(argc, argv); fastx_init_reader(&fastx, get_input_filename(), FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE); fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag()); while ( fastx_read_next_record(&fastx) ) { reads_count = get_reads_count(&fastx); #if 0 std::string query = std::string(fastx.nucleotides) + std::string( strlen(adapter), 'N' ); std::string target= std::string( strlen(fastx.nucleotides), 'N' ) + std::string(adapter); #else std::string query = std::string(fastx.nucleotides) ; std::string target= std::string(adapter); #endif align.align( query, target ) ; if (debug>1) align.print_matrix(); if (debug>0) align.results().print(); count_input+= reads_count; //Find the best match with the adapter i = adapter_cutoff_index ( align.results() ) ; if (i!=-1 && i>0) { i += keep_delta; //Just trim the string after this position fastx.nucleotides[i] = 0 ; } if (i==0) { // empty sequence ? (in which the adapter was found at index 0) count_discarded_adapter_at_index_zero += reads_count; if (show_adapter_only) fastx_write_record(&fastx); continue; } if (strlen(fastx.nucleotides) < min_length) { // too-short sequence ? count_discarded_too_short += reads_count; continue; } if ( (i==-1) && discard_non_clipped ) { // adapter not found (i.e. sequence was not clipped) ? count_discarded_no_adapter_found += reads_count; continue ; } if ( (i>0) && discard_clipped ) { // adapter found, and user requested to keep only non-clipped sequences count_discarded_adapter_found += reads_count; continue; } if ( (discard_unknown_bases && strchr(fastx.nucleotides,'N')!=NULL ) ) { // contains unknown bases (after clipping) ? count_discarded_N += reads_count; continue; } if (!show_adapter_only) { //none of the above condition matched, so print this sequence. fastx_write_record(&fastx); } } // //Print verbose report if ( verbose_flag() ) { fprintf(get_report_file(), "Clipping Adapter: %s\n", adapter ); fprintf(get_report_file(), "Min. Length: %d\n", min_length) ; if (discard_clipped) fprintf(get_report_file(), "Clipped reads - discarded.\n" ) ; if (discard_non_clipped) fprintf(get_report_file(), "Non-Clipped reads - discarded.\n" ) ; fprintf(get_report_file(), "Input: %u reads.\n", count_input ) ; fprintf(get_report_file(), "Output: %u reads.\n", count_input - count_discarded_too_short - count_discarded_no_adapter_found - count_discarded_adapter_found - count_discarded_N - count_discarded_adapter_at_index_zero ) ; fprintf(get_report_file(), "discarded %u too-short reads.\n", count_discarded_too_short ) ; fprintf(get_report_file(), "discarded %u adapter-only reads.\n", count_discarded_adapter_at_index_zero ); if (discard_non_clipped) fprintf(get_report_file(), "discarded %u non-clipped reads.\n", count_discarded_no_adapter_found ); if (discard_clipped) fprintf(get_report_file(), "discarded %u clipped reads.\n", count_discarded_adapter_found ); if (discard_unknown_bases) fprintf(get_report_file(), "discarded %u N reads.\n", count_discarded_N ); } return 0; }