view fastx_toolkit-0.0.6/src/fastx_clipper/fastx_clipper.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
line wrap: on
line source

/*
    FASTX-toolkit - FASTA/FASTQ preprocessing tools.
    Copyright (C) 2009  A. Gordon (gordon@cshl.edu)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
#include <cstddef>
#include <cstdlib>
#include <algorithm>
#include <ostream>
#include <iostream>
#include <string>
#include <vector>
#include <string.h>

#include "sequence_alignment.h"

#include <errno.h>
#include <err.h>

#include <config.h>

#include "fastx.h"
#include "fastx_args.h"


#define MAX_ADAPTER_LEN 100

const char* usage=
"usage: fastx_clipper [-h] [-a ADAPTER] [-D] [-l N] [-n] [-d N] [-c] [-C] [-o] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \
"\n" \
"version " VERSION "\n" \
"   [-h]         = This helpful help screen.\n" \
"   [-a ADAPTER] = ADAPTER string. default is CCTTAAGG (dummy adapter).\n" \
"   [-l N]       = discard sequences shorter than N nucleotides. default is 5.\n" \
"   [-d N]       = Keep the adapter and N bases after it.\n" \
"                  (using '-d 0' is the same as not using '-d' at all. which is the default).\n" \
"   [-c]         = Discard non-clipped sequences (i.e. - keep only sequences which contained the adapter).\n" \
"   [-C]         = Discard clipped sequences (i.e. - keep only sequences which did not contained the adapter).\n" \
"   [-k]         = Report Adapter-Only sequences.\n" \
"   [-n]         = keep sequences with unknown (N) nucleotides. default is to discard such sequences.\n" \
"   [-v]         = Verbose - report number of sequences.\n" \
"                  If [-o] is specified,  report will be printed to STDOUT.\n" \
"                  If [-o] is not specified (and output goes to STDOUT),\n" \
"                  report will be printed to STDERR.\n" \
"   [-z]         = Compress output with GZIP.\n" \
"   [-D]	 = DEBUG output.\n" \
"   [-i INFILE]  = FASTA/Q input file. default is STDIN.\n" \
"   [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
"\n";

//Default adapter - Dummy sequence
char adapter[MAX_ADAPTER_LEN]="CCTTAAGG";
unsigned int min_length=5;
int discard_unknown_bases=1;
int keep_delta=0;
int discard_non_clipped=0;
int discard_clipped=0;
int show_adapter_only=0;
int debug = 0 ;


//Statistics for verbose report
unsigned int count_input=0 ;
unsigned int count_discarded_too_short=0; // see [-l N] option
unsigned int count_discarded_adapter_at_index_zero=0;  //empty sequences (after clipping)
unsigned int count_discarded_no_adapter_found=0; // see [-c] option
unsigned int count_discarded_adapter_found=0; // see [-C] option
unsigned int count_discarded_N=0; // see [-n]

FASTX fastx;
HalfLocalSequenceAlignment align;

int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg)
{
	switch(optc) {
		case 'k':
			show_adapter_only=1;
			break;

		case 'D':
			debug++;
			break ;

		case 'c':
			discard_non_clipped = 1;
			break;

		case 'C':
			discard_clipped = 1 ;
			break ;
		case 'd':
			if (optarg==NULL) 
				errx(1, "[-d] parameter requires an argument value");
			keep_delta = strtoul(optarg,NULL,10);
			if (keep_delta<0) 
				errx(1,"Invalid number bases to keep (-d %s)", optarg);
			break;
		case 'a':
			strncpy(adapter,optarg,sizeof(adapter)-1);
			//TODO:
			//if (!valid_sequence_string(adapter)) 
			//	errx(1,"Invalid adapter string (-a %s)", adapter);
			break ;
			
		case 'l':
			if (optarg==NULL) 
				errx(1,"[-l] parameter requires an argument value");
			
			min_length = strtoul(optarg, NULL, 10);
			break;
			
		case 'n':
			discard_unknown_bases = 0 ;
			break;

		default:
			errx(1,"Unknown argument (%c)", optc ) ;

	}
	return 1;
}

int parse_commandline(int argc, char* argv[])
{

	fastx_parse_cmdline(argc, argv, "kDCcd:a:s:l:n", parse_program_args);

	if (keep_delta>0) 
		keep_delta += strlen(adapter);
	return 1;
}

int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) __attribute__ ((const));
int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results )
{
	#if 0
	int mismatches = alignment_results.mismatches ;
	
	//The adapter(=target) is expected to align from the first base.
	//If the start is not zero (=not aligned from first base),
	//count each skipped base as a mismatch
	mismatches += alignment_results.target_start ;

	//The adapter is expected to align up to the end
	//of the adapter(=target), or the end of the query.
	//If it doesn't, count the un-aligned bases as mismatches
	int missing_from_query_end = (alignment_results.query_size - alignment_results.query_end-1);
	int missing_from_target_end = (alignment_results.target_size - alignment_results.target_end-1);

	int missing_from_end = std::min(missing_from_query_end, missing_from_target_end);
	
	mismatches += missing_from_end ;
	

	 
	std::cout << "Missing from start = " << alignment_results.target_start
		  << " Missing from end = " << missing_from_end
		  << " mismatches = " << mismatches 
		  << std::endl;
	
	if (mismatches > max_mismatches)
		return -1;

	return alignment_results.query_start;
	#endif

	int alignment_size = alignment_results.neutral_matches +
			     alignment_results.matches + 
			     alignment_results.mismatches +
			     alignment_results.gaps ;

	//No alignment at all?
	if (alignment_size==0)
		return -1;

	//Any good alignment at the end of the query
	//(even only a single nucleotide)
	//Example:
	//  The adapter starts with CTGTAG, The Query ends with CT - it's a match.
	if ( alignment_results.query_end == alignment_results.query_size-1
	     &&
	     alignment_results.mismatches == 0 ) {
	     	//printf("--1\n");
		return alignment_results.query_start ;
	}

	if ( alignment_size > 5
	     &&
	     alignment_results.target_start == 0
	     &&
	     (alignment_results.matches * 100 / alignment_size ) >= 75 ) {
	     	//printf("--2\n");
		return alignment_results.query_start ;
	}

	if ( alignment_size > 11 
	     &&
	     (alignment_results.matches * 100 / alignment_size ) >= 80 ) {
	     	//printf("--2\n");
		return alignment_results.query_start ;
	}

	//
	//Be very lenient regarding alignments at the end of the query sequence
	if ( alignment_results.query_end >= alignment_results.query_size-2
	     &&
	     alignment_size <= 5 && alignment_results.matches >= 3) {
			//printf("--3\n");
			return alignment_results.query_start ;
		}

	return -1;
}


int main(int argc, char* argv[])
{
	int i;
	int reads_count;

	parse_commandline(argc, argv);

	fastx_init_reader(&fastx, get_input_filename(), 
		FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);

	fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag());

	while ( fastx_read_next_record(&fastx) ) {

		reads_count = get_reads_count(&fastx);
		
		#if 0
		std::string query = std::string(fastx.nucleotides) + std::string( strlen(adapter), 'N' ); 
		std::string target= std::string( strlen(fastx.nucleotides), 'N' ) + std::string(adapter);
		#else
		std::string query = std::string(fastx.nucleotides) ;
		std::string target= std::string(adapter);
		#endif
		
		
		align.align( query, target ) ;

		if (debug>1) 
			align.print_matrix();
		if (debug>0)
			align.results().print();
		
		count_input+= reads_count;

		//Find the best match with the adapter
		i = adapter_cutoff_index ( align.results() ) ;
		
		if (i!=-1 && i>0) {
			i += keep_delta;
			//Just trim the string after this position
			fastx.nucleotides[i] = 0 ;
		}

		if (i==0) { // empty sequence ? (in which the adapter was found at index 0)
			count_discarded_adapter_at_index_zero += reads_count;
			
			if (show_adapter_only)
				fastx_write_record(&fastx);
			continue;
		}

		if (strlen(fastx.nucleotides) < min_length) { // too-short sequence ?
			count_discarded_too_short += reads_count;
			continue;
		}

		if ( (i==-1) && discard_non_clipped ) { // adapter not found (i.e. sequence was not clipped) ?
			count_discarded_no_adapter_found += reads_count;
			continue ;
		}

		if ( (i>0) && discard_clipped ) { // adapter found, and user requested to keep only non-clipped sequences 
			count_discarded_adapter_found += reads_count;
			continue;
		}

		if ( (discard_unknown_bases && strchr(fastx.nucleotides,'N')!=NULL ) ) { // contains unknown bases (after clipping) ?
			count_discarded_N += reads_count;
			continue;
		}
		
		if (!show_adapter_only)  {
			//none of the above condition matched, so print this sequence.
			fastx_write_record(&fastx);
		}
	}

	//
	//Print verbose report
	if ( verbose_flag() ) {
		fprintf(get_report_file(), "Clipping Adapter: %s\n", adapter );
		fprintf(get_report_file(), "Min. Length: %d\n", min_length) ;

		if (discard_clipped)
			fprintf(get_report_file(), "Clipped reads - discarded.\n"  ) ;
		if (discard_non_clipped)
			fprintf(get_report_file(), "Non-Clipped reads - discarded.\n"  ) ;

		
		fprintf(get_report_file(), "Input: %u reads.\n", count_input ) ;
		fprintf(get_report_file(), "Output: %u reads.\n", 
			count_input - count_discarded_too_short - count_discarded_no_adapter_found - count_discarded_adapter_found -
			count_discarded_N - count_discarded_adapter_at_index_zero ) ;

		fprintf(get_report_file(), "discarded %u too-short reads.\n", count_discarded_too_short ) ;
		fprintf(get_report_file(), "discarded %u adapter-only reads.\n", count_discarded_adapter_at_index_zero );
		if (discard_non_clipped)
			fprintf(get_report_file(), "discarded %u non-clipped reads.\n", count_discarded_no_adapter_found );
		if (discard_clipped)
			fprintf(get_report_file(), "discarded %u clipped reads.\n", count_discarded_adapter_found );
		if (discard_unknown_bases)
			fprintf(get_report_file(), "discarded %u N reads.\n", count_discarded_N );
	}

	return 0;
}