Mercurial > repos > xilinxu > xilinxu
diff fastx_toolkit-0.0.6/src/fastx_collapser/fastx_collapser.cpp @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_toolkit-0.0.6/src/fastx_collapser/fastx_collapser.cpp Thu Aug 14 04:52:17 2014 -0400 @@ -0,0 +1,116 @@ +/* + FASTX-toolkit - FASTA/FASTQ preprocessing tools. + Copyright (C) 2009 A. Gordon (gordon@cshl.edu) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +#include <err.h> +#include <getopt.h> +#include <string.h> +#include <algorithm> +#include <cstdlib> +#include <ios> +#include <iostream> +#include <string> +#include <ostream> +#include <fstream> +#include <map> +#include <list> + +#include <config.h> + +#include "fastx.h" +#include "fastx_args.h" + +using namespace std; + +const char* usage= +"usage: fastx_collapser [-h] [-v] [-i INFILE] [-o OUTFILE]\n" \ +"\n" \ +"version " VERSION "\n" \ +" [-h] = This helpful help screen.\n" \ +" [-v] = verbose: print short summary of input/output counts\n" \ +" [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \ +" [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \ +"\n"; + +FASTX fastx; +#include <tr1/unordered_map> +std::tr1::unordered_map<string,size_t> collapsed_sequences; +std::list< pair<string,size_t> > sorted_collapsed_sequences ; + +struct PrintCollapsedSequence +{ + size_t counter; + size_t total_reads ; + + ostream &output ; + PrintCollapsedSequence( ostream& _output ) : + counter(0), + total_reads(0), + output(_output) {} + + void operator() ( const std::pair<string, int> & sequence ) + { + counter++; + total_reads += sequence.second ; + output << ">" << counter << "-" << sequence.second << endl << sequence.first << endl ; + } +}; + +bool sort_by_abundance_count ( const pair<string, size_t> & sequence1, const pair<string, size_t>& sequence2 ) +{ + return sequence1.second < sequence2.second ; +} + +int main(int argc, char* argv[]) +{ + ofstream output_file ; + + fastx_parse_cmdline(argc, argv, "", NULL ); + + fastx_init_reader(&fastx, get_input_filename(), + FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE); + + bool use_stdout = true; + if ( strcmp(get_output_filename(), "-")!=0 ) { + use_stdout = false; + output_file.open(get_output_filename()); + if (!output_file) + errx(1,"Failed to create output file (%s)", get_output_filename() ); + } + ostream& real_output = (use_stdout) ? cout : output_file ; + + while ( fastx_read_next_record(&fastx) ) { + collapsed_sequences[string(fastx.nucleotides)]++ ; + } + + copy ( collapsed_sequences.begin(), collapsed_sequences.end(), + back_inserter(sorted_collapsed_sequences) ) ; + + sorted_collapsed_sequences.sort ( sort_by_abundance_count ) ; + + PrintCollapsedSequence stats = for_each ( sorted_collapsed_sequences.rbegin(), + sorted_collapsed_sequences.rend(), PrintCollapsedSequence(real_output) ) ; + + if (stats.total_reads != num_input_reads(&fastx)) + errx(1,"Internal error: stats.total_reads (%zu) != num_input_reads(&fastx) (%zu).\n", + stats.total_reads, num_input_reads(&fastx) ); + + if ( verbose_flag() ) { + fprintf(get_report_file(), "Collapsd %zu reads into %zu unique sequences.\n", + num_input_reads(&fastx), stats.counter) ; + } + return 0; +}