diff fastx_toolkit-0.0.6/src/fastx_collapser/fastx_collapser.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_toolkit-0.0.6/src/fastx_collapser/fastx_collapser.cpp	Thu Aug 14 04:52:17 2014 -0400
@@ -0,0 +1,116 @@
+/*
+    FASTX-toolkit - FASTA/FASTQ preprocessing tools.
+    Copyright (C) 2009  A. Gordon (gordon@cshl.edu)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as
+    published by the Free Software Foundation, either version 3 of the
+    License, or (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <err.h>
+#include <getopt.h>
+#include <string.h>
+#include <algorithm>
+#include <cstdlib>
+#include <ios>
+#include <iostream>
+#include <string>
+#include <ostream>
+#include <fstream>
+#include <map>
+#include <list>
+
+#include <config.h>
+
+#include "fastx.h"
+#include "fastx_args.h"
+
+using namespace std;
+
+const char* usage=
+"usage: fastx_collapser [-h] [-v] [-i INFILE] [-o OUTFILE]\n" \
+"\n" \
+"version " VERSION "\n" \
+"   [-h]         = This helpful help screen.\n" \
+"   [-v]         = verbose: print short summary of input/output counts\n" \
+"   [-i INFILE]  = FASTA/Q input file. default is STDIN.\n" \
+"   [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
+"\n";
+
+FASTX fastx;
+#include <tr1/unordered_map>
+std::tr1::unordered_map<string,size_t> collapsed_sequences;
+std::list< pair<string,size_t> > sorted_collapsed_sequences ;
+
+struct PrintCollapsedSequence
+{
+	size_t counter;
+	size_t total_reads ;
+
+	ostream &output ;
+	PrintCollapsedSequence( ostream& _output ) : 
+		counter(0), 
+		total_reads(0),
+		output(_output) {}
+
+	void operator() ( const std::pair<string, int> & sequence )
+	{
+		counter++;
+		total_reads += sequence.second ;
+		output << ">" << counter << "-" << sequence.second << endl << sequence.first << endl ;
+	}
+};
+
+bool sort_by_abundance_count ( const pair<string, size_t> & sequence1, const pair<string, size_t>& sequence2 )
+{
+	return sequence1.second < sequence2.second ;
+}
+
+int main(int argc, char* argv[])
+{
+	ofstream output_file ;
+
+	fastx_parse_cmdline(argc, argv, "", NULL );
+
+	fastx_init_reader(&fastx, get_input_filename(), 
+		FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
+
+	bool use_stdout = true;
+	if ( strcmp(get_output_filename(), "-")!=0 ) {
+		use_stdout = false;
+		output_file.open(get_output_filename());
+		if (!output_file) 
+			errx(1,"Failed to create output file (%s)", get_output_filename() );
+	}
+	ostream& real_output = (use_stdout) ? cout : output_file ;
+
+	while ( fastx_read_next_record(&fastx) ) {
+		collapsed_sequences[string(fastx.nucleotides)]++ ;
+	}
+	
+	copy ( collapsed_sequences.begin(), collapsed_sequences.end(), 
+		back_inserter(sorted_collapsed_sequences) ) ;
+
+	sorted_collapsed_sequences.sort ( sort_by_abundance_count ) ;
+
+	PrintCollapsedSequence stats =  for_each ( sorted_collapsed_sequences.rbegin(), 
+			sorted_collapsed_sequences.rend(), PrintCollapsedSequence(real_output) ) ;
+
+	if (stats.total_reads != num_input_reads(&fastx))
+		errx(1,"Internal error: stats.total_reads (%zu) != num_input_reads(&fastx) (%zu).\n", 
+			stats.total_reads, num_input_reads(&fastx) ); 
+
+	if ( verbose_flag() ) {
+		fprintf(get_report_file(), "Collapsd %zu reads into %zu unique sequences.\n",
+			num_input_reads(&fastx), stats.counter) ;
+	}
+	return 0;
+}