comparison fastx_toolkit-0.0.6/src/fastx_collapser/fastx_collapser.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
comparison
equal deleted inserted replaced
2:dfe9332138cf 3:997f5136985f
1 /*
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Affero General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Affero General Public License for more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 #include <err.h>
19 #include <getopt.h>
20 #include <string.h>
21 #include <algorithm>
22 #include <cstdlib>
23 #include <ios>
24 #include <iostream>
25 #include <string>
26 #include <ostream>
27 #include <fstream>
28 #include <map>
29 #include <list>
30
31 #include <config.h>
32
33 #include "fastx.h"
34 #include "fastx_args.h"
35
36 using namespace std;
37
38 const char* usage=
39 "usage: fastx_collapser [-h] [-v] [-i INFILE] [-o OUTFILE]\n" \
40 "\n" \
41 "version " VERSION "\n" \
42 " [-h] = This helpful help screen.\n" \
43 " [-v] = verbose: print short summary of input/output counts\n" \
44 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
45 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
46 "\n";
47
48 FASTX fastx;
49 #include <tr1/unordered_map>
50 std::tr1::unordered_map<string,size_t> collapsed_sequences;
51 std::list< pair<string,size_t> > sorted_collapsed_sequences ;
52
53 struct PrintCollapsedSequence
54 {
55 size_t counter;
56 size_t total_reads ;
57
58 ostream &output ;
59 PrintCollapsedSequence( ostream& _output ) :
60 counter(0),
61 total_reads(0),
62 output(_output) {}
63
64 void operator() ( const std::pair<string, int> & sequence )
65 {
66 counter++;
67 total_reads += sequence.second ;
68 output << ">" << counter << "-" << sequence.second << endl << sequence.first << endl ;
69 }
70 };
71
72 bool sort_by_abundance_count ( const pair<string, size_t> & sequence1, const pair<string, size_t>& sequence2 )
73 {
74 return sequence1.second < sequence2.second ;
75 }
76
77 int main(int argc, char* argv[])
78 {
79 ofstream output_file ;
80
81 fastx_parse_cmdline(argc, argv, "", NULL );
82
83 fastx_init_reader(&fastx, get_input_filename(),
84 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
85
86 bool use_stdout = true;
87 if ( strcmp(get_output_filename(), "-")!=0 ) {
88 use_stdout = false;
89 output_file.open(get_output_filename());
90 if (!output_file)
91 errx(1,"Failed to create output file (%s)", get_output_filename() );
92 }
93 ostream& real_output = (use_stdout) ? cout : output_file ;
94
95 while ( fastx_read_next_record(&fastx) ) {
96 collapsed_sequences[string(fastx.nucleotides)]++ ;
97 }
98
99 copy ( collapsed_sequences.begin(), collapsed_sequences.end(),
100 back_inserter(sorted_collapsed_sequences) ) ;
101
102 sorted_collapsed_sequences.sort ( sort_by_abundance_count ) ;
103
104 PrintCollapsedSequence stats = for_each ( sorted_collapsed_sequences.rbegin(),
105 sorted_collapsed_sequences.rend(), PrintCollapsedSequence(real_output) ) ;
106
107 if (stats.total_reads != num_input_reads(&fastx))
108 errx(1,"Internal error: stats.total_reads (%zu) != num_input_reads(&fastx) (%zu).\n",
109 stats.total_reads, num_input_reads(&fastx) );
110
111 if ( verbose_flag() ) {
112 fprintf(get_report_file(), "Collapsd %zu reads into %zu unique sequences.\n",
113 num_input_reads(&fastx), stats.counter) ;
114 }
115 return 0;
116 }