3
|
1 /*
|
|
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
|
|
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
|
|
4
|
|
5 This program is free software: you can redistribute it and/or modify
|
|
6 it under the terms of the GNU Affero General Public License as
|
|
7 published by the Free Software Foundation, either version 3 of the
|
|
8 License, or (at your option) any later version.
|
|
9
|
|
10 This program is distributed in the hope that it will be useful,
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 GNU Affero General Public License for more details.
|
|
14
|
|
15 You should have received a copy of the GNU Affero General Public License
|
|
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
17 */
|
|
18 #include <err.h>
|
|
19 #include <getopt.h>
|
|
20 #include <string.h>
|
|
21 #include <algorithm>
|
|
22 #include <cstdlib>
|
|
23 #include <ios>
|
|
24 #include <iostream>
|
|
25 #include <string>
|
|
26 #include <ostream>
|
|
27 #include <fstream>
|
|
28 #include <map>
|
|
29 #include <list>
|
|
30
|
|
31 #include <config.h>
|
|
32
|
|
33 #include "fastx.h"
|
|
34 #include "fastx_args.h"
|
|
35
|
|
36 using namespace std;
|
|
37
|
|
38 const char* usage=
|
|
39 "usage: fastx_collapser [-h] [-v] [-i INFILE] [-o OUTFILE]\n" \
|
|
40 "\n" \
|
|
41 "version " VERSION "\n" \
|
|
42 " [-h] = This helpful help screen.\n" \
|
|
43 " [-v] = verbose: print short summary of input/output counts\n" \
|
|
44 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
|
|
45 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
|
|
46 "\n";
|
|
47
|
|
48 FASTX fastx;
|
|
49 #include <tr1/unordered_map>
|
|
50 std::tr1::unordered_map<string,size_t> collapsed_sequences;
|
|
51 std::list< pair<string,size_t> > sorted_collapsed_sequences ;
|
|
52
|
|
53 struct PrintCollapsedSequence
|
|
54 {
|
|
55 size_t counter;
|
|
56 size_t total_reads ;
|
|
57
|
|
58 ostream &output ;
|
|
59 PrintCollapsedSequence( ostream& _output ) :
|
|
60 counter(0),
|
|
61 total_reads(0),
|
|
62 output(_output) {}
|
|
63
|
|
64 void operator() ( const std::pair<string, int> & sequence )
|
|
65 {
|
|
66 counter++;
|
|
67 total_reads += sequence.second ;
|
|
68 output << ">" << counter << "-" << sequence.second << endl << sequence.first << endl ;
|
|
69 }
|
|
70 };
|
|
71
|
|
72 bool sort_by_abundance_count ( const pair<string, size_t> & sequence1, const pair<string, size_t>& sequence2 )
|
|
73 {
|
|
74 return sequence1.second < sequence2.second ;
|
|
75 }
|
|
76
|
|
77 int main(int argc, char* argv[])
|
|
78 {
|
|
79 ofstream output_file ;
|
|
80
|
|
81 fastx_parse_cmdline(argc, argv, "", NULL );
|
|
82
|
|
83 fastx_init_reader(&fastx, get_input_filename(),
|
|
84 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
|
|
85
|
|
86 bool use_stdout = true;
|
|
87 if ( strcmp(get_output_filename(), "-")!=0 ) {
|
|
88 use_stdout = false;
|
|
89 output_file.open(get_output_filename());
|
|
90 if (!output_file)
|
|
91 errx(1,"Failed to create output file (%s)", get_output_filename() );
|
|
92 }
|
|
93 ostream& real_output = (use_stdout) ? cout : output_file ;
|
|
94
|
|
95 while ( fastx_read_next_record(&fastx) ) {
|
|
96 collapsed_sequences[string(fastx.nucleotides)]++ ;
|
|
97 }
|
|
98
|
|
99 copy ( collapsed_sequences.begin(), collapsed_sequences.end(),
|
|
100 back_inserter(sorted_collapsed_sequences) ) ;
|
|
101
|
|
102 sorted_collapsed_sequences.sort ( sort_by_abundance_count ) ;
|
|
103
|
|
104 PrintCollapsedSequence stats = for_each ( sorted_collapsed_sequences.rbegin(),
|
|
105 sorted_collapsed_sequences.rend(), PrintCollapsedSequence(real_output) ) ;
|
|
106
|
|
107 if (stats.total_reads != num_input_reads(&fastx))
|
|
108 errx(1,"Internal error: stats.total_reads (%zu) != num_input_reads(&fastx) (%zu).\n",
|
|
109 stats.total_reads, num_input_reads(&fastx) );
|
|
110
|
|
111 if ( verbose_flag() ) {
|
|
112 fprintf(get_report_file(), "Collapsd %zu reads into %zu unique sequences.\n",
|
|
113 num_input_reads(&fastx), stats.counter) ;
|
|
114 }
|
|
115 return 0;
|
|
116 }
|