annotate fastx_toolkit-0.0.6/src/fastx_collapser/fastx_collapser.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
997f5136985f Uploaded
xilinxu
parents:
diff changeset
1 /*
997f5136985f Uploaded
xilinxu
parents:
diff changeset
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
3 Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
4
997f5136985f Uploaded
xilinxu
parents:
diff changeset
5 This program is free software: you can redistribute it and/or modify
997f5136985f Uploaded
xilinxu
parents:
diff changeset
6 it under the terms of the GNU Affero General Public License as
997f5136985f Uploaded
xilinxu
parents:
diff changeset
7 published by the Free Software Foundation, either version 3 of the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
8 License, or (at your option) any later version.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
9
997f5136985f Uploaded
xilinxu
parents:
diff changeset
10 This program is distributed in the hope that it will be useful,
997f5136985f Uploaded
xilinxu
parents:
diff changeset
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
997f5136985f Uploaded
xilinxu
parents:
diff changeset
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
997f5136985f Uploaded
xilinxu
parents:
diff changeset
13 GNU Affero General Public License for more details.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
14
997f5136985f Uploaded
xilinxu
parents:
diff changeset
15 You should have received a copy of the GNU Affero General Public License
997f5136985f Uploaded
xilinxu
parents:
diff changeset
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
997f5136985f Uploaded
xilinxu
parents:
diff changeset
17 */
997f5136985f Uploaded
xilinxu
parents:
diff changeset
18 #include <err.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
19 #include <getopt.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
20 #include <string.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
21 #include <algorithm>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
22 #include <cstdlib>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
23 #include <ios>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
24 #include <iostream>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
25 #include <string>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
26 #include <ostream>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
27 #include <fstream>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
28 #include <map>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
29 #include <list>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
30
997f5136985f Uploaded
xilinxu
parents:
diff changeset
31 #include <config.h>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
32
997f5136985f Uploaded
xilinxu
parents:
diff changeset
33 #include "fastx.h"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
34 #include "fastx_args.h"
997f5136985f Uploaded
xilinxu
parents:
diff changeset
35
997f5136985f Uploaded
xilinxu
parents:
diff changeset
36 using namespace std;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
37
997f5136985f Uploaded
xilinxu
parents:
diff changeset
38 const char* usage=
997f5136985f Uploaded
xilinxu
parents:
diff changeset
39 "usage: fastx_collapser [-h] [-v] [-i INFILE] [-o OUTFILE]\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
40 "\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
41 "version " VERSION "\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
42 " [-h] = This helpful help screen.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
43 " [-v] = verbose: print short summary of input/output counts\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
44 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
45 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
997f5136985f Uploaded
xilinxu
parents:
diff changeset
46 "\n";
997f5136985f Uploaded
xilinxu
parents:
diff changeset
47
997f5136985f Uploaded
xilinxu
parents:
diff changeset
48 FASTX fastx;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
49 #include <tr1/unordered_map>
997f5136985f Uploaded
xilinxu
parents:
diff changeset
50 std::tr1::unordered_map<string,size_t> collapsed_sequences;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
51 std::list< pair<string,size_t> > sorted_collapsed_sequences ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
52
997f5136985f Uploaded
xilinxu
parents:
diff changeset
53 struct PrintCollapsedSequence
997f5136985f Uploaded
xilinxu
parents:
diff changeset
54 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
55 size_t counter;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
56 size_t total_reads ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
57
997f5136985f Uploaded
xilinxu
parents:
diff changeset
58 ostream &output ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
59 PrintCollapsedSequence( ostream& _output ) :
997f5136985f Uploaded
xilinxu
parents:
diff changeset
60 counter(0),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
61 total_reads(0),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
62 output(_output) {}
997f5136985f Uploaded
xilinxu
parents:
diff changeset
63
997f5136985f Uploaded
xilinxu
parents:
diff changeset
64 void operator() ( const std::pair<string, int> & sequence )
997f5136985f Uploaded
xilinxu
parents:
diff changeset
65 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
66 counter++;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
67 total_reads += sequence.second ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
68 output << ">" << counter << "-" << sequence.second << endl << sequence.first << endl ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
69 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
70 };
997f5136985f Uploaded
xilinxu
parents:
diff changeset
71
997f5136985f Uploaded
xilinxu
parents:
diff changeset
72 bool sort_by_abundance_count ( const pair<string, size_t> & sequence1, const pair<string, size_t>& sequence2 )
997f5136985f Uploaded
xilinxu
parents:
diff changeset
73 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
74 return sequence1.second < sequence2.second ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
75 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
76
997f5136985f Uploaded
xilinxu
parents:
diff changeset
77 int main(int argc, char* argv[])
997f5136985f Uploaded
xilinxu
parents:
diff changeset
78 {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
79 ofstream output_file ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
80
997f5136985f Uploaded
xilinxu
parents:
diff changeset
81 fastx_parse_cmdline(argc, argv, "", NULL );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
82
997f5136985f Uploaded
xilinxu
parents:
diff changeset
83 fastx_init_reader(&fastx, get_input_filename(),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
84 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE);
997f5136985f Uploaded
xilinxu
parents:
diff changeset
85
997f5136985f Uploaded
xilinxu
parents:
diff changeset
86 bool use_stdout = true;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
87 if ( strcmp(get_output_filename(), "-")!=0 ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
88 use_stdout = false;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
89 output_file.open(get_output_filename());
997f5136985f Uploaded
xilinxu
parents:
diff changeset
90 if (!output_file)
997f5136985f Uploaded
xilinxu
parents:
diff changeset
91 errx(1,"Failed to create output file (%s)", get_output_filename() );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
92 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
93 ostream& real_output = (use_stdout) ? cout : output_file ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
94
997f5136985f Uploaded
xilinxu
parents:
diff changeset
95 while ( fastx_read_next_record(&fastx) ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
96 collapsed_sequences[string(fastx.nucleotides)]++ ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
97 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
98
997f5136985f Uploaded
xilinxu
parents:
diff changeset
99 copy ( collapsed_sequences.begin(), collapsed_sequences.end(),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
100 back_inserter(sorted_collapsed_sequences) ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
101
997f5136985f Uploaded
xilinxu
parents:
diff changeset
102 sorted_collapsed_sequences.sort ( sort_by_abundance_count ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
103
997f5136985f Uploaded
xilinxu
parents:
diff changeset
104 PrintCollapsedSequence stats = for_each ( sorted_collapsed_sequences.rbegin(),
997f5136985f Uploaded
xilinxu
parents:
diff changeset
105 sorted_collapsed_sequences.rend(), PrintCollapsedSequence(real_output) ) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
106
997f5136985f Uploaded
xilinxu
parents:
diff changeset
107 if (stats.total_reads != num_input_reads(&fastx))
997f5136985f Uploaded
xilinxu
parents:
diff changeset
108 errx(1,"Internal error: stats.total_reads (%zu) != num_input_reads(&fastx) (%zu).\n",
997f5136985f Uploaded
xilinxu
parents:
diff changeset
109 stats.total_reads, num_input_reads(&fastx) );
997f5136985f Uploaded
xilinxu
parents:
diff changeset
110
997f5136985f Uploaded
xilinxu
parents:
diff changeset
111 if ( verbose_flag() ) {
997f5136985f Uploaded
xilinxu
parents:
diff changeset
112 fprintf(get_report_file(), "Collapsd %zu reads into %zu unique sequences.\n",
997f5136985f Uploaded
xilinxu
parents:
diff changeset
113 num_input_reads(&fastx), stats.counter) ;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
114 }
997f5136985f Uploaded
xilinxu
parents:
diff changeset
115 return 0;
997f5136985f Uploaded
xilinxu
parents:
diff changeset
116 }