annotate tools/spades_3_0/filter_spades_output.pl @ 8:ff058438080a draft

Version 0.8, supports SPAdes 3.0.0
author lionelguy
date Wed, 05 Feb 2014 05:19:03 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
1 #!/usr/bin/perl -w
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
2
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
3 =head1 SYNOPSIS
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
4
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
5 filter_spades_output.pl - Filters contigs or scaffolds based on contig length and coverage.
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
6
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
7 =head1 USAGE
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
8
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
9 filter_spades_output.pl [-c|--coverage-cutoff] [-l|--length-cutoff] [-o|--filtered-out out.fasta] -t|--tab stats.tab seqs.fasta
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
10
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
11 =head1 INPUT
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
12
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
13 =head2 [-c|--coverage-cutoff]
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
14
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
15 Mininum coverage. Contigs with lower coverage will be discarded. Default 10.
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
16
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
17 =head2 [-l|--length-cutoff]
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
18
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
19 Mininum coverage. Smaller ontigs will be discarded. Default 500.
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
20
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
21 =head2 -t|--tab stats.tab
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
22
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
23 A tabular file, with three columns: contig name, length, and coverage:
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
24
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
25 NODE_1 31438 24.5116
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
26 NODE_2 31354 2316.96
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
27 NODE_3 26948 82.3294
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
28
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
29 Such a file is produced by spades.xml. Contigs should be in the same order as in the fasta file.
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
30
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
31 =head2 [-o|--filtered-out out.fasta]
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
32
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
33 If specified, filtered out sequences will be written to this file.
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
34
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
35 =head2 seqs.fasta
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
36
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
37 Sequences in fasta format. Start of IDs must match ids in the tabular file.
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
38
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
39 =head1 OUTPUT
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
40
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
41 A fasta file on stdout.
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
42
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
43 =head1 AUTHOR
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
44
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
45 Lionel Guy (lionel.guy@icm.uu.se)
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
46
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
47 =head1 DATE
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
48
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
49 Thu Aug 29 13:51:13 CEST 2013
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
50
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
51 =cut
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
52
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
53 # libraries
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
54 use strict;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
55 use Getopt::Long;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
56 use Bio::SeqIO;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
57
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
58 my $coverage_co = 10;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
59 my $length_co = 500;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
60 my $out_filtered;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
61 my $tab_file;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
62
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
63 GetOptions(
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
64 'c|coverage-cutoff=s' => \$coverage_co,
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
65 'l|length-cutoff=s' => \$length_co,
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
66 'o|filtered-out=s' => \$out_filtered,
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
67 't|tab=s' => \$tab_file,
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
68 );
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
69 my $fasta_file = shift(@ARGV);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
70 die ("No tab file specified") unless ($tab_file);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
71 die ("No fasta file specified") unless ($fasta_file);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
72
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
73 ## Read tab file, discard rows with comments
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
74 open TAB, '<', $tab_file or die "$?";
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
75 my @stats;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
76 while (<TAB>){
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
77 chomp;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
78 push @stats, $_ unless (/^#/);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
79 }
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
80
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
81 ## Read fasta
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
82 my $seq_in = Bio::SeqIO->new(-file => $fasta_file,
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
83 -format => 'fasta');
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
84 my $seq_out = Bio::SeqIO->new(-fh => \*STDOUT,
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
85 -format => 'fasta');
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
86 my $seq_out_filt = Bio::SeqIO->new(-file => ">$out_filtered",
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
87 -format => 'fasta') if ($out_filtered);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
88 while (my $seq = $seq_in->next_seq){
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
89 my $stat = shift @stats;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
90 die "Less rows in tab than sequences in seq file" unless $stat;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
91 my ($id_tab, $length, $coverage) = split(/\t+/, $stat);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
92 die "id, length or coverate not defined at $stat\n"
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
93 unless ($id_tab && $length && $coverage);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
94 my $id_seq = $seq->id;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
95 die "Unmatched ids $id_seq and $id_tab\n" unless ($id_seq =~ /^$id_tab/i);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
96 if ($length >= $length_co && $coverage >= $coverage_co){
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
97 $seq_out->write_seq($seq);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
98 } elsif ($out_filtered){
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
99 $seq_out_filt->write_seq($seq);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
100 } else {
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
101 # do nothing
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
102 }
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
103 }
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
104 die "More rows in tab than sequences in seq file" if (scalar(@stats) > 0);
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
105 exit 0;
ff058438080a Version 0.8, supports SPAdes 3.0.0
lionelguy
parents:
diff changeset
106