# HG changeset patch
# User aaronpetkau
# Date 1436017530 14400
# Node ID ddd1e15df88cbb7711819d29ea117506ebad3e2a
Uploaded
diff -r 000000000000 -r ddd1e15df88c filter_spades_repeats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_spades_repeats.xml Sat Jul 04 09:45:30 2015 -0400
@@ -0,0 +1,160 @@
+
+ Remove short and repeat contigs/scaffolds
+
+ perl
+
+ nml_filter_spades_repeats.pl -i $fasta_input -t $tab_input -c $cov_cutoff -r $rep_cutoff -l $len_cutoff -o $output_with_repeats -u $output_without_repeats -n $repeat_sequences_only -e $cov_len_cutoff -f $discarded_sequences -s $summary
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ keep_leftover == "yes"
+
+
+ print_summary == "yes"
+
+
+
+
+
+
+
+================
+**What it does**
+================
+
+Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a calculated coverage. Repeated contigs are detected based on coverage.
+
+--------------------------------------
+
+==========
+**Output**
+==========
+
+- **Filtered sequences (with repeats)**
+ - Will contain the filtered contigs/scaffolds including the repeats. These are the sequences that passed the length and minumum coverage cutoffs.
+ - For workflows, this output is named **output_with_repeats**
+- **Filtered sequences (no repeats)**
+ - Will contain the filtered contigs/scaffolds excluding the repeats. These are the sequences that passed the length, minimum coverage and repeat cutoffs.
+ - For workflows, this output is named **output_without_repeats**
+- **Repeat sequences**
+ - Will contain the repeated contigs/scaffolds only. These are the sequences that were exluded for having high coverage (determined by the repeat cutoff).
+ - For workflows, this output is named **repeat_sequences_only**
+- **Discarded sequences**
+ - If selected, will contain the discarded sequences. These are the sequences that fell below the length and minumum coverage cutoffs, and got discarded.
+ - For workflows, this output is named **discarded_sequences**
+- **Results summary** : If selected, will contain a summary of all the results.
+
+------------------------------------------
+
+============
+**Example**
+============
+
+Stats file input:
+------------------
+
+ +------------+------------+------------+
+ |#name |length |coverage |
+ +============+============+============+
+ |NODE_1 |2500 |15.5 |
+ +------------+------------+------------+
+ |NODE_2 |102 |3.0 |
+ +------------+------------+------------+
+ |NODE_3 |1300 |50.0 |
+ +------------+------------+------------+
+ |NODE_4 |1000 |2.3 |
+ +------------+------------+------------+
+ |NODE_5 |5000 |14.3 |
+ +------------+------------+------------+
+ |NODE_6 |450 |25.2 |
+ +------------+------------+------------+
+
+User Inputs:
+------------
+
+- Coverage cut-off ratio = 0.33
+- Repeat cut-off ratio = 1.75
+- Length cut-off = 500
+- Length for average coverage calculation = 1000
+
+Calculations:
+-------------
+
+**Average coverage will be calculatd based on contigs with length >= 1000bp**
+
+
+- Average coverage = 15.5 + 50.0 + 2.3 + 14.3 / 4 = 20.5
+
+**Contigs that have coverage in the lower 1/3 of the average coverage will be eliminated.**
+
+- Coverage cut-off = 20.5 * 0.33 = 6.8
+
+**Contigs with high coverage (larger than 1.75 times the average coverage) are considered to be repeated contigs.**
+
+- Repeat cut-off = 20.5 * 1.75 = 35.9
+
+**Number of copies are calculated by dividing the sequence coverage by the average coverage.**
+
+- Number of repeats for NODE_3 = 50 / 20.5 = 2 copies
+
+
+Output (in fasta format):
+--------------------------
+
+**Filtered sequences (with repeats)**
+
+::
+
+ >NODE_1
+ >NODE_3 (2 copies)
+ >NODE_5
+
+**Filtered sequences (no repeats)**
+
+::
+
+ >NODE_1
+ >NODE_5
+
+**Repeat sequences**
+
+::
+
+ >NODE_3 (2 copies)
+
+**Discarded sequences**
+
+::
+
+ >NODE_2
+ >NODE_4
+ >NODE_6
+
+---------------------------------------
+
+
+
+
+
+
+
diff -r 000000000000 -r ddd1e15df88c nml_filter_spades_repeats.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nml_filter_spades_repeats.pl Sat Jul 04 09:45:30 2015 -0400
@@ -0,0 +1,315 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use Getopt::Long;
+use Bio::SeqIO;
+use Pod::Usage;
+
+my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help);
+
+GetOptions(
+ 'c|coverage-cutoff=s' => \$coverage_co,
+ 'l|length-cutoff=s' => \$length_co,
+ 'e|coverage-length-cutoff=s' => \$coverage_length_co,
+ 'r|repeat_cutoff=s' => \$repeat_co,
+ 'i|input=s' => \$fasta_file,
+ 't|tab=s' => \$tab_file,
+ 'f|filtered-out=s' => \$out_filtered,
+ 'o|output-repeats=s' => \$out_repeats,
+ 'u|output-norepeats=s' => \$out_norepeats,
+ 'n|filtered-repeats=s' => \$filtered_repeats,
+ 's|summary=s' => \$summary_out,
+ 'h|help' => \$help
+);
+
+pod2usage(-verbose => 2) if ($help);
+print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file);
+print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file);
+pod2usage(1) unless $fasta_file && $tab_file;
+
+if (!$coverage_co)
+{
+ $coverage_co = 0.33;
+}
+if (!$length_co)
+{
+ $length_co = 1000;
+}
+if (!$coverage_length_co)
+{
+ $coverage_length_co = 5000;
+}
+if (!$repeat_co)
+{
+ $repeat_co = 1.75;
+}
+if (!$out_filtered)
+{
+ $out_filtered = "Discarded_sequences.fasta";
+ print "Discarded sequences will be printed out to $out_filtered\n";
+}
+if (!$out_repeats)
+{
+ $out_repeats = "Filtered_sequences_with_repeats.fasta";
+ print "Filtered sequences with repeats will be printed out to $out_repeats\n";
+}
+if (!$out_norepeats)
+{
+ $out_norepeats = "Filtered_sequences_no_repeats.fasta";
+ print "Filtered sequences without repeats will be printed out to $out_norepeats\n";
+}
+if (!$filtered_repeats)
+{
+ $filtered_repeats = "Repeat_sequences.fasta";
+ print "Repeat sequences will be printed out to $filtered_repeats\n";
+}
+
+die ("No tab file specified") unless ($tab_file);
+die ("No fasta file specified") unless ($fasta_file);
+
+##Read tab file and discard rows with comments
+open TAB, '<', $tab_file or die "Could not open tab file: $?";
+open SEQIN, '<', $fasta_file or die "Could not open tab file: $?";
+open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?";
+open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?";
+open SEQOUT_FILT, '>', $out_filtered if ($out_filtered);
+open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?";
+open SUMMARY, '>', $summary_out if ($summary_out);
+
+
+my $avg_coverage = 0;
+my $num_contigs = 0;
+my $cutoff_coverage;
+my $cutoff_repeats;
+my @stats;
+
+
+while ()
+{
+ chomp;
+ push @stats, $_ unless (/^#/);
+}
+
+#Calculate average coverage.
+foreach my $stat(@stats)
+{
+ my ($length, $coverage);
+ (undef,$length, $coverage) = split(/\t+/, $stat);
+ die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0));
+ if ($length >= $coverage_length_co)
+ {
+ $avg_coverage = $avg_coverage + $coverage;
+ $num_contigs++;
+ }
+}
+
+$avg_coverage = $avg_coverage / $num_contigs;
+$cutoff_coverage = $avg_coverage * $coverage_co;
+$cutoff_repeats = $avg_coverage * $repeat_co;
+
+print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out);
+print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out);
+
+print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out);
+
+my ($header, $seq_id, $seq);
+my $repeated = 0;
+my $valid = 0;
+
+#Summary strings:
+my $discarded = "";
+my $repeats = "";
+my $filtered_rep = "";
+my $filtered_norep = "";
+
+while (my $line = )
+{
+ if ($line =~ />/)
+ {
+ chomp $line;
+ #Get the sequence name to compare against tab file
+ $header = $line;
+ $seq_id = $line =~ /(\w+)_length/;
+ $seq = "";
+
+ my $stat = shift @stats;
+ die "Less rows in tab than sequences in seq file" unless $stat;
+ my($name, $length, $coverage) = split(/\t+/, $stat);
+ die "name or length not defined at $stat\n" unless ($name && $length);
+ die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0);
+ die "Unmatched names $header and $name\n" unless ($header =~ /$name/i);
+
+ #Entry passes the length and coverage cutoffs?
+ if ($length >= $length_co && $coverage >= $cutoff_coverage)
+ {
+ $valid = 1;
+ #Repeats
+ if ($coverage >= $cutoff_repeats)
+ {
+ my $num_repeats = int($coverage/$avg_coverage);
+ $header = $header."(".$num_repeats." copies)";
+ print SEQOUT_REP $header,"\n";
+ $filtered_rep = $filtered_rep.$header."\n";
+ print SEQOUT_FILT_REP $header, "\n";
+ $repeats = $repeats.$header."\n";
+ $repeated = 1;
+ }
+ else
+ {
+ print SEQOUT_REP $header, "\n";
+ $filtered_rep = $filtered_rep.$header."\n";
+ print SEQOUT_NOREP $header, "\n";
+ $filtered_norep = $filtered_norep.$header."\n";
+ $repeated = 0;
+ }
+ }
+ elsif ($out_filtered)
+ {
+ $valid = 0;
+ print SEQOUT_FILT $header,"\n";
+ $discarded = $discarded.$header."\n";
+ }
+ }
+ else
+ {
+ if ($valid)
+ {
+ print SEQOUT_REP $line;
+ if (!$repeated)
+ {
+ print SEQOUT_NOREP $line;
+ }
+ else
+ {
+ print SEQOUT_FILT_REP $line;
+ }
+ }
+ elsif ($out_filtered)
+ {
+ print SEQOUT_FILT $line;
+ }
+ }
+
+}
+
+close TAB;
+close SEQIN;
+close SEQOUT_REP;
+close SEQOUT_NOREP;
+close SEQOUT_FILT;
+close SEQOUT_FILT_REP;
+
+
+#Get summary info:
+if ($summary_out)
+{
+ print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n";
+ print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n";
+ print SUMMARY "Repeat sequences:\n$repeats\n";
+ if ($out_filtered)
+ {
+ print SUMMARY "Discarded sequences:\n$discarded\n";
+ }
+
+ close SUMMARY;
+}
+
+die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0);
+exit 0;
+
+
+__END__
+
+
+
+=head1 NAME
+
+ filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage.
+
+
+
+=head1 USAGE
+
+ filter_spades_output.pl -i
+ -t
+ -o