# HG changeset patch
# User aaronpetkau
# Date 1436021320 14400
# Node ID 27f6392e254f227cc4b0a656438fd516162acedb
# Parent ddd1e15df88cbb7711819d29ea117506ebad3e2a
Deleted selected files
diff -r ddd1e15df88c -r 27f6392e254f filter_spades_repeats.xml
--- a/filter_spades_repeats.xml Sat Jul 04 09:45:30 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,160 +0,0 @@
-
- Remove short and repeat contigs/scaffolds
-
- perl
-
- nml_filter_spades_repeats.pl -i $fasta_input -t $tab_input -c $cov_cutoff -r $rep_cutoff -l $len_cutoff -o $output_with_repeats -u $output_without_repeats -n $repeat_sequences_only -e $cov_len_cutoff -f $discarded_sequences -s $summary
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- keep_leftover == "yes"
-
-
- print_summary == "yes"
-
-
-
-
-
-
-
-================
-**What it does**
-================
-
-Using the output of SPAdes (a fasta and a stats file, either from contigs or scaffolds), it filters the fasta files, discarding all sequences that are under a given length or under a calculated coverage. Repeated contigs are detected based on coverage.
-
---------------------------------------
-
-==========
-**Output**
-==========
-
-- **Filtered sequences (with repeats)**
- - Will contain the filtered contigs/scaffolds including the repeats. These are the sequences that passed the length and minumum coverage cutoffs.
- - For workflows, this output is named **output_with_repeats**
-- **Filtered sequences (no repeats)**
- - Will contain the filtered contigs/scaffolds excluding the repeats. These are the sequences that passed the length, minimum coverage and repeat cutoffs.
- - For workflows, this output is named **output_without_repeats**
-- **Repeat sequences**
- - Will contain the repeated contigs/scaffolds only. These are the sequences that were exluded for having high coverage (determined by the repeat cutoff).
- - For workflows, this output is named **repeat_sequences_only**
-- **Discarded sequences**
- - If selected, will contain the discarded sequences. These are the sequences that fell below the length and minumum coverage cutoffs, and got discarded.
- - For workflows, this output is named **discarded_sequences**
-- **Results summary** : If selected, will contain a summary of all the results.
-
-------------------------------------------
-
-============
-**Example**
-============
-
-Stats file input:
-------------------
-
- +------------+------------+------------+
- |#name |length |coverage |
- +============+============+============+
- |NODE_1 |2500 |15.5 |
- +------------+------------+------------+
- |NODE_2 |102 |3.0 |
- +------------+------------+------------+
- |NODE_3 |1300 |50.0 |
- +------------+------------+------------+
- |NODE_4 |1000 |2.3 |
- +------------+------------+------------+
- |NODE_5 |5000 |14.3 |
- +------------+------------+------------+
- |NODE_6 |450 |25.2 |
- +------------+------------+------------+
-
-User Inputs:
-------------
-
-- Coverage cut-off ratio = 0.33
-- Repeat cut-off ratio = 1.75
-- Length cut-off = 500
-- Length for average coverage calculation = 1000
-
-Calculations:
--------------
-
-**Average coverage will be calculatd based on contigs with length >= 1000bp**
-
-
-- Average coverage = 15.5 + 50.0 + 2.3 + 14.3 / 4 = 20.5
-
-**Contigs that have coverage in the lower 1/3 of the average coverage will be eliminated.**
-
-- Coverage cut-off = 20.5 * 0.33 = 6.8
-
-**Contigs with high coverage (larger than 1.75 times the average coverage) are considered to be repeated contigs.**
-
-- Repeat cut-off = 20.5 * 1.75 = 35.9
-
-**Number of copies are calculated by dividing the sequence coverage by the average coverage.**
-
-- Number of repeats for NODE_3 = 50 / 20.5 = 2 copies
-
-
-Output (in fasta format):
---------------------------
-
-**Filtered sequences (with repeats)**
-
-::
-
- >NODE_1
- >NODE_3 (2 copies)
- >NODE_5
-
-**Filtered sequences (no repeats)**
-
-::
-
- >NODE_1
- >NODE_5
-
-**Repeat sequences**
-
-::
-
- >NODE_3 (2 copies)
-
-**Discarded sequences**
-
-::
-
- >NODE_2
- >NODE_4
- >NODE_6
-
----------------------------------------
-
-
-
-
-
-
-
diff -r ddd1e15df88c -r 27f6392e254f nml_filter_spades_repeats.pl
--- a/nml_filter_spades_repeats.pl Sat Jul 04 09:45:30 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,315 +0,0 @@
-#!/usr/bin/env perl
-
-use warnings;
-use strict;
-use Getopt::Long;
-use Bio::SeqIO;
-use Pod::Usage;
-
-my ($fasta_file, $tab_file, $coverage_co, $length_co, $repeat_co, $out_filtered, $out_repeats, $out_norepeats,$coverage_length_co, $summary_out, $filtered_repeats, $help);
-
-GetOptions(
- 'c|coverage-cutoff=s' => \$coverage_co,
- 'l|length-cutoff=s' => \$length_co,
- 'e|coverage-length-cutoff=s' => \$coverage_length_co,
- 'r|repeat_cutoff=s' => \$repeat_co,
- 'i|input=s' => \$fasta_file,
- 't|tab=s' => \$tab_file,
- 'f|filtered-out=s' => \$out_filtered,
- 'o|output-repeats=s' => \$out_repeats,
- 'u|output-norepeats=s' => \$out_norepeats,
- 'n|filtered-repeats=s' => \$filtered_repeats,
- 's|summary=s' => \$summary_out,
- 'h|help' => \$help
-);
-
-pod2usage(-verbose => 2) if ($help);
-print "A fasta file is required. Please enter a fasta file using the -i flag.\n" if (!$fasta_file);
-print "A spades tabs file is required. Please enter a tabs file using the -t flag\n" if (!$tab_file);
-pod2usage(1) unless $fasta_file && $tab_file;
-
-if (!$coverage_co)
-{
- $coverage_co = 0.33;
-}
-if (!$length_co)
-{
- $length_co = 1000;
-}
-if (!$coverage_length_co)
-{
- $coverage_length_co = 5000;
-}
-if (!$repeat_co)
-{
- $repeat_co = 1.75;
-}
-if (!$out_filtered)
-{
- $out_filtered = "Discarded_sequences.fasta";
- print "Discarded sequences will be printed out to $out_filtered\n";
-}
-if (!$out_repeats)
-{
- $out_repeats = "Filtered_sequences_with_repeats.fasta";
- print "Filtered sequences with repeats will be printed out to $out_repeats\n";
-}
-if (!$out_norepeats)
-{
- $out_norepeats = "Filtered_sequences_no_repeats.fasta";
- print "Filtered sequences without repeats will be printed out to $out_norepeats\n";
-}
-if (!$filtered_repeats)
-{
- $filtered_repeats = "Repeat_sequences.fasta";
- print "Repeat sequences will be printed out to $filtered_repeats\n";
-}
-
-die ("No tab file specified") unless ($tab_file);
-die ("No fasta file specified") unless ($fasta_file);
-
-##Read tab file and discard rows with comments
-open TAB, '<', $tab_file or die "Could not open tab file: $?";
-open SEQIN, '<', $fasta_file or die "Could not open tab file: $?";
-open SEQOUT_REP, '>', $out_repeats or die "Could not open file for writing: $?";
-open SEQOUT_NOREP, '>', $out_norepeats or die "Could not open file for writing: $?";
-open SEQOUT_FILT, '>', $out_filtered if ($out_filtered);
-open SEQOUT_FILT_REP, '>', $filtered_repeats or die "Could not open file for writing: $?";
-open SUMMARY, '>', $summary_out if ($summary_out);
-
-
-my $avg_coverage = 0;
-my $num_contigs = 0;
-my $cutoff_coverage;
-my $cutoff_repeats;
-my @stats;
-
-
-while ()
-{
- chomp;
- push @stats, $_ unless (/^#/);
-}
-
-#Calculate average coverage.
-foreach my $stat(@stats)
-{
- my ($length, $coverage);
- (undef,$length, $coverage) = split(/\t+/, $stat);
- die "length or coverage not defined at $stat\n" unless ($length && ($coverage ne '' && $coverage >= 0));
- if ($length >= $coverage_length_co)
- {
- $avg_coverage = $avg_coverage + $coverage;
- $num_contigs++;
- }
-}
-
-$avg_coverage = $avg_coverage / $num_contigs;
-$cutoff_coverage = $avg_coverage * $coverage_co;
-$cutoff_repeats = $avg_coverage * $repeat_co;
-
-print SUMMARY "Filter SPAdes repeats Results Summary\n======================================\n\n" if ($summary_out);
-print SUMMARY "Paramaters used:\nLength cutoff for calcularing average cutoff: $coverage_length_co\nCoverage cutoff ratio: $coverage_co\nRepeat cutoff ratio: $repeat_co\nLength cutoff: $length_co\n\n" if ($summary_out);
-
-print SUMMARY "Calculations:\nAverage coverage: $avg_coverage\nCoverage cutoff: $cutoff_coverage\nRepeat cutoff: $cutoff_repeats\n\nFile headers:\n" if ($summary_out);
-
-my ($header, $seq_id, $seq);
-my $repeated = 0;
-my $valid = 0;
-
-#Summary strings:
-my $discarded = "";
-my $repeats = "";
-my $filtered_rep = "";
-my $filtered_norep = "";
-
-while (my $line = )
-{
- if ($line =~ />/)
- {
- chomp $line;
- #Get the sequence name to compare against tab file
- $header = $line;
- $seq_id = $line =~ /(\w+)_length/;
- $seq = "";
-
- my $stat = shift @stats;
- die "Less rows in tab than sequences in seq file" unless $stat;
- my($name, $length, $coverage) = split(/\t+/, $stat);
- die "name or length not defined at $stat\n" unless ($name && $length);
- die "coverage is not defined at $stat\n" unless ($coverage ne '' && $coverage >= 0);
- die "Unmatched names $header and $name\n" unless ($header =~ /$name/i);
-
- #Entry passes the length and coverage cutoffs?
- if ($length >= $length_co && $coverage >= $cutoff_coverage)
- {
- $valid = 1;
- #Repeats
- if ($coverage >= $cutoff_repeats)
- {
- my $num_repeats = int($coverage/$avg_coverage);
- $header = $header."(".$num_repeats." copies)";
- print SEQOUT_REP $header,"\n";
- $filtered_rep = $filtered_rep.$header."\n";
- print SEQOUT_FILT_REP $header, "\n";
- $repeats = $repeats.$header."\n";
- $repeated = 1;
- }
- else
- {
- print SEQOUT_REP $header, "\n";
- $filtered_rep = $filtered_rep.$header."\n";
- print SEQOUT_NOREP $header, "\n";
- $filtered_norep = $filtered_norep.$header."\n";
- $repeated = 0;
- }
- }
- elsif ($out_filtered)
- {
- $valid = 0;
- print SEQOUT_FILT $header,"\n";
- $discarded = $discarded.$header."\n";
- }
- }
- else
- {
- if ($valid)
- {
- print SEQOUT_REP $line;
- if (!$repeated)
- {
- print SEQOUT_NOREP $line;
- }
- else
- {
- print SEQOUT_FILT_REP $line;
- }
- }
- elsif ($out_filtered)
- {
- print SEQOUT_FILT $line;
- }
- }
-
-}
-
-close TAB;
-close SEQIN;
-close SEQOUT_REP;
-close SEQOUT_NOREP;
-close SEQOUT_FILT;
-close SEQOUT_FILT_REP;
-
-
-#Get summary info:
-if ($summary_out)
-{
- print SUMMARY "Filtered sequences (with repeats):\n$filtered_rep\n";
- print SUMMARY "Filtered sequences (no repeats):\n$filtered_norep\n";
- print SUMMARY "Repeat sequences:\n$repeats\n";
- if ($out_filtered)
- {
- print SUMMARY "Discarded sequences:\n$discarded\n";
- }
-
- close SUMMARY;
-}
-
-die "More rows in stats file than sequences in the fasta file\n" if (scalar(@stats) > 0);
-exit 0;
-
-
-__END__
-
-
-
-=head1 NAME
-
- filter_spades_repeats.pl - Filters contigs or scaffolds based on contig length and detects contigs/scaffolds with very high coverage.
-
-
-
-=head1 USAGE
-
- filter_spades_output.pl -i
- -t
- -o