Mercurial > repos > nml > fasta_extract
changeset 0:75e70a6d8d60 draft
Uploaded
| author | nml | 
|---|---|
| date | Mon, 06 Feb 2017 10:27:59 -0500 | 
| parents | |
| children | 6590023d80a9 | 
| files | fa-extract-few.pl fa-extract-sequence.xml tool_dependencies.xml | 
| diffstat | 3 files changed, 176 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fa-extract-few.pl Mon Feb 06 10:27:59 2017 -0500 @@ -0,0 +1,77 @@ +#!/usr/bin/perl -w +use strict; +use Bio::SeqIO; + +my(@Options, $verbose, $inverse, $file,$list,$exact); +setOptions(); + +my $in = Bio::SeqIO->new(-file=>$file, -format=>'Fasta'); +my $out = Bio::SeqIO->new(-fh=>\*STDOUT, -format=>'Fasta'); +my $nread=0; +my $nwrote=0; + +my $pattern = join('|', @ARGV); + +if ( $list) { + my @list; + open my $in,'<',$list; + while ( <$in>) { + chomp; + push @list,$_; + } + close $in; + $pattern = join ('|',@list); +} + +while (my $seq = $in->next_seq) { + $nread++; + my $match = ($seq->description =~ m/($pattern)/ or $seq->display_id =~ m/($pattern)/); + if ($exact) { + $match = ($seq->display_id =~ m/^($pattern)$/); + } + #print STDERR "Found match: ",$seq->display_id, " ", $seq->description, "\n" if $verbose; + if ($match ^ $inverse) { # rare use for XOR ! + $out->write_seq($seq); + $nwrote++; + } +} + +#print STDERR "Read $nread sequences, wrote $nwrote, with pattern: $pattern\n"; +exit(0); +#---------------------------------------------------------------------- +# Option setting routines + +sub setOptions { + use Getopt::Long; + + @Options = ( + {OPT=>"h|help", VAR=>\&usage, DESC=>"This help"}, + {OPT=>"verbose!", VAR=>\$verbose, DEFAULT=>0, DESC=>"Verbose"}, + {OPT=>"v|inverse!", VAR=>\$inverse, DEFAULT=>0, DESC=>"Output NON-matching sequences instead"}, + {OPT=>"f|file=s", VAR=>\$file, DEFAULT=>"", DESC=>"The fasta file to extract sequences from"}, + {OPT=>"exact", VAR=>\$exact, DEFAULT=>"", DESC=>"Exact matches for display id only"}, + {OPT=>"l|list=s", VAR=>\$list, DEFAULT=>"", DESC=>"List of pattern to look from"}, + ); + + (!@ARGV) && (usage()); + + &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage(); + + # Now setup default values. + foreach (@Options) { + if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) { + ${$_->{VAR}} = $_->{DEFAULT}; + } + } +} + +sub usage { + print "Usage: $0 [options] id1 [id2 ...] < input.fasta > output.fasta\n"; + foreach (@Options) { + printf " --%-13s %s%s.\n",$_->{OPT},$_->{DESC}, + defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : ""; + } + exit(1); +} + +#----------------------------------------------------------------------
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fa-extract-sequence.xml Mon Feb 06 10:27:59 2017 -0500 @@ -0,0 +1,76 @@ +<tool id="fa-extract-sequence" name="Fasta Extract Sequence" version="1.0.0"> + <description>Extract a single sequence from a fasta file.</description> + <requirements> + <requirement type="package" version="5.18.1">perl</requirement> + <requirement type="package" version="1.6.924">bioperl</requirement> + </requirements> + <command interpreter="perl"> + fa-extract-few.pl + -f $dataset + $exact + $inverse + #if $file_or_type.select == "list" + --list $file_or_type.list_file + #else + $file_or_type.id + #end if + + > + $output + </command> + <inputs> + <param name="dataset" type="data" format="fasta" label="fasta or multifasta file" help="fasta dataset to get statistics for."/> + <param name="exact" type="boolean" truevalue="--exact" label="Exact matches only" help="Will only match exact matches for fasta id"/> + <param name="inverse" type="boolean" truevalue="--inverse" label="Entries NOT matching" help="Will return the sequences not matching the given ids"/> + + <conditional name="file_or_type"> + <param name="select" type="select" label="List file or single pattern"> + <option value="list">List file</option> + <option value="single">Single Pattern</option> + </param> + <when value="list"> + <param name="list_file" type="data" format="txt" help="List of pattern to find." label="List file"/> + </when> + <when value="single"> + <param name="id" type="text" label="Sequence ID (or partial)" help="Name of the sequence to extract. Will also match partial names and return all matches." /> + </when> + </conditional> + + + </inputs> + <outputs> + <data name="output" format="fasta" label="${tool.name} on ${on_string}: Fasta"/> + </outputs> + <tests> + <test> + <output/> + </test> + </tests> + <help> +**Fasta Extract Sequence** +Extracts a fasta sequence from a multfasta by id (exact or partial) + +Latest author: +Written by Philip Mabon - Public Health Agency of Canada + +Original authors: +Written by Torsten Seemann - Victorian Bioinformatics Consortium + +Wrapped by Simon Gladman - Victorian Bioinformatics Consortium + + +------ + +Outputs in fasta format. + +------ + +Inputs: + +Fasta dataset + +Sequence id + </help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Feb 06 10:27:59 2017 -0500 @@ -0,0 +1,23 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="perl" version="5.18.1"> + <repository changeset_revision="35f117d7396b" name="package_perl_5_18" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="bioperl" version="1.6.924"> + <install version="1.0"> + <actions> + <action type="setup_perl_environment"> + <repository changeset_revision="35f117d7396b" name="package_perl_5_18" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="perl" version="5.18.1" /> + </repository> + <!-- allow downloading and installing a Perl package from cpan.org--> + <package>XML::Parser</package> + <package>http://search.cpan.org/CPAN/authors/id/C/CJ/CJFIELDS/BioPerl-1.6.924.tar.gz</package> + </action> + </actions> + </install> + <readme> + Bundle::BioPerl + </readme> + </package> +</tool_dependency>
