changeset 0:c342ebb50f0b draft default tip

Uploaded
author fernando
date Thu, 22 May 2014 05:09:07 -0400
parents
children
files interpro/Galaxy-Workflow-Protein_Funcional_Analysis_Similarities.ga interpro/INSTALL.txt interpro/paso1.pl interpro/paso1.xml interpro/paso2.sh interpro/paso2.xml interpro/paso3.sh interpro/paso3.xml interpro/paso4.pl interpro/paso4.xml
diffstat 10 files changed, 783 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/Galaxy-Workflow-Protein_Funcional_Analysis_Similarities.ga	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,119 @@
+{
+    "a_galaxy_workflow": "true", 
+    "annotation": "Search common attributes in protein sequences", 
+    "format-version": "0.1", 
+    "name": "Protein Funcional Analysis Similarities", 
+    "steps": {
+        "0": {
+            "annotation": "", 
+            "id": 0, 
+            "input_connections": {}, 
+            "inputs": [], 
+            "name": "RemoteBlast", 
+            "outputs": [
+                {
+                    "name": "outfile", 
+                    "type": "txt"
+                }
+            ], 
+            "position": {
+                "left": 200, 
+                "top": 205
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "ALaGiFer_1", 
+            "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"infile\": \"null\"}", 
+            "tool_version": "1.0.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "1": {
+            "annotation": "", 
+            "id": 1, 
+            "input_connections": {
+                "infile": {
+                    "id": 0, 
+                    "output_name": "outfile"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Sequences assortment", 
+            "outputs": [
+                {
+                    "name": "outfile", 
+                    "type": "fasta"
+                }
+            ], 
+            "position": {
+                "left": 308, 
+                "top": 339
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "BLaGiFer_2", 
+            "tool_state": "{\"__page__\": 0, \"numsequences\": \"\\\"5\\\"\", \"__rerun_remap_job_id__\": null, \"infile\": \"null\"}", 
+            "tool_version": "1.0.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "2": {
+            "annotation": "", 
+            "id": 2, 
+            "input_connections": {
+                "infile": {
+                    "id": 1, 
+                    "output_name": "outfile"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Sequences attributes", 
+            "outputs": [
+                {
+                    "name": "outfile", 
+                    "type": "gff"
+                }
+            ], 
+            "position": {
+                "left": 385, 
+                "top": 463
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "CLaGiFer_3", 
+            "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"infile\": \"null\"}", 
+            "tool_version": "1.0.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "3": {
+            "annotation": "", 
+            "id": 3, 
+            "input_connections": {
+                "infile": {
+                    "id": 2, 
+                    "output_name": "outfile"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Common attributes selection", 
+            "outputs": [
+                {
+                    "name": "outfile", 
+                    "type": "txt"
+                }
+            ], 
+            "position": {
+                "left": 441, 
+                "top": 589
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "DLaGiFer_3", 
+            "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"infile\": \"null\"}", 
+            "tool_version": "1.0.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }
+    }
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/INSTALL.txt	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,55 @@
+DEPENDENCES
+
+The wrapper needs BioPerl and InterProScan and it runs on a linux system.
+
+To install BioPerl:
+http://www.bioperl.org/wiki/Installing_BioPerl
+
+To install InterProScan:
+http://code.google.com/p/interproscan/wiki/HowToDownload
+
+
+
+## INSTALLATION WRAPPER INSTRUCTIONS
+
+# Stop your local Galaxy
+
+# Change to your_galaxy_path/tools  
+	> cd your_galaxy_path/tools
+
+# Uncompress tar.gz file into "tools" directory.
+	> tar -xvfz interpro.tar.gz
+
+# Edit "paso3.sh" file and change InterProScan path to your local InterProScan path.
+
+# Go to the root galaxy directory, edit "tool_conf.xml" and insert this code block in the site where you would like to see the tool on the toolbar:
+
+  <section id="LaGiFer" name="Protein Functional Analysis Similarities">
+    <tool file="interpro/paso1.xml" />
+    <tool file="interpro/paso2.xml" />
+    <tool file="interpro/paso3.xml" />
+    <tool file="interpro/paso4.xml" />
+  </section>
+
+
+# Run your Galaxy
+	> ./run.sh
+
+## INSTALLATION WORKFLOW INSTRUCTIONS
+
+# To install the workflow to run these tools, press "Upload or import workflow" from workflow menu, then select the .ga file through the "browse" (examinar) bottom and import it.
+
+Select "Configure your workflow menu" and click on "Show in menu" item and press "send".
+
+
+
+Thank you very much for using our Protein "Functional Analysis Similarities" wrapper
+
+For additional details or reporting errors please contact us.
+
+
+gines.a@hotmail.com
+fernando.perez8@um.es
+lentram@um.es
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/paso1.pl	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,77 @@
+#Remote-blast "factory object" creation and blast-parameter initialization
+
+  use Bio::Tools::Run::RemoteBlast;
+  use strict;
+  my $prog = 'blastp';
+  my $db   = 'swissprot';
+  my $e_val= '1e-10';
+my ($input,$output)=@ARGV;
+  my @params = ( '-prog' => $prog,
+         '-data' => $db,
+         '-expect' => $e_val,
+         '-readmethod' => 'SearchIO' );
+
+  my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
+
+  #change a paramter
+#  $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]';
+
+  #remove a parameter
+ # delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
+
+  my $v = 1;
+  #$v is just to turn on and off the messages
+
+  my $str = Bio::SeqIO->new(-file=>$input , '-format' => 'fasta' );
+
+  while (my $input = $str->next_seq()){
+    #Blast a sequence against a database:
+
+    #Alternatively, you could  pass in a file with many
+    #sequences rather than loop through sequence one at a time
+    #Remove the loop starting 'while (my $input = $str->next_seq())'
+    #and swap the two lines below for an example of that.
+    my $r = $factory->submit_blast($input);
+    #my $r = $factory->submit_blast('amino.fa');
+
+    print STDERR "waiting..." if( $v > 0 );
+    while ( my @rids = $factory->each_rid ) {
+      foreach my $rid ( @rids ) {
+        my $rc = $factory->retrieve_blast($rid);
+        if( !ref($rc) ) {
+          if( $rc < 0 ) {
+            $factory->remove_rid($rid);
+          }
+          print STDERR "." if ( $v > 0 );
+          sleep 5;
+        } else {
+          my $result = $rc->next_result();
+          #save the output
+          # my $filename = $result->query_name()."\.out";
+	  my $filename="";
+          $factory->save_output($output);
+          $factory->remove_rid($rid);
+          #print "\nQuery Name: ", $result->query_name(), "\n";
+          #while ( my $hit = $result->next_hit ) {
+          #  next unless ( $v > 0);
+          #  print "\nhit name is ", $hit->name, "\n";
+          #  while( my $hsp = $hit->next_hsp ) {
+          #    print "HSP Len is ", $hsp->length('total'), " ",
+          # " E-value is ", $hsp->evalue, " Bit score ",
+          # $hsp->score, " \t",
+          # " Query loc: ",$hsp->query->start, " ",
+          # $hsp->query->end," ",
+          # " Sbject loc: ",$hsp->hit->start, " ",
+          # $hsp->hit->end,"\n";
+          #  }
+          #}
+        }
+      }
+    }
+  }
+
+  # This example shows how to change a CGI parameter:
+#  $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM25';
+
+  # And this is how to delete a CGI parameter:
+ # delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/paso1.xml	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,107 @@
+<tool id="ALaGiFer_1" name="RemoteBlast" version="1.0.0">
+    <description>Blast against UniProt/SwissProt database</description>
+    <command interpreter="perl">
+    paso1.pl $infile $outfile
+    </command>
+
+        <inputs>
+            <param name="infile" type="data" format="fasta" label="Sequence in Fasta File"/>
+
+          
+           </inputs>
+<outputs>
+<data format="txt" name="outfile"/>
+</outputs>
+
+<stdio><exit_code range="1:" level="fatal" description="Error" /></stdio>
+    <help>
+
+**What it does**
+
+This tool runs a remote blast against Uniprot/Swissprot database.
+
+
+**Dependencies**
+
+Bioperl is necessary for running this tool.
+
+#####
+Input
+#####
+
+Required is a FASTA file containing protein sequences.
+
+
+######
+Output
+######
+
+Generates an output text file.
+
+
+Example Output
+--------------
+BLASTP 2.2.29+
+Reference: Stephen F. Altschul, Thomas L. Madden, Alejandro
+A. Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and
+David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new
+generation of protein database search programs", Nucleic
+Acids Res. 25:3389-3402.
+
+
+RID: M2E3HVH4014
+
+
+Database: Non-redundant UniProtKB/SwissProt sequences
+           457,803 sequences; 170,956,615 total letters
+Query= C08B11.6 CE30856 WBGene00007434  actin   status:Confirmed       
+UniProt:Q09443  protein_id:CAA86667.3
+
+Length=418
+
+
+                                                                   Score     E
+Sequences producing significant alignments:                       (Bits)  Value
+
+sp|Q09443.3|ARP6_CAEEL  RecName: Full=Actin-like protein C08B11.6    853   0.0  
+sp|Q9D864.2|ARP6_MOUSE  RecName: Full=Actin-related protein 6;...    280   2e-88
+sp|Q9GZN1.1|ARP6_HUMAN  RecName: Full=Actin-related protein 6;...    279   5e-88
+sp|Q9DEE9.1|ARP6_CHICK  RecName: Full=Actin-related protein 6;...    274   5e-86
+sp|P45890.1|ARP6_DROME  RecName: Full=Actin-related protein 6;...    265   1e-82
+sp|Q6C982.1|ARP6_YARLI  RecName: Full=Actin-like protein ARP6        232   4e-70
+sp|O94241.1|ARP6_SCHPO  RecName: Full=Actin-like protein arp6        228   2e-68
+sp|Q7S6X6.1|ARP6_NEUCR  RecName: Full=Actin-related protein 6        212   6e-62
+sp|Q5NBI2.1|ARP6_ORYSJ  RecName: Full=Actin-related protein 6 ...    210   3e-61
+
+ALIGNMENTS
+>sp|Q09443.3|ARP6_CAEEL RecName: Full=Actin-like protein C08B11.6
+Length=418
+
+ Score =   853 bits (2204),  Expect = 0.0
+ Identities = 418/418 (100%), Positives = 418/418 (100%), Gaps = 0/418 (0%)
+
+Query  1    MSLTTIIFDNGGHNMKIGTIDSESPRLVPNSIVKAKHEKKRVFVAHEQEECSDKFSLFYV  60
+            MSLTTIIFDNGGHNMKIGTIDSESPRLVPNSIVKAKHEKKRVFVAHEQEECSDKFSLFYV
+Sbjct  1    MSLTTIIFDNGGHNMKIGTIDSESPRLVPNSIVKAKHEKKRVFVAHEQEECSDKFSLFYV  60
+
+Query  61   RPIERGYVVNWDTQQQIWEKTFGSMDVEASTSRIALTDNNYLIPALPDVSSEILFDYFGF  120
+            RPIERGYVVNWDTQQQIWEKTFGSMDVEASTSRIALTDNNYLIPALPDVSSEILFDYFGF
+Sbjct  61   RPIERGYVVNWDTQQQIWEKTFGSMDVEASTSRIALTDNNYLIPALPDVSSEILFDYFGF  120
+
+Query  121  TEVHKTSASTLVAKHSNKINNEKCAVVVDSGFSWTTVASFVNGMLIQDSVIRIDVGGKAL  180
+            TEVHKTSASTLVAKHSNKINNEKCAVVVDSGFSWTTVASFVNGMLIQDSVIRIDVGGKAL
+Sbjct  121  TEVHKTSASTLVAKHSNKINNEKCAVVVDSGFSWTTVASFVNGMLIQDSVIRIDVGGKAL  180
+
+Query  181  TNKLKDWVSYRQLNVSEETYVINECKEDLCFVSQNFDESMKEARNRFQENTTMKRYIMPD  240
+            TNKLKDWVSYRQLNVSEETYVINECKEDLCFVSQNFDESMKEARNRFQENTTMKRYIMPD
+Sbjct  181  TNKLKDWVSYRQLNVSEETYVINECKEDLCFVSQNFDESMKEARNRFQENTTMKRYIMPD  240
+
+
+**Galaxy Wrapper Authors**::
+
+    * Laura Entrambasaguas
+    * Ginés Almagro
+    * Fernando Pérez
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/paso2.sh	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,15 @@
+#!/bin/bash
+# -*- ENCODING: UTF-8 -*-
+# fichero de entrada es salida.out
+input=$1
+output=$2
+working_dir=$PWD
+sift_output="$working_dir/sift_output.txt"
+
+grep sp\| "$input" |head -n$3 |awk 'BEGIN {FS="|"};{print $2}' | awk 'BEGIN {FS="."};{print $1".fasta"}' >"$sift_output"
+while read line
+do 
+   echo "wget http://www.uniprot.org/uniprot/$line" | sh
+   cat $line >>"$output"
+   rm "$line"
+done < "$sift_output"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/paso2.xml	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,88 @@
+<tool id="BLaGiFer_2" name="Sequences assortment" version="1.0.0">
+    <description> Select the most similar sequences</description>
+    <command interpreter="bash">
+    ./paso2.sh "$infile" "$outfile" "$numsequences"
+    </command>
+
+        <inputs>
+            <param name="infile" type="data" format="txt" label="ids file"/>
+	
+	     <param name="numsequences" type="select" label="Select number of sequences" help="">
+                <option value="1">1</option>
+                <option value="2">2</option>
+		<option value="3">3</option>
+		<option value="4">4</option>
+		<option value="5" selected="true">5</option>
+            </param>
+      </inputs>
+<outputs>
+<data format="fasta" name="outfile"/>
+</outputs>
+
+<stdio><exit_code range="1:" level="fatal" description="Error" /></stdio>
+    <help>
+
+**What it does**
+
+From the output text file generated in RemoteBlast wrapper, the user may choose from the five first sequences. The result is stored in a fasta file.
+
+**Dependencies**
+
+This tool only runs UNIX commands, awk is necessary to be installed.
+
+
+#####
+Input
+#####
+
+The input text file generated by RemoteBlast is necessary.
+
+######
+
+The Blast results are saved on an output text file.
+
+
+
+Example Output
+--------------
+
+>sp|Q09443|ARP6_CAEEL Actin-like protein C08B11.6 OS=Caenorhabditis elegans GN=arp-6 PE=2 SV=3
+MSLTTIIFDNGGHNMKIGTIDSESPRLVPNSIVKAKHEKKRVFVAHEQEECSDKFSLFYV
+RPIERGYVVNWDTQQQIWEKTFGSMDVEASTSRIALTDNNYLIPALPDVSSEILFDYFGF
+TEVHKTSASTLVAKHSNKINNEKCAVVVDSGFSWTTVASFVNGMLIQDSVIRIDVGGKAL
+TNKLKDWVSYRQLNVSEETYVINECKEDLCFVSQNFDESMKEARNRFQENTTMKRYIMPD
+FHSTFRGVVKDVKEPHDPQIPSIHLGVERFAIPEILFNPSDIDIDQCGVAEAVIESICQC
+PEALRPALAENIIVIGGSSCFPGFRERLEREVRSMLPAEYGLNVSNDVINPQTHSWHCGQ
+ELLTASKVPWINRKDWDERGDSLEFSNFFQTLVQSDELKGTRNFDDQREKSPKEDEDF
+>sp|Q9D864|ARP6_MOUSE Actin-related protein 6 OS=Mus musculus GN=Actr6 PE=1 SV=2
+MTTLVLDNGAYNAKIGYSHDSVSVIPNCQFRSKTARLKTFTANQIDEIKDPSGLFYILPF
+QKGYLVNWDVQRQVWDYLFGKEMYQVDFLDTNIIITEPYFNFTSIQESMNEILFEEYQFQ
+AVLRVNAGALSAHRYFRDNPSELCCIIVDSGYSFTHIVPYCRSKKKKEAIIRINVGGKLL
+TNHLKEIISYRQLHVMDETHVINQVKEDVCYVSQDFYRDMDIAKLKGEDNTVMIDYVLPD
+FSTIKKGFCKPREEMVLSGKYKSGEQILRLANERFAVPEILFNPSDIGIQEMGIPEAIVY
+SIQNLPEEMQPHFFKNIVLTGGNSLFPGFRERVYSEVRCLTPTDYDVSVVLPENPITYSW
+EGGKLISENDDFEDMVVTREDYEENGHSVCEEKFDI
+>sp|Q9GZN1|ARP6_HUMAN Actin-related protein 6 OS=Homo sapiens GN=ACTR6 PE=1 SV=1
+MTTLVLDNGAYNAKIGYSHENVSVIPNCQFRSKTARLKTFTANQIDEIKDPSGLFYILPF
+QKGYLVNWDVQRQVWDYLFGKEMYQVDFLDTNIIITEPYFNFTSIQESMNEILFEEYQFQ
+AVLRVNAGALSAHRYFRDNPSELCCIIVDSGYSFTHIVPYCRSKKKKEAIIRINVGGKLL
+TNHLKEIISYRQLHVMDETHVINQVKEDVCYVSQDFYRDMDIAKLKGEENTVMIDYVLPD
+FSTIKKGFCKPREEMVLSGKYKSGEQILRLANERFAVPEILFNPSDIGIQEMGIPEAIVY
+SIQNLPEEMQPHFFKNIVLTGGNSLFPGFRDRVYSEVRCLTPTDYDVSVVLPENPITYAW
+EGGKLISENDDFEDMVVTREDYEENGHSVCEEKFDI
+>sp|Q9DEE9|ARP6_CHICK Actin-related protein 6 OS=Gallus gallus GN=ACTR6 PE=2 SV=1
+MATLVLDNGAYNAKIGYSHAHVSVIPNCQFRSKTARLKTFTANQLDEIKDPSGLFYILPF
+QKGYLVNWDVQRQVWDYLFGKEMYQVDFVDTNIIITEPYFNFSSIQESMNEILFEEYQFQ
+AVLRVNAGALSAHRYFRDNPSELCCIIVDSGYSFTHIVPYCRSKKKKEAIIRINVGGKLL
+TNHLKEIISYRQLHVMDETHVINQVKEDVCYVSQDFYKDMEIAKLKGEENTVMVDYVLPD
+FSTIKKGFCKPREEMVLSGKYKTGEQILRLTNERFAVPEILFHPSDIGIQEMGIPEAIVD
+SIQNLPEEMQPHFFKNIVLTGGNTLFPGFRDRVYSEVRCLTPTDYDVSVVLPENPITYSW
+EGGKLISENDDFEDLVVTREDYEEHGHNICEEKFDI
+
+**Galaxy Wrapper Authors**::
+
+    * Ginés Almagro
+    * Laura Entrambasaguas
+    * Fernando Pérez
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/paso3.sh	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,6 @@
+#!/bin/bash
+# -*- ENCODING: UTF-8 -*
+
+input=$1
+output=$2
+~/bioapps/interproscan/interproscan-5.3-46.0/interproscan.sh -i "$input" -o "$output" -f gff3 --iprlookup --goterms -pa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/paso3.xml	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,116 @@
+<tool id="CLaGiFer_3" name="Sequences attributes" version="1.0.0">
+    <description>Download gff file from InterPro</description>
+    <command interpreter="bash">
+    ./paso3.sh "$infile" "$outfile"
+    </command>
+
+        <inputs>
+            <param name="infile" type="data" format="fasta" label="Fasta file"/>
+      </inputs>
+<outputs>
+<data format="gff" name="outfile"/>
+</outputs>
+
+<stdio><exit_code range="1:" level="fatal" description="Error" /></stdio>
+    <help>
+
+
+**What it does**
+
+Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches of profile and other functional databases.
+
+
+**Dependencies**
+
+InterProscan package is required to be installed (http://code.google.com/p/interproscan/wiki/HowToDownload).
+
+
+
+#####
+Input
+#####
+
+A FASTA file containing protein sequences is required.
+
+
+######
+Output
+######
+
+Generic Feature Format Version 3 (GFF3)
+
+The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on [http://www.sequenceontology.org/gff3.shtml].
+
+Example Output
+--------------
+
+::
+
+  ##gff-version 3
+  ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269
+  ##sequence-region AACH01000027 1 1347
+  ##seqid|source|type|start|end|score|strand|phase|attributes
+  AACH01000027 provided_by_user nucleic_acid 1 1347 . + . Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027
+  AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347
+  AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347
+  AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase family;Target=null 84 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13"
+  ##sequence-region 2
+  ...
+  >pep_AACH01000027_1_1347
+  LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV
+  LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA
+  GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI
+  LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ
+  ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA
+  TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV
+  DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML
+  RSQKAKGVLIYRDDWISITPEIQLLFTEF
+  ...
+  >match$8_84_314
+  KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK
+  RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL
+  LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR
+  AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS
+  
+
+
+----------
+References
+----------
+
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers:
+
+Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
+Galaxy tools and workflows for sequence analysis with applications
+in molecular plant pathology. PeerJ 1:e167
+http://dx.doi.org/10.7717/peerj.167
+
+Zdobnov EM, Apweiler R (2001)
+InterProScan an integration platform for the signature-recognition methods in InterPro.
+Bioinformatics 17, 847-848.
+http://dx.doi.org/10.1093/bioinformatics/17.9.847
+
+Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005)
+InterProScan: protein domains identifier.
+Nucleic Acids Research 33 (Web Server issue), W116-W120.
+http://dx.doi.org/10.1093/nar/gki442
+
+Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J, McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ, Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009)
+InterPro: the integrative protein signature database.
+Nucleic Acids Research 37 (Database Issue), D224-228.
+http://dx.doi.org/10.1093/nar/gkn785
+
+
+This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at
+http://toolshed.g2.bx.psu.edu/view/bgruening/interproscan5
+
+
+**Galaxy Wrapper Author**::
+
+    * Fernando Pérez
+    * Ginés Almagro
+    * Laura Entrambasaguas
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/paso4.pl	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,132 @@
+#!/usr/bin/perl -w
+$| = 1;
+
+# Dado un fichero en formato GFF3 que incluye el análisis de varias secuencias,
+# el programa devuelve un fichero de texto que incluye los valores comunes a todas las secuencias dadas
+# para los atributos:
+    # Name -> Entrada de la base de datos de donde se ha obtenido una característica determinada;
+    # Ontology_term -> Entradas de Gene Ontology para una característica dada;
+    # Dbxref -> Entra de Interpro para una característica dada.
+
+use strict;
+
+
+# Declaración e inicialización de variables
+
+my $fichero_ent = "";                               #Nombre de fichero en formato GFF3 a analizar tomado de líneas de comandos
+my $output="";                                      #Fichero de salida pasado como parámetro
+my (@id_1, @id_2) = ();                             #Almacenan temporalmente las líneas directivas e ID de las secuencias
+my @ids = ();                                       #Todos los ID-seq del archivo
+my @temp = ();                                      #Todas las líneas de características del archivo
+my $lin = "";                                       #Recupera cada ID del @ids
+my @lin_id = ();                                    #Todas las líneas de características para un ID determinado
+my (@t1, @t3) = ();                                 #Almacenan temporalmente las características y atributos de una línea dada
+my $atributo = "";                                  #Únicamente la característica "Atributos" de cada línea de características
+my @etiquetas = ("Name","Ontology_term","Dbxref");  #Son los tres tipos de atributos comunes que se van a extraer del fichero
+my @sel_atrib = ();                                 #Los atributos correspondientes a una etiqueta dada en cada fila 
+my ($etiq, $atrib) = "";                            #La etiqueta y el valor respectivamente, de un atributo en una línea
+my @val_atrib = ();                                 #Los diferentes posibles valores de un atributo en una línea
+my (@valores, @valores_rep) = ([],[],[]);           #Valores de cada atributo para cada ID-seq y los valores repetidos para todos los ID-seq
+my @repetidos = ();                                 #Almacena temporalmente los valores comunes para cada atributo, entre los ID-seq analizados
+
+
+######## Abrir fichero y seleccionar lineas #########
+
+($fichero_ent,$output) = @ARGV;
+
+open(ARCHGFF3, $fichero_ent) || die "Failure to open the file \"$fichero_ent\"\n\n";      #Abre el fichero
+        
+while (<ARCHGFF3>)  {                                   #Lee el archivo
+    chomp $_;
+    if ($_ =~ /^##FASTA/) {                             #Elimina la parte de secuencias fasta
+        last;
+    }elsif ($_ =~ /^##sequence-region./) {      
+        push (@id_1, $_)                                #Las líneas directivas de sequence-region, para obtener su ID correspondiente
+    }elsif ($_ =~ /^#+/) {                              #Elimina líneas de comentarios y directivas, excepto el tipo anterior
+        next;   
+    }else { push (@temp, $_)};                          #Almacena las líneas con los atributos de todos los ID
+}; 
+close ARCHGFF3;                                    
+
+###########  Seleccionar ID de la línea directiva  ##################
+
+foreach (@id_1) {
+    @id_2 = split(/\s/,$_,3); 
+    push (@ids, $id_2[1]);                              #Almacena todos los ID-seq que hay en el archivo
+}
+
+###########  Seleccionar un ID-seq determinado y todas sus filas de características  ##################
+
+my $i = 0;                                              #Para distinguir entre la primera secuencia y el resto
+foreach $lin (@ids) {
+    @lin_id = grep (/^$lin/, @temp);                    #Todas las líneas de características correspondientes al ID seleccionado 
+
+###########  Seleccionar, para cada línea de características, la columna novena de atributos  ##################
+
+    foreach (@lin_id) {
+        @t1 = split(/\t/,$_);                           #Cada elemento es una característica de la línea dada
+        $atributo = $t1[8];                             #Únicamente la característica "atributos" de la línea dada
+            
+    #Seleccionar los atributos "Name", "Ontology_term" y "Dbxref" de la columna 9
+    
+        @t3 = split(/;/, $atributo);                    #Cada elemento es un atributo de la característica "Atributos" de una línea dada 
+        
+###########  Almacenar los diferentes valores de cada atributo (Name, Ontology_term y Dbxref) #############
+###########  de un ID_seq determinado en un @rray diferente                                   #############
+    
+        for my $cont (0..2){ 
+            if (@sel_atrib = grep (/^$etiquetas[$cont]./, @t3)) {      #Evitar valores no definidos para un atributo concreto
+                ($etiq, $atrib) = split (/=/, $sel_atrib[0], 2); 
+                @val_atrib = split (/,/, $atrib); 
+                foreach my $valor (@val_atrib) {                              
+                    if (!grep (/$valor/, @{$valores[$cont]})) {        #Evitar valores repetidos de un mismo atributo para un ID-seq determinado
+                            push (@{$valores[$cont]}, $valor);         #Todos los valores diferentes de cada atributo para un ID-seq determinado    
+                    }
+                }    
+            }
+        }    
+    }
+    
+###########  Comprobar los atributos comunes a todos los ID-seq y guardarlos  #################
+
+    if ($i == 0) {                                                     #Para el primer ID-seq se guardan todos sus atributos
+        for my $cont (0..2) {                                          #no repetidos. Como mucho, serán todos estos la solución.
+            $valores_rep[$cont] = [@{$valores[$cont]}];                
+            $valores[$cont] = [];
+        }
+        $i++;
+    } else {                                                                        
+        for my $cont (0..2) {
+            foreach my $valor (@{$valores[$cont]}) {
+                if (grep (/$valor/, @{$valores_rep[$cont]})) {         #Búsqueda valores comunes para cada tipo atributo entre los
+                    push (@repetidos, $valor);                         #ID-seq analizados.
+                }
+            }
+            $valores_rep[$cont] = [@repetidos];                        #Todos los valores comunes de los tres tipos de atributos para
+            $valores[$cont] = [];                                      #todas las secuencias del fichero.
+            @repetidos = ();
+        }
+    }
+    
+}
+
+
+############ Impresión de resultados ######################
+
+if (!open(FICHEROUT, ">$output")) { 
+    print "The file \"$output\" can not be opened";
+} else {
+    print FICHEROUT "The common atributes of the sequences ","@ids"," are:\n";
+    for my $cont (0..2) {
+        print FICHEROUT "For atribute ","$etiquetas[$cont]" ," : ";
+        if (scalar @{$valores_rep[$cont]} == 0) {
+            print FICHEROUT "No common atributes \n";
+        }else { print FICHEROUT "@{$valores_rep[$cont]}", "\n"}
+    }
+    close (FICHEROUT);
+}
+exit;
+  
+
+    
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/interpro/paso4.xml	Thu May 22 05:09:07 2014 -0400
@@ -0,0 +1,68 @@
+<tool id="DLaGiFer_3" name="Common attributes selection" version="1.0.0">
+    <description>Extraction of common sequences attributes</description>
+    <command interpreter="perl">
+    paso4.pl "$infile" "$outfile"
+    </command>
+
+         <inputs>
+            <param name="infile" type="data" format="gff" label="Sequence in Fasta File"/>
+
+          
+           </inputs>
+	<outputs><data format="txt" name="outfile"/>
+	</outputs>
+
+<stdio><exit_code range="1:" level="fatal" description="Error" /></stdio>
+    <help>
+
+
+**What it does**
+
+This tool obtains certain common attributes of the sequences contained in a gff3 format file.  
+
+
+**Dependencies**
+
+Perl is required for running this tool.
+
+
+
+#####
+Input
+#####
+
+A gff3 file with protein attributes is required.  
+
+Generic Feature Format Version 3 (GFF3)
+
+The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on [http://www.sequenceontology.org/gff3.shtml].
+
+
+######
+Output
+######
+
+
+A text file is generated by this tool.
+
+Example Output
+--------------
+
+The common atributes of the sequences Q0KIY3 P02185 P02178 Q0KIY5 Q0KIY1 are:
+
+For Name atribute: SSF46458 PF00042 PR00613 G3DSA:1.10.490.10 PS01033
+
+For Ontology_term atribute: "GO:0005506" "GO:0020037" "GO:0015671" "GO:0019825"
+
+For Dbxref atribute: "InterPro:IPR009050" "InterPro:IPR000971" "InterPro:IPR002335" "InterPro:IPR012292"  
+
+
+
+**Galaxy Wrapper Authors**::
+
+    * Laura Entrambasaguas
+    * Ginés Almagro
+    * Fernando Pérez
+
+    </help>
+</tool>