Mercurial > repos > fernando > protein_funcional_analysis_similarities
changeset 0:c342ebb50f0b draft default tip
Uploaded
author | fernando |
---|---|
date | Thu, 22 May 2014 05:09:07 -0400 |
parents | |
children | |
files | interpro/Galaxy-Workflow-Protein_Funcional_Analysis_Similarities.ga interpro/INSTALL.txt interpro/paso1.pl interpro/paso1.xml interpro/paso2.sh interpro/paso2.xml interpro/paso3.sh interpro/paso3.xml interpro/paso4.pl interpro/paso4.xml |
diffstat | 10 files changed, 783 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/Galaxy-Workflow-Protein_Funcional_Analysis_Similarities.ga Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,119 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "Search common attributes in protein sequences", + "format-version": "0.1", + "name": "Protein Funcional Analysis Similarities", + "steps": { + "0": { + "annotation": "", + "id": 0, + "input_connections": {}, + "inputs": [], + "name": "RemoteBlast", + "outputs": [ + { + "name": "outfile", + "type": "txt" + } + ], + "position": { + "left": 200, + "top": 205 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "ALaGiFer_1", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"infile\": \"null\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "1": { + "annotation": "", + "id": 1, + "input_connections": { + "infile": { + "id": 0, + "output_name": "outfile" + } + }, + "inputs": [], + "name": "Sequences assortment", + "outputs": [ + { + "name": "outfile", + "type": "fasta" + } + ], + "position": { + "left": 308, + "top": 339 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "BLaGiFer_2", + "tool_state": "{\"__page__\": 0, \"numsequences\": \"\\\"5\\\"\", \"__rerun_remap_job_id__\": null, \"infile\": \"null\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "2": { + "annotation": "", + "id": 2, + "input_connections": { + "infile": { + "id": 1, + "output_name": "outfile" + } + }, + "inputs": [], + "name": "Sequences attributes", + "outputs": [ + { + "name": "outfile", + "type": "gff" + } + ], + "position": { + "left": 385, + "top": 463 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "CLaGiFer_3", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"infile\": \"null\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + }, + "3": { + "annotation": "", + "id": 3, + "input_connections": { + "infile": { + "id": 2, + "output_name": "outfile" + } + }, + "inputs": [], + "name": "Common attributes selection", + "outputs": [ + { + "name": "outfile", + "type": "txt" + } + ], + "position": { + "left": 441, + "top": 589 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "DLaGiFer_3", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"infile\": \"null\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [] + } + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/INSTALL.txt Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,55 @@ +DEPENDENCES + +The wrapper needs BioPerl and InterProScan and it runs on a linux system. + +To install BioPerl: +http://www.bioperl.org/wiki/Installing_BioPerl + +To install InterProScan: +http://code.google.com/p/interproscan/wiki/HowToDownload + + + +## INSTALLATION WRAPPER INSTRUCTIONS + +# Stop your local Galaxy + +# Change to your_galaxy_path/tools + > cd your_galaxy_path/tools + +# Uncompress tar.gz file into "tools" directory. + > tar -xvfz interpro.tar.gz + +# Edit "paso3.sh" file and change InterProScan path to your local InterProScan path. + +# Go to the root galaxy directory, edit "tool_conf.xml" and insert this code block in the site where you would like to see the tool on the toolbar: + + <section id="LaGiFer" name="Protein Functional Analysis Similarities"> + <tool file="interpro/paso1.xml" /> + <tool file="interpro/paso2.xml" /> + <tool file="interpro/paso3.xml" /> + <tool file="interpro/paso4.xml" /> + </section> + + +# Run your Galaxy + > ./run.sh + +## INSTALLATION WORKFLOW INSTRUCTIONS + +# To install the workflow to run these tools, press "Upload or import workflow" from workflow menu, then select the .ga file through the "browse" (examinar) bottom and import it. + +Select "Configure your workflow menu" and click on "Show in menu" item and press "send". + + + +Thank you very much for using our Protein "Functional Analysis Similarities" wrapper + +For additional details or reporting errors please contact us. + + +gines.a@hotmail.com +fernando.perez8@um.es +lentram@um.es + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/paso1.pl Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,77 @@ +#Remote-blast "factory object" creation and blast-parameter initialization + + use Bio::Tools::Run::RemoteBlast; + use strict; + my $prog = 'blastp'; + my $db = 'swissprot'; + my $e_val= '1e-10'; +my ($input,$output)=@ARGV; + my @params = ( '-prog' => $prog, + '-data' => $db, + '-expect' => $e_val, + '-readmethod' => 'SearchIO' ); + + my $factory = Bio::Tools::Run::RemoteBlast->new(@params); + + #change a paramter +# $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]'; + + #remove a parameter + # delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; + + my $v = 1; + #$v is just to turn on and off the messages + + my $str = Bio::SeqIO->new(-file=>$input , '-format' => 'fasta' ); + + while (my $input = $str->next_seq()){ + #Blast a sequence against a database: + + #Alternatively, you could pass in a file with many + #sequences rather than loop through sequence one at a time + #Remove the loop starting 'while (my $input = $str->next_seq())' + #and swap the two lines below for an example of that. + my $r = $factory->submit_blast($input); + #my $r = $factory->submit_blast('amino.fa'); + + print STDERR "waiting..." if( $v > 0 ); + while ( my @rids = $factory->each_rid ) { + foreach my $rid ( @rids ) { + my $rc = $factory->retrieve_blast($rid); + if( !ref($rc) ) { + if( $rc < 0 ) { + $factory->remove_rid($rid); + } + print STDERR "." if ( $v > 0 ); + sleep 5; + } else { + my $result = $rc->next_result(); + #save the output + # my $filename = $result->query_name()."\.out"; + my $filename=""; + $factory->save_output($output); + $factory->remove_rid($rid); + #print "\nQuery Name: ", $result->query_name(), "\n"; + #while ( my $hit = $result->next_hit ) { + # next unless ( $v > 0); + # print "\nhit name is ", $hit->name, "\n"; + # while( my $hsp = $hit->next_hsp ) { + # print "HSP Len is ", $hsp->length('total'), " ", + # " E-value is ", $hsp->evalue, " Bit score ", + # $hsp->score, " \t", + # " Query loc: ",$hsp->query->start, " ", + # $hsp->query->end," ", + # " Sbject loc: ",$hsp->hit->start, " ", + # $hsp->hit->end,"\n"; + # } + #} + } + } + } + } + + # This example shows how to change a CGI parameter: +# $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM25'; + + # And this is how to delete a CGI parameter: + # delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/paso1.xml Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,107 @@ +<tool id="ALaGiFer_1" name="RemoteBlast" version="1.0.0"> + <description>Blast against UniProt/SwissProt database</description> + <command interpreter="perl"> + paso1.pl $infile $outfile + </command> + + <inputs> + <param name="infile" type="data" format="fasta" label="Sequence in Fasta File"/> + + + </inputs> +<outputs> +<data format="txt" name="outfile"/> +</outputs> + +<stdio><exit_code range="1:" level="fatal" description="Error" /></stdio> + <help> + +**What it does** + +This tool runs a remote blast against Uniprot/Swissprot database. + + +**Dependencies** + +Bioperl is necessary for running this tool. + +##### +Input +##### + +Required is a FASTA file containing protein sequences. + + +###### +Output +###### + +Generates an output text file. + + +Example Output +-------------- +BLASTP 2.2.29+ +Reference: Stephen F. Altschul, Thomas L. Madden, Alejandro +A. Schaffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and +David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new +generation of protein database search programs", Nucleic +Acids Res. 25:3389-3402. + + +RID: M2E3HVH4014 + + +Database: Non-redundant UniProtKB/SwissProt sequences + 457,803 sequences; 170,956,615 total letters +Query= C08B11.6 CE30856 WBGene00007434 actin status:Confirmed +UniProt:Q09443 protein_id:CAA86667.3 + +Length=418 + + + Score E +Sequences producing significant alignments: (Bits) Value + +sp|Q09443.3|ARP6_CAEEL RecName: Full=Actin-like protein C08B11.6 853 0.0 +sp|Q9D864.2|ARP6_MOUSE RecName: Full=Actin-related protein 6;... 280 2e-88 +sp|Q9GZN1.1|ARP6_HUMAN RecName: Full=Actin-related protein 6;... 279 5e-88 +sp|Q9DEE9.1|ARP6_CHICK RecName: Full=Actin-related protein 6;... 274 5e-86 +sp|P45890.1|ARP6_DROME RecName: Full=Actin-related protein 6;... 265 1e-82 +sp|Q6C982.1|ARP6_YARLI RecName: Full=Actin-like protein ARP6 232 4e-70 +sp|O94241.1|ARP6_SCHPO RecName: Full=Actin-like protein arp6 228 2e-68 +sp|Q7S6X6.1|ARP6_NEUCR RecName: Full=Actin-related protein 6 212 6e-62 +sp|Q5NBI2.1|ARP6_ORYSJ RecName: Full=Actin-related protein 6 ... 210 3e-61 + +ALIGNMENTS +>sp|Q09443.3|ARP6_CAEEL RecName: Full=Actin-like protein C08B11.6 +Length=418 + + Score = 853 bits (2204), Expect = 0.0 + Identities = 418/418 (100%), Positives = 418/418 (100%), Gaps = 0/418 (0%) + +Query 1 MSLTTIIFDNGGHNMKIGTIDSESPRLVPNSIVKAKHEKKRVFVAHEQEECSDKFSLFYV 60 + MSLTTIIFDNGGHNMKIGTIDSESPRLVPNSIVKAKHEKKRVFVAHEQEECSDKFSLFYV +Sbjct 1 MSLTTIIFDNGGHNMKIGTIDSESPRLVPNSIVKAKHEKKRVFVAHEQEECSDKFSLFYV 60 + +Query 61 RPIERGYVVNWDTQQQIWEKTFGSMDVEASTSRIALTDNNYLIPALPDVSSEILFDYFGF 120 + RPIERGYVVNWDTQQQIWEKTFGSMDVEASTSRIALTDNNYLIPALPDVSSEILFDYFGF +Sbjct 61 RPIERGYVVNWDTQQQIWEKTFGSMDVEASTSRIALTDNNYLIPALPDVSSEILFDYFGF 120 + +Query 121 TEVHKTSASTLVAKHSNKINNEKCAVVVDSGFSWTTVASFVNGMLIQDSVIRIDVGGKAL 180 + TEVHKTSASTLVAKHSNKINNEKCAVVVDSGFSWTTVASFVNGMLIQDSVIRIDVGGKAL +Sbjct 121 TEVHKTSASTLVAKHSNKINNEKCAVVVDSGFSWTTVASFVNGMLIQDSVIRIDVGGKAL 180 + +Query 181 TNKLKDWVSYRQLNVSEETYVINECKEDLCFVSQNFDESMKEARNRFQENTTMKRYIMPD 240 + TNKLKDWVSYRQLNVSEETYVINECKEDLCFVSQNFDESMKEARNRFQENTTMKRYIMPD +Sbjct 181 TNKLKDWVSYRQLNVSEETYVINECKEDLCFVSQNFDESMKEARNRFQENTTMKRYIMPD 240 + + +**Galaxy Wrapper Authors**:: + + * Laura Entrambasaguas + * Ginés Almagro + * Fernando Pérez + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/paso2.sh Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,15 @@ +#!/bin/bash +# -*- ENCODING: UTF-8 -*- +# fichero de entrada es salida.out +input=$1 +output=$2 +working_dir=$PWD +sift_output="$working_dir/sift_output.txt" + +grep sp\| "$input" |head -n$3 |awk 'BEGIN {FS="|"};{print $2}' | awk 'BEGIN {FS="."};{print $1".fasta"}' >"$sift_output" +while read line +do + echo "wget http://www.uniprot.org/uniprot/$line" | sh + cat $line >>"$output" + rm "$line" +done < "$sift_output"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/paso2.xml Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,88 @@ +<tool id="BLaGiFer_2" name="Sequences assortment" version="1.0.0"> + <description> Select the most similar sequences</description> + <command interpreter="bash"> + ./paso2.sh "$infile" "$outfile" "$numsequences" + </command> + + <inputs> + <param name="infile" type="data" format="txt" label="ids file"/> + + <param name="numsequences" type="select" label="Select number of sequences" help=""> + <option value="1">1</option> + <option value="2">2</option> + <option value="3">3</option> + <option value="4">4</option> + <option value="5" selected="true">5</option> + </param> + </inputs> +<outputs> +<data format="fasta" name="outfile"/> +</outputs> + +<stdio><exit_code range="1:" level="fatal" description="Error" /></stdio> + <help> + +**What it does** + +From the output text file generated in RemoteBlast wrapper, the user may choose from the five first sequences. The result is stored in a fasta file. + +**Dependencies** + +This tool only runs UNIX commands, awk is necessary to be installed. + + +##### +Input +##### + +The input text file generated by RemoteBlast is necessary. + +###### + +The Blast results are saved on an output text file. + + + +Example Output +-------------- + +>sp|Q09443|ARP6_CAEEL Actin-like protein C08B11.6 OS=Caenorhabditis elegans GN=arp-6 PE=2 SV=3 +MSLTTIIFDNGGHNMKIGTIDSESPRLVPNSIVKAKHEKKRVFVAHEQEECSDKFSLFYV +RPIERGYVVNWDTQQQIWEKTFGSMDVEASTSRIALTDNNYLIPALPDVSSEILFDYFGF +TEVHKTSASTLVAKHSNKINNEKCAVVVDSGFSWTTVASFVNGMLIQDSVIRIDVGGKAL +TNKLKDWVSYRQLNVSEETYVINECKEDLCFVSQNFDESMKEARNRFQENTTMKRYIMPD +FHSTFRGVVKDVKEPHDPQIPSIHLGVERFAIPEILFNPSDIDIDQCGVAEAVIESICQC +PEALRPALAENIIVIGGSSCFPGFRERLEREVRSMLPAEYGLNVSNDVINPQTHSWHCGQ +ELLTASKVPWINRKDWDERGDSLEFSNFFQTLVQSDELKGTRNFDDQREKSPKEDEDF +>sp|Q9D864|ARP6_MOUSE Actin-related protein 6 OS=Mus musculus GN=Actr6 PE=1 SV=2 +MTTLVLDNGAYNAKIGYSHDSVSVIPNCQFRSKTARLKTFTANQIDEIKDPSGLFYILPF +QKGYLVNWDVQRQVWDYLFGKEMYQVDFLDTNIIITEPYFNFTSIQESMNEILFEEYQFQ +AVLRVNAGALSAHRYFRDNPSELCCIIVDSGYSFTHIVPYCRSKKKKEAIIRINVGGKLL +TNHLKEIISYRQLHVMDETHVINQVKEDVCYVSQDFYRDMDIAKLKGEDNTVMIDYVLPD +FSTIKKGFCKPREEMVLSGKYKSGEQILRLANERFAVPEILFNPSDIGIQEMGIPEAIVY +SIQNLPEEMQPHFFKNIVLTGGNSLFPGFRERVYSEVRCLTPTDYDVSVVLPENPITYSW +EGGKLISENDDFEDMVVTREDYEENGHSVCEEKFDI +>sp|Q9GZN1|ARP6_HUMAN Actin-related protein 6 OS=Homo sapiens GN=ACTR6 PE=1 SV=1 +MTTLVLDNGAYNAKIGYSHENVSVIPNCQFRSKTARLKTFTANQIDEIKDPSGLFYILPF +QKGYLVNWDVQRQVWDYLFGKEMYQVDFLDTNIIITEPYFNFTSIQESMNEILFEEYQFQ +AVLRVNAGALSAHRYFRDNPSELCCIIVDSGYSFTHIVPYCRSKKKKEAIIRINVGGKLL +TNHLKEIISYRQLHVMDETHVINQVKEDVCYVSQDFYRDMDIAKLKGEENTVMIDYVLPD +FSTIKKGFCKPREEMVLSGKYKSGEQILRLANERFAVPEILFNPSDIGIQEMGIPEAIVY +SIQNLPEEMQPHFFKNIVLTGGNSLFPGFRDRVYSEVRCLTPTDYDVSVVLPENPITYAW +EGGKLISENDDFEDMVVTREDYEENGHSVCEEKFDI +>sp|Q9DEE9|ARP6_CHICK Actin-related protein 6 OS=Gallus gallus GN=ACTR6 PE=2 SV=1 +MATLVLDNGAYNAKIGYSHAHVSVIPNCQFRSKTARLKTFTANQLDEIKDPSGLFYILPF +QKGYLVNWDVQRQVWDYLFGKEMYQVDFVDTNIIITEPYFNFSSIQESMNEILFEEYQFQ +AVLRVNAGALSAHRYFRDNPSELCCIIVDSGYSFTHIVPYCRSKKKKEAIIRINVGGKLL +TNHLKEIISYRQLHVMDETHVINQVKEDVCYVSQDFYKDMEIAKLKGEENTVMVDYVLPD +FSTIKKGFCKPREEMVLSGKYKTGEQILRLTNERFAVPEILFHPSDIGIQEMGIPEAIVD +SIQNLPEEMQPHFFKNIVLTGGNTLFPGFRDRVYSEVRCLTPTDYDVSVVLPENPITYSW +EGGKLISENDDFEDLVVTREDYEEHGHNICEEKFDI + +**Galaxy Wrapper Authors**:: + + * Ginés Almagro + * Laura Entrambasaguas + * Fernando Pérez + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/paso3.sh Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,6 @@ +#!/bin/bash +# -*- ENCODING: UTF-8 -* + +input=$1 +output=$2 +~/bioapps/interproscan/interproscan-5.3-46.0/interproscan.sh -i "$input" -o "$output" -f gff3 --iprlookup --goterms -pa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/paso3.xml Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,116 @@ +<tool id="CLaGiFer_3" name="Sequences attributes" version="1.0.0"> + <description>Download gff file from InterPro</description> + <command interpreter="bash"> + ./paso3.sh "$infile" "$outfile" + </command> + + <inputs> + <param name="infile" type="data" format="fasta" label="Fasta file"/> + </inputs> +<outputs> +<data format="gff" name="outfile"/> +</outputs> + +<stdio><exit_code range="1:" level="fatal" description="Error" /></stdio> + <help> + + +**What it does** + +Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches of profile and other functional databases. + + +**Dependencies** + +InterProscan package is required to be installed (http://code.google.com/p/interproscan/wiki/HowToDownload). + + + +##### +Input +##### + +A FASTA file containing protein sequences is required. + + +###### +Output +###### + +Generic Feature Format Version 3 (GFF3) + +The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on [http://www.sequenceontology.org/gff3.shtml]. + +Example Output +-------------- + +:: + + ##gff-version 3 + ##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269 + ##sequence-region AACH01000027 1 1347 + ##seqid|source|type|start|end|score|strand|phase|attributes + AACH01000027 provided_by_user nucleic_acid 1 1347 . + . Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027 + AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347 + AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347 + AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase family;Target=null 84 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13" + ##sequence-region 2 + ... + >pep_AACH01000027_1_1347 + LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV + LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA + GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI + LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ + ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA + TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV + DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML + RSQKAKGVLIYRDDWISITPEIQLLFTEF + ... + >match$8_84_314 + KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK + RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL + LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR + AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS + + + +---------- +References +---------- + + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following papers: + +Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). +Galaxy tools and workflows for sequence analysis with applications +in molecular plant pathology. PeerJ 1:e167 +http://dx.doi.org/10.7717/peerj.167 + +Zdobnov EM, Apweiler R (2001) +InterProScan an integration platform for the signature-recognition methods in InterPro. +Bioinformatics 17, 847-848. +http://dx.doi.org/10.1093/bioinformatics/17.9.847 + +Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005) +InterProScan: protein domains identifier. +Nucleic Acids Research 33 (Web Server issue), W116-W120. +http://dx.doi.org/10.1093/nar/gki442 + +Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J, McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ, Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009) +InterPro: the integrative protein signature database. +Nucleic Acids Research 37 (Database Issue), D224-228. +http://dx.doi.org/10.1093/nar/gkn785 + + +This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at +http://toolshed.g2.bx.psu.edu/view/bgruening/interproscan5 + + +**Galaxy Wrapper Author**:: + + * Fernando Pérez + * Ginés Almagro + * Laura Entrambasaguas + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/paso4.pl Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,132 @@ +#!/usr/bin/perl -w +$| = 1; + +# Dado un fichero en formato GFF3 que incluye el análisis de varias secuencias, +# el programa devuelve un fichero de texto que incluye los valores comunes a todas las secuencias dadas +# para los atributos: + # Name -> Entrada de la base de datos de donde se ha obtenido una característica determinada; + # Ontology_term -> Entradas de Gene Ontology para una característica dada; + # Dbxref -> Entra de Interpro para una característica dada. + +use strict; + + +# Declaración e inicialización de variables + +my $fichero_ent = ""; #Nombre de fichero en formato GFF3 a analizar tomado de líneas de comandos +my $output=""; #Fichero de salida pasado como parámetro +my (@id_1, @id_2) = (); #Almacenan temporalmente las líneas directivas e ID de las secuencias +my @ids = (); #Todos los ID-seq del archivo +my @temp = (); #Todas las líneas de características del archivo +my $lin = ""; #Recupera cada ID del @ids +my @lin_id = (); #Todas las líneas de características para un ID determinado +my (@t1, @t3) = (); #Almacenan temporalmente las características y atributos de una línea dada +my $atributo = ""; #Únicamente la característica "Atributos" de cada línea de características +my @etiquetas = ("Name","Ontology_term","Dbxref"); #Son los tres tipos de atributos comunes que se van a extraer del fichero +my @sel_atrib = (); #Los atributos correspondientes a una etiqueta dada en cada fila +my ($etiq, $atrib) = ""; #La etiqueta y el valor respectivamente, de un atributo en una línea +my @val_atrib = (); #Los diferentes posibles valores de un atributo en una línea +my (@valores, @valores_rep) = ([],[],[]); #Valores de cada atributo para cada ID-seq y los valores repetidos para todos los ID-seq +my @repetidos = (); #Almacena temporalmente los valores comunes para cada atributo, entre los ID-seq analizados + + +######## Abrir fichero y seleccionar lineas ######### + +($fichero_ent,$output) = @ARGV; + +open(ARCHGFF3, $fichero_ent) || die "Failure to open the file \"$fichero_ent\"\n\n"; #Abre el fichero + +while (<ARCHGFF3>) { #Lee el archivo + chomp $_; + if ($_ =~ /^##FASTA/) { #Elimina la parte de secuencias fasta + last; + }elsif ($_ =~ /^##sequence-region./) { + push (@id_1, $_) #Las líneas directivas de sequence-region, para obtener su ID correspondiente + }elsif ($_ =~ /^#+/) { #Elimina líneas de comentarios y directivas, excepto el tipo anterior + next; + }else { push (@temp, $_)}; #Almacena las líneas con los atributos de todos los ID +}; +close ARCHGFF3; + +########### Seleccionar ID de la línea directiva ################## + +foreach (@id_1) { + @id_2 = split(/\s/,$_,3); + push (@ids, $id_2[1]); #Almacena todos los ID-seq que hay en el archivo +} + +########### Seleccionar un ID-seq determinado y todas sus filas de características ################## + +my $i = 0; #Para distinguir entre la primera secuencia y el resto +foreach $lin (@ids) { + @lin_id = grep (/^$lin/, @temp); #Todas las líneas de características correspondientes al ID seleccionado + +########### Seleccionar, para cada línea de características, la columna novena de atributos ################## + + foreach (@lin_id) { + @t1 = split(/\t/,$_); #Cada elemento es una característica de la línea dada + $atributo = $t1[8]; #Únicamente la característica "atributos" de la línea dada + + #Seleccionar los atributos "Name", "Ontology_term" y "Dbxref" de la columna 9 + + @t3 = split(/;/, $atributo); #Cada elemento es un atributo de la característica "Atributos" de una línea dada + +########### Almacenar los diferentes valores de cada atributo (Name, Ontology_term y Dbxref) ############# +########### de un ID_seq determinado en un @rray diferente ############# + + for my $cont (0..2){ + if (@sel_atrib = grep (/^$etiquetas[$cont]./, @t3)) { #Evitar valores no definidos para un atributo concreto + ($etiq, $atrib) = split (/=/, $sel_atrib[0], 2); + @val_atrib = split (/,/, $atrib); + foreach my $valor (@val_atrib) { + if (!grep (/$valor/, @{$valores[$cont]})) { #Evitar valores repetidos de un mismo atributo para un ID-seq determinado + push (@{$valores[$cont]}, $valor); #Todos los valores diferentes de cada atributo para un ID-seq determinado + } + } + } + } + } + +########### Comprobar los atributos comunes a todos los ID-seq y guardarlos ################# + + if ($i == 0) { #Para el primer ID-seq se guardan todos sus atributos + for my $cont (0..2) { #no repetidos. Como mucho, serán todos estos la solución. + $valores_rep[$cont] = [@{$valores[$cont]}]; + $valores[$cont] = []; + } + $i++; + } else { + for my $cont (0..2) { + foreach my $valor (@{$valores[$cont]}) { + if (grep (/$valor/, @{$valores_rep[$cont]})) { #Búsqueda valores comunes para cada tipo atributo entre los + push (@repetidos, $valor); #ID-seq analizados. + } + } + $valores_rep[$cont] = [@repetidos]; #Todos los valores comunes de los tres tipos de atributos para + $valores[$cont] = []; #todas las secuencias del fichero. + @repetidos = (); + } + } + +} + + +############ Impresión de resultados ###################### + +if (!open(FICHEROUT, ">$output")) { + print "The file \"$output\" can not be opened"; +} else { + print FICHEROUT "The common atributes of the sequences ","@ids"," are:\n"; + for my $cont (0..2) { + print FICHEROUT "For atribute ","$etiquetas[$cont]" ," : "; + if (scalar @{$valores_rep[$cont]} == 0) { + print FICHEROUT "No common atributes \n"; + }else { print FICHEROUT "@{$valores_rep[$cont]}", "\n"} + } + close (FICHEROUT); +} +exit; + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interpro/paso4.xml Thu May 22 05:09:07 2014 -0400 @@ -0,0 +1,68 @@ +<tool id="DLaGiFer_3" name="Common attributes selection" version="1.0.0"> + <description>Extraction of common sequences attributes</description> + <command interpreter="perl"> + paso4.pl "$infile" "$outfile" + </command> + + <inputs> + <param name="infile" type="data" format="gff" label="Sequence in Fasta File"/> + + + </inputs> + <outputs><data format="txt" name="outfile"/> + </outputs> + +<stdio><exit_code range="1:" level="fatal" description="Error" /></stdio> + <help> + + +**What it does** + +This tool obtains certain common attributes of the sequences contained in a gff3 format file. + + +**Dependencies** + +Perl is required for running this tool. + + + +##### +Input +##### + +A gff3 file with protein attributes is required. + +Generic Feature Format Version 3 (GFF3) + +The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output format. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on [http://www.sequenceontology.org/gff3.shtml]. + + +###### +Output +###### + + +A text file is generated by this tool. + +Example Output +-------------- + +The common atributes of the sequences Q0KIY3 P02185 P02178 Q0KIY5 Q0KIY1 are: + +For Name atribute: SSF46458 PF00042 PR00613 G3DSA:1.10.490.10 PS01033 + +For Ontology_term atribute: "GO:0005506" "GO:0020037" "GO:0015671" "GO:0019825" + +For Dbxref atribute: "InterPro:IPR009050" "InterPro:IPR000971" "InterPro:IPR002335" "InterPro:IPR012292" + + + +**Galaxy Wrapper Authors**:: + + * Laura Entrambasaguas + * Ginés Almagro + * Fernando Pérez + + </help> +</tool>