changeset 3:70df762b48a8 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
author earlhaminst
date Tue, 03 Oct 2017 04:51:45 -0400
parents 376ed15e0d27
children 363f3480622d
files blast_parser.pl blast_parser.py blast_parser.xml test-data/output.tabular test-data/output2.tabular
diffstat 5 files changed, 133 insertions(+), 66 deletions(-) [+]
line wrap: on
line diff
--- a/blast_parser.pl	Fri Mar 24 12:14:47 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use warnings;
-use List::Util qw(min max);
-
-# A simple Perl parser to convert a BLAST 12-column or 24-column output into a
-# 3-column input for hcluster_hg (id1, id2, weight):
-# parse_blast.pl <file>
-
-use constant LOG_E_10 => log(10);
-
-my $file1 = $ARGV[0];
-open my $fh1, '<', $file1;
-
-while (my $line = <$fh1>) {
-    my @row = split(/\t/, $line);
-
-    if ($row[0] eq $row[1]) {
-        # ignore self matching hits
-    } else {
-        # Convert evalue to an integer weight with max 100
-        my $weight = 100;
-
-        #if the evalue is 0, leave weight at 100
-        if ($row[10] != 0 && $row[10] != 0.0) {
-            $weight = min(100, positive_round(-1 * log10($row[10])));
-        }
-        print"$row[0]\t$row[1]\t$weight\n";
-    }
-}
-close $fh1;
-
-# Calculate logarithm to base 10 of a number
-sub log10 {
-    my $n = shift;
-    return log($n) / LOG_E_10;
-}
-
-# Round a positive float to the nearest integer
-sub positive_round{
-    my $n = shift;
-    return int($n + 0.5);
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_parser.py	Tue Oct 03 04:51:45 2017 -0400
@@ -0,0 +1,54 @@
+"""
+Simple parser to convert a BLAST 12-column or 24-column tabular output into a
+3-column tabular input for hcluster_hg (id1, id2, weight):
+"""
+import argparse
+import math
+from collections import OrderedDict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file')
+
+    parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file')
+
+    parser.add_argument('-r', action='store_true', default=False,
+                        dest='reciprocal',
+                        help='Annotate homolog pair')
+
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+
+    options = parser.parse_args()
+
+    results = OrderedDict()
+
+    for line in options.i:
+        line = line.rstrip()
+        line_cols = line.split('\t')
+        sequence1_id = line_cols[0]
+        sequence2_id = line_cols[1]
+        evalue = float(line_cols[10])
+
+        # Ignore self-matching hits
+        if sequence1_id != sequence2_id:
+            # Convert evalue to an integer weight with max 100
+            weight = 100
+
+            # If the evalue is 0, leave weight at 100
+            if evalue != 0.0:
+                weight = min(100, round(math.log10(evalue) / -2.0))
+
+            if (sequence1_id, sequence2_id) not in results:
+                results[(sequence1_id, sequence2_id)] = weight
+            else:
+                results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight)
+
+    for (sequence1_id, sequence2_id), weight in results.items():
+        if not options.reciprocal or (sequence2_id, sequence1_id) in results:
+            options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight))
+
+
+if __name__ == "__main__":
+    main()
--- a/blast_parser.xml	Fri Mar 24 12:14:47 2017 -0400
+++ b/blast_parser.xml	Tue Oct 03 04:51:45 2017 -0400
@@ -1,18 +1,22 @@
-<tool id="blast_parser" name="BLAST parser" version="0.1.1">
+<tool id="blast_parser" name="BLAST parser" version="0.1.2">
     <description>
         Convert 12- or 24-column BLAST output into 3-column hcluster_sg input
     </description>
 
     <command detect_errors="exit_code">
 <![CDATA[
-perl '$__tool_directory__/blast_parser.pl'
-'$input'
-> '$output'
+python '$__tool_directory__/blast_parser.py'
+-i '$input'
+-o '$output'
+#if $reciprocal
+    -r
+#end if
 ]]>
     </command>
 
     <inputs>
         <param name="input" type="data" format="tabular" label="Tabular data" help="BLAST 12 column tabular format data"/>
+        <param name="reciprocal" type="boolean" checked="false" label="Reciprocal results" help="returns only reciprocal results"/>
     </inputs>
 
     <outputs>
@@ -22,12 +26,18 @@
     <tests>
         <test>
             <param name="input" ftype="tabular" value="input.tabular" />
+            <param name="reciprocal" value="false" />
             <output name="output" file="output.tabular" />
         </test>
+        <test>
+            <param name="input" ftype="tabular" value="input.tabular" />
+            <param name="reciprocal" value="true" />
+            <output name="output" file="output2.tabular" />
+        </test>
     </tests>
     <help>
 <![CDATA[
-Simple tool to convert a 12- or 24-column BLAST output into a 3-column format (qseqid, sseqid, round(-1 * log10(evalue))) usable as input for the hcluster_sg tool.
+Simple tool to convert a 12- or 24-column BLAST output into a 3-column format (qseqid, sseqid, round(-1 * log10(evalue)/2)) usable as input for the hcluster_sg tool.
 ]]>
     </help>
     <citations>
--- a/test-data/output.tabular	Fri Mar 24 12:14:47 2017 -0400
+++ b/test-data/output.tabular	Tue Oct 03 04:51:45 2017 -0400
@@ -4,24 +4,22 @@
 ENSPCAT00000008534_procaviacapensis_1	ENST00000378069_homosapiens_1	100
 ENSLOCT00000017020_lepisosteusoculatus_1	ENSLACT00000026689_latimeriachalumnae_1	100
 ENSCPOT00000000986_caviaporcellus_1	ENSCAFT00000022963_canisfamiliaris_1	100
-ENSTGUT00000016508_taeniopygiaguttata_1	ENSTGUT00000006603_taeniopygiaguttata_1	100
+ENSTGUT00000016508_taeniopygiaguttata_1	ENSTGUT00000006603_taeniopygiaguttata_1	79
 ENSPFOT00000010657_poeciliaformosa_1	ENSXMAT00000001796_xiphophorusmaculatus_1	100
 ENSDNOT00000016434_dasypusnovemcinctus_1	ENSDNOT00000036768_dasypusnovemcinctus_1	100
-ENSPMAT00000010398_petromyzonmarinus_1	ENSLACT00000015911_latimeriachalumnae_1	64
+ENSPMAT00000010398_petromyzonmarinus_1	ENSLACT00000015911_latimeriachalumnae_1	32
 ENSAMET00000018099_ailuropodamelanoleuca_1	ENSCAFT00000022939_canisfamiliaris_1	100
 ENSEEUT00000005606_erinaceuseuropaeus_1	ENSMPUT00000012759_mustelaputoriusfuro_1	100
 ENSSHAT00000006757_sarcophilusharrisii_1	ENSMODT00000026841_monodelphisdomestica_1	100
 ENSPSIT00000017454_pelodiscussinensis_1	ENSPSIT00000017443_pelodiscussinensis_1	100
 ENSPFOT00000022544_poeciliaformosa_1	ENSXMAT00000001796_xiphophorusmaculatus_1	100
 ENSMICT00000002052_microcebusmurinus_1	ENSCAFT00000022963_canisfamiliaris_1	100
-ENSMICT00000002052_microcebusmurinus_1	ENSCAFT00000022963_canisfamiliaris_1	46
 ENSRNOT00000066674_rattusnorvegicus_1	ENSMUST00000026013_musmusculus_1	100
 ENSFCAT00000013090_feliscatus_1	ENSAMET00000018029_ailuropodamelanoleuca_1	100
 ENSONIT00000020514_oreochromisniloticus_1	ENSPFOT00000009022_poeciliaformosa_1	100
 ENSLACT00000026572_latimeriachalumnae_1	ENSLACT00000015911_latimeriachalumnae_1	100
-ENSPMAT00000003449_petromyzonmarinus_1	ENSGGOT00000000206_gorillagorilla_1	100
+ENSPMAT00000003449_petromyzonmarinus_1	ENSGGOT00000000206_gorillagorilla_1	75
 ENSSART00000014230_sorexaraneus_1	ENSSTOT00000004965_ictidomystridecemlineatus_1	100
-ENSSART00000014230_sorexaraneus_1	ENSSTOT00000004965_ictidomystridecemlineatus_1	44
 ENSBTAT00000001698_bostaurus_1	ENSCAFT00000022963_canisfamiliaris_1	100
 ENSTBET00000006983_tupaiabelangeri_1	ENSAMET00000018099_ailuropodamelanoleuca_1	100
 ENSLACT00000014274_latimeriachalumnae_1	ENSLACT00000026689_latimeriachalumnae_1	100
@@ -38,8 +36,7 @@
 ENSXMAT00000001796_xiphophorusmaculatus_1	ENSONIT00000016435_oreochromisniloticus_1	100
 ENSSSCT00000013404_susscrofa_1	ENSPPYT00000023637_pongoabelii_1	100
 ENSGALT00000036672_gallusgallus_1	ENSMGAT00000016429_meleagrisgallopavo_1	100
-ENSOPRT00000017156_ochotonaprinceps_1	ENSOCUT00000001438_oryctolaguscuniculus_1	100
-ENSOPRT00000017156_ochotonaprinceps_1	ENSOCUT00000001438_oryctolaguscuniculus_1	17
+ENSOPRT00000017156_ochotonaprinceps_1	ENSOCUT00000001438_oryctolaguscuniculus_1	70
 ENSSTOT00000004988_ictidomystridecemlineatus_1	ENSPANT00000027606_papioanubis_1	100
 ENSECAT00000024641_equuscaballus_1	ENSCAFT00000022939_canisfamiliaris_1	100
 ENSAPLT00000013855_anasplatyrhynchos_1	ENSMGAT00000016431_meleagrisgallopavo_1	100
@@ -48,7 +45,7 @@
 ENSPTRT00000040521_pantroglodytes_1	ENSGGOT00000000206_gorillagorilla_1	100
 ENSPTRT00000040520_pantroglodytes_1	ENSPPYT00000023637_pongoabelii_1	100
 ENSMEUT00000003745_macropuseugenii_1	ENSMODT00000026841_monodelphisdomestica_1	100
-ENSMICT00000002042_microcebusmurinus_1	ENSDNOT00000039756_dasypusnovemcinctus_1	100
+ENSMICT00000002042_microcebusmurinus_1	ENSDNOT00000039756_dasypusnovemcinctus_1	85
 ENSXETT00000010517_xenopustropicalis_1	ENSXETT00000010521_xenopustropicalis_1	100
 ENSMODT00000026840_monodelphisdomestica_1	ENSCAFT00000022963_canisfamiliaris_1	100
 ENSMLUT00000001428_myotislucifugus_1	ENSAMET00000018099_ailuropodamelanoleuca_1	100
@@ -59,8 +56,8 @@
 ENSTRUT00000035430_takifugurubripes_1	ENSTNIT00000014720_tetraodonnigroviridis_1	100
 ENSMEUT00000006183_macropuseugenii_1	ENSMODT00000026840_monodelphisdomestica_1	100
 ENSFALT00000001591_ficedulaalbicollis_1	ENSTGUT00000006603_taeniopygiaguttata_1	100
-ENSMUST00000168613_musmusculus_1	ENSMUST00000040820_musmusculus_1	100
-ENSSSCT00000032764_susscrofa_1	ENSSSCT00000013404_susscrofa_1	100
+ENSMUST00000168613_musmusculus_1	ENSMUST00000040820_musmusculus_1	76
+ENSSSCT00000032764_susscrofa_1	ENSSSCT00000013404_susscrofa_1	53
 ENSGALT00000026158_gallusgallus_1	ENSMGAT00000016431_meleagrisgallopavo_1	100
 ENSDART00000160057_daniorerio_1	ENSDART00000132084_daniorerio_1	100
 ENSMPUT00000012759_mustelaputoriusfuro_1	ENSCAFT00000022939_canisfamiliaris_1	100
@@ -92,11 +89,10 @@
 ENSSTOT00000004965_ictidomystridecemlineatus_1	ENSECAT00000024641_equuscaballus_1	100
 ENSMLUT00000001440_myotislucifugus_1	ENSPANT00000027606_papioanubis_1	100
 ENSORLT00000017214_oryziaslatipes_1	ENSPFOT00000009022_poeciliaformosa_1	100
-ENSMUST00000163344_musmusculus_1	ENSMUST00000168613_musmusculus_1	63
+ENSMUST00000163344_musmusculus_1	ENSMUST00000168613_musmusculus_1	31
 ENSACAT00000017993_anoliscarolinensis_1	ENSPSIT00000017443_pelodiscussinensis_1	100
 ENSCJAT00000021080_callithrixjacchus_1	ENSCJAT00000058575_callithrixjacchus_1	100
-ENSOPRT00000000678_ochotonaprinceps_1	ENSOGAT00000030491_otolemurgarnettii_1	100
-ENSOPRT00000000678_ochotonaprinceps_1	ENSOGAT00000030491_otolemurgarnettii_1	19
+ENSOPRT00000000678_ochotonaprinceps_1	ENSOGAT00000030491_otolemurgarnettii_1	71
 ENSMMUT00000027384_macacamulatta_1	ENSPANT00000027701_papioanubis_1	100
 ENSMMUT00000027387_macacamulatta_1	ENSPANT00000027631_papioanubis_1	100
 ENSLOCT00000019886_lepisosteusoculatus_1	ENSDART00000160057_daniorerio_1	100
@@ -118,7 +114,7 @@
 ENSTTRT00000009129_tursiopstruncatus_1	ENSCAFT00000022963_canisfamiliaris_1	100
 ENSCAFT00000022963_canisfamiliaris_1	ENSAMET00000018029_ailuropodamelanoleuca_1	100
 ENSGGOT00000008973_gorillagorilla_1	ENSPPYT00000023637_pongoabelii_1	100
-ENSOGAT00000031973_otolemurgarnettii_1	ENSOGAT00000005620_otolemurgarnettii_1	100
+ENSOGAT00000031973_otolemurgarnettii_1	ENSOGAT00000005620_otolemurgarnettii_1	77
 ENSGACT00000024065_gasterosteusaculeatus_1	ENSGACT00000024064_gasterosteusaculeatus_1	100
 ENSGACT00000024064_gasterosteusaculeatus_1	ENSGACT00000024065_gasterosteusaculeatus_1	100
 ENSAMET00000018029_ailuropodamelanoleuca_1	ENSCAFT00000022963_canisfamiliaris_1	100
@@ -142,19 +138,19 @@
 ENSLAFT00000027936_loxodontaafricana_1	ENSLAFT00000015029_loxodontaafricana_1	100
 ENSPSIT00000016442_pelodiscussinensis_1	ENSAPLT00000013117_anasplatyrhynchos_1	100
 ENSOART00000003319_ovisaries_1	ENSBTAT00000021570_bostaurus_1	100
-ENSMMUT00000046681_macacamulatta_1	ENSPANT00000027701_papioanubis_1	28
-ENSMMUT00000046680_macacamulatta_1	ENSCSAT00000012035_chlorocebussabaeus_1	44
+ENSMMUT00000046681_macacamulatta_1	ENSPANT00000027701_papioanubis_1	14
+ENSMMUT00000046680_macacamulatta_1	ENSCSAT00000012035_chlorocebussabaeus_1	22
 ENSBTAT00000021570_bostaurus_1	ENSOART00000003319_ovisaries_1	100
 ENST00000378069_homosapiens_1	ENSGGOT00000000206_gorillagorilla_1	100
 ENSPANT00000027606_papioanubis_1	ENSPANT00000027631_papioanubis_1	100
 ENSLAFT00000000504_loxodontaafricana_1	ENSECAT00000024641_equuscaballus_1	100
 ENSPCAT00000006605_procaviacapensis_1	ENSLAFT00000000504_loxodontaafricana_1	100
 ENSLOCT00000002323_lepisosteusoculatus_1	ENSDART00000028225_daniorerio_1	100
-ENSMUST00000173143_musmusculus_1	ENSMUST00000163344_musmusculus_1	5
+ENSMUST00000173143_musmusculus_1	ENSMUST00000163344_musmusculus_1	3
 ENSRNOT00000044009_rattusnorvegicus_1	ENSMUST00000040820_musmusculus_1	100
 ENSMODT00000026841_monodelphisdomestica_1	ENSCAFT00000022939_canisfamiliaris_1	100
 ENSLACT00000015911_latimeriachalumnae_1	ENSLACT00000026572_latimeriachalumnae_1	100
-ENSSSCT00000035258_susscrofa_1	ENSSSCT00000013404_susscrofa_1	97
+ENSSSCT00000035258_susscrofa_1	ENSSSCT00000013404_susscrofa_1	48
 ENSTRUT00000011582_takifugurubripes_1	ENSTRUT00000011581_takifugurubripes_1	100
 ENSPSIT00000017443_pelodiscussinensis_1	ENSPSIT00000017454_pelodiscussinensis_1	100
 ENSTRUT00000011580_takifugurubripes_1	ENSTRUT00000011581_takifugurubripes_1	100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output2.tabular	Tue Oct 03 04:51:45 2017 -0400
@@ -0,0 +1,50 @@
+ENSFCAT00000013089_feliscatus_1	ENSCAFT00000022939_canisfamiliaris_1	100
+ENSCAFT00000022939_canisfamiliaris_1	ENSFCAT00000013089_feliscatus_1	100
+ENSAMXT00000002585_astyanaxmexicanus_1	ENSDART00000028225_daniorerio_1	100
+ENSPSIT00000017454_pelodiscussinensis_1	ENSPSIT00000017443_pelodiscussinensis_1	100
+ENSRNOT00000066674_rattusnorvegicus_1	ENSMUST00000026013_musmusculus_1	100
+ENSLACT00000026572_latimeriachalumnae_1	ENSLACT00000015911_latimeriachalumnae_1	100
+ENSTGUT00000006603_taeniopygiaguttata_1	ENSFALT00000001591_ficedulaalbicollis_1	100
+ENSFALT00000001560_ficedulaalbicollis_1	ENSTGUT00000006498_taeniopygiaguttata_1	100
+ENSXMAT00000001796_xiphophorusmaculatus_1	ENSONIT00000016435_oreochromisniloticus_1	100
+ENSGALT00000036672_gallusgallus_1	ENSMGAT00000016429_meleagrisgallopavo_1	100
+ENSAPLT00000013855_anasplatyrhynchos_1	ENSMGAT00000016431_meleagrisgallopavo_1	100
+ENSDART00000132084_daniorerio_1	ENSDART00000160057_daniorerio_1	100
+ENSPTRT00000040521_pantroglodytes_1	ENSGGOT00000000206_gorillagorilla_1	100
+ENSXETT00000010517_xenopustropicalis_1	ENSXETT00000010521_xenopustropicalis_1	100
+ENSTRUT00000035430_takifugurubripes_1	ENSTNIT00000014720_tetraodonnigroviridis_1	100
+ENSFALT00000001591_ficedulaalbicollis_1	ENSTGUT00000006603_taeniopygiaguttata_1	100
+ENSDART00000160057_daniorerio_1	ENSDART00000132084_daniorerio_1	100
+ENSLACT00000014695_latimeriachalumnae_1	ENSLACT00000026689_latimeriachalumnae_1	100
+ENSDART00000028225_daniorerio_1	ENSAMXT00000002585_astyanaxmexicanus_1	100
+ENSPANT00000027631_papioanubis_1	ENSMMUT00000027387_macacamulatta_1	100
+ENSMUST00000026013_musmusculus_1	ENSRNOT00000066674_rattusnorvegicus_1	100
+ENSONIT00000016435_oreochromisniloticus_1	ENSXMAT00000001796_xiphophorusmaculatus_1	100
+ENSTNIT00000014720_tetraodonnigroviridis_1	ENSTRUT00000035430_takifugurubripes_1	100
+ENSCJAT00000058575_callithrixjacchus_1	ENSCJAT00000021080_callithrixjacchus_1	100
+ENSCJAT00000021080_callithrixjacchus_1	ENSCJAT00000058575_callithrixjacchus_1	100
+ENSMMUT00000027387_macacamulatta_1	ENSPANT00000027631_papioanubis_1	100
+ENSMGAT00000016429_meleagrisgallopavo_1	ENSGALT00000036672_gallusgallus_1	100
+ENSSSCT00000023183_susscrofa_1	ENSSSCT00000033745_susscrofa_1	100
+ENSGGOT00000000206_gorillagorilla_1	ENSPTRT00000040521_pantroglodytes_1	100
+ENSXETT00000010521_xenopustropicalis_1	ENSXETT00000010517_xenopustropicalis_1	100
+ENSCAFT00000022963_canisfamiliaris_1	ENSAMET00000018029_ailuropodamelanoleuca_1	100
+ENSGACT00000024065_gasterosteusaculeatus_1	ENSGACT00000024064_gasterosteusaculeatus_1	100
+ENSGACT00000024064_gasterosteusaculeatus_1	ENSGACT00000024065_gasterosteusaculeatus_1	100
+ENSAMET00000018029_ailuropodamelanoleuca_1	ENSCAFT00000022963_canisfamiliaris_1	100
+ENSSSCT00000033745_susscrofa_1	ENSSSCT00000023183_susscrofa_1	100
+ENSMGAT00000016431_meleagrisgallopavo_1	ENSAPLT00000013855_anasplatyrhynchos_1	100
+ENSPPYT00000023640_pongoabelii_1	ENSPPYT00000023641_pongoabelii_1	100
+ENSPPYT00000023641_pongoabelii_1	ENSPPYT00000023640_pongoabelii_1	100
+ENSTGUT00000006498_taeniopygiaguttata_1	ENSFALT00000001560_ficedulaalbicollis_1	100
+ENSLACT00000026689_latimeriachalumnae_1	ENSLACT00000014695_latimeriachalumnae_1	100
+ENSPANT00000027701_papioanubis_1	ENSPPYT00000023637_pongoabelii_1	100
+ENSPPYT00000023637_pongoabelii_1	ENSPANT00000027701_papioanubis_1	100
+ENSMUST00000040820_musmusculus_1	ENSRNOT00000044009_rattusnorvegicus_1	100
+ENSOART00000003319_ovisaries_1	ENSBTAT00000021570_bostaurus_1	100
+ENSBTAT00000021570_bostaurus_1	ENSOART00000003319_ovisaries_1	100
+ENSRNOT00000044009_rattusnorvegicus_1	ENSMUST00000040820_musmusculus_1	100
+ENSLACT00000015911_latimeriachalumnae_1	ENSLACT00000026572_latimeriachalumnae_1	100
+ENSTRUT00000011582_takifugurubripes_1	ENSTRUT00000011581_takifugurubripes_1	100
+ENSPSIT00000017443_pelodiscussinensis_1	ENSPSIT00000017454_pelodiscussinensis_1	100
+ENSTRUT00000011581_takifugurubripes_1	ENSTRUT00000011582_takifugurubripes_1	100