diff external_tools/darwin/lib/hh/scripts/create_profile_from_hhm.pl @ 6:2277dd59b9f9 draft

Uploaded
author hammock
date Wed, 01 Nov 2017 05:54:28 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/external_tools/darwin/lib/hh/scripts/create_profile_from_hhm.pl	Wed Nov 01 05:54:28 2017 -0400
@@ -0,0 +1,167 @@
+#!/usr/bin/env perl
+# 
+# create_profile_from_hhm.pl 
+# Create a profile (.prf) from a given HHM file
+
+#     HHsuite version 2.0.16 (January 2013)
+#
+#     Reference: 
+#     Remmert M., Biegert A., Hauser A., and Soding J.
+#     HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
+#     Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
+
+#     (C) Michael Remmert and Johannes Soeding, 2012
+
+#     This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#     the Free Software Foundation, either version 3 of the License, or
+#     (at your option) any later version.
+
+#     This program is distributed in the hope that it will be useful,
+#     but WITHOUT ANY WARRANTY; without even the implied warranty of
+#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#     GNU General Public License for more details.
+
+#     You should have received a copy of the GNU General Public License
+#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#     We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
+
+use lib $ENV{"HHLIB"}."/scripts";
+use HHPaths;   # config file with path variables for nr, blast, psipred, pdb, dssp etc.
+use strict;
+
+$|= 1; # Activate autoflushing on STDOUT
+
+# Default values:
+our $v=2;              # verbose mode
+
+my $help="
+create_profile_from_hhm.pl from HHsuite $VERSION  
+Create a profile (.prf) from a given HHM file
+
+Usage: perl create_profile_from_hhm.pl -i <infile> [-o <outfile>]
+
+Options:
+  -i <infile>   Input file in HHM format
+  -o <outfile>  Output file in prf-format (default: infile.prf)
+
+  -v [0-5]      verbose mode (default: $v)
+\n";
+
+# Variable declarations
+my $line;
+my $infile;
+my $outfile;
+my $i;
+my $a;
+
+my @counts;     # count profile
+my @neffs;
+my $name;       # name of HHM profile
+my $len;        # length of HHM profile
+
+                 #   A  C  D  E   F  G  H  I   K   L   M  N   P  Q  R   S   T   V   W   Y   
+my @hhmaa2csaa = ( 0, 4, 3, 6, 13, 7, 8, 9, 11, 10, 12, 2, 14, 5, 1, 15, 16, 19, 17, 18);
+my @aminoacids   = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V');
+my %aa2i;
+for ($a = 0; $a < 20; $a++) {
+    $aa2i{$aminoacids[$a]} = $a;
+}
+
+###############################################################################################
+# Processing command line input
+###############################################################################################
+
+if (@ARGV<1) {die ($help);}
+
+my $options="";
+for (my $i=0; $i<@ARGV; $i++) {$options.=" $ARGV[$i] ";}
+
+if ($options=~s/ -i\s+(\S+) //) {$infile=$1;}
+if ($options=~s/ -o\s+(\S+) //) {$outfile=$1;}
+
+if ($options=~s/ -v\s+(\S+) //) {$v=$1;}
+
+if (!$infile) {print($help); print "ERROR! No input file!\n"; exit(1);}
+
+if (!$outfile) {
+    $infile =~ /^(\S+)\.\S+?$/;
+    $outfile = "$1.prf";
+}
+
+##############################################################################################
+# Main part
+##############################################################################################
+
+######################################
+# Read HHM profile
+######################################
+open (IN, $infile);
+while ($line = <IN>) {
+    if ($line =~ /^NAME\s+(\S+)/) {
+	$name = $1;
+    } elsif ($line =~ /^LENG\s+(\d+)/) {
+	$len = $1;
+    } elsif ($line =~ /^HMM/) {
+	last;
+    }
+}
+
+$i = 0;
+while ($line = <IN>) {
+    if ($line =~ /^\/\//) { last; }
+
+    if ($line =~ s/^\S \d+ //) {
+
+	for ($a = 0; $a < 20; $a++) {
+	    $line =~ s/^\s*(\S+)\s/ /;
+	    $counts[$i][$hhmaa2csaa[$a]] = $1;
+	    if ($counts[$i][$hhmaa2csaa[$a]] !~ /\*/ && $counts[$i][$hhmaa2csaa[$a]] == 0) { 
+		$counts[$i][$hhmaa2csaa[$a]] = 1; 
+	    }
+	}
+
+	$line = <IN>;
+	$line =~ /^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+/;
+	$neffs[$i] = $1;
+
+	$i++;
+    }
+}
+
+######################################
+# write count_profile
+######################################
+
+open (OUT, ">$outfile");
+# Write header
+printf(OUT "CountProfile\n");
+printf(OUT "NAME\t%s\n", $name);
+printf(OUT "LENG\t%i\n", $len);
+printf(OUT "ALPH\t20\n");              # 20 amino acid alphabet
+printf(OUT "COUNTS");
+for ($a = 0; $a < 20; $a++) {
+    printf(OUT "\t%s", $aminoacids[$a]);
+}
+printf(OUT "\tNEFF\n");
+
+# Write profile
+for ($i = 0; $i < $len; $i++) {
+    printf(OUT "%i", $i+1);
+    for ($a = 0; $a < 20; $a++) {
+	if ($counts[$i][$a] == '*') {
+	    printf(OUT "\t*");
+	} else {
+	    printf(OUT "\t%i", $counts[$i][$a]);
+	}
+    }
+    printf(OUT "\t%i\n", $neffs[$i]);
+}
+
+printf(OUT "//\n");
+close OUT;
+
+exit;
+
+