diff alignment/seqfill.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alignment/seqfill.pl	Tue Mar 11 12:19:13 2014 -0700
@@ -0,0 +1,139 @@
+#!/usr/bin/perl
+
+my $file = $ARGV[0];
+my $q_mark = $ARGV[1];
+my $hyphen = $ARGV[2];
+my $N = $ARGV[3];
+my $usePartFile = $ARGV[4];
+my $partFile = $ARGV[5];
+
+my $out = "out.phylipnon";	# output file
+
+open(FILE, $file);
+	my @speciesNames;
+	my @sequenceLines;
+	
+	my @currentLineContent;
+	
+	my $i = 0;
+	while($currentLine = <FILE>) {
+		chomp($currentLine);
+		@currentLineContent = split(/\t/, $currentLine);
+		$speciesNames[$i] = $currentLineContent[0];
+		$sequenceLines[$i] = $currentLineContent[1];
+		$i++;
+	}
+	
+	my $dataInfo = $speciesNames[1];	# gets num of species and sequence length
+	my @numbers = split(/ /, $dataInfo);
+
+	my $numberOfSpecies = $numbers[0];
+	my $sequenceLength = $numbers[1];
+	
+close(FILE);
+
+open(OUT, '>'.$out);
+	my @columnData;		# this will have $sequenceLength elements
+	for($j = 0; $j < $numberOfSpecies+2; $j++) {
+		for($k = 0; $k < $sequenceLength; $k++) {
+			$currChar = substr($sequenceLines[$j], $k, 1);
+			$columnData[$k] = $columnData[$k].$currChar;
+		}
+	}
+	
+	# mark locations that will be removed
+	my @flagMap;
+	for($i = 0; $i < $sequenceLength; $i++) {
+		$flagMap[$i] = 0;		
+	}
+	my $index = 0;
+	foreach $el(@columnData) {
+		my $tot = 0;
+		my $q_mark_occur = 0;
+		my $hyphen_occur = 0;
+		my $N_occur = 0;
+		
+		if($q_mark eq "true") {
+			$q_mark_occur = ($el =~ tr/?//);
+		}
+		if($hyphen eq "true") {
+			$hyphen_occur = ($el =~ tr/-//);	
+		}
+		if($N eq "true") {
+			$N_occur = ($el =~ tr/N//);
+		}
+
+		$tot = $q_mark_occur + $hyphen_occur + $N_occur;
+		if($tot == $numberOfSpecies) {
+			$flagMap[$index] = 1;
+		}
+		$index++;
+	}
+	
+	my $newSequenceLength = $sequenceLength;
+	foreach $el(@flagMap) {
+		if($el == 1) {
+			$newSequenceLength--;
+		}
+	}
+
+	print OUT $speciesNames[0]."\n";
+	print OUT $numberOfSpecies." ".$newSequenceLength."\n";
+	for($i = 2; $i < $numberOfSpecies+3; $i++) {
+		print OUT $speciesNames[$i]."\t";
+		for($j = 0; $j < $sequenceLength; $j++) {
+			if($flagMap[$j] == 0) {
+				my $character = substr($sequenceLines[$i], $j, 1);
+				print OUT $character;
+			}
+		}
+		print OUT "\n"; 
+	}	
+
+close(OUT);
+
+my $partOut = "partOut.txt";
+
+if($usePartFile eq "true") {
+	# update the partition file
+	open(PART, $partFile);
+		my @data;
+		my @ranges;
+		my @names;
+		$i = 0;
+		while($currentLine = <PART>) {
+			@data = split(/=/, $currentLine);
+			$names[$i] = $data[0];
+			$ranges[$i] = $data[1];
+			$i++;
+		}
+	close(PART);
+	
+	my $firstFlag = 1;
+	open(PARTOUT, '>'.$partOut);
+		$j = 0;
+		my $newLower;
+		foreach $el(@ranges) {
+			print PARTOUT $names[$j]." = ";
+			@lowerUpper = split(/-/, $el);
+			if($firstFlag == 1) {
+				$newLower = $lowerUpper[0];
+				$firstFlag = 0;
+			}
+			my $currUpper = $lowerUpper[1];	
+			my $newUpper = $currUpper;
+
+			
+
+			for($i = $currLower; $i < $currUpper; $i++) {
+				if($flagMap[$i] == 1) {
+					$newUpper--;
+				}
+			}
+
+			print PARTOUT $newLower." - ".$newUpper."\n";
+			$newLower = $newUpper + 1;
+			$j++;
+		}
+	close(PARTOUT);
+}