view CreateMatrixMultiple.pl @ 2:cabe6c55780a draft

Uploaded
author jnavarro
date Thu, 26 Oct 2017 07:37:17 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
#V1.0.0
use strict;
use warnings;
use Getopt::Long;


my $input_matrix_files;
GetOptions (
"input_matrix_files=s" => \$input_matrix_files
) or die("Error in command line arguments\n");

my @files = split(/,/,$input_matrix_files);
my @tbl_hash;
my %global_hash;
my @tbl_genotype_names;
my %chromosome_hash;
my %result_by_chr;
my @tbl_chr_name;

for (my $i=0;$i<=$#files;$i++){
	my $current_file = $files[$i];
	my $current_genotype = "NA";
	my %current_hash;
	
	open (CF,$current_file) or die ("Can't open file $current_file\n");
	my $header = <CF>;
	if ($header =~ /Chrom\s*Pos\s*Ref\s*(.*?)\s*$/){
		$current_genotype = $1;
	}
	else {
		print STDERR "Unable to recognize header in matrix file\n$header\n";
		exit(0);
	}
	while (my $line=<CF>){
		if ($line!~/^\s*$/){
			my @fields = split (/\t+/,$line);
			my $chr;
			my $pos;
			my $ref;
			my $variant;
			
			if ($fields[0]=~/^\s*([\w\-]+)\s*$/){
				$chr = $1;
			}
			else {
				print STDERR "Unable to detect chromosome in matrix file\n$line\n";
				exit(0);
			}
			if ($fields[1]=~/^\s*(\d+)\s*$/){
				$pos = $1;
			}
			else {
				print STDERR "Unable to detect position in matrix file\n$line\n";
				exit(0);
			}
			
			if ($fields[2]=~/^\s*([ATGCNX])\s*$/i){
				$ref = $1;
			}
			else {
				print STDERR "Unable to detect reference base in matrix file\n$line\n";
				exit(0);
			}
			
			if ($fields[3]=~/^\s*([\w\/]+)\s*$/i){
				$variant = $1;
			}
			else {
				print STDERR "Unable to detect variant in matrix file\n$line\n";
				exit(0);
			}
			$current_hash{"$chr#$pos"} = $variant;
			$global_hash{"$chr#$pos"} = $ref;
		}
	}
	close CF;
	push(@tbl_genotype_names,$current_genotype);
	push(@tbl_hash,\%current_hash);
}

print "Chrom\tPos\tRef";
for (my $i=0;$i<=$#tbl_genotype_names;$i++){
	print "\t".$tbl_genotype_names[$i];
}
print "\n";

my @tbl_line_to_display;

#exit(0);

foreach my $key (keys %global_hash){
	my @tbl = split (/\#/,$key);
	my $chr = $tbl[0];
	my $pos = $tbl[1];
	my $ref = $global_hash{$key};
	my $line = "$chr\t$pos\t$ref";
	my $isvariant = 0;
	for (my $i=0;$i<=$#tbl_hash;$i++){
		#my %current_hash = %{$tbl_hash[$i]};
		if ($tbl_hash[$i]->{$key}){
			$line.="\t".$tbl_hash[$i]->{$key};
			if ($tbl_hash[$i]->{$key} ne $ref){
				$isvariant = 1;
			}
		}
		else {
			$line.="\t"."NA";
		}
		
	}
	
	$line .="\n";
	if ($isvariant == 1){
		push(@tbl_line_to_display,$line);
	}
}


for (my $i=0;$i<=$#tbl_line_to_display;$i++){
	my @tbl = split (/\s+/,$tbl_line_to_display[$i]);
	my $current_chr = $tbl[0];
	my @current_tbl;
	if ($result_by_chr{$current_chr}){
		push (@{$result_by_chr{$current_chr}},$tbl_line_to_display[$i]);
	}
	else {
		push (@current_tbl,$tbl_line_to_display[$i]);
		$result_by_chr{$current_chr} = \@current_tbl;
	}
}

foreach my $key (sort keys %result_by_chr){
	my @current_tbl = sort mysort @{$result_by_chr{$key}};
	for (my $i=0;$i<=$#current_tbl;$i++){
		print $current_tbl[$i];
	}
}

sub mysort {
	my @tbla = split (/\s+/,$a);
	my @tblb = split (/\s+/,$b);
	$tbla[0] cmp $tblb[0] || $tbla[1]<=>$tblb[1];
}