changeset 2:cabe6c55780a draft

Uploaded
author jnavarro
date Thu, 26 Oct 2017 07:37:17 -0400
parents 9e0251d2ecb5
children 0354759bf0ce
files CreateMatrixMultiple.pl
diffstat 1 files changed, 146 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CreateMatrixMultiple.pl	Thu Oct 26 07:37:17 2017 -0400
@@ -0,0 +1,146 @@
+#!/usr/bin/perl
+#V1.0.0
+use strict;
+use warnings;
+use Getopt::Long;
+
+
+my $input_matrix_files;
+GetOptions (
+"input_matrix_files=s" => \$input_matrix_files
+) or die("Error in command line arguments\n");
+
+my @files = split(/,/,$input_matrix_files);
+my @tbl_hash;
+my %global_hash;
+my @tbl_genotype_names;
+my %chromosome_hash;
+my %result_by_chr;
+my @tbl_chr_name;
+
+for (my $i=0;$i<=$#files;$i++){
+	my $current_file = $files[$i];
+	my $current_genotype = "NA";
+	my %current_hash;
+	
+	open (CF,$current_file) or die ("Can't open file $current_file\n");
+	my $header = <CF>;
+	if ($header =~ /Chrom\s*Pos\s*Ref\s*(.*?)\s*$/){
+		$current_genotype = $1;
+	}
+	else {
+		print STDERR "Unable to recognize header in matrix file\n$header\n";
+		exit(0);
+	}
+	while (my $line=<CF>){
+		if ($line!~/^\s*$/){
+			my @fields = split (/\t+/,$line);
+			my $chr;
+			my $pos;
+			my $ref;
+			my $variant;
+			
+			if ($fields[0]=~/^\s*([\w\-]+)\s*$/){
+				$chr = $1;
+			}
+			else {
+				print STDERR "Unable to detect chromosome in matrix file\n$line\n";
+				exit(0);
+			}
+			if ($fields[1]=~/^\s*(\d+)\s*$/){
+				$pos = $1;
+			}
+			else {
+				print STDERR "Unable to detect position in matrix file\n$line\n";
+				exit(0);
+			}
+			
+			if ($fields[2]=~/^\s*([ATGCNX])\s*$/i){
+				$ref = $1;
+			}
+			else {
+				print STDERR "Unable to detect reference base in matrix file\n$line\n";
+				exit(0);
+			}
+			
+			if ($fields[3]=~/^\s*([\w\/]+)\s*$/i){
+				$variant = $1;
+			}
+			else {
+				print STDERR "Unable to detect variant in matrix file\n$line\n";
+				exit(0);
+			}
+			$current_hash{"$chr#$pos"} = $variant;
+			$global_hash{"$chr#$pos"} = $ref;
+		}
+	}
+	close CF;
+	push(@tbl_genotype_names,$current_genotype);
+	push(@tbl_hash,\%current_hash);
+}
+
+print "Chrom\tPos\tRef";
+for (my $i=0;$i<=$#tbl_genotype_names;$i++){
+	print "\t".$tbl_genotype_names[$i];
+}
+print "\n";
+
+my @tbl_line_to_display;
+
+#exit(0);
+
+foreach my $key (keys %global_hash){
+	my @tbl = split (/\#/,$key);
+	my $chr = $tbl[0];
+	my $pos = $tbl[1];
+	my $ref = $global_hash{$key};
+	my $line = "$chr\t$pos\t$ref";
+	my $isvariant = 0;
+	for (my $i=0;$i<=$#tbl_hash;$i++){
+		#my %current_hash = %{$tbl_hash[$i]};
+		if ($tbl_hash[$i]->{$key}){
+			$line.="\t".$tbl_hash[$i]->{$key};
+			if ($tbl_hash[$i]->{$key} ne $ref){
+				$isvariant = 1;
+			}
+		}
+		else {
+			$line.="\t"."NA";
+		}
+		
+	}
+	
+	$line .="\n";
+	if ($isvariant == 1){
+		push(@tbl_line_to_display,$line);
+	}
+}
+
+
+for (my $i=0;$i<=$#tbl_line_to_display;$i++){
+	my @tbl = split (/\s+/,$tbl_line_to_display[$i]);
+	my $current_chr = $tbl[0];
+	my @current_tbl;
+	if ($result_by_chr{$current_chr}){
+		push (@{$result_by_chr{$current_chr}},$tbl_line_to_display[$i]);
+	}
+	else {
+		push (@current_tbl,$tbl_line_to_display[$i]);
+		$result_by_chr{$current_chr} = \@current_tbl;
+	}
+}
+
+foreach my $key (sort keys %result_by_chr){
+	my @current_tbl = sort mysort @{$result_by_chr{$key}};
+	for (my $i=0;$i<=$#current_tbl;$i++){
+		print $current_tbl[$i];
+	}
+}
+
+sub mysort {
+	my @tbla = split (/\s+/,$a);
+	my @tblb = split (/\s+/,$b);
+	$tbla[0] cmp $tblb[0] || $tbla[1]<=>$tblb[1];
+}
+
+