| 
2
 | 
     1 #!/usr/bin/perl
 | 
| 
 | 
     2 #V1.0.0
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 use Getopt::Long;
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 my $input_matrix_files;
 | 
| 
 | 
     9 GetOptions (
 | 
| 
 | 
    10 "input_matrix_files=s" => \$input_matrix_files
 | 
| 
 | 
    11 ) or die("Error in command line arguments\n");
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 my @files = split(/,/,$input_matrix_files);
 | 
| 
 | 
    14 my @tbl_hash;
 | 
| 
 | 
    15 my %global_hash;
 | 
| 
 | 
    16 my @tbl_genotype_names;
 | 
| 
 | 
    17 my %chromosome_hash;
 | 
| 
 | 
    18 my %result_by_chr;
 | 
| 
 | 
    19 my @tbl_chr_name;
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 for (my $i=0;$i<=$#files;$i++){
 | 
| 
 | 
    22 	my $current_file = $files[$i];
 | 
| 
 | 
    23 	my $current_genotype = "NA";
 | 
| 
 | 
    24 	my %current_hash;
 | 
| 
 | 
    25 	
 | 
| 
 | 
    26 	open (CF,$current_file) or die ("Can't open file $current_file\n");
 | 
| 
 | 
    27 	my $header = <CF>;
 | 
| 
 | 
    28 	if ($header =~ /Chrom\s*Pos\s*Ref\s*(.*?)\s*$/){
 | 
| 
 | 
    29 		$current_genotype = $1;
 | 
| 
 | 
    30 	}
 | 
| 
 | 
    31 	else {
 | 
| 
 | 
    32 		print STDERR "Unable to recognize header in matrix file\n$header\n";
 | 
| 
 | 
    33 		exit(0);
 | 
| 
 | 
    34 	}
 | 
| 
 | 
    35 	while (my $line=<CF>){
 | 
| 
 | 
    36 		if ($line!~/^\s*$/){
 | 
| 
 | 
    37 			my @fields = split (/\t+/,$line);
 | 
| 
 | 
    38 			my $chr;
 | 
| 
 | 
    39 			my $pos;
 | 
| 
 | 
    40 			my $ref;
 | 
| 
 | 
    41 			my $variant;
 | 
| 
 | 
    42 			
 | 
| 
 | 
    43 			if ($fields[0]=~/^\s*([\w\-]+)\s*$/){
 | 
| 
 | 
    44 				$chr = $1;
 | 
| 
 | 
    45 			}
 | 
| 
 | 
    46 			else {
 | 
| 
 | 
    47 				print STDERR "Unable to detect chromosome in matrix file\n$line\n";
 | 
| 
 | 
    48 				exit(0);
 | 
| 
 | 
    49 			}
 | 
| 
 | 
    50 			if ($fields[1]=~/^\s*(\d+)\s*$/){
 | 
| 
 | 
    51 				$pos = $1;
 | 
| 
 | 
    52 			}
 | 
| 
 | 
    53 			else {
 | 
| 
 | 
    54 				print STDERR "Unable to detect position in matrix file\n$line\n";
 | 
| 
 | 
    55 				exit(0);
 | 
| 
 | 
    56 			}
 | 
| 
 | 
    57 			
 | 
| 
 | 
    58 			if ($fields[2]=~/^\s*([ATGCNX])\s*$/i){
 | 
| 
 | 
    59 				$ref = $1;
 | 
| 
 | 
    60 			}
 | 
| 
 | 
    61 			else {
 | 
| 
 | 
    62 				print STDERR "Unable to detect reference base in matrix file\n$line\n";
 | 
| 
 | 
    63 				exit(0);
 | 
| 
 | 
    64 			}
 | 
| 
 | 
    65 			
 | 
| 
 | 
    66 			if ($fields[3]=~/^\s*([\w\/]+)\s*$/i){
 | 
| 
 | 
    67 				$variant = $1;
 | 
| 
 | 
    68 			}
 | 
| 
 | 
    69 			else {
 | 
| 
 | 
    70 				print STDERR "Unable to detect variant in matrix file\n$line\n";
 | 
| 
 | 
    71 				exit(0);
 | 
| 
 | 
    72 			}
 | 
| 
 | 
    73 			$current_hash{"$chr#$pos"} = $variant;
 | 
| 
 | 
    74 			$global_hash{"$chr#$pos"} = $ref;
 | 
| 
 | 
    75 		}
 | 
| 
 | 
    76 	}
 | 
| 
 | 
    77 	close CF;
 | 
| 
 | 
    78 	push(@tbl_genotype_names,$current_genotype);
 | 
| 
 | 
    79 	push(@tbl_hash,\%current_hash);
 | 
| 
 | 
    80 }
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 print "Chrom\tPos\tRef";
 | 
| 
 | 
    83 for (my $i=0;$i<=$#tbl_genotype_names;$i++){
 | 
| 
 | 
    84 	print "\t".$tbl_genotype_names[$i];
 | 
| 
 | 
    85 }
 | 
| 
 | 
    86 print "\n";
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 my @tbl_line_to_display;
 | 
| 
 | 
    89 
 | 
| 
 | 
    90 #exit(0);
 | 
| 
 | 
    91 
 | 
| 
 | 
    92 foreach my $key (keys %global_hash){
 | 
| 
 | 
    93 	my @tbl = split (/\#/,$key);
 | 
| 
 | 
    94 	my $chr = $tbl[0];
 | 
| 
 | 
    95 	my $pos = $tbl[1];
 | 
| 
 | 
    96 	my $ref = $global_hash{$key};
 | 
| 
 | 
    97 	my $line = "$chr\t$pos\t$ref";
 | 
| 
 | 
    98 	my $isvariant = 0;
 | 
| 
 | 
    99 	for (my $i=0;$i<=$#tbl_hash;$i++){
 | 
| 
 | 
   100 		#my %current_hash = %{$tbl_hash[$i]};
 | 
| 
 | 
   101 		if ($tbl_hash[$i]->{$key}){
 | 
| 
 | 
   102 			$line.="\t".$tbl_hash[$i]->{$key};
 | 
| 
 | 
   103 			if ($tbl_hash[$i]->{$key} ne $ref){
 | 
| 
 | 
   104 				$isvariant = 1;
 | 
| 
 | 
   105 			}
 | 
| 
 | 
   106 		}
 | 
| 
 | 
   107 		else {
 | 
| 
 | 
   108 			$line.="\t"."NA";
 | 
| 
 | 
   109 		}
 | 
| 
 | 
   110 		
 | 
| 
 | 
   111 	}
 | 
| 
 | 
   112 	
 | 
| 
 | 
   113 	$line .="\n";
 | 
| 
 | 
   114 	if ($isvariant == 1){
 | 
| 
 | 
   115 		push(@tbl_line_to_display,$line);
 | 
| 
 | 
   116 	}
 | 
| 
 | 
   117 }
 | 
| 
 | 
   118 
 | 
| 
 | 
   119 
 | 
| 
 | 
   120 for (my $i=0;$i<=$#tbl_line_to_display;$i++){
 | 
| 
 | 
   121 	my @tbl = split (/\s+/,$tbl_line_to_display[$i]);
 | 
| 
 | 
   122 	my $current_chr = $tbl[0];
 | 
| 
 | 
   123 	my @current_tbl;
 | 
| 
 | 
   124 	if ($result_by_chr{$current_chr}){
 | 
| 
 | 
   125 		push (@{$result_by_chr{$current_chr}},$tbl_line_to_display[$i]);
 | 
| 
 | 
   126 	}
 | 
| 
 | 
   127 	else {
 | 
| 
 | 
   128 		push (@current_tbl,$tbl_line_to_display[$i]);
 | 
| 
 | 
   129 		$result_by_chr{$current_chr} = \@current_tbl;
 | 
| 
 | 
   130 	}
 | 
| 
 | 
   131 }
 | 
| 
 | 
   132 
 | 
| 
 | 
   133 foreach my $key (sort keys %result_by_chr){
 | 
| 
 | 
   134 	my @current_tbl = sort mysort @{$result_by_chr{$key}};
 | 
| 
 | 
   135 	for (my $i=0;$i<=$#current_tbl;$i++){
 | 
| 
 | 
   136 		print $current_tbl[$i];
 | 
| 
 | 
   137 	}
 | 
| 
 | 
   138 }
 | 
| 
 | 
   139 
 | 
| 
 | 
   140 sub mysort {
 | 
| 
 | 
   141 	my @tbla = split (/\s+/,$a);
 | 
| 
 | 
   142 	my @tblb = split (/\s+/,$b);
 | 
| 
 | 
   143 	$tbla[0] cmp $tblb[0] || $tbla[1]<=>$tblb[1];
 | 
| 
 | 
   144 }
 | 
| 
 | 
   145 
 | 
| 
 | 
   146 
 |