10
|
1 #!/usr/bin/perl
|
|
2 #V1.0.0
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use Getopt::Long;
|
|
6
|
|
7
|
|
8 my $input_matrix_files;
|
|
9 GetOptions (
|
|
10 "input_matrix_files=s" => \$input_matrix_files
|
|
11 ) or die("Error in command line arguments\n");
|
|
12
|
|
13 my @files = split(/,/,$input_matrix_files);
|
15
|
14 my @tbl_hash;
|
|
15 my %global_hash;
|
|
16 my @tbl_genotype_names;
|
|
17 my %chromosome_hash;
|
|
18 my %result_by_chr;
|
|
19 my @tbl_chr_name;
|
|
20
|
10
|
21 for (my $i=0;$i<=$#files;$i++){
|
15
|
22 my $current_file = $files[$i];
|
|
23 my $current_genotype = "NA";
|
|
24 my %current_hash;
|
|
25
|
|
26 open (CF,$current_file) or die ("Can't open file $current_file\n");
|
|
27 my $header = <CF>;
|
|
28 if ($header =~ /Chrom\s*Pos\s*Ref\s*(.*?)\s*$/){
|
|
29 $current_genotype = $1;
|
|
30 }
|
|
31 else {
|
|
32 print STDERR "Unable to recognize header in matrix file\n$header\n";
|
|
33 exit(0);
|
|
34 }
|
|
35 while (my $line=<CF>){
|
|
36 if ($line!~/^\s*$/){
|
|
37 my @fields = split (/\t+/,$line);
|
|
38 my $chr;
|
|
39 my $pos;
|
|
40 my $ref;
|
|
41 my $variant;
|
|
42
|
|
43 if ($fields[0]=~/^\s*([\w\-]+)\s*$/){
|
|
44 $chr = $1;
|
|
45 }
|
|
46 else {
|
|
47 print STDERR "Unable to detect chromosome in matrix file\n$line\n";
|
|
48 exit(0);
|
|
49 }
|
|
50 if ($fields[1]=~/^\s*(\d+)\s*$/){
|
|
51 $pos = $1;
|
|
52 }
|
|
53 else {
|
|
54 print STDERR "Unable to detect position in matrix file\n$line\n";
|
|
55 exit(0);
|
|
56 }
|
|
57
|
|
58 if ($fields[2]=~/^\s*([ATGCNX])\s*$/i){
|
|
59 $ref = $1;
|
|
60 }
|
|
61 else {
|
|
62 print STDERR "Unable to detect reference base in matrix file\n$line\n";
|
|
63 exit(0);
|
|
64 }
|
|
65
|
|
66 if ($fields[3]=~/^\s*([\w\/]+)\s*$/i){
|
|
67 $variant = $1;
|
|
68 }
|
|
69 else {
|
|
70 print STDERR "Unable to detect variant in matrix file\n$line\n";
|
|
71 exit(0);
|
|
72 }
|
|
73 $current_hash{"$chr#$pos"} = $variant;
|
|
74 $global_hash{"$chr#$pos"} = $ref;
|
|
75 }
|
|
76 }
|
|
77 close CF;
|
|
78 push(@tbl_genotype_names,$current_genotype);
|
|
79 push(@tbl_hash,\%current_hash);
|
10
|
80 }
|
|
81
|
15
|
82 print "Chrom\tPos\tRef";
|
|
83 for (my $i=0;$i<=$#tbl_genotype_names;$i++){
|
|
84 print "\t".$tbl_genotype_names[$i];
|
|
85 }
|
|
86 print "\n";
|
|
87
|
|
88 my @tbl_line_to_display;
|
|
89
|
|
90 #exit(0);
|
|
91
|
|
92 foreach my $key (keys %global_hash){
|
|
93 my @tbl = split (/\#/,$key);
|
|
94 my $chr = $tbl[0];
|
|
95 my $pos = $tbl[1];
|
|
96 my $ref = $global_hash{$key};
|
|
97 my $line = "$chr\t$pos\t$ref";
|
|
98 my $isvariant = 0;
|
|
99 for (my $i=0;$i<=$#tbl_hash;$i++){
|
|
100 #my %current_hash = %{$tbl_hash[$i]};
|
|
101 if ($tbl_hash[$i]->{$key}){
|
|
102 $line.="\t".$tbl_hash[$i]->{$key};
|
|
103 if ($tbl_hash[$i]->{$key} ne $ref){
|
|
104 $isvariant = 1;
|
|
105 }
|
|
106 }
|
|
107 else {
|
|
108 $line.="\t"."NA";
|
|
109 }
|
|
110
|
|
111 }
|
|
112
|
|
113 $line .="\n";
|
|
114 if ($isvariant == 1){
|
|
115 push(@tbl_line_to_display,$line);
|
|
116 }
|
|
117 }
|
|
118
|
|
119
|
|
120 for (my $i=0;$i<=$#tbl_line_to_display;$i++){
|
|
121 my @tbl = split (/\s+/,$tbl_line_to_display[$i]);
|
|
122 my $current_chr = $tbl[0];
|
|
123 my @current_tbl;
|
|
124 if ($result_by_chr{$current_chr}){
|
|
125 push (@{$result_by_chr{$current_chr}},$tbl_line_to_display[$i]);
|
|
126 }
|
|
127 else {
|
|
128 push (@current_tbl,$tbl_line_to_display[$i]);
|
|
129 $result_by_chr{$current_chr} = \@current_tbl;
|
|
130 }
|
|
131 }
|
|
132
|
|
133 foreach my $key (sort keys %result_by_chr){
|
|
134 my @current_tbl = sort mysort @{$result_by_chr{$key}};
|
|
135 for (my $i=0;$i<=$#current_tbl;$i++){
|
|
136 print $current_tbl[$i];
|
|
137 }
|
|
138 }
|
|
139
|
|
140 sub mysort {
|
|
141 my @tbla = split (/\s+/,$a);
|
|
142 my @tblb = split (/\s+/,$b);
|
|
143 $tbla[0] cmp $tblb[0] || $tbla[1]<=>$tblb[1];
|
|
144 }
|
|
145
|
|
146
|