Mercurial > repos > mcharles > rapsosnp
diff rapsodyn/CreateMatrixMultiple.pl @ 15:56d328bce3a7 draft default tip
Uploaded
author | mcharles |
---|---|
date | Thu, 29 Jan 2015 08:54:06 -0500 |
parents | 0a6c1cfe4dc8 |
children |
line wrap: on
line diff
--- a/rapsodyn/CreateMatrixMultiple.pl Mon Jan 26 18:10:52 2015 -0500 +++ b/rapsodyn/CreateMatrixMultiple.pl Thu Jan 29 08:54:06 2015 -0500 @@ -11,7 +11,136 @@ ) or die("Error in command line arguments\n"); my @files = split(/,/,$input_matrix_files); +my @tbl_hash; +my %global_hash; +my @tbl_genotype_names; +my %chromosome_hash; +my %result_by_chr; +my @tbl_chr_name; + for (my $i=0;$i<=$#files;$i++){ - print $files[$i],"\n"; + my $current_file = $files[$i]; + my $current_genotype = "NA"; + my %current_hash; + + open (CF,$current_file) or die ("Can't open file $current_file\n"); + my $header = <CF>; + if ($header =~ /Chrom\s*Pos\s*Ref\s*(.*?)\s*$/){ + $current_genotype = $1; + } + else { + print STDERR "Unable to recognize header in matrix file\n$header\n"; + exit(0); + } + while (my $line=<CF>){ + if ($line!~/^\s*$/){ + my @fields = split (/\t+/,$line); + my $chr; + my $pos; + my $ref; + my $variant; + + if ($fields[0]=~/^\s*([\w\-]+)\s*$/){ + $chr = $1; + } + else { + print STDERR "Unable to detect chromosome in matrix file\n$line\n"; + exit(0); + } + if ($fields[1]=~/^\s*(\d+)\s*$/){ + $pos = $1; + } + else { + print STDERR "Unable to detect position in matrix file\n$line\n"; + exit(0); + } + + if ($fields[2]=~/^\s*([ATGCNX])\s*$/i){ + $ref = $1; + } + else { + print STDERR "Unable to detect reference base in matrix file\n$line\n"; + exit(0); + } + + if ($fields[3]=~/^\s*([\w\/]+)\s*$/i){ + $variant = $1; + } + else { + print STDERR "Unable to detect variant in matrix file\n$line\n"; + exit(0); + } + $current_hash{"$chr#$pos"} = $variant; + $global_hash{"$chr#$pos"} = $ref; + } + } + close CF; + push(@tbl_genotype_names,$current_genotype); + push(@tbl_hash,\%current_hash); } +print "Chrom\tPos\tRef"; +for (my $i=0;$i<=$#tbl_genotype_names;$i++){ + print "\t".$tbl_genotype_names[$i]; +} +print "\n"; + +my @tbl_line_to_display; + +#exit(0); + +foreach my $key (keys %global_hash){ + my @tbl = split (/\#/,$key); + my $chr = $tbl[0]; + my $pos = $tbl[1]; + my $ref = $global_hash{$key}; + my $line = "$chr\t$pos\t$ref"; + my $isvariant = 0; + for (my $i=0;$i<=$#tbl_hash;$i++){ + #my %current_hash = %{$tbl_hash[$i]}; + if ($tbl_hash[$i]->{$key}){ + $line.="\t".$tbl_hash[$i]->{$key}; + if ($tbl_hash[$i]->{$key} ne $ref){ + $isvariant = 1; + } + } + else { + $line.="\t"."NA"; + } + + } + + $line .="\n"; + if ($isvariant == 1){ + push(@tbl_line_to_display,$line); + } +} + + +for (my $i=0;$i<=$#tbl_line_to_display;$i++){ + my @tbl = split (/\s+/,$tbl_line_to_display[$i]); + my $current_chr = $tbl[0]; + my @current_tbl; + if ($result_by_chr{$current_chr}){ + push (@{$result_by_chr{$current_chr}},$tbl_line_to_display[$i]); + } + else { + push (@current_tbl,$tbl_line_to_display[$i]); + $result_by_chr{$current_chr} = \@current_tbl; + } +} + +foreach my $key (sort keys %result_by_chr){ + my @current_tbl = sort mysort @{$result_by_chr{$key}}; + for (my $i=0;$i<=$#current_tbl;$i++){ + print $current_tbl[$i]; + } +} + +sub mysort { + my @tbla = split (/\s+/,$a); + my @tblb = split (/\s+/,$b); + $tbla[0] cmp $tblb[0] || $tbla[1]<=>$tblb[1]; +} + +