1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4 use Switch;
|
|
5 use Getopt::Long;
|
|
6
|
|
7 my $usage = qq~Usage:$0 <args> [<opts>]
|
|
8 where <args> are:
|
|
9 -h, --hapmap <Hapmap input file>
|
|
10 -t, --trait <Trait input file>
|
|
11 -o, --out <Output base name>
|
|
12 ~;
|
|
13 $usage .= "\n";
|
|
14
|
|
15 my ($hapmap,$trait,$out);
|
|
16
|
|
17
|
|
18 GetOptions(
|
|
19 "trait=s" => \$trait,
|
|
20 "out=s" => \$out,
|
|
21 "hapmap=s" => \$hapmap
|
|
22 );
|
|
23
|
|
24
|
|
25 die $usage
|
|
26 if ( !$trait || !$out || !$hapmap);
|
|
27
|
|
28 my %inds;
|
|
29
|
|
30 #######################################
|
|
31 # get individuals in trait file
|
|
32 #######################################
|
|
33 my %traits;
|
|
34 my $head_trait = `head -1 $trait`;
|
|
35 open(my $T,$trait);
|
|
36 <$T>;
|
|
37 while(<$T>)
|
|
38 {
|
|
39 my @infos = split(/\t/,$_);
|
|
40 my $ind = $infos[0];
|
|
41 $inds{$ind}++;
|
|
42 $traits{$ind} = $_;
|
|
43 }
|
|
44 close($T);
|
|
45 my $nb_ind_trait = scalar keys(%traits);
|
|
46
|
|
47 #######################################
|
|
48 # get individuals in hapmap file
|
|
49 #######################################
|
|
50 my $line_ind = `head -1 $hapmap`;
|
|
51 chomp($line_ind);
|
|
52 my @infos = split(/\t/,$line_ind);
|
|
53 for (my $i = 11; $i <= $#infos; $i++)
|
|
54 {
|
|
55 my $ind = $infos[$i];
|
|
56 $inds{$ind}++;
|
|
57 }
|
|
58 my $nb_ind_hapmap = scalar @infos - 11;
|
|
59
|
|
60 #################################################################
|
|
61 # create trait output by keeping individuals found in both files
|
|
62 #################################################################
|
|
63 open(my $O,">$out.trait");
|
|
64 print $O $head_trait;
|
|
65 my $nb_common = 0;
|
|
66 foreach my $ind(keys(%inds))
|
|
67 {
|
|
68 my $nb_found = $inds{$ind};
|
|
69 if ($nb_found == 2)
|
|
70 {
|
|
71 $nb_common++;
|
|
72 print $O $traits{$ind};
|
|
73 }
|
|
74 }
|
|
75 close($O);
|
|
76
|
|
77
|
|
78 #####################################################################
|
|
79 # create hapmap output after keeping individuals found in both files
|
|
80 # and removing monomorphic positions
|
|
81 #####################################################################
|
|
82 open(my $O2,">$out.hapmap");
|
|
83 my $numline = 0;
|
|
84 my %genotypes;
|
|
85 my %columns_to_keep;
|
|
86 my $nb_monomorphic = 0;
|
|
87 my $not_biallelic = 0;
|
|
88 my $diff_variation = 0;
|
|
89 open(my $H,$hapmap);
|
|
90 while(<$H>)
|
|
91 {
|
|
92 $numline++;
|
|
93 my $line = $_;
|
|
94 $line =~s/\n//g;
|
|
95 $line =~s/\r//g;
|
|
96 my @infos = split(/\t/,$line);
|
|
97 if ($numline == 1)
|
|
98 {
|
|
99 my @titles;
|
|
100 for (my $i = 0; $i <= 10; $i++)
|
|
101 {
|
|
102 my $title = $infos[$i];
|
|
103 push(@titles,$title);
|
|
104 }
|
|
105 print $O2 join("\t",@titles);
|
|
106 for (my $i = 11; $i <= $#infos; $i++)
|
|
107 {
|
|
108 my $ind = $infos[$i];
|
|
109 my $nb_found = $inds{$ind};
|
|
110 if ($nb_found == 2)
|
|
111 {
|
|
112 print $O2 " $ind";
|
|
113 $columns_to_keep{$i} = 1;
|
|
114 }
|
|
115 }
|
|
116 print $O2 "\n";
|
|
117 }
|
|
118 else
|
|
119 {
|
|
120 my $to_be_printed = "";
|
|
121 my $variation = $infos[1];
|
|
122 for (my $i = 0; $i <= 10; $i++)
|
|
123 {
|
|
124 my $title = $infos[$i];
|
|
125 $to_be_printed .= "$title ";
|
|
126 }
|
|
127 my %letters;
|
|
128 for (my $i = 11; $i <= $#infos; $i++)
|
|
129 {
|
|
130 if ($columns_to_keep{$i})
|
|
131 {
|
|
132 my $genotype = $infos[$i];
|
|
133 if ($genotype ne 'NN')
|
|
134 {
|
|
135 my ($allele1,$allele2) = split(//,$genotype);
|
|
136 $letters{$allele1}=1;
|
|
137 $letters{$allele2}=1;
|
|
138 }
|
|
139 $to_be_printed .= "$genotype ";
|
|
140 }
|
|
141 }
|
|
142 chop($to_be_printed);
|
|
143
|
|
144 my $variation_obs = join("/",sort keys(%letters));
|
|
145
|
|
146 # print only if polymorphic
|
|
147 if (scalar keys(%letters) < 2)
|
|
148 {
|
|
149 $nb_monomorphic++;
|
|
150 }
|
|
151 elsif (scalar keys(%letters) > 2)
|
|
152 {
|
|
153 $not_biallelic++;
|
|
154 }
|
|
155 else
|
|
156 {
|
|
157 if ($variation ne $variation_obs)
|
|
158 {
|
|
159 $to_be_printed =~s/$variation/$variation_obs/;
|
|
160 $diff_variation++;
|
|
161 }
|
|
162
|
|
163 print $O2 $to_be_printed . "\n";
|
|
164 }
|
|
165 }
|
|
166 }
|
|
167 close($H);
|
|
168 close($O2);
|
|
169
|
|
170 print "==============================================\n";
|
|
171 print "Individuals\n";
|
|
172 print "==============================================\n";
|
|
173 print "Individuals in hapmap file: $nb_ind_hapmap\n";
|
|
174 print "Individuals in trait file: $nb_ind_trait\n";
|
|
175 print "Individuals found in both files: $nb_common\n";
|
|
176 print "==============================================\n";
|
|
177 print "Markers\n";
|
|
178 print "==============================================\n";
|
|
179 print "Discarded markers:\n";
|
|
180 print "Monomorphic: $nb_monomorphic\n";
|
|
181 print "Not biallelic: $not_biallelic\n";
|
|
182 print "Modified markers:\n";
|
|
183 print "Difference in variation: $diff_variation\n";
|
|
184
|