annotate check_gwas_inputs/CheckGWASInputs.pl @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
2
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
3 use strict;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
4 use Switch;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
5 use Getopt::Long;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
6
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
7 my $usage = qq~Usage:$0 <args> [<opts>]
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
8 where <args> are:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
9 -h, --hapmap <Hapmap input file>
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
10 -t, --trait <Trait input file>
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
11 -o, --out <Output base name>
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
12 ~;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
13 $usage .= "\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
14
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
15 my ($hapmap,$trait,$out);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
16
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
17
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
18 GetOptions(
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
19 "trait=s" => \$trait,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
20 "out=s" => \$out,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
21 "hapmap=s" => \$hapmap
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
22 );
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
23
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
24
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
25 die $usage
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
26 if ( !$trait || !$out || !$hapmap);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
27
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
28 my %inds;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
29
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
30 #######################################
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
31 # get individuals in trait file
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
32 #######################################
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
33 my %traits;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
34 my $head_trait = `head -1 $trait`;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
35 open(my $T,$trait);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
36 <$T>;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
37 while(<$T>)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
38 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
39 my @infos = split(/\t/,$_);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
40 my $ind = $infos[0];
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
41 $inds{$ind}++;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
42 $traits{$ind} = $_;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
43 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
44 close($T);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
45 my $nb_ind_trait = scalar keys(%traits);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
46
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
47 #######################################
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
48 # get individuals in hapmap file
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
49 #######################################
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
50 my $line_ind = `head -1 $hapmap`;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
51 chomp($line_ind);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
52 my @infos = split(/\t/,$line_ind);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
53 for (my $i = 11; $i <= $#infos; $i++)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
54 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
55 my $ind = $infos[$i];
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
56 $inds{$ind}++;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
57 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
58 my $nb_ind_hapmap = scalar @infos - 11;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
59
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
60 #################################################################
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
61 # create trait output by keeping individuals found in both files
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
62 #################################################################
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
63 open(my $O,">$out.trait");
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
64 print $O $head_trait;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
65 my $nb_common = 0;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
66 foreach my $ind(keys(%inds))
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
67 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
68 my $nb_found = $inds{$ind};
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
69 if ($nb_found == 2)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
70 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
71 $nb_common++;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
72 print $O $traits{$ind};
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
73 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
74 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
75 close($O);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
76
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
77
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
78 #####################################################################
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
79 # create hapmap output after keeping individuals found in both files
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
80 # and removing monomorphic positions
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
81 #####################################################################
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
82 open(my $O2,">$out.hapmap");
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
83 my $numline = 0;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
84 my %genotypes;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
85 my %columns_to_keep;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
86 my $nb_monomorphic = 0;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
87 my $not_biallelic = 0;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
88 my $diff_variation = 0;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
89 open(my $H,$hapmap);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
90 while(<$H>)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
91 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
92 $numline++;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
93 my $line = $_;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
94 $line =~s/\n//g;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
95 $line =~s/\r//g;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
96 my @infos = split(/\t/,$line);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
97 if ($numline == 1)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
98 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
99 my @titles;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
100 for (my $i = 0; $i <= 10; $i++)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
101 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
102 my $title = $infos[$i];
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
103 push(@titles,$title);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
104 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
105 print $O2 join("\t",@titles);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
106 for (my $i = 11; $i <= $#infos; $i++)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
107 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
108 my $ind = $infos[$i];
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
109 my $nb_found = $inds{$ind};
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
110 if ($nb_found == 2)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
111 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
112 print $O2 " $ind";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
113 $columns_to_keep{$i} = 1;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
114 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
115 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
116 print $O2 "\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
117 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
118 else
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
119 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
120 my $to_be_printed = "";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
121 my $variation = $infos[1];
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
122 for (my $i = 0; $i <= 10; $i++)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
123 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
124 my $title = $infos[$i];
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
125 $to_be_printed .= "$title ";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
126 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
127 my %letters;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
128 for (my $i = 11; $i <= $#infos; $i++)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
129 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
130 if ($columns_to_keep{$i})
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
131 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
132 my $genotype = $infos[$i];
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
133 if ($genotype ne 'NN')
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
134 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
135 my ($allele1,$allele2) = split(//,$genotype);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
136 $letters{$allele1}=1;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
137 $letters{$allele2}=1;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
138 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
139 $to_be_printed .= "$genotype ";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
140 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
141 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
142 chop($to_be_printed);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
143
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
144 my $variation_obs = join("/",sort keys(%letters));
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
145
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
146 # print only if polymorphic
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
147 if (scalar keys(%letters) < 2)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
148 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
149 $nb_monomorphic++;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
150 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
151 elsif (scalar keys(%letters) > 2)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
152 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
153 $not_biallelic++;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
154 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
155 else
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
156 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
157 if ($variation ne $variation_obs)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
158 {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
159 $to_be_printed =~s/$variation/$variation_obs/;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
160 $diff_variation++;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
161 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
162
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
163 print $O2 $to_be_printed . "\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
164 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
165 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
166 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
167 close($H);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
168 close($O2);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
169
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
170 print "==============================================\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
171 print "Individuals\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
172 print "==============================================\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
173 print "Individuals in hapmap file: $nb_ind_hapmap\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
174 print "Individuals in trait file: $nb_ind_trait\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
175 print "Individuals found in both files: $nb_common\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
176 print "==============================================\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
177 print "Markers\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
178 print "==============================================\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
179 print "Discarded markers:\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
180 print "Monomorphic: $nb_monomorphic\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
181 print "Not biallelic: $not_biallelic\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
182 print "Modified markers:\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
183 print "Difference in variation: $diff_variation\n";
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
184