annotate check_gwas_inputs/CheckGWASInputs.pl @ 13:734a3572c1d6 draft

Uploaded
author dereeper
date Tue, 08 Jan 2019 08:45:34 -0500
parents 98c37a5d67f4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
2
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
3 use strict;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
4
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
5 use Getopt::Long;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
6
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
7 my $usage = qq~Usage:$0 <args> [<opts>]
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
8 where <args> are:
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
9 -h, --hapmap <Hapmap input file>
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
10 -t, --trait <Trait input file>
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
11 -o, --out <Output base name>
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
12 ~;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
13 $usage .= "\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
14
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
15 my ($hapmap,$trait,$out);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
16
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
17
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
18 GetOptions(
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
19 "trait=s" => \$trait,
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
20 "out=s" => \$out,
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
21 "hapmap=s" => \$hapmap
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
22 );
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
23
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
24
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
25 die $usage
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
26 if ( !$trait || !$out || !$hapmap);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
27
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
28 my %inds;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
29
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
30 #######################################
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
31 # get individuals in trait file
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
32 #######################################
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
33 my %traits;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
34 my $head_trait = `head -1 $trait`;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
35 open(my $T,$trait);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
36 <$T>;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
37 while(<$T>)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
38 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
39 my @infos = split(/\t/,$_);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
40 my $ind = $infos[0];
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
41 $inds{$ind}++;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
42 $traits{$ind} = $_;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
43 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
44 close($T);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
45 my $nb_ind_trait = scalar keys(%traits);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
46
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
47 #######################################
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
48 # get individuals in hapmap file
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
49 #######################################
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
50 my $line_ind = `head -1 $hapmap`;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
51 chomp($line_ind);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
52 my @infos = split(/\t/,$line_ind);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
53 for (my $i = 11; $i <= $#infos; $i++)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
54 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
55 my $ind = $infos[$i];
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
56 $inds{$ind}++;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
57 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
58 my $nb_ind_hapmap = scalar @infos - 11;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
59
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
60 #################################################################
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
61 # create trait output by keeping individuals found in both files
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
62 #################################################################
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
63 open(my $O,">$out.trait");
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
64 print $O $head_trait;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
65 my $nb_common = 0;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
66 foreach my $ind(keys(%inds))
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
67 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
68 my $nb_found = $inds{$ind};
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
69 if ($nb_found == 2)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
70 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
71 $nb_common++;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
72 print $O $traits{$ind};
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
73 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
74 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
75 close($O);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
76
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
77
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
78 #####################################################################
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
79 # create hapmap output after keeping individuals found in both files
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
80 # and removing monomorphic positions
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
81 #####################################################################
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
82 open(my $O2,">$out.hapmap");
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
83 my $numline = 0;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
84 my %genotypes;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
85 my %columns_to_keep;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
86 my $nb_monomorphic = 0;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
87 my $not_biallelic = 0;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
88 my $diff_variation = 0;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
89 open(my $H,$hapmap);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
90 while(<$H>)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
91 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
92 $numline++;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
93 my $line = $_;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
94 $line =~s/\n//g;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
95 $line =~s/\r//g;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
96 my @infos = split(/\t/,$line);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
97 if ($numline == 1)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
98 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
99 my @titles;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
100 for (my $i = 0; $i <= 10; $i++)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
101 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
102 my $title = $infos[$i];
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
103 push(@titles,$title);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
104 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
105 print $O2 join("\t",@titles);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
106 for (my $i = 11; $i <= $#infos; $i++)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
107 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
108 my $ind = $infos[$i];
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
109 my $nb_found = $inds{$ind};
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
110 if ($nb_found == 2)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
111 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
112 print $O2 " $ind";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
113 $columns_to_keep{$i} = 1;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
114 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
115 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
116 print $O2 "\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
117 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
118 else
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
119 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
120 my $to_be_printed = "";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
121 my $variation = $infos[1];
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
122 for (my $i = 0; $i <= 10; $i++)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
123 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
124 my $title = $infos[$i];
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
125 $to_be_printed .= "$title ";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
126 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
127 my %letters;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
128 for (my $i = 11; $i <= $#infos; $i++)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
129 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
130 if ($columns_to_keep{$i})
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
131 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
132 my $genotype = $infos[$i];
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
133 if ($genotype ne 'NN')
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
134 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
135 my ($allele1,$allele2) = split(//,$genotype);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
136 $letters{$allele1}=1;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
137 $letters{$allele2}=1;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
138 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
139 $to_be_printed .= "$genotype ";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
140 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
141 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
142 chop($to_be_printed);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
143
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
144 my $variation_obs = join("/",sort keys(%letters));
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
145
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
146 # print only if polymorphic
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
147 if (scalar keys(%letters) < 2)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
148 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
149 $nb_monomorphic++;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
150 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
151 elsif (scalar keys(%letters) > 2)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
152 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
153 $not_biallelic++;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
154 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
155 else
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
156 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
157 if ($variation ne $variation_obs)
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
158 {
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
159 $to_be_printed =~s/$variation/$variation_obs/;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
160 $diff_variation++;
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
161 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
162
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
163 print $O2 $to_be_printed . "\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
164 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
165 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
166 }
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
167 close($H);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
168 close($O2);
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
169
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
170 print "==============================================\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
171 print "Individuals\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
172 print "==============================================\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
173 print "Individuals in hapmap file: $nb_ind_hapmap\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
174 print "Individuals in trait file: $nb_ind_trait\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
175 print "Individuals found in both files: $nb_common\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
176 print "==============================================\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
177 print "Markers\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
178 print "==============================================\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
179 print "Discarded markers:\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
180 print "Monomorphic: $nb_monomorphic\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
181 print "Not biallelic: $not_biallelic\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
182 print "Modified markers:\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
183 print "Difference in variation: $diff_variation\n";
98c37a5d67f4 Uploaded
dereeper
parents:
diff changeset
184