Mercurial > repos > dereeper > sniplay
comparison check_gwas_inputs/CheckGWASInputs.pl @ 1:420b57c3c185 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 10 Jul 2015 04:39:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3e19d0dfcf3e | 1:420b57c3c185 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 use Switch; | |
5 use Getopt::Long; | |
6 | |
7 my $usage = qq~Usage:$0 <args> [<opts>] | |
8 where <args> are: | |
9 -h, --hapmap <Hapmap input file> | |
10 -t, --trait <Trait input file> | |
11 -o, --out <Output base name> | |
12 ~; | |
13 $usage .= "\n"; | |
14 | |
15 my ($hapmap,$trait,$out); | |
16 | |
17 | |
18 GetOptions( | |
19 "trait=s" => \$trait, | |
20 "out=s" => \$out, | |
21 "hapmap=s" => \$hapmap | |
22 ); | |
23 | |
24 | |
25 die $usage | |
26 if ( !$trait || !$out || !$hapmap); | |
27 | |
28 my %inds; | |
29 | |
30 ####################################### | |
31 # get individuals in trait file | |
32 ####################################### | |
33 my %traits; | |
34 my $head_trait = `head -1 $trait`; | |
35 open(my $T,$trait); | |
36 <$T>; | |
37 while(<$T>) | |
38 { | |
39 my @infos = split(/\t/,$_); | |
40 my $ind = $infos[0]; | |
41 $inds{$ind}++; | |
42 $traits{$ind} = $_; | |
43 } | |
44 close($T); | |
45 my $nb_ind_trait = scalar keys(%traits); | |
46 | |
47 ####################################### | |
48 # get individuals in hapmap file | |
49 ####################################### | |
50 my $line_ind = `head -1 $hapmap`; | |
51 chomp($line_ind); | |
52 my @infos = split(/\t/,$line_ind); | |
53 for (my $i = 11; $i <= $#infos; $i++) | |
54 { | |
55 my $ind = $infos[$i]; | |
56 $inds{$ind}++; | |
57 } | |
58 my $nb_ind_hapmap = scalar @infos - 11; | |
59 | |
60 ################################################################# | |
61 # create trait output by keeping individuals found in both files | |
62 ################################################################# | |
63 open(my $O,">$out.trait"); | |
64 print $O $head_trait; | |
65 my $nb_common = 0; | |
66 foreach my $ind(keys(%inds)) | |
67 { | |
68 my $nb_found = $inds{$ind}; | |
69 if ($nb_found == 2) | |
70 { | |
71 $nb_common++; | |
72 print $O $traits{$ind}; | |
73 } | |
74 } | |
75 close($O); | |
76 | |
77 | |
78 ##################################################################### | |
79 # create hapmap output after keeping individuals found in both files | |
80 # and removing monomorphic positions | |
81 ##################################################################### | |
82 open(my $O2,">$out.hapmap"); | |
83 my $numline = 0; | |
84 my %genotypes; | |
85 my %columns_to_keep; | |
86 my $nb_monomorphic = 0; | |
87 my $not_biallelic = 0; | |
88 my $diff_variation = 0; | |
89 open(my $H,$hapmap); | |
90 while(<$H>) | |
91 { | |
92 $numline++; | |
93 my $line = $_; | |
94 $line =~s/\n//g; | |
95 $line =~s/\r//g; | |
96 my @infos = split(/\t/,$line); | |
97 if ($numline == 1) | |
98 { | |
99 my @titles; | |
100 for (my $i = 0; $i <= 10; $i++) | |
101 { | |
102 my $title = $infos[$i]; | |
103 push(@titles,$title); | |
104 } | |
105 print $O2 join("\t",@titles); | |
106 for (my $i = 11; $i <= $#infos; $i++) | |
107 { | |
108 my $ind = $infos[$i]; | |
109 my $nb_found = $inds{$ind}; | |
110 if ($nb_found == 2) | |
111 { | |
112 print $O2 " $ind"; | |
113 $columns_to_keep{$i} = 1; | |
114 } | |
115 } | |
116 print $O2 "\n"; | |
117 } | |
118 else | |
119 { | |
120 my $to_be_printed = ""; | |
121 my $variation = $infos[1]; | |
122 for (my $i = 0; $i <= 10; $i++) | |
123 { | |
124 my $title = $infos[$i]; | |
125 $to_be_printed .= "$title "; | |
126 } | |
127 my %letters; | |
128 for (my $i = 11; $i <= $#infos; $i++) | |
129 { | |
130 if ($columns_to_keep{$i}) | |
131 { | |
132 my $genotype = $infos[$i]; | |
133 if ($genotype ne 'NN') | |
134 { | |
135 my ($allele1,$allele2) = split(//,$genotype); | |
136 $letters{$allele1}=1; | |
137 $letters{$allele2}=1; | |
138 } | |
139 $to_be_printed .= "$genotype "; | |
140 } | |
141 } | |
142 chop($to_be_printed); | |
143 | |
144 my $variation_obs = join("/",sort keys(%letters)); | |
145 | |
146 # print only if polymorphic | |
147 if (scalar keys(%letters) < 2) | |
148 { | |
149 $nb_monomorphic++; | |
150 } | |
151 elsif (scalar keys(%letters) > 2) | |
152 { | |
153 $not_biallelic++; | |
154 } | |
155 else | |
156 { | |
157 if ($variation ne $variation_obs) | |
158 { | |
159 $to_be_printed =~s/$variation/$variation_obs/; | |
160 $diff_variation++; | |
161 } | |
162 | |
163 print $O2 $to_be_printed . "\n"; | |
164 } | |
165 } | |
166 } | |
167 close($H); | |
168 close($O2); | |
169 | |
170 print "==============================================\n"; | |
171 print "Individuals\n"; | |
172 print "==============================================\n"; | |
173 print "Individuals in hapmap file: $nb_ind_hapmap\n"; | |
174 print "Individuals in trait file: $nb_ind_trait\n"; | |
175 print "Individuals found in both files: $nb_common\n"; | |
176 print "==============================================\n"; | |
177 print "Markers\n"; | |
178 print "==============================================\n"; | |
179 print "Discarded markers:\n"; | |
180 print "Monomorphic: $nb_monomorphic\n"; | |
181 print "Not biallelic: $not_biallelic\n"; | |
182 print "Modified markers:\n"; | |
183 print "Difference in variation: $diff_variation\n"; | |
184 |