Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison alignment/phylocatenator.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #! /usr/bin/perl -w | |
2 | |
3 use strict; | |
4 use warnings; | |
5 #phylocatenator.pl | |
6 #Written by Todd H. Oakley UCSB | |
7 #Version 1.0 May, 2012 | |
8 # | |
9 #phyloconcatenator.pl [infile] [also requires arguments 1-8 detailed below] | |
10 # | |
11 #Version 1.0.1 Sept 2012 -- fixed a bug that led sometimes to species with no data being retained. | |
12 #This bug would not affect the data that was retained, so if raxml ran all was fine. However, | |
13 #raxml will not run with species that have completely missing data. Added extra section to remove | |
14 #those species. | |
15 | |
16 #For debugging command line pass, uncomment next | |
17 #for(my $i=0; $i < @ARGV; $i++){ | |
18 # print "Arg $i: $ARGV[$i] \n\n"; | |
19 #} | |
20 #exit; | |
21 | |
22 die "Check arguments" unless @ARGV == 9; | |
23 | |
24 #Obtain Arguments | |
25 my $infile1=shift(@ARGV); #0 input file | |
26 my $mingf_persp=shift(@ARGV); #1 minimum number of genefamiles to keep species | |
27 my $minsp_pergf=shift(@ARGV); #2 minimum number of species to keep genefamily | |
28 my $min_gf_len =shift(@ARGV); #3 minimum gene family length | |
29 my $speciesfile=shift(@ARGV); #4 optional species file 'None' if false | |
30 my $modelsfile=shift(@ARGV); #5 models file | |
31 my $outfile=shift(@ARGV); #6 data outfile | |
32 my $partfile=shift(@ARGV); #7 partition file | |
33 my $htmlfile=shift(@ARGV); #8 html file | |
34 | |
35 #Open 2 output files | |
36 open (OUT, ">$outfile") or die "Cannot create $outfile \n"; | |
37 open (PART, ">$partfile") or die "Cannot create $partfile\n"; | |
38 open (TABLE, ">$htmlfile") or die "Cannot create $htmlfile\n"; | |
39 | |
40 my %HoAdatatable; | |
41 my %HoGF; | |
42 my %lengthof; | |
43 my %ModelFor; | |
44 my $line = ""; | |
45 | |
46 my @specieslist; | |
47 my @genelist; | |
48 my @nsp; | |
49 my $nsl=1; | |
50 my @uniquemodels; | |
51 my @nullmodels; | |
52 my @retainedspecies; | |
53 | |
54 # read file with species | |
55 unless($speciesfile eq 'None'){ | |
56 open SPFILE, $speciesfile or die "ERROR: Cannot open file $speciesfile\n\n"; | |
57 $nsl=0; | |
58 while (<SPFILE>) { | |
59 chomp; | |
60 my $currentinput = "$_"; | |
61 if($currentinput =~m /\w/){ #must have a word otherwise empty | |
62 push(@nsp, $currentinput); | |
63 }else{ | |
64 die "ERROR: Fewer than 4 species meet your specified criteria.\n"; | |
65 } | |
66 # if(@nsp < 4){ | |
67 # die "ERROR: Species file must have more than 4 species.\n"; | |
68 # } | |
69 } | |
70 #print "\n\n"; | |
71 } #end of unless | |
72 | |
73 #Determine models used for each partition, make hash | |
74 unless($modelsfile eq 'None'){ | |
75 open MODFILE, $modelsfile or die "ERROR: Cannot open file $modelsfile\n\n"; | |
76 while (<MODFILE>) { | |
77 chomp; | |
78 my $currentinput = "$_"; | |
79 if($currentinput =~m /\t/){ #must have a tab otherwise wrong file format | |
80 if($currentinput =~m /\t\t/){ | |
81 print OUT "ERROR: file contains 2 tabs in a row. Check phytab format.\n"; | |
82 die; | |
83 }else{ | |
84 my @genemodel = split(/\t/, $currentinput); | |
85 my $genefamily=$genemodel[0]; | |
86 my $curmodel = $genemodel[1]; | |
87 if (exists $ModelFor{$genefamily}) { | |
88 print OUT "ERROR: Model specification for $genefamily is duplicated\n"; | |
89 die; | |
90 }else{ | |
91 $ModelFor{$genefamily}=$curmodel; | |
92 push(@uniquemodels,$curmodel); | |
93 } | |
94 } | |
95 }else{ | |
96 die "ERROR: Model LUT must be genefamily\tmodel and contain no blank lines\n"; | |
97 } | |
98 } | |
99 } | |
100 @uniquemodels = uniq(@uniquemodels); #remove redundant models - uniq is subroutine at end of script | |
101 | |
102 #Now check that models are valid raxml models NOT DONE YET | |
103 checkraxmlmodel(); | |
104 | |
105 #INPUT ALL DATA | |
106 open(INFILE1, $infile1); | |
107 foreach $line(<INFILE1>) { | |
108 chop($line); | |
109 my $getline = $line; | |
110 my @column = split(/\t/, $getline); | |
111 my $species = $column[0]; | |
112 my $genefamily = $column[1]; | |
113 my $genename = $column[2]; | |
114 my $sequence = $column[3]; | |
115 | |
116 if (exists $HoAdatatable{$species}{$genefamily}) { | |
117 print OUT "ERROR: $species $genefamily is duplicated\n"; | |
118 die; | |
119 }else{ | |
120 $HoAdatatable{$species}{$genefamily} = $sequence; | |
121 $HoGF{$genefamily}{$species}=$sequence; | |
122 } | |
123 } | |
124 | |
125 #If no models file selected, set every gene family to GTR model | |
126 if($modelsfile eq 'None'){ | |
127 foreach my $gfkey (sort keys %HoGF){ | |
128 $ModelFor{$gfkey} = 'GTR'; | |
129 } | |
130 push(@uniquemodels, 'GTR'); | |
131 } | |
132 | |
133 #First, keep all species with enough total partitions present | |
134 foreach my $specieskey (sort keys %HoAdatatable) | |
135 { | |
136 #Count species with minimum gfs | |
137 my $ngf_persp=0; | |
138 foreach my $genefamilykey (sort keys %{$HoAdatatable{$specieskey}}) | |
139 { | |
140 $ngf_persp++; | |
141 } | |
142 unless($ngf_persp < $mingf_persp){ #too few genes for this species, delete the sp | |
143 if($nsl){ #No species list supplied, push all species into list | |
144 push(@specieslist,$specieskey); | |
145 }else{ | |
146 #See if current specieskey is in inputted species list | |
147 my $nsp; | |
148 foreach $nsp(@nsp){ | |
149 if (index($nsp,$specieskey) ge 0){ | |
150 push(@specieslist,$specieskey); | |
151 } | |
152 } | |
153 } | |
154 } | |
155 } | |
156 print OUT "\n"; | |
157 unless (@specieslist){ | |
158 print OUT "ERROR: No species with more than $mingf_persp genes\n"; | |
159 die; | |
160 } | |
161 if(@specieslist < 4){ | |
162 print OUT "ERROR: Less than 4 species with more than $mingf_persp genes\n"; | |
163 die; | |
164 } | |
165 | |
166 | |
167 my $oldgenelen = 0; | |
168 my $currentseqlen = 0; | |
169 foreach my $gfkey (sort keys %HoGF) | |
170 { | |
171 $oldgenelen=0; | |
172 #Count gfs with minimum species | |
173 my $nsp_pergf=0; | |
174 for(my $j=0; $j<@specieslist; $j++){ | |
175 if(exists $HoGF{$gfkey}{$specieslist[$j]}){ | |
176 $nsp_pergf++ ; ###if exists $HoGF{$gfkey}{$specieslist[$j]}; | |
177 | |
178 | |
179 ##get length of gene and check it is consistent | |
180 if(exists $lengthof{$gfkey}){ | |
181 $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]}); | |
182 # $lengthof{$gfkey} = $currentseqlen; | |
183 if($currentseqlen == $oldgenelen){ | |
184 #$oldgenelen = $lengthof{$gfkey}; | |
185 #okay | |
186 }else{ | |
187 print OUT "ERROR: $specieslist[$j] $gfkey sequences ". | |
188 "different lengths than previous. Sequences must be aligned. If ". | |
189 "sequences are aligned, check that the line ". | |
190 "does not have an extra data column before the ". | |
191 "sequence.\n\n"; | |
192 die "ERROR: $specieslist[$j] $gfkey sequences ". | |
193 "different lengths than previous. Sequences must be aligned. If ". | |
194 "sequences are aligned, check that the line ". | |
195 "does not have an extra data column before the ". | |
196 "sequence.\n\n"; | |
197 } | |
198 }else{ | |
199 $currentseqlen = length($HoGF{$gfkey}{$specieslist[$j]}); | |
200 if($currentseqlen == 0){ | |
201 die "ERROR: Zero length sequence in file.\n" | |
202 } | |
203 $lengthof{$gfkey} = $currentseqlen; | |
204 $oldgenelen = $lengthof{$gfkey}; | |
205 } | |
206 } | |
207 } | |
208 if($nsp_pergf < $minsp_pergf){ | |
209 #too few species for this gene family | |
210 }else{ | |
211 if(exists $lengthof{$gfkey}){ | |
212 if($lengthof{$gfkey}>$min_gf_len){ | |
213 push(@genelist,$gfkey); | |
214 } | |
215 } | |
216 } | |
217 } | |
218 if (@genelist==0){ | |
219 print OUT "ERROR: No gene families/partitions meet the specified criteria\n"; | |
220 die "ERROR: No gene families/partitions meet the specified criteria\n"; | |
221 } | |
222 | |
223 #Now must delete species that lack any *retained* partitions, ie some species may be all missing | |
224 foreach(@specieslist) | |
225 { | |
226 my $curspecies=$_; | |
227 #Count species with minimum gfs | |
228 my $ngf_persp=0; | |
229 my $nonmissing=0; | |
230 foreach (@genelist) | |
231 { | |
232 if (exists $HoAdatatable{$curspecies}{$_}){ | |
233 my $cursequence = $HoAdatatable{$curspecies}{$_}; | |
234 $cursequence =~ s/\?//g; | |
235 $cursequence =~ s/\-//g; | |
236 #Will not remove N's assumes those are not missing data. Revisit? | |
237 my $curlen = length($cursequence); | |
238 $nonmissing = $nonmissing + $curlen; | |
239 } | |
240 } | |
241 if($nonmissing > 0) #must remove species as it contains none of the genes retained | |
242 { | |
243 print "$curspecies has $nonmissing Non-Missing characters\n"; | |
244 push(@retainedspecies, $curspecies); | |
245 } | |
246 } | |
247 @specieslist=@retainedspecies; | |
248 | |
249 #Remove blankspecies from | |
250 #print phylip file | |
251 #calculate n characters | |
252 my $nchar=0; #total characters | |
253 my $ncharPs =0; #start count for characters in current partition | |
254 my $ncharPe =0; #end count for characters in current partition | |
255 | |
256 #First count total characters for first line | |
257 for(my $k=0; $k<@genelist; $k++){ | |
258 if (exists $lengthof{$genelist[$k]}){ #check that length was calculated | |
259 $nchar = $nchar + $lengthof{$genelist[$k]}; | |
260 }else{ | |
261 die "ERROR: $genelist[$k] LENGTH MISSING\n"; | |
262 } | |
263 } | |
264 print OUT @specieslist." ".$nchar."\n"; | |
265 htmlheader(); | |
266 | |
267 #Need to determine gene list order, which will change due to partitioning | |
268 #then write header line of gene names, hopefully in correct order | |
269 #print TABLE "<td bgcolor=white></td>"; #Blank line in species column | |
270 print TABLE "<td style'width:2pc'><font size='-3'>Partition:</td>"; | |
271 for(my $part=0; $part < @uniquemodels; $part++){ | |
272 for(my $k=0; $k<@genelist; $k++){ | |
273 #First check if current gf matches current partition | |
274 if(exists $ModelFor{$genelist[$k]}){ | |
275 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ | |
276 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ | |
277 print TABLE "<td style'width:2pc'><font size='-3'>$genelist[$k]</td>"; | |
278 } | |
279 } | |
280 } | |
281 } | |
282 } | |
283 print TABLE "</tr>"; | |
284 | |
285 #print TABLE "<td bgcolor=white></td>"; #Blank line in species column | |
286 print TABLE "<td style'width:2pc'><font size='-3'>Model:</td>"; | |
287 for(my $part=0; $part < @uniquemodels; $part++){ | |
288 for(my $k=0; $k<@genelist; $k++){ | |
289 #First check if current gf matches current partition | |
290 if(exists $ModelFor{$genelist[$k]}){ | |
291 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ | |
292 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ | |
293 print TABLE "<td style'width:2pc'><font size='-3'>$uniquemodels[$part]</td>"; | |
294 } | |
295 } | |
296 } | |
297 } | |
298 } | |
299 #End of htmlheader printing | |
300 | |
301 for(my $j=0; $j<@specieslist; $j++){ | |
302 print OUT "$specieslist[$j]\t"; | |
303 print TABLE " | |
304 <tr> | |
305 <td>$specieslist[$j]</td>"; | |
306 for(my $part=0; $part < @uniquemodels; $part++){ | |
307 for(my $k=0; $k<@genelist; $k++){ | |
308 #First check if current gf matches current partition | |
309 if(exists $ModelFor{$genelist[$k]}){ | |
310 if($ModelFor{$genelist[$k]} eq $uniquemodels[$part]){ | |
311 if (exists $HoAdatatable{$specieslist[$j]}{$genelist[$k]}){ | |
312 print OUT $HoAdatatable{$specieslist[$j]}{$genelist[$k]}; | |
313 print TABLE "<td bgcolor=black></td>"; | |
314 $ncharPe = $ncharPe + $lengthof{$genelist[$k]}; | |
315 }else{ | |
316 if (exists $lengthof{$genelist[$k]}){ | |
317 for(my $gap=0; $gap<$lengthof{$genelist[$k]}; $gap++){ | |
318 print OUT "?"; | |
319 } | |
320 $ncharPe = $ncharPe + $lengthof{$genelist[$k]}; | |
321 print TABLE "<td bgcolor=white></td>"; | |
322 #print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$k]</td>"; | |
323 }else{ | |
324 die "ERROR: BUG!! $genelist[$k] LENGTH MISSING\n"; | |
325 } | |
326 } | |
327 } | |
328 }else{ | |
329 die "ERROR: $genelist[$k] is not assigned a model. Check model LUT input.\n"; | |
330 } | |
331 | |
332 } | |
333 if($j==0){ #print partitions first time through gene lists | |
334 if(($ncharPe==0) || ($ncharPs > $ncharPe)){ | |
335 push(@nullmodels, $uniquemodels[$part]); | |
336 #print "NOTE: no partitions under the model $uniquemodels[$part] made final dataset\n"; | |
337 }else{ | |
338 print PART "$uniquemodels[$part], $uniquemodels[$part] = $ncharPs - $ncharPe \n"; | |
339 } | |
340 $ncharPs=$ncharPe + 1; | |
341 } | |
342 } | |
343 print TABLE "</tr>"; | |
344 print OUT "\n"; | |
345 } | |
346 if($ncharPe/@genelist != $nchar){ | |
347 # Have to account for multiple times through gene lists -- ncharPe is summed multiple times but printed once | |
348 #die "ERROR: BUG!! Last partition number doesn't match total n characters\n"; | |
349 } | |
350 | |
351 #print Statistics to screen can be redirected for Log File | |
352 print "\nSPECIES:\n"; | |
353 unless($speciesfile eq 'None'){ | |
354 print "Used species file to select species. \n"; | |
355 } | |
356 print "Number of species with $mingf_persp or more genefamilies/partitions: ".@specieslist."\n"; | |
357 print "Species list: @specieslist\n\n\n"; | |
358 print "\nPARTITIONS/GENE FAMILIES:\n"; | |
359 print "Number genefamilies/partitions longer than $min_gf_len characters and present in at least $minsp_pergf genefamilies/partitions: ".@genelist."\n"; | |
360 for(my $i=0; $i < @genelist; $i++){ | |
361 print "$genelist[$i] Length:$lengthof{$genelist[$i]}\n"; | |
362 } | |
363 | |
364 #Printing Model stats | |
365 print "\nMODELS:\n"; | |
366 if($modelsfile eq 'None'){ | |
367 print "All partitions set to GTR (no model LUT supplied)\n\n"; | |
368 }else{ | |
369 print "A LUT of models was supplied.\n"; | |
370 print "The following models were present in the model LUT file:\n"; | |
371 print join(" ", @uniquemodels), "\n"; | |
372 print "\n"; | |
373 for(my $i=0; $i < @nullmodels; $i++){ | |
374 print "NOTE: no partitions under the model $nullmodels[$i] made final dataset\n"; | |
375 } | |
376 } | |
377 close PART; | |
378 close OUT; | |
379 close SPFILE; | |
380 close TABLE; | |
381 | |
382 sub uniq { | |
383 return keys %{{ map { $_ => 1 } @_ }}; | |
384 } | |
385 | |
386 sub checkraxmlmodel { | |
387 my @raxmlmodels = ("DNA","BIN","MULTI","DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG", "GTR", "MTART", "MTZOA", "FLU","PMB", "HIVB","HIVW","JTTDCMUT"); | |
388 return(1); #Not yet implemented | |
389 } | |
390 sub htmlheader { | |
391 print TABLE '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" | |
392 "http://www.w3.org/TR/html4/loose.dtd"> | |
393 <html> | |
394 <head> | |
395 <title>Dataset Presences and Absences</title> | |
396 <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> | |
397 </head> | |
398 <hr><table border="1"> | |
399 <tr> | |
400 <th align="center">Species<br>'; | |
401 for(my $i=1; $i < @genelist+1; $i++){ | |
402 #Genes are printing in wrong order | |
403 # print TABLE "<td style'width:2pc'><font size='-2'>$genelist[$i]</td>"; | |
404 print TABLE "<td style'width:2pc'><font size='-2'>$i</td>"; | |
405 } | |
406 print TABLE "</th> | |
407 </tr>"; | |
408 } | |
409 sub htmltail { | |
410 print TABLE ' | |
411 </body> | |
412 </html>'; | |
413 } |