Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison getdata/genbankstrip.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # Modified for Osiris by THO. | |
4 # | |
5 # | |
6 # GenBankStrip.pl v2.0 | |
7 # Last modified July 25, 2005 14:29 | |
8 # (c) Olaf R.P. Bininda-Emonds | |
9 # | |
10 # Input: | |
11 # 1) A GenBank output file | |
12 # 2) A file containing gene names, each on a separate line | |
13 # | |
14 # Generates: | |
15 # 1) A Se-Al formatted (and optionally a nexus-formatted) datafile containing stripped sequences | |
16 # 2) A summary file giving status of each entry in GenBank file | |
17 # | |
18 # Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v] | |
19 # options: -f<filename> = file containing sequences in GenBank format | |
20 # -g<filename> = file containing specific genes to be stripped | |
21 # -i<genename> = strip a single gene | |
22 # -k<number> = number of (longest) sequences to retain per species for a given gene (default = all) | |
23 # -l<number> = minimum length required for all non-tRNA genes (default = none) | |
24 # -o<n|s> = provide output in nexus (n) phytab (p) and/or Se-Al (s) format in addition to fasta format | |
25 # -s = only process sequences with valid species names (i.e., no species with sp. or cf. in names) | |
26 # -t<g<number>|s<number>> = minimum number of sequences (g; default = 0) or species (s; default = 0) a gene must have to be retained; both can be specified simultaneously | |
27 # -h = print this message and quit | |
28 # -v = verbose output | |
29 # -a = Accession number appears first in stripped Fasta File -- Added by THO | |
30 use strict; | |
31 | |
32 # Set default values | |
33 # Genes and sequences | |
34 my %synonym; | |
35 my %complement = ('A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A', | |
36 'M' => 'K', 'R' => 'Y', 'W' => 'S', 'S' => 'W', 'Y' => 'R', 'K' => 'M', | |
37 'B' => 'V', 'D' => 'H', 'H' => 'D', 'V' => 'B', 'N' => 'N', '-' => '-', '?' => '?'); | |
38 my %maxLength; | |
39 | |
40 # I/O | |
41 my $gbFile; | |
42 my ($countFile, $rejectFile, $stripFile); | |
43 my $geneFile; | |
44 my %userGene; | |
45 my $singleGene; | |
46 my $keepGene = 0; | |
47 my $minLength; | |
48 my $minLengthTRNA; | |
49 my $speciesBlock = 0; | |
50 my $seqThreshold = 0; | |
51 my $speciesThreshold = 0; | |
52 my $sealPrint = 0; | |
53 my $nexusPrint = 0; | |
54 my $phytabPrint = 0; | |
55 | |
56 # Global stats | |
57 my (@globalGeneList, %globalGenePresent, %geneStatus, @rejectList, $stripCount); | |
58 my (@speciesList, %speciesPresent, %speciesGenePresent, %quickSpeciesCount, %speciesCount, %quickSequenceCount, %sequenceCount); | |
59 $sequenceCount{"all"} = 0; | |
60 | |
61 # Miscellaneous | |
62 my $debug = 0; | |
63 my $verbose = 0; | |
64 my $accessionFirst = 0; #Added by THO as option to have accession # first in FASTA output | |
65 | |
66 for (my $i = 0; $i <= $#ARGV; $i++) | |
67 { | |
68 #Original stripped out directory structure, required by Galaxy | |
69 # if ($ARGV[$i] =~ /^-f([\w+\.?\w+?]*)/) | |
70 if ($ARGV[$i] =~ /^-f(.+)/) | |
71 { | |
72 $gbFile = $1; | |
73 | |
74 # (my $baseFile = $gbFile) =~ s/.\w+$//; | |
75 # $countFile .= $baseFile . "_geneCount.txt"; | |
76 # $rejectFile .= $baseFile . "_gbs.rejectlist.txt"; | |
77 # $stripFile .= $baseFile . "_gbs.striplist.txt"; | |
78 # } | |
79 #for Galaxy, easier to have same file name each time it'r run | |
80 (my $baseFile = $gbFile) =~ s/.\w+$//; | |
81 $countFile .= "geneCount.txt"; | |
82 $rejectFile .= "rejectlist.txt"; | |
83 $stripFile .= "striplist.txt"; | |
84 } | |
85 elsif ($ARGV[$i] =~ /^-g([\w+\.?\w+?]*)/) | |
86 { | |
87 $geneFile = $1; | |
88 | |
89 #This is a hack to write the name of a single gene into a new genefile based on the Galaxy call | |
90 | |
91 open (GENE,">$geneFile") or die "Cannot open file $geneFile containing gene names.\n"; | |
92 print GENE "$geneFile\n"; | |
93 close GENE; | |
94 } | |
95 elsif ($ARGV[$i] =~ /^-i([\w+\.?\w+?]*)/) | |
96 { | |
97 $singleGene = $1; | |
98 } | |
99 elsif ($ARGV[$i] =~ /^-k(\d+)/) | |
100 { | |
101 $keepGene = $1; | |
102 } | |
103 elsif ($ARGV[$i] =~ /^-l(\d+)/) | |
104 { | |
105 $minLength = $1; | |
106 $minLengthTRNA = 50; | |
107 } | |
108 elsif ($ARGV[$i] =~ /^-on/) | |
109 { | |
110 $nexusPrint = 1; | |
111 } | |
112 elsif ($ARGV[$i] =~ /^-op/) | |
113 { | |
114 $phytabPrint = 1; | |
115 } | |
116 elsif ($ARGV[$i] =~ /^-os/) | |
117 { | |
118 $sealPrint = 1; | |
119 } | |
120 elsif ($ARGV[$i] =~ /^-s/) | |
121 { | |
122 $speciesBlock = 1; | |
123 } | |
124 elsif ($ARGV[$i] =~ /^-tg(\d+)/) | |
125 { | |
126 $seqThreshold = $1; | |
127 } | |
128 elsif ($ARGV[$i] =~ /^-ts(\d+)/) | |
129 { | |
130 $speciesThreshold = $1; | |
131 } | |
132 elsif ($ARGV[$i] =~ /^-v/) | |
133 { | |
134 $verbose = 1; | |
135 } | |
136 elsif ($ARGV[$i] =~ /^-a/) | |
137 { | |
138 $accessionFirst = 1; | |
139 } | |
140 elsif ($ARGV[$i] =~ /^-x/) | |
141 { | |
142 $debug = 1; | |
143 $verbose = 1; | |
144 } | |
145 elsif ($ARGV[$i] =~ /^-h/) | |
146 { | |
147 print "Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v]\n"; | |
148 print "Options: -f<filename> = file containing sequences in GenBank format\n"; | |
149 print " -g<filename> = file containing specific genes to be stripped\n"; | |
150 print " -k<number> = number of (longest) sequences to retain per species for a given gene (default = all)\n"; | |
151 print " -l<number> = minimum length required for all non-tRNA genes (default = none)\n"; | |
152 print " -o<n|s> = provide output in nexus (n) phytab (p) and/or Se-Al (s) format in addition to fasta format\n"; | |
153 print " -s = only process sequences for valid species names (i.e., no species with sp. or cf. in names)\n"; | |
154 print " -t<g<number>|s<number>> = minimum number of sequences (g; default = 0) or species (s; default = 0)\n"; | |
155 print " a gene must have to be retained; both can be specified simultaneously\n"; | |
156 print " -h = print this message and quit\n"; | |
157 print " -v = verbose output\n"; | |
158 exit(0); | |
159 } | |
160 else | |
161 { | |
162 print "Don't understand argument: $ARGV[$i]\n"; | |
163 print "Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v]\n"; | |
164 exit(1); | |
165 } | |
166 } | |
167 | |
168 die "ERROR: Must supply name of GenBank output file using flag -f.\n" if (not $gbFile); | |
169 | |
170 # Load in hardwired gene synonyms | |
171 geneSynonyms(); | |
172 | |
173 # Get list of target genes (if desired) | |
174 if ($geneFile) | |
175 { | |
176 my $userGeneCount = 0; | |
177 my %userGenePresent; | |
178 setLineBreak($geneFile); | |
179 open (GENE,"<$geneFile") or die "Cannot open file $geneFile containing gene names.\n"; | |
180 print "Gene(s) to be stripped:\n"; | |
181 while (<GENE>) | |
182 { | |
183 chomp; | |
184 next unless ($_); | |
185 my $gene = $_; | |
186 $gene = $synonym{$gene} if (defined $synonym{$gene}); | |
187 $userGene{$gene} = 1; | |
188 unless ($userGenePresent{$gene}) | |
189 { | |
190 $userGeneCount++; | |
191 $userGenePresent{$gene}++; | |
192 print "\t$gene\n"; | |
193 } | |
194 } | |
195 close GENE; | |
196 | |
197 # die "ERROR: No genes read in from file $geneFile\n"; | |
198 } | |
199 #THO added -h command to easilly pass a single gene for Galaxy | |
200 if($singleGene) | |
201 { | |
202 my $userGeneCount = 1; | |
203 my %userGenePresent; | |
204 print "Gene(s) to be stripped:\n"; | |
205 my $gene = $singleGene; | |
206 $gene = $synonym{$gene} if (defined $synonym{$gene}); | |
207 print "\t$gene\n"; | |
208 $geneFile = $singleGene; | |
209 } | |
210 | |
211 # Print parameter summary | |
212 print "The following parameters have been set by the user:\n"; | |
213 print "\tFile containing GenBank sequence data: $gbFile\n"; | |
214 print "\tFile containing target genes to be stripped: $geneFile\n" if ($geneFile); | |
215 print "\tUser-defined constraints(s):\n"; | |
216 print "\t\tNumber of sequences: $seqThreshold\n" if ($seqThreshold); | |
217 print "\t\tNumber of species: $speciesThreshold\n" if ($speciesThreshold); | |
218 print "\t\tMinimum sequence length: global - $minLength bp; tRNAs - $minLengthTRNA bp\n" if (defined $minLength); | |
219 print "\t\tOnly using species with valid names\n" if ($speciesBlock); | |
220 print "\t\tNone\n" if (not $seqThreshold and not $speciesThreshold and not defined $minLength and not $speciesBlock); | |
221 print "\tNumber of sequences to keep per species for each gene: "; | |
222 if ($keepGene) | |
223 { | |
224 print "$keepGene\n"; | |
225 } | |
226 else | |
227 { | |
228 print "all\n"; | |
229 } | |
230 print "\tOutput format(s): fasta"; | |
231 print ", Se-Al" if ($sealPrint); | |
232 print ", nexus" if ($nexusPrint); | |
233 print ", phytab" if ($phytabPrint); | |
234 print "\n"; | |
235 | |
236 # Do quick gene count if thresholds indicated; takes longer but 1) saves many disk operations and 2) less memory intensive | |
237 geneCount($gbFile) if (not defined $geneFile and ($seqThreshold or $speciesThreshold)); # Don't bother if user gene list given; will usually be small enough so that benefits won't come into play | |
238 | |
239 # Read in GenBank file and strip genes | |
240 my $stripZero = time; | |
241 print "\nProcessing GenBank file $gbFile ...\n"; | |
242 setLineBreak($gbFile); | |
243 open (DATA, "<$gbFile") or die "Cannot open GenBank output file $gbFile\n"; | |
244 my @allAccNum; | |
245 my ($accNum, %accRead, $duplEntry, $organism, %species, $geneName); | |
246 my $speciesFlag = 0; | |
247 my (%genePresent, @geneList); | |
248 my (@startList, @stopList, $complementFlag, $typeStatus, %pseudoStatus, %seqType, $joinLine, @joinSegments, %geneStart, %geneStop, %compStatus, $fullSeq); | |
249 my $nameFlag = 0; | |
250 my $joinFlag = 0; | |
251 my $pseudoFlag = 0; | |
252 my $readSeqs = 0; | |
253 | |
254 while (<DATA>) | |
255 { | |
256 chomp; | |
257 my $readLine = $_; | |
258 next if ($readLine =~ /^\s*LOCUS/ or $readLine =~ /^\s*DEFINITION/ or $readLine =~ /^\s*VERSION/ or $readLine =~ /^\s*KEYWORDS/ or $readLine =~ /^\s*SOURCE/ or $readLine =~ /^\s*ORGANISM/ or $readLine =~ /^\s*REFERENCE/ or $readLine =~ /^\s*AUTHORS/ or $readLine =~ /^\s*TITLE/ or $readLine =~ /^\s*JOURNAL/ or $readLine =~ /^\s*MEDLINE/ or $readLine =~ /^\s*PUBMED/ or $readLine =~ /^\s*FEATURES/ or $readLine =~ /^\s*COMMENT/ or $readLine =~ /^\s*BASE COUNT/ or $readLine =~ /^\s*source/ or $readLine =~ /^\s*\/codon/ or $readLine =~ /^\s*\/transl/ or $readLine =~ /^\s*\/db_/ or $readLine =~ /^\s*CONTIG/); | |
259 | |
260 # Get accession number | |
261 if ($readLine =~ /^\s*ACCESSION\s+(.+)/) | |
262 { | |
263 $accNum = $1; | |
264 print "$readLine\n" if ($debug); | |
265 # Clear variables | |
266 undef @geneList; | |
267 undef %genePresent; | |
268 undef $fullSeq; | |
269 undef @startList; | |
270 undef @stopList; | |
271 undef %geneStart; | |
272 undef %geneStop; | |
273 undef %pseudoStatus; | |
274 undef $geneName; | |
275 $speciesFlag = $nameFlag = $joinFlag = $pseudoFlag = $readSeqs = $duplEntry = 0; | |
276 | |
277 if (not $accRead{$accNum}) # Check for duplicate entries | |
278 { | |
279 $accRead{$accNum} = 1; | |
280 push @allAccNum, $accNum; | |
281 if (scalar(@allAccNum) == int(scalar(@allAccNum)/10000)*10000 and $verbose) | |
282 { | |
283 print "\tSequences read in: ".scalar(@allAccNum)."\n"; | |
284 } | |
285 print "\tAccession number: $accNum\n" if ($debug); | |
286 } | |
287 else | |
288 { | |
289 print "*****NOTE -- DUPLICATE ENTRY Accession $accNum\n"; | |
290 $duplEntry = 1; | |
291 } | |
292 } | |
293 | |
294 # Get organism name | |
295 if ($readLine =~ /^\s*\/organism=\"(.+)\"/) | |
296 { | |
297 $organism = $1; | |
298 $organism =~ s/\s/_/g; | |
299 print "$readLine\n" if ($debug); | |
300 $species{$accNum} = $organism; | |
301 $speciesFlag = 1 if ($organism =~ /sp\./ or $organism =~ /cf\./ or $organism =~ /_X_/i); | |
302 if ($debug) | |
303 { | |
304 print "\t\tOrganism: $organism"; | |
305 print " (blocked)" if ($speciesFlag and $speciesBlock); | |
306 print "\n"; | |
307 } | |
308 } | |
309 next if ($speciesFlag and $speciesBlock); # Entry pertains to invalid species name; skip parsing rest of entry | |
310 | |
311 | |
312 # Get gene boundaries; process previous set of CDs | |
313 if ($readLine =~ /\<?(\d+)\<?\.\.\>?(\d+)\>?/ or $joinFlag == 1 or $readLine =~ /^\s*ORIGIN/) # ORIGIN will process last set of CDs | |
314 { | |
315 next if ($readLine =~ /^\s+\/\w+/); # Prevents spurious matches with lines beginning with "/feature" | |
316 $readSeqs = 1 if ($readLine =~ /^\s*ORIGIN/); # Indicates that remaining lines will contain sequence information | |
317 | |
318 # Process previous gene; need to do here to account for a posteriori declarations of pseudogene status | |
319 if ($geneName and $nameFlag == 2 and @startList and @stopList) # Process complete gene | |
320 { | |
321 print "\t\t\t\tParsed name: $geneName (type: $typeStatus)\n" if ($debug); | |
322 | |
323 # Clean up gene name and misleading punctuation | |
324 $geneName = geneClean($geneName); | |
325 $pseudoStatus{$geneName} = 1 if ($pseudoFlag); | |
326 | |
327 if (defined @ { $geneStart{$geneName} } and ((defined $pseudoStatus{$geneName} and $pseudoStatus{$geneName} == 1) or $typeStatus =~ /intron/i or $typeStatus =~ /UTR/i)) # Gene has previously stored CDs that might not have been recognized as a pseudogene or non-coding region | |
328 { | |
329 print "\t\t\t\t\tSubsequent occurrence of now recognized pseudogene or non-coding region; comparing new and stored CDs\n" if ($debug); | |
330 for (my $i = 0; $i < scalar(@startList); $i++) # Check each occurrence in new CDs for matches in stored ones | |
331 { | |
332 my $newStart = $startList[$i]; | |
333 my $newStop = $stopList[$i]; | |
334 print "\t\t\t\t\t\tChecking new CDs $newStart to $newStop\n" if ($debug); | |
335 for (my $j = 0; $j < scalar(@ { $geneStart{$geneName} }); $j++) | |
336 { | |
337 if ($newStart == $geneStart{$geneName}[$j] and $newStop == $geneStop{$geneName}[$j]) | |
338 { | |
339 print "\t\t\t\t\t\t\tMatch with stored CDs (no. $j); deleted\n" if ($debug); | |
340 splice(@ { $geneStart{$geneName} }, $j, 1); | |
341 splice(@ { $geneStop{$geneName} }, $j, 1); | |
342 } | |
343 } | |
344 } | |
345 if ($debug) | |
346 { | |
347 print "\n\t\t\t\t\tCurrent gene boundaries after pseudogene / non-coding check (type = $seqType{$geneName}):\n"; | |
348 if (scalar(@ { $geneStart{$geneName} }) < 1) | |
349 { | |
350 print "\t\t\t\t\t\tnone\n"; | |
351 } | |
352 else | |
353 { | |
354 for (my $i = 0; $i < scalar(@ { $geneStart{$geneName} }); $i++) | |
355 { | |
356 print "\t\t\t\t\t\t$geneStart{$geneName}[$i]\t$geneStop{$geneName}[$i]\n"; | |
357 } | |
358 } | |
359 } | |
360 } | |
361 | |
362 # Only process coding regions of user-desired genes (if applicable), genes with sensible CDs, non-blocked genes, and genes that are not obvious singletons or pseudogenes | |
363 unless (($geneFile and not defined $userGene{$geneName}) or | |
364 (defined $geneStatus{$geneName} and $geneStatus{$geneName} eq "rejected") or | |
365 (defined $pseudoStatus{$geneName} and $pseudoStatus{$geneName} == 1) or | |
366 $geneName =~ /hypothetical/i or $geneName =~ /open reading frame/i or $geneName =~ /similar/i or $geneName =~ /homolog/i or $geneName =~ /putative/i or $geneName =~ /unknown/i or $geneName =~ /unnamed/i or $geneName =~ /\d+rik/i or $geneName =~ /possible/i or $geneName =~ /pseudo/i | |
367 or $typeStatus =~ /UTR/i or $typeStatus =~ /intron/i or $typeStatus =~ /misc_feature/i) | |
368 { | |
369 if (not defined $genePresent{$geneName}) # Process first occurrence of gene in entry | |
370 { | |
371 if ($debug) | |
372 { | |
373 print "\t\t\t\t\tFirst occurrence of $geneName\n" if ($debug); | |
374 for (my $i = 0; $i < scalar(@startList); $i++) | |
375 { | |
376 print "\t\t\t\t\t\t$startList[$i]\t$stopList[$i]\n"; | |
377 } | |
378 } | |
379 $genePresent{$geneName} = 1; | |
380 push @geneList, $geneName; | |
381 push @ { $geneStart{$geneName} }, $_ foreach (@startList); | |
382 push @ { $geneStop{$geneName} }, $_ foreach (@stopList); | |
383 | |
384 $seqType{$geneName} = $typeStatus; | |
385 $compStatus{$geneName} = 0; # Note whether gene is complemented or not | |
386 $compStatus{$geneName} = 1 if ($complementFlag); | |
387 } | |
388 else # Attempt to add secondary occurrences | |
389 { | |
390 my $storedSegments = scalar(@ { $geneStop{$geneName} }) - 1; | |
391 my $lastStop = $geneStop{$geneName}[$storedSegments]; | |
392 $lastStop = 0 if (not defined $lastStop); | |
393 my $newStart = $startList[0]; | |
394 | |
395 if ($debug) | |
396 { | |
397 print "\t\t\t\t\tSecondary occurrence of $geneName with boundaries\n" if ($debug); | |
398 for (my $i = 0; $i < scalar(@startList); $i++) | |
399 { | |
400 print "\t\t\t\t\t\t$startList[$i]\t$stopList[$i]\n"; | |
401 } | |
402 } | |
403 if ($seqType{$geneName} eq "gene" and $typeStatus ne "gene") # New information probably more precise and better accounts for structure | |
404 { | |
405 print "\n\t\t\t\t\t\tNew segment more precisely defined by type; replaced\n" if ($debug); | |
406 undef @ { $geneStart{$geneName} }; | |
407 undef @ { $geneStop{$geneName} }; | |
408 push @ { $geneStart{$geneName} }, $_ foreach (@startList); | |
409 push @ { $geneStop{$geneName} }, $_ foreach (@stopList); | |
410 $seqType{$geneName} = $typeStatus; | |
411 } | |
412 | |
413 elsif ($newStart > $lastStop) # New segment occurs distal to last stored segment (could also be contiguous); append to boundaries | |
414 { | |
415 print "\n\t\t\t\t\t\tContiguous with or occurs after last stored segment; appended\n" if ($debug); | |
416 push @ { $geneStart{$geneName} }, $_ foreach (@startList); | |
417 push @ { $geneStop{$geneName} }, $_ foreach (@stopList); | |
418 $seqType{$geneName} = "composite"; | |
419 } | |
420 elsif (scalar(@ { $geneStart{$geneName} }) == 1 and scalar(@startList) > 1) # Replace single stored segment with new segments derived from join statement; probably better accounts for intron/exon structure | |
421 { | |
422 print "\n\t\t\t\t\t\tNew segments subdivide single stored segment; replaced\n" if ($debug); | |
423 undef @ { $geneStart{$geneName} }; | |
424 undef @ { $geneStop{$geneName} }; | |
425 push @ { $geneStart{$geneName} }, $_ foreach (@startList); | |
426 push @ { $geneStop{$geneName} }, $_ foreach (@stopList); | |
427 $seqType{$geneName} = "composite"; | |
428 } | |
429 else | |
430 { | |
431 print "\n\t\t\t\t\t\tNew segment overlaps stored segments; rejected\n" if ($debug); | |
432 } | |
433 } | |
434 if ($debug) | |
435 { | |
436 print "\n\t\t\t\t\tCurrent gene boundaries after CDS processing (type = $seqType{$geneName}):\n"; | |
437 if (scalar(@ { $geneStart{$geneName} }) < 1) | |
438 { | |
439 print "\t\t\t\t\t\tnone\n"; | |
440 } | |
441 else | |
442 { | |
443 for (my $i = 0; $i < scalar(@ { $geneStart{$geneName} }); $i++) | |
444 { | |
445 print "\t\t\t\t\t\t$geneStart{$geneName}[$i]\t$geneStop{$geneName}[$i]\n"; | |
446 } | |
447 } | |
448 } | |
449 } | |
450 | |
451 # Clear entries for gene | |
452 $nameFlag = $complementFlag = 0; | |
453 undef @startList; # Prevents double listing of genes if CDS use both /gene and /product tags | |
454 undef @stopList; | |
455 undef $typeStatus; | |
456 $pseudoStatus{$geneName} = 0; # Clear pseudogene status for previous set of CDs | |
457 } | |
458 next if ($readLine =~ /^\s*ORIGIN/); # No more CDs to worry about | |
459 | |
460 print "$readLine\n" if ($debug); | |
461 | |
462 # Reset gene name and information for current set of CDs | |
463 undef $geneName; | |
464 $nameFlag = $pseudoFlag = $complementFlag = 0; | |
465 $complementFlag = 1 if ($readLine =~ /complement/); #DISABLED BY THO BECAUSE ONLY COMPLEMENTS AND DOESN'T REVERSE | |
466 | |
467 # Get new CD information | |
468 if ($readLine =~ /^\s+\S+\s+join\(/i or $readLine =~ /^\s+\S+\s+order\(/) | |
469 { | |
470 $joinFlag = 1; | |
471 undef @startList; | |
472 undef @stopList; | |
473 undef $joinLine; | |
474 } | |
475 if ($readLine =~ /^\s+(\S+)\s+/i) | |
476 { | |
477 $typeStatus = $1; | |
478 } | |
479 | |
480 if ($joinFlag) | |
481 { | |
482 $joinLine .= $readLine; | |
483 if ($readLine =~ /\)$/) # Have accounted for multiline join statements; process | |
484 { | |
485 $joinFlag = 0; | |
486 $complementFlag = 1 if ($joinLine =~ /complement/); | |
487 | |
488 # Clean up join statement | |
489 $joinLine =~ s/complement\(//; | |
490 $joinLine =~ s/>//g; | |
491 $joinLine =~ s/<//g; | |
492 $joinLine =~ s/^\s+\S+\s+join\(//; | |
493 $joinLine =~ s/^\s+\S+\s+order\(//; | |
494 $joinLine =~ s/\)+$//; | |
495 $joinLine =~ s/\s+//g; | |
496 print "\t\tJoin boundaries: $joinLine\n" if ($debug); | |
497 if ($joinLine =~ /[A-Z]+\d+/i) # Avoid join commands referring to separate accessions | |
498 { | |
499 undef $joinLine; | |
500 print "\t\t\tERROR: Joining accessions; skipping\n" if ($debug); | |
501 next; | |
502 } | |
503 | |
504 # Derive pieces of join statement | |
505 @joinSegments = split(/,/, $joinLine); | |
506 foreach my $segment (@joinSegments) | |
507 { | |
508 my ($start, $stop) = split(/\.\./, $segment); | |
509 # Check safeties | |
510 next unless ($start and $stop); | |
511 next if ($start =~ /\D/ or $stop =~ /\D/); | |
512 next if ($start > $stop); | |
513 push @startList, $start; | |
514 push @stopList, $stop; | |
515 } | |
516 | |
517 undef $joinLine; | |
518 $typeStatus = "join"; | |
519 } | |
520 } | |
521 else | |
522 { | |
523 my ($start, $stop); | |
524 if ($readLine =~ /\<?(\d+)\<?\.\.\>?(\d+)\>?/) | |
525 { | |
526 $start = $1; | |
527 $stop = $2; | |
528 } | |
529 next unless ($start and $stop); | |
530 next if ($start > $stop); | |
531 undef @startList; | |
532 push @startList, $start; | |
533 undef @stopList; | |
534 push @stopList, $stop; | |
535 print "\t\tParsed boundaries: $1\t$2\n" if ($debug); | |
536 } | |
537 } | |
538 | |
539 | |
540 # Check for pseudogene status | |
541 if ($readLine =~ /\s+\/pseudo/ or $readLine =~ /pseudogene/ or $readLine =~ /putative/) | |
542 { | |
543 print "$readLine\n" if ($debug); | |
544 $pseudoStatus{$geneName} = 1 if (defined $geneName); | |
545 $pseudoFlag = 1; | |
546 print "\t\t\t\tWARNING: suspected pseudogene or of uncertain (\"putative\") status\n" if ($debug); | |
547 } | |
548 | |
549 if ($readLine =~ /codon recognized/ and defined $geneName and $geneName =~ /trna/i) | |
550 { | |
551 print "$readLine\n" if ($debug); | |
552 $pseudoStatus{$geneName} = 1; | |
553 $pseudoFlag = 1; | |
554 print "\t\t\t\tWARNING: tRNA for an alternative codon; ignored\n" if ($debug); | |
555 } | |
556 | |
557 # Get gene name | |
558 if ($readLine =~ /^\s*(\/gene)|(\/product)=\"(.+)/ and $nameFlag == 0) # Get start of gene name | |
559 { | |
560 $geneName = $1 if ($readLine =~ /=\"(.+)/); | |
561 print "$readLine\n" if ($debug); | |
562 | |
563 # Check whether name wraps onto a new line | |
564 if (substr($readLine, -1) ne "\"") | |
565 { | |
566 $nameFlag = 1; | |
567 next; | |
568 } | |
569 else | |
570 { | |
571 $nameFlag = 2; | |
572 $geneName =~ s/\"$// if ($geneName =~ /\"$/); | |
573 } | |
574 } | |
575 | |
576 if ($nameFlag == 1) # Get continuation / end of gene name | |
577 { | |
578 print "$readLine\n" if ($debug); | |
579 $geneName .= $readLine; | |
580 $nameFlag = 2 if ($readLine =~ /\"$/) # Gene name is complete | |
581 } | |
582 | |
583 # Read in sequence information and append | |
584 if ($readLine =~ /^\s*\d*1\s\w+/ and @geneList and $readSeqs) | |
585 { | |
586 my $seqFrag = $readLine; | |
587 $seqFrag =~ s/\s+//g; | |
588 $seqFrag =~ s/\d+//g; | |
589 $fullSeq .= uc($seqFrag); | |
590 } | |
591 | |
592 # End of entry; process | |
593 if ($readLine =~ /\/\//) | |
594 { | |
595 next if ($duplEntry); | |
596 next unless (@geneList and defined $fullSeq); | |
597 foreach my $gene (@geneList) | |
598 { | |
599 # Safeties; shouldn't come into play | |
600 next if ($geneFile and not defined $userGene{$gene}); | |
601 next if (defined $geneStatus{$gene} and $geneStatus{$gene} eq "rejected"); | |
602 next if (defined $pseudoStatus{$gene} and $pseudoStatus{$gene} == 1); | |
603 | |
604 print "\n\t\tProcessing gene $gene\n" if ($debug); | |
605 | |
606 # Strip gene out of full sequence and format as fasta | |
607 next unless (@ { $geneStart{$gene} } and @ { $geneStop{$gene} } ); | |
608 | |
609 my $geneSeq; | |
610 for (my $i = 0; $i < scalar(@ { $geneStart{$gene} }); $i++) | |
611 { | |
612 print "\t\t\tGetting sequence between positions $geneStart{$gene}[$i] and $geneStop{$gene}[$i]\n" if ($debug); | |
613 next if ($geneStart{$gene}[$i] > length($fullSeq) or $geneStop{$gene}[$i] > length($fullSeq)); | |
614 my $geneSegment = substr($fullSeq,$geneStart{$gene}[$i]-1,$geneStop{$gene}[$i]-$geneStart{$gene}[$i]+1); | |
615 print "\t\t\t\t$geneSegment\n" if ($debug); | |
616 $geneSeq .= $geneSegment; | |
617 } | |
618 | |
619 next unless ($geneSeq); | |
620 $geneSeq = complement($geneSeq) if ($compStatus{$gene}); | |
621 $maxLength{$gene} = length($geneSeq) if (not defined $maxLength{$gene} or length($geneSeq) > $maxLength{$gene}); | |
622 | |
623 # Check if sequence matches length threshold (if appropriate) | |
624 if (defined $minLength) | |
625 { | |
626 if ($gene =~ /^trna-\w\w\w/) | |
627 { | |
628 if (length($geneSeq) < $minLengthTRNA) | |
629 { | |
630 printf "\t\t\tRejected: length (%s bp) does not meet tRNA length threshold ($minLengthTRNA bp)\n", length($geneSeq) if ($debug); | |
631 next; | |
632 } | |
633 } | |
634 else | |
635 { | |
636 if (length($geneSeq) < $minLength) | |
637 { | |
638 printf "\t\t\tRejected: length (%s bp) does not meet global threshold ($minLength bp)\n", length($geneSeq) if ($debug); | |
639 next; | |
640 } | |
641 } | |
642 } | |
643 if ($gene =~ /trna-\w\w\w/ and length($geneSeq) > 100) | |
644 { | |
645 printf "\t\t\tRejected: length of tRNA too long (%s bp); indicates parsing failure\n", length($geneSeq) if ($debug); | |
646 next; | |
647 } | |
648 | |
649 my $breakPoint = 79; | |
650 until ($breakPoint > length($geneSeq)) | |
651 { | |
652 my $replaceString = "\n" . substr($geneSeq, $breakPoint, 1); | |
653 substr($geneSeq, $breakPoint, 1) = $replaceString; | |
654 $breakPoint += 80; | |
655 } | |
656 | |
657 # Append sequence to file | |
658 $geneStatus{$gene} = "stripped"; | |
659 $sequenceCount{$gene}++; | |
660 $sequenceCount{$species{$accNum}}++; | |
661 $sequenceCount{"all"}++; | |
662 my $fastaFile = $gene."_gbs.fasta.txt"; | |
663 | |
664 my $IOicon = ">"; | |
665 $IOicon .= ">" if ($sequenceCount{$gene} > 1); | |
666 | |
667 unless ($debug) | |
668 | |
669 | |
670 { | |
671 open (GENE, $IOicon, $fastaFile) or die "Cannot open file $fastaFile to write sequence to."; | |
672 print GENE ">$species{$accNum} $accNum\n"; | |
673 print GENE "$geneSeq\n"; | |
674 close GENE; | |
675 } | |
676 | |
677 # Update various statistical counters | |
678 unless ($globalGenePresent{$gene}) # Gene counter | |
679 { | |
680 push @globalGeneList, $gene; | |
681 $globalGenePresent{$gene} = 1; | |
682 } | |
683 | |
684 unless ($speciesPresent{$species{$accNum}}) # Species counter | |
685 { | |
686 push @speciesList, $species{$accNum}; | |
687 $speciesPresent{$species{$accNum}} = 1; | |
688 } | |
689 | |
690 $speciesGenePresent{$gene}{$species{$accNum}}++; # Species-gene counter | |
691 $speciesCount{$gene}++ if ($speciesGenePresent{$gene}{$species{$accNum}} == 1); | |
692 | |
693 # Print out summary stats for gene | |
694 if ($debug) | |
695 { | |
696 print "\t\t\tGene: $gene\n"; | |
697 printf "\t\t\t\tGene boundaries (%s bp)", length($geneSeq) - ($geneSeq =~ tr/\n//); | |
698 print " (complemented)" if ($compStatus{$gene}); | |
699 print ":\n"; | |
700 for (my $i = 0; $i < scalar(@ { $geneStart{$gene} }); $i++) | |
701 { | |
702 print "\t\t\t\t\t$geneStart{$gene}[$i]\t$geneStop{$gene}[$i]\n"; | |
703 } | |
704 print "\t\t\t\t\t$geneSeq\n"; | |
705 } | |
706 } | |
707 } | |
708 } | |
709 close DATA; | |
710 | |
711 # Print out summary stats | |
712 print "\n\tSummary stats:\n"; | |
713 printf "\t\tTotal number of accessions processed: %s\n", scalar(@allAccNum); | |
714 printf "\t\tTotal number of unique species: %s\n", scalar(@speciesList); | |
715 printf "\t\tTotal number of unique sequences: %s\n", $sequenceCount{"all"}; | |
716 printf "\t\tTotal number of unique genes: %s\n", scalar(@globalGeneList); | |
717 | |
718 my $stripTime = time - $stripZero; | |
719 print "\n\t\tTime taken to process file: $stripTime seconds\n"; | |
720 | |
721 if (not @globalGeneList) | |
722 { | |
723 print "\nNOTE: No genes stripped from file $gbFile; exiting\n"; | |
724 exit(0); | |
725 } | |
726 | |
727 # Reprocess results to 1) recheck whether stripped genes actually fall below user-set thresholds (geneCount procedure is a rough maximum), 2) pare down to required number of sequences per species, and 3) produce appropriate output files | |
728 print "\nPostprocessing results ...\n"; | |
729 my $postZero = time; | |
730 | |
731 @globalGeneList = sort(@globalGeneList); | |
732 $stripCount = scalar(@globalGeneList); | |
733 | |
734 foreach my $gene (@globalGeneList) | |
735 { | |
736 next if ($debug); | |
737 print "\tProcessing gene $gene ($sequenceCount{$gene} sequences for $speciesCount{$gene} species)\n" if ($verbose); | |
738 if ($sequenceCount{$gene} < $seqThreshold or $speciesCount{$gene} < $speciesThreshold) # Gene does not meet threshold requirements; reject | |
739 { | |
740 $geneStatus{$gene} = "rejected"; | |
741 push @rejectList, $gene; | |
742 $stripCount--; | |
743 print "\t\tRejected; does not meet threshold requirements\n" if ($verbose); | |
744 | |
745 # Remove associated file | |
746 my $rmString = $gene."_gbs.fasta.txt"; | |
747 $rmString =~ s/\s/\\ /g; | |
748 $rmString =~ s/\(/\\(/g; | |
749 $rmString =~ s/\)/\\)/g; | |
750 $rmString =~ s/\'/\\'/g; | |
751 system ("rm $rmString"); | |
752 } | |
753 else | |
754 { | |
755 my (%infLength, %lengthList, %criticalLength); | |
756 undef @speciesList; | |
757 $geneStatus{$gene} = "stripped"; | |
758 if ($verbose) | |
759 { | |
760 print "\t\tStripped"; | |
761 print "; meets threshold requirements" if ($seqThreshold or $speciesThreshold); | |
762 print "\n"; | |
763 } | |
764 | |
765 # Reload sequences from disk | |
766 my $inputFile = $gene."_gbs.fasta.txt"; | |
767 setLineBreak($inputFile); | |
768 open (FASTA, "<$inputFile") or die "Cannot open file $inputFile to read data for gene $gene\n"; | |
769 my (@accList, %speciesName, %fastaSeq, $species, $fastaAcc); | |
770 | |
771 undef %speciesPresent; | |
772 while (<FASTA>) | |
773 { | |
774 chomp; | |
775 if ($_ =~ "^>") | |
776 { | |
777 ($species, $fastaAcc) = split; | |
778 $species =~ s/^>//; | |
779 $fastaAcc =~ s/^\(//; | |
780 $fastaAcc =~ s/\)$//; | |
781 | |
782 | |
783 push @accList, $fastaAcc; | |
784 $speciesName{$fastaAcc} = $species; | |
785 $speciesPresent{$species}++; | |
786 push @speciesList, $species if ($speciesPresent{$species} == 1); | |
787 } | |
788 else | |
789 { | |
790 $fastaSeq{$fastaAcc} .= $_; | |
791 } | |
792 } | |
793 close FASTA; | |
794 @accList = sort { $speciesName{$a} cmp $speciesName{$b} } keys %speciesName; | |
795 | |
796 # Pare down to desired number of sequences as needed | |
797 if ($keepGene) | |
798 { | |
799 undef %lengthList; | |
800 print "\t\tParing down to $keepGene best lengths for each of $speciesCount{$gene} species ...\n" if ($verbose); | |
801 # Get informative length for each sequence | |
802 foreach my $entry (@accList) | |
803 { | |
804 $infLength{$entry} = ($fastaSeq{$entry} =~ tr/ACGT//); | |
805 push @ { $lengthList{$speciesName{$entry}} }, $infLength{$entry}; | |
806 } | |
807 | |
808 # Determine critical length and correct sequence count for number of deleted species | |
809 foreach my $species (@speciesList) | |
810 { | |
811 print "\t\t\tSpecies: $species ($speciesPresent{$species} sequences)\n" if ($debug); | |
812 next if ($speciesPresent{$species} <= $keepGene); # Only process species for which gene has been sampled more than threshold | |
813 @ { $lengthList{$species} } = sort {$b <=> $a} @ { $lengthList{$species} }; # Sort in descending order and get critical length | |
814 $criticalLength{$species} = $lengthList{$species}[$keepGene-1]; | |
815 | |
816 # Correct sequence count | |
817 my $i = $keepGene; | |
818 $i++ until ($i >= $speciesPresent{$species} or $lengthList{$species}[$i] < $criticalLength{$species}); | |
819 $sequenceCount{$gene} -= (scalar(@ { $lengthList{$species} }) - $i); | |
820 | |
821 if ($debug) | |
822 { | |
823 print "\t\t\t\tCritical length: $criticalLength{$species}\n"; | |
824 printf "\t\t\t\tNumber of sequences deleted: %s\n", scalar(@ { $lengthList{$species} }) - $i; | |
825 print "\t\t\t\tTotal number of sequences for gene remaining: $sequenceCount{$gene}\n"; | |
826 } | |
827 } | |
828 | |
829 print "\t\t\tFinal count: $sequenceCount{$gene} sequences\n" if ($verbose); | |
830 | |
831 # Recheck if paring has dropped sequence below sequence threshold | |
832 if ($sequenceCount{$gene} < $seqThreshold) | |
833 { | |
834 print "\t\t\tNOTE: Gene ($sequenceCount{$gene} sequences) now falls below threshold of $seqThreshold sequences; no file created\n" if ($verbose); | |
835 $geneStatus{$gene} = "rejected"; | |
836 push @rejectList, $gene; | |
837 $stripCount--; | |
838 | |
839 # Remove associated file | |
840 my $rmString = $gene."_gbs.fasta.txt"; | |
841 $rmString =~ s/\s/\\ /g; | |
842 $rmString =~ s/\(/\\(/g; | |
843 $rmString =~ s/\)/\\)/g; | |
844 $rmString =~ s/\'/\\'/g; | |
845 system ("rm $rmString"); | |
846 | |
847 next; | |
848 } | |
849 } | |
850 | |
851 # Produce appropriate output files | |
852 print "\t\tSaving output ...\n" if ($verbose); | |
853 my %printCount; | |
854 | |
855 # Open files as needed | |
856 my $sealFile = $gene."_gbs.seal"; | |
857 open (SEAL, ">$sealFile") or die "Cannot open Se-Al formatted file $sealFile for writing\n" if ($sealPrint); | |
858 my $nexusFile = $gene."_gbs.nex"; | |
859 open (NEX, ">$nexusFile") or die "Cannot open nexus-formatted file $nexusFile for writing\n" if ($nexusPrint); | |
860 # my $phytabFile = $gene."_gbs.phytab"; | |
861 # open (PHYTAB, ">$phytabFile") or die "Cannot open nexus-formatted file $phytabFile for writing\n" if ($phytabPrint); | |
862 #Currently assumes that only 1 file will be written for ease of Galaxy implementation | |
863 my $phytabFile = "osiris_gbs.phytab"; | |
864 open (PHYTAB, ">$phytabFile") or die "Cannot open nexus-formatted file $phytabFile for writing\n" if ($phytabPrint); | |
865 my $fastaFile = $gene."_gbs.fasta.txt"; | |
866 $fastaFile = $gene."_gbs.new.fasta.txt" if ($debug); | |
867 open (FASTA, ">$fastaFile") or die "Cannot open fasta-formatted file $fastaFile for writing\n"; | |
868 | |
869 # Print headers | |
870 if ($sealPrint) | |
871 { | |
872 print SEAL "Database={\n"; | |
873 print SEAL "\tID='MLst';\n"; | |
874 print SEAL "\tOwner=null;\n"; | |
875 print SEAL "\tName=null;\n"; | |
876 print SEAL "\tDescription=null;\n"; | |
877 print SEAL "\tFlags=0;\n"; | |
878 print SEAL "\tCount=2;\n"; | |
879 print SEAL "\t{\n\t\t{\n"; | |
880 | |
881 print SEAL "\t\t\tID='PAli';\n"; | |
882 print SEAL "\t\t\tOwner=1;\n"; | |
883 printf SEAL "\t\t\tName=\"%sgbs\";\n", $gene; | |
884 print SEAL "\t\t\tDescription=null;\n"; | |
885 print SEAL "\t\t\tFlags=0;\n"; | |
886 print SEAL "\t\t\tNumSites=$maxLength{$gene};\n"; | |
887 print SEAL "\t\t\tType=\"Nucleotide\";\n"; | |
888 print SEAL "\t\t\tFeatures=null;\n"; | |
889 print SEAL "\t\t\tColourMode=1;\n"; | |
890 print SEAL "\t\t\tLabelMode=0;\n"; | |
891 print SEAL "\t\t\ttriplets=false;\n"; | |
892 print SEAL "\t\t\tinverse=true;\n"; | |
893 printf SEAL "\t\t\tCount=%s;\n", $sequenceCount{$gene}; | |
894 print SEAL "\t\t\t{\n"; | |
895 } | |
896 | |
897 if ($nexusPrint) | |
898 { | |
899 print NEX "#NEXUS\n\n"; | |
900 print NEX "Begin data;\n"; | |
901 printf NEX "\tDimensions ntax=%s nchar=%s;\n", $sequenceCount{$gene}, $maxLength{$gene}; | |
902 print NEX "\tFormat datatype=nucleotide gap=-;\n\n"; | |
903 print NEX "\tmatrix\n\n"; | |
904 } | |
905 | |
906 # Print sequence data | |
907 my $i = 0; | |
908 foreach my $entry (@accList) | |
909 { | |
910 next if (defined $criticalLength{$speciesName{$entry}} and $infLength{$entry} < $criticalLength{$speciesName{$entry}}); | |
911 | |
912 $printCount{$speciesName{$entry}}++; | |
913 $speciesName{$entry} .= "_$printCount{$speciesName{$entry}}" if ($printCount{$speciesName{$entry}} > 1); | |
914 | |
915 if ($sealPrint) | |
916 { | |
917 $i++; | |
918 print SEAL "\t\t\t\t{\n"; | |
919 print SEAL "\t\t\t\t\tID='PSeq';\n"; | |
920 print SEAL "\t\t\t\t\tOwner=2;\n"; | |
921 print SEAL "\t\t\t\t\tName=\"$speciesName{$entry}\";\n"; | |
922 print SEAL "\t\t\t\t\tDescription=null;\n"; | |
923 print SEAL "\t\t\t\t\tFlags=0;\n"; | |
924 print SEAL "\t\t\t\t\tAccession=\"$entry\";\n"; | |
925 print SEAL "\t\t\t\t\tType=\"DNA\";\n"; | |
926 printf SEAL "\t\t\t\t\tLength=%s;\n", length($fastaSeq{$entry}); | |
927 print SEAL "\t\t\t\t\tSequence=\"$fastaSeq{$entry}\";\n"; | |
928 print SEAL "\t\t\t\t\tGeneticCode=1;\n"; | |
929 print SEAL "\t\t\t\t\tCodeTable=null;\n"; | |
930 print SEAL "\t\t\t\t\tFrame=1;\n"; | |
931 print SEAL "\t\t\t\t\tFeatures=null;\n"; | |
932 print SEAL "\t\t\t\t\tParent=null;\n"; | |
933 print SEAL "\t\t\t\t\tComplemented=false;\n"; | |
934 print SEAL "\t\t\t\t\tReversed=false;\n"; | |
935 print SEAL "\t\t\t\t}"; | |
936 print SEAL "," unless ($i == $sequenceCount{$gene}); | |
937 print SEAL "\n"; | |
938 } | |
939 | |
940 if ($nexusPrint) | |
941 { | |
942 my $gaps = "-" x ($maxLength{$gene} - length($fastaSeq{$entry})); | |
943 print NEX "$speciesName{$entry}\t$fastaSeq{$entry}$gaps\n"; | |
944 } | |
945 if($phytabPrint) | |
946 { | |
947 print PHYTAB "$speciesName{$entry} \t$gene \t $entry \t $fastaSeq{$entry} \n" | |
948 } | |
949 | |
950 my $breakPoint = 79; | |
951 until ($breakPoint > length($fastaSeq{$entry})) | |
952 { | |
953 my $replaceString = "\n" . substr($fastaSeq{$entry}, $breakPoint, 1); | |
954 substr($fastaSeq{$entry}, $breakPoint, 1) = $replaceString; | |
955 $breakPoint += 80; | |
956 } | |
957 if ($accessionFirst) #ADDED BY THO | |
958 { | |
959 print FASTA ">$entry", "\___", "$speciesName{$entry}\n"; | |
960 print FASTA "$fastaSeq{$entry}\n"; | |
961 } | |
962 else | |
963 { | |
964 print FASTA ">$speciesName{$entry} ($entry)\n"; | |
965 print FASTA "$fastaSeq{$entry}\n"; | |
966 } | |
967 } | |
968 # Print footers | |
969 if ($sealPrint) | |
970 { | |
971 print SEAL "\t\t\t};\n"; | |
972 print SEAL "\t\t},\n"; | |
973 print SEAL "\t\t{\n"; | |
974 print SEAL "\t\t\tID='MCoL';\n"; | |
975 print SEAL "\t\t\tOwner=1;\n"; | |
976 print SEAL "\t\t\tName=\"Genetic Codes\";\n"; | |
977 print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n"; | |
978 print SEAL "\t\t\tFlags=0;\n"; | |
979 print SEAL "\t\t\tCount=0;\n"; | |
980 print SEAL "\t\t}\n"; | |
981 print SEAL "\t};\n"; | |
982 print SEAL "};\n"; | |
983 } | |
984 if ($nexusPrint) | |
985 { | |
986 print NEX "\t;\nend;\n"; | |
987 } | |
988 | |
989 # Close files as appropriate | |
990 close SEAL if ($sealPrint); | |
991 close NEX if ($nexusPrint); | |
992 close FASTA if ($keepGene); | |
993 } | |
994 } | |
995 | |
996 my $postTime = time - $postZero; | |
997 print "\n" if ($verbose); | |
998 print "\tTime taken: $postTime seconds\n"; | |
999 | |
1000 # Print out final summary stats / files | |
1001 print "\nFinal summary statistics\n"; | |
1002 | |
1003 # Print file of stripped genes | |
1004 open (STRIP, ">$stripFile") or die "Cannot write summary of stripped genes to $stripFile.\n"; | |
1005 print STRIP "The following $stripCount genes were stripped successfully from file $gbFile\n"; | |
1006 print STRIP "\nGene\tNo. of sequences\tNo. of species\n\n"; | |
1007 | |
1008 foreach my $gene (@globalGeneList) | |
1009 { | |
1010 print STRIP "$gene\t$sequenceCount{$gene}\t$speciesCount{$gene}\n" unless ($geneStatus{$gene} eq "rejected"); | |
1011 } | |
1012 close STRIP; | |
1013 | |
1014 print "\tSummaries of $stripCount genes that were stripped from $gbFile have been written to $stripFile\n"; | |
1015 | |
1016 # Print file of rejected genes | |
1017 if (@rejectList) | |
1018 { | |
1019 @rejectList = sort @rejectList; | |
1020 open (REJECT, ">$rejectFile") or die "Cannot write summary of rejected genes to $rejectFile.\n"; | |
1021 printf REJECT "The following %s genes do not meet user-defined threshold(s) of", scalar(@rejectList); | |
1022 print REJECT " $seqThreshold sequences" if ($seqThreshold); | |
1023 print REJECT " or" if ($seqThreshold and $speciesThreshold); | |
1024 print REJECT " $speciesThreshold species" if ($speciesThreshold); | |
1025 print REJECT "; an asterisk indicates that values given are rough maximums\n"; | |
1026 print REJECT "\nGene\tNo. of sequences\tNo. of species\n\n"; | |
1027 | |
1028 foreach my $gene (@rejectList) | |
1029 { | |
1030 print REJECT "$gene\t"; | |
1031 if (defined $sequenceCount{$gene}) # Need to do alternative versions depending on whether gene was properly counted or not | |
1032 { | |
1033 print REJECT "$sequenceCount{$gene}\t"; | |
1034 } | |
1035 else | |
1036 { | |
1037 print REJECT "$quickSequenceCount{$gene} \*\t"; | |
1038 } | |
1039 if (defined $speciesCount{$gene}) | |
1040 { | |
1041 print REJECT "$speciesCount{$gene}\n"; | |
1042 } | |
1043 else | |
1044 { | |
1045 print REJECT "$quickSpeciesCount{$gene} \*\n"; | |
1046 } | |
1047 } | |
1048 close REJECT; | |
1049 | |
1050 printf "\tSummaries of %s genes that did not meet threshold(s) have been written to $rejectFile\n", scalar(@rejectList); | |
1051 } | |
1052 | |
1053 if ($debug) | |
1054 { | |
1055 print "\n\tSummary stats for each processed gene:\n"; | |
1056 foreach my $gene (@globalGeneList) | |
1057 { | |
1058 print "\t\tGene: $gene ($geneStatus{$gene})\n"; | |
1059 print "\t\t\tTotal number of sequences: $sequenceCount{$gene}\n"; | |
1060 print "\t\t\tTotal number of unique species: $speciesCount{$gene}\n"; | |
1061 } | |
1062 } | |
1063 | |
1064 exit(0); | |
1065 | |
1066 # Subroutines used in the script | |
1067 | |
1068 sub setLineBreak # Check line breaks of input files and set input record separator accordingly | |
1069 { | |
1070 my $gbFile = shift; | |
1071 $/ ="\n"; | |
1072 open (IN, "<$gbFile") or die "Cannot open $gbFile to check form of line breaks.\n"; | |
1073 while (<IN>) | |
1074 { | |
1075 if ($_ =~ /\r\n/) | |
1076 { | |
1077 print "\tDOS line breaks detected ...\n" if ($debug); | |
1078 $/ ="\r\n"; | |
1079 last; | |
1080 } | |
1081 elsif ($_ =~ /\r/) | |
1082 { | |
1083 print "\tMac line breaks detected ...\n" if ($debug); | |
1084 $/ ="\r"; | |
1085 last; | |
1086 } | |
1087 else | |
1088 { | |
1089 print "\tUnix line breaks detected ...\n" if ($debug); | |
1090 $/ ="\n"; | |
1091 last; | |
1092 } | |
1093 } | |
1094 close IN; | |
1095 } | |
1096 | |
1097 sub complement # Outputs complementary sequence to one provided | |
1098 { | |
1099 my $tempSeq = shift; | |
1100 | |
1101 my $compSeq; | |
1102 for (my $nt = 0; $nt < length($tempSeq); $nt++) | |
1103 { | |
1104 if (not defined $complement{substr($tempSeq, $nt, 1)}) | |
1105 { | |
1106 $compSeq .= "?"; | |
1107 } | |
1108 else | |
1109 { | |
1110 $compSeq .= $complement{substr($tempSeq, $nt, 1)}; | |
1111 } | |
1112 } | |
1113 my $revCompSeq = reverse($compSeq); | |
1114 return $revCompSeq; | |
1115 } | |
1116 | |
1117 sub geneSynonyms # Define gene synonyms; originally compiled by Robin Beck | |
1118 { | |
1119 #Opsin gene added by THO | |
1120 $synonym{$_} = "opsin" foreach ("opsin", "anceropsin", "blop", "blue-sensitive_opsin", "blue-sensitive_opsin_precursor", "blue-sensitive_rhodopsin", "blue-sensitive_visual_pigment", "blue_opsin", "bluerh", "boceropsin", "buvops", "compound_eye_opsin_bcrh1", "compound_eye_opsin_bcrh2", "lateral_eye_opsin", "locus_opsin_1", "locust_opsin_2", "long-wavelenght_opsin", "long-wavelength_like_opsin", "long-wavelength_opsin", "long-wavelength_rhodopsin", "long-wavelength_sensitive_opsin_1", "long-wavelength_sensitive_opsin_2", "long_wave_opsin", "long_wavelength-sensitive_opsin", "long_wavelength-sensitive_rhodopsin", "long_wavelength-sensitive_visual_pigment", "long_wavelength_opsin", "long_wavelength_sensitive_opsin_1", "long_wavelength_sensitive_opsin_2", "lop2", "lw_opsin", | |
1121 "ocellar_opsin", "opsin", "opsin_2", "opsin_bcrh1", "opsin_bcrh2", | |
1122 "opsin_rh1", "opsin_rh3", "opsin_rh4", "piceropsin", "pteropsin", | |
1123 "rh1_opsin", "rh2_opsin", "rh3_opsin", "rh4_opsin", "rh6_rhodopsin", | |
1124 "rhodopsin", "rhodopsin_1", "rhodopsin_2_cg16740-pa", "rhodopsin_3", | |
1125 "rhodopsin_3_cg10888-pa", "rhodopsin_4", "rhodopsin_4_cg9668-pa", | |
1126 "rhodopsin_5", "rhodopsin_5_cg5279-pa", "rhodopsin_6", | |
1127 "rhodopsin_6_cg5192-pb", "rhodopsin_7_cg5638-pa", | |
1128 "rhodopsin_long-wavelength", "short_wavelength-sensitive_opsin", | |
1129 "ultraviolet-sensitive_opsin", "ultraviolet-sensitive_rhodopsin", | |
1130 "ultraviolet-sensitive_visual_pigment", "ultraviolet_sensitive_opsin", | |
1131 "uv-sensitive_opsin", "uv-wavelength_like_opsin", "uv_opsin", "uvop", | |
1132 "uvrh", "uvrh1", "amblop", "amuvop", | |
1133 "lwrh", "lwrh1", "lwrh2", "lw", "lw-rh", "lw_rh", "ninae", "ninae-pa", | |
1134 "op", "ops", "ops1", "opsin_1", "opsin_3", "rh", "rh1", "rh2", "rh3", | |
1135 "rh2-pa", "rh3-pa", "rh4", "rh4-pa", "rh5", "rh6", "rh7", "rho", | |
1136 "visual_pigment", "long-wavelength_rodopsin", | |
1137 "long_wavelength_rhodopsin", "short-wavelength_rhodopsin"); | |
1138 | |
1139 #Chloroplast genes added by THO*** | |
1140 $synonym{$_} = "rbcl" foreach ("rbcl", "large_subunit_of_riblose-15-bisphosphate_carboxylase-oxygenase","larger_subunit_of_rubisco", | |
1141 "rubisco_large_subunit", "ribulose_15-bisphosphate_carboxylase_large_subunit", "ribulosebiphosphate_carboxylase_large_subunit"); | |
1142 $synonym{$_} = "matk" foreach ("matk", "maturase_k", "maturase"); | |
1143 | |
1144 # Mitochondrial genes | |
1145 $synonym{$_} = "mtatp6" foreach ("mtatp6", "atp synthase 6", "atp synthase a chain", "atp6", "mt-atp6", | |
1146 "atpase subunit 6", "atpase6", "atpase 6", "atpase6 protein", "atp6", | |
1147 "atp synthase f0 subunit 6", "atp synthase a chain protein 6", | |
1148 "atp synthase subunit 6", "atpase6", "atp6", "atp sythase subunit 6", | |
1149 "atpase subunit 6, atpase6", "f0-atpase subunit 6", "f1atpase subunit 6", | |
1150 "f1-atpase subunit 6", "atpase 6", "atpase 6 gene", "atpase subunit 6 (atp6)", | |
1151 "atpase subunit 6 (atpase6)"); | |
1152 $synonym{$_} = "mtatp8" foreach ("mtatp8", "atp synthase 8", "atp synthase protein 8", "atpase subunit 8", | |
1153 "a6l", "atp8", "mt-atp8", "atpase subunit 8", "atpase8", "atpase 8", | |
1154 "atpase8 protein", "atp8", "atp synthase f0 subunit 8", "atp synthase protein 8", | |
1155 "atpase-8", "atp synthase subunit 8", "atpase8", "atp8", "atp sythase subunit 8", | |
1156 "atpase subunit8", "f0-atpase subunit 8", "f1 atpase subunit 8", "f1-atpase subunit 8", | |
1157 "protein a6l", "atpase 8", "atpase subunit 8 (atp8)", "atpase subunit 8 (atpase8)", | |
1158 "atpase 8 gene", "atpase subunit 8 (atp8)", "atpase subunit 8 (atpase8)"); | |
1159 $synonym{$_} = "mtco1" foreach ("mtco1", "cytochrome c oxidase i", "coi", "mt-co1", "co i", "ccoi", "cox 1", "cox i", | |
1160 "coi", "cytochrome c oxidase subunit i", "cytochrome oxidase subunit i", "cox1", | |
1161 "cytochrome c oxidase subunit 1", "cytochrome oxidase subunit 1", "co1", | |
1162 "cytochrome c oxidase polypeptide i", "cytochrome oxidase i", "cox-1", | |
1163 "cytochrome oxidase c subunit 1", "cox1", "co i", "co1", "cox 1", "coxi", | |
1164 "cytochrome c oxidase polypeptide i", "cytochrome c-oxidase subunit 1", | |
1165 "cytochrome oxidase c subunit i", "cytochrome-c oxidase i", "cytochrome c1 mrna", | |
1166 "cytochrome c1", "cytochrome c oxidase subunit 1 (coxi)", "cytochrome c oxidase chain i", | |
1167 "cytochrome c1 (aa 1-241)", "cytochrome c oxidase polypeptide 1", | |
1168 "cytochrome c oxidase subunit 1 (coi)", "cytochrome c-1"); | |
1169 $synonym{$_} = "mtco2" foreach ("mtco2", "cytochrome c oxidase ii", "cytochrome c oxidase polypeptide ii", "coii", | |
1170 "mt-co2", "coii", "cytochrome c oxidase subunit ii", "cytochrome oxidase subunit ii", | |
1171 "cytochrome oxidase subunit 2", "cytochrome c oxidase subunit 2", | |
1172 "cytochrome c oxidase ii", "cox2", "co2", "cytochrome c oxidase polypeptide ii", | |
1173 "cytochrome oxidase ii", "cox2", "cytochrome oxidase c subunit 2", | |
1174 "cytochrome-c oxidase", "co ii", "co2", "cox 2", "cytochrome c oxidase polypeptide ii", | |
1175 "cytochrome c-oxidase subunit 2", "cytochrome oxidase c subunit ii", | |
1176 "cytochrome oxidase subunit2", "cytochrome-c oxidase ii", "cytochrome c oxidase subunit 2 (coxii)", | |
1177 "cytochrome c oxidase subunit 2 (coii)", "cytochrome c oxidase chain ii", | |
1178 "cytochrome c oxidase polypeptide 2", "cytochrome c oxidase ii subunit", | |
1179 "cytochrome c oxidase ii mrna"); | |
1180 $synonym{$_} = "mtco3" foreach ("mtco3", "cytochrome c oxidase iii", "coiii", "mt-co3", "co iii", "ccoiii", "cox3", | |
1181 "cox iii", "cytochrome oxidase subunit iii", "coiii", "cox iii", | |
1182 "cytochrome c oxidase subunit iii", "cytochrome oxidase subunit 3", "cox3", | |
1183 "cytochrome c oxidase subunit 3", "co3", "cytochrome c oxidase polypeptide iii", | |
1184 "cox3", "cytochrome oxidase c subunit 3", "cytochrome oxidase iii", | |
1185 "cytochrome c oxidase polypeptide iii", "co iii", "coiii", "coiii protein", "coxiii", | |
1186 "cytochrome c oxidase subunit iii, coiii", "cytochrome c-oxidase subunit 3", | |
1187 "cytochrome c-oxidase subunit three", "cytochrome oxidase c subunit iii", | |
1188 "cytochrome-c oxidase iii", "cytochrme c oxidase iii", "cytochrome c oxidase subunit 3 (coiii)", | |
1189 "cytochrome oxidase subunit iii type 2"); | |
1190 $synonym{$_} = "mtnd1" foreach ("mtnd1", "nd1", "nadh dehydrogenase 1", "nadh dehydrogenase, subunit 1 (complex i)", | |
1191 "mt-nd1", "nadh-ubiquinone oxidoreductase chain 1", "nd1", "nadh1", "nd1", | |
1192 "nadh-ubiquinone oxidoreductase chain 1", "nadh-ubiquinone oxidoreductase subunit 1", | |
1193 "nadh dehydrogenase i", "nadh dehydrogenase 1", "nadh subunit 1", | |
1194 "nadh dehydrogenase subnuit 1", "nadh1", "nadh1 protein", | |
1195 "nadh-ubiquinone oxidoreductase subunit i", "nadh dehydrogenase subunit 1", | |
1196 "nadh dehydrogenase subunit 1 (nd1)", "nadh dehydrogenase subunit 1 type 2", | |
1197 "nadh dehydrogenase subunit 1 type 1", "nadh dehydrogenase subunit i", "nadh dehydrogenase i", | |
1198 "nadh dehydrogenase subnit 1", "nadh dehydrogenase ubiquinone 1 alpha", | |
1199 "nadh dehydrogenase subunit i"); | |
1200 $synonym{$_} = "mtnd2" foreach ("ndh-u1", "mtnd2", "nadh dehydrogenase 2", "nadh dehydrogenase, subunit 2 (complex i)", | |
1201 "nadh-ubiquinone oxidoreductase chain 2", "nd2", "mt-nd2", "urf2", | |
1202 "nadh dehydrogenase subunit 2", "nd2", "nadh2", | |
1203 "nadh-ubiquinone oxidoreductase chain 2", "nadh dehydrogenase 2", "nadh subunit 2", | |
1204 "nadh-ubiquinone oxidoreductase subunit 2", "nadh dehydrogenase subnuit 2", | |
1205 "nadh deydrogenase subunit 2", "nadh2", "nadh-ubiquinone oxidoreductase subunit ii", | |
1206 "nd2", "nadh2 protein", "nadh dehydrogenase subunit 2 (nd2)", "nadh dehydrogenase subunit ii", | |
1207 "nadh dehydrogenase ii", "nadh dehydrogense 2", "nadh dehydrogenase subunit ii"); | |
1208 $synonym{$_} = "mtnd3" foreach ("mtnd3", "ndh-u3", "nadh dehydrogenase 3", "nd3", "nadh3", | |
1209 "nadh-ubiquinone oxidoreductase chain 3", | |
1210 "nadh-ubiquinone/plastoquinone oxidoreductase chain 3", "mt-md3", "urf3", | |
1211 "nadh dehydrogenase subunit 3", "nd3", "nadh3", | |
1212 "nadh-ubiquinone oxidoreductase chain 3", "nadh subunit 3", "nadh3", "nadh3 protein", | |
1213 "nadh-ubiquinone oxidoreductase subunit 3", "nadh dehydrogenase subnuit 3", | |
1214 "nadh-ubiquinone oxidoreductase subunit iii", "nadh3 protein", | |
1215 "nadh dehydrogenase subunit 3 (nd3)", "nadh dehydrogenase subunit iii", "nadh dehydrogenase iii", | |
1216 "nadh dehydrogenase subunit iii"); | |
1217 $synonym{$_} = "mtnd4" foreach ("mtnd4", "ndh-u4", "nadh dehydrogenase 4", "nadh-ubiquinone oxidoreductase chain 4", "mt-nd4", | |
1218 "nd4", "urf4", "nadh:ubiquinone oxidoreductase subunit 4 (chain m)", | |
1219 "nadh dehydrogenase subunit 4", "nd4", "nadh4", | |
1220 "nadh-ubiquinone oxidoreductase chain 4", "nadh subunit 4", "nadh4", | |
1221 "nadh-ubiquinone oxidoreductase subunit 4", "nadh dehydrogenase subunit4", | |
1222 "nadh-ubiquinone oxidoreductase subunit iv", "nadh4 protein", | |
1223 "nadh dehydrogenase subunit 4 (nd4)", "nadh dehydrogenase subunit iv", "nadh dehydrogenase iv", | |
1224 "nadh dehydrogenase subunit iv"); | |
1225 $synonym{$_} = "mtnd4l" foreach ("mtnd4l", "ndh-u4l", "nadh dehydrogenase 4l", "nadh dehydrogenase, subunit 4l (complex i)", | |
1226 "nadh-ubiquinone oxidoreductase chain 4l", "nd4l", "mt-md4l", "urf4l", | |
1227 "nadh dehydrogenase subunit 4l", "nd4l", "nadh4l", "nadh-ubiquinone oxidoreductase chain 4l", | |
1228 "nadh subunit 4l", "nadh4l", "nadh-ubiquinone oxidoreductase subunit 4l", | |
1229 "nadh dehydrogenase subunit 4 l", "nadh4l protein", "nadh dehydrogenase subunit 4l (nd4l)", | |
1230 "nadh dehydrogenase subunit ivl", "nadh dehydrogenase ivl", "nadh dehydrogenase subunit ivl"); | |
1231 $synonym{$_} = "mtnd5" foreach ("mtnd5", "ndh-u5", "nadh dehydrogenase 5", "nd5", "nadh-ubiquinone oxidoreductase chain 5", | |
1232 "mt-nd5", "urf5", "nadh dehydrogenase subunit 5", "nadh5", "nd5", | |
1233 "nadh dehydrogenase-5", "nadh-ubiquinone oxidoreductase chain 5", | |
1234 "nadh dehydrogenase 5", "nadh 5", "nadh subunit 5", | |
1235 "nadh-ubiquinone oxidoreductase subunit 5", "nadh5", "nadh-dehydrogenase subunit 5", | |
1236 "nadh-dehydrogenase subunit 5, nd5", "nadh-ubiquinone oxidoreductase subunit v", | |
1237 "nadh5 protein", "nadh dehydrogenase subunit 5 (nd5)", "nadh dehydrogenase subunit v", | |
1238 "nadh dehydrogenase v", "nadh dehydrogenase subunit v"); | |
1239 $synonym{$_} = "mtnd6" foreach ("mtnd6", "ndh-u6", "nadh dehydrogenase 6", "nd6", "nadh6", | |
1240 "nadh-ubiquinone oxidoreductase chain 6", | |
1241 "nadh-ubiquinone/plastoquinone oxidoreductase chain 6", "mt-md6", "urf6", | |
1242 "nadh dehydrogenase subunit 6", "nd6", "nadh6", | |
1243 "nadh-ubiquinone oxidoreductase chain 6", "nadh subunit 6", | |
1244 "nadh-ubiquinone oxidoreductase subunit 6", "nadh dehydrogenase 6", "nadh6", | |
1245 "nadh-ubiquinone oxidoreductase subunit vi", "nadh6 protein", | |
1246 "nadh dehydrogenase subunit 6 (nd6)", "nadh dehydrogenase subunit vi", "nadh dehydrogenase vi", | |
1247 "nadh dehydrogenase subunit vi"); | |
1248 $synonym{$_} = "mtrnr1" foreach ("mtrnr1", "mt-rnr1", "12s rna", "12s rrna", "12s ribosomal rna", "12s rrna", | |
1249 "small subunit ribosomal rna", "12 rrna", "12 s ribosomal rna", "12s rrna, mtssu rrna", | |
1250 | |
1251 "12s rrna gene", "12s small subunit ribosomal rna", | |
1252 "mitochondrial ssu ribosomal rna", "rrns", | |
1253 "ssu ribosomal rna", "12s","12s_ribosmal_rna", "s-rrna", "srrna", "ssu"); | |
1254 $synonym{$_} = "mtrnr2" foreach ("16s", "16s_ribosomal_rna","l-rrna","lrrna","lsu", "mtrnr2", "mt-rnr2", "16s rrna", "16s rna", "16srrna", "16s ribosomal rna", "16s rrna", "rrna large subunit", | |
1255 "large subunit ribosomal rna", "16 s ribosomal rna", "16s mitochondrial ribosomal rna", | |
1256 | |
1257 "mitochondrial 16s ribosomal rna", "16s large subunit ribosomal | |
1258 rna", "16s_ribosmal_rna","rrnl"); | |
1259 | |
1260 # Nuclear genes | |
1261 $synonym{$_} = "adora3" foreach ("adora3", "adenosine a3 receptor", "a3ar", "ara3", "gpcr2", "adora3", "adenosine a3 receptor", | |
1262 "a3 adenosine receptor", "adenosine-3 receptor"); | |
1263 $synonym{$_} = "adra2b" foreach ("adra2b", "adrenergic, alpha-2b-, receptor", "adra2l1", "adrarl1", "adra2rl1", | |
1264 "alpha-2b adrenergic receptor", "alpha-2b adrenoceptor", "subtype c2", "[a]2b", "adra-2b", | |
1265 "alpha2-c2", "alpha2b", "subtype alpha2-c2", "alpha-2-adrenergic receptor-like 1", | |
1266 "alpha adrenergic receptor 2b", "adra2b", "alpha 2b adrenergic receptor", "adra2b", | |
1267 "alpha adrenergic receptor subtype 2b", "alpha-2b-adrenergic receptor", | |
1268 "alpha adrenergic receptor, subtype 2b", "alpha-2b adrenergic receptor", "alpha-2b adrenoceptor"); | |
1269 $synonym{$_} = "adrb2" foreach ("adrb2", "adrenergic, beta-2-, receptor, surface", "adrb2r", "adrbr", "beta-2 adrenergic receptor", | |
1270 "b2ar", "bar", "beta-2 adrenoceptor", "catecholamine receptor", "adrb-2", "badm", "beta 2-ar", | |
1271 "gpcr7", "adrb2", "beta-2 adrenergic receptor", "beta-2-adrenergic receptor", | |
1272 "adrenergic receptor beta 2", "beta 2 adrenergic receptor", "beta2-adrenergic receptor", | |
1273 "beta2 adrenergic receptor"); | |
1274 $synonym{$_} = "apob" foreach ("apob", "apolipoprotein b", "ag(x) antigen", "apolipoprotein b-100 precursor", "apo b-100", | |
1275 "apolipoproteinb-48", "apo b-48", "fldb", "apob-100", "apolipoprotein b", "apolipoprotein b 100", | |
1276 "apob", "apolipoprotein b-100", "apob"); | |
1277 $synonym{$_} = "app" foreach ("app", "amyloid beta (a4) precursor protein", "protease nexin-ii, alzheimer disease", "ad1", | |
1278 "human mrna for amyloid a4 precursor of alzheimer's disease", "appi", "beta-amyloid protein", | |
1279 "beta-app", "a-beta", "a4", "cvap", "aaa", "abeta", "amyloid beta-peptide", | |
1280 "amyloid beta (a4) precursor protein", "adap", "appican", "betaapp", "app", | |
1281 "amyloid beta precursor protein", "amyloid precursor protein", "amyloid beta precursor b1", | |
1282 "beta amyloid protein precursor", "beta-a4"); | |
1283 $synonym{$_} = "atp7a" foreach ("atp7a", "atpase, cu++ transporting, alpha polypeptide (menkes syndrome)", "mnk", | |
1284 "copper-transporting atpase 1", "copper pump 1", "menkes disease-associated protein", "mc1", | |
1285 "copper binding p-type atpase 7a", "blo", "blotchy", "br", "brindled", "menkes protein", "mo", | |
1286 "mottled", "mk", "ohs", "atp7a", "copper transporting atpase", | |
1287 "atpase, cu++ transporting, alpha polypeptide", "menkes syndrome protein"); | |
1288 $synonym{$_} = "bdnf" foreach ("bdnf", "brain-derived neurotrophic factor", "brain-derived neurotrophic factor precursor", "bdnf", | |
1289 "brain-derived neurotrophic factor", "brain derived neurotrophic factor", | |
1290 "brain-derived neurotrophic factor mature peptide", "bdnf"); | |
1291 $synonym{$_} = "bmi1" foreach ("bmi1", "b lymphoma mo-mlv insertion region (mouse)", | |
1292 "murine leukemia viral (bmi-1) oncogene homolog", "polycomb complex protein bmi-1", "rnf51", | |
1293 "mgc12685", "oncogene bmi-1", "bmi1", "bmi-1", "bmi-1 protein", "oncoprotein bmi-1"); | |
1294 $synonym{$_} = "brca1" foreach ("brca1", "breast cancer 1, early onset", "pscp", "papillary serous carcinoma of the peritoneum", | |
1295 "breast cancer type 1 susceptibility protein", "breast and ovarian cancer susceptibility gene", | |
1296 "rnf53", "breast-ovarian cancer, included", "brca1", "brca1", | |
1297 "breast and ovarian cancer susceptibility protein", "brca1 protein", | |
1298 "breast and ovarian cancer susceptibility"); | |
1299 $synonym{$_} = "chrna1" foreach ("chrna1", "cholinergic receptor, nicotinic, alpha polypeptide 1 (muscle)", "chrna", | |
1300 "acetylcholine receptor protein, alpha chain precursor", "achra", "achr-1", "acra", "chrna1", | |
1301 "nicotinic cholinergic receptor alpha polypeptide", "chrna1", "achr", "achr prepeptide", "chrna", | |
1302 "neuronal nicotinic acetylcholine receptor alpha", "nicotinic acetylcholine recepter alpha-subunit"); | |
1303 $synonym{$_} = "cftr" foreach ("cftr", "cystic fibrosis transmembrane conductance regulator, atp-binding cassette (sub-family c, | |
1304 member 7)", "mrp7", "cf", "abcc7", "abc35", "cftr", "cystic fibrosis transmembrane conductance", | |
1305 "cftr chloride channel"); | |
1306 $synonym{$_} = "cnr1" foreach ("cnr1", "cannabinoid receptor 1 (brain)", "cnr", "cb1", "cb-r", "cann6", "cb>1<", "cb1k5", "cb1a", | |
1307 "central cannabinoid receptor", "cnr1", "cannabinoid receptor 1", "cb1", "cb1 cannabinoid receptor", | |
1308 "cb1 cannabinoid receptor", "cbr"); | |
1309 $synonym{$_} = "crem" foreach ("crem", "camp responsive modulator", "crea", "camp-responsive element modulator, alpha isoform", | |
1310 "crem", "camp responsive element moderator", "camp responsive element modulator", | |
1311 "camp-responsive element moderator"); | |
1312 $synonym{$_} = "edg1" foreach ("edg1", "endothelial differentiation, sphingolipid g-protein-coupled receptor, 1", "d1s3362", "edg-1", | |
1313 "1pb1", "s1p1", "ecgf1", "chedg1", "sphingosine 1-phosphate receptor edg1", | |
1314 "g protein-coupled sphingolipid receptor", "edg1"); | |
1315 $synonym{$_} = "ghr" foreach ("ghr", "growth hormone receptor", "growth hormone receptor precursor", "gh receptor", | |
1316 "serum binding protein", "growth hormone receptor", "ghr", "growth hormone receptor precursor", | |
1317 "ghr", "bovine growth hormone receptor", "mature growth hormone receptor"); | |
1318 $synonym{$_} = "pgk1" foreach ("pgk1", "phosphoglycerate kinase 1", "pgk-1", "primer recognition protein 2", "prp 2", "pgka", | |
1319 "pgk-1", "pgk-1", "phosphoglycerate kinase 1"); | |
1320 $synonym{$_} = "plcb4" foreach ("plcb4", "phospholipase c, beta 4", | |
1321 "1-phosphatidylinositol-4,5-bisphosphate phosphodiesterase beta 4", "plc-beta-4", | |
1322 "phospholipase c-beta-4", "plcb4", "phospholipase c beta 4"); | |
1323 $synonym{$_} = "pnoc" foreach ("pnoc", "prepronociceptin", "propronociceptin", "nociceptin precursor", "orphanin fq", "ppnoc", "ofq", | |
1324 "n/ofq", "n23k", "npnc1", "ofq/n", "proorphanin", "pnoc", "prepronociceptin", | |
1325 "nociceptin/orphanin fq precursor"); | |
1326 $synonym{$_} = "prkci" foreach ("prkci", "protein kinase c, iota", "pkci", "dxs1179e", "npkc-iota", "protein kinase c iota", | |
1327 "prkci"); | |
1328 $synonym{$_} = "prnp" foreach ("prnp", "(prion protein (p27-30) (creutzfeld-jakob disease, gerstmann-strausler-scheinker syndrome, fatal familial insomnia)", | |
1329 "cjd", "major prion protein precursor", "prp", "prp27-30", "prp33-35c", "ascr", "prip", "gss", | |
1330 "prn-i", "prn-p", "prpc", "prpsc", "sinc", "mgc26679", "cd230 antigen", "prion-related protein", | |
1331 "prion protein", "prp", "prnp", "prion protein precursor", "prion protein", "prnp", "prp", | |
1332 "prion protein prp", "prion protein precursor prp", "prp", "prp", "prp gene", "greater kudu prp", | |
1333 "major prion protein", "prion protein variant 110p", "prion protein variant 143r", | |
1334 "prion protein variant 240s", "prion protein variant 37v", "prion protein variant 37v/240s", | |
1335 "prion protein, prp", "prnp", "prmp"); | |
1336 $synonym{$_} = "rag1" foreach ("rag1", "recombination activating gene 1", "v(d)j recombination activating protein 1", "rag-1", | |
1337 "recombination activating protein 1", "rag1", "rag-1", "recombination activating gene-1", | |
1338 "recombination activating gene 1", "recombination-activating gene 1", "rag1", "rag 1", "rag1"); | |
1339 $synonym{$_} = "rag2" foreach ("rag2", "recombination activator protein 2", "recombination activating protein 2", "rag-2", "rag 2", | |
1340 "recombination activating protein", "recombination activating gene-2", "rag2", | |
1341 "recombination activating protein 2", "rag2", "rag2", "rag2", "rag-2", "rag-2 protein", | |
1342 "recombinase activating gene 2", "recombination activating protein 2"); | |
1343 $synonym{$_} = "rbp3" foreach ("rbp3", "retinol-binding protein 3, interstitial", "interphotoreceptor retinoid-binding protein precursor", | |
1344 "irbp", "interstitial retinol-binding protein", "rbp-3", "interphotoreceptor retinoid binding protein", | |
1345 "irbp", "irbp", "interphotoreceptor retinoid-binding protein", "interphotorecepter retinoid binding protein", | |
1346 "interphotoreceptor retinoid binding protein (irbp)", "irbp mrna"); | |
1347 $synonym{$_} = "tnf" foreach ("tnf", "tumor necrosis factor (tnf superfamily, member 2)", "tnfa", "dif", "tnfsf2", | |
1348 "tumor necrosis factor (cachectin)", "tumor necrosis factor, alpha (cachectin)", | |
1349 "tumor necrosis factor", "tnf-alpha", "cachectin", "tnfsf1a", "apc1 protein", | |
1350 "tnf, monocyte-derived", "tnf, macrophage-derived", "tnf superfamily, member 2", | |
1351 "tumor necrosis factor-alpha", "tumor necrosis factor alpha", "tnfa", "tnf-alpha", | |
1352 "tumor necrosis factor alpha precursor", "tnfa", "tnfa", "tnfalpha", "tumour necrosis factor alpha", | |
1353 "tnf alpha", "tnf-a", "tumor necrosis factor alpha, tnf-alpha", "bovine tumor necrosis factor alpha", | |
1354 "tumor necrosis factor alpha (cachetin)", "tumor necrosis factor-alpha precursor"); | |
1355 $synonym{$_} = "tp53" foreach ("tp53", "tumor protein p53 (li-fraumeni syndrome)", "p53", "cellular tumor antigen p53", | |
1356 "phosphoprotein p53", "trp53", "transformation related protein 53", "p53", "p53 protein", "p53", | |
1357 "tp53", "tumor suppressor p53", "53 kda phosphoprotein", "insulin recptor substrate p53 short form", | |
1358 "p53 gene product", "p53 tumor suppressor gene", "p53 tumor suppressor protein", | |
1359 "tumor suppressor p53 phosphoprotein"); | |
1360 $synonym{$_} = "ttr" foreach ("ttr", "transthyretin (prealbumin, amyloidosis type i)", "palb", "tthy", "tbpa", "attr", | |
1361 "transthyretin", "transthyretin precursor", "transthyretin subunit", "ttr", "ttr"); | |
1362 $synonym{$_} = "tyr" foreach ("tyr", "tyrosinase (oculocutaneous albinism ia)", "ocaia", "monophenol monooxygenase", | |
1363 "tumor rejection antigen ab", "sk29-ab", "lb24-ab", "albino", "c", "skc35", "oca1a", "tyrosinase", | |
1364 "tyr", "tyrosinase precursor", "truncated tyrosinase", "tyr", "tyr"); | |
1365 $synonym{$_} = "vwf" foreach ("vwf", "von willebrand factor", "f8vwf", "coagulation factor viii", "von willebrand factor", "vwf", | |
1366 "von willebrand factor", "vwf", "vwf", "vwf", "von willebrand factor, vwf", "von willebrand factor precursor", | |
1367 "von willebrand factor precursor", "wf"); | |
1368 $synonym{$_} = "zfx" foreach ("zfx", "zinc finger protein, x-linked", "zinc finger x-chromosomal protein", "zfx", "zinc finger protein zfx", | |
1369 "zfx", "zinc finger protein zfx", "x-linked zinc finger protein zfx", "zfx", "x-linked zinc finger protein", | |
1370 "zinc finger x-linked protein 1", "zinc finger x-linked protein 2", "zinc finger protein x linked", | |
1371 "zfx1", "zfx-1", "zfx2", "zfx-2","zfx protein mrna", "zinc finger protein zfx isoform 4", | |
1372 "zfx product, isoform 1", "zfx product, isoform 2", "zfx product, isoform 3", | |
1373 "zfx product, isoform 4", "zfx protein"); | |
1374 $synonym{$_} = "zfy" foreach ("zfy", "zinc finger protein, y-linked", "zinc finger y-chromosomal protein", "zfy", "zfy", | |
1375 "zinc finger protein zfy", "zinc finger protein zfy", "y-linked zinc finger protein", | |
1376 "y-linked zinc finger protein zfy", "zinc finger protein y linked", | |
1377 "y-linked zinc finger protein 2", "y-linked zinc finger protein 1", "zfy1", "zfy-1", "zfy2", | |
1378 "zfy-2"); | |
1379 | |
1380 # Additional genes | |
1381 $synonym{$_} = "18s_rrna" foreach ("18s rrna", "18s_rrna", "18s ribosomal rna", | |
1382 "18 rrna", "18 s ribosomal rna", "18 s ribosomal rna", "18s small subunit ribosomal rna", | |
1383 "18s_ribsomal_rna", "18s_ribosomal_rna_a-type", "18s_rna_gene", "18s_rrna", "nuclear_18s_ribosomal_rna"); | |
1384 | |
1385 $synonym{$_} = "28s_rrna" foreach ("28s_large_subunit_ribosomal_rna", "28s rrna", "28s_rrna", "28s ribosomal rna", "28 rrna", "put. 28S ribosomal RNA", "28 s ribosomal rna", "28 s ribosomal rna", | |
1386 "rrna-28s_rrna-related", "28s ribosomal rna v region"); | |
1387 $synonym{$_} = "5.8s" foreach ("5.8s_ribosomal_rna", "5.8s_rrna", "5s_ribosomal_rna","5s_rrna","5.8s_rna_gene"); | |
1388 $synonym{$_} = "mtcyb" foreach ("cob", "mtcyb", "cyt b", "cytb", "cytochrome b", "cytochrome b", "cytb", "cyt b", "cytb", "cytb", | |
1389 "'cytochrome b'", "cytochrome-b", "skeletal muscle cytochrome b", "cyt-b", "cyt.b", "cyto b", | |
1390 "cytob", "cytb1", "cytb2", "cytb3", "cytb4", "cytochrome b light strand", "cyt-b", "cytob", | |
1391 "cyt.b", "cytb", "cytochrome b", "mitochondrial cytochrome b", "cytochrome b protein", | |
1392 "duodenal cytochrome b"); | |
1393 $synonym{$_} = "alpha_lactalbumin" foreach ("alpha lactalbumin", "alpha_lactalbumin", "alpha-lactalbumin", "alpla lactalbumin", | |
1394 "alpah lactalbumin", "a-lacta", "a-lacta", "lactalbumin, alpha-", "lactalbumin, alpha", | |
1395 "mature alpha-lactalbumin"); | |
1396 $synonym{$_} = "alpha-s1-casein" foreach ("alpha-s1-casein", "as1-casein", "alpha s1 casein", "alpha s1-casein b", | |
1397 "alpha s1 casein", "alpha s1-casein", "alpha(s1) casein", "alpha-s1 casein", "alpha-s1 casein", | |
1398 "alpha-s1-casein", "alpha-s1-casein mrna", "alphas1-casein", "as1-casein", "alfas1-casein", | |
1399 "alpha-sl casein", "casein alpha s1"); | |
1400 $synonym{$_} = "alpha-s2-casein" foreach ("alpha-s2-casein", "as2-casein", "alpha s2 casein", "alpha s2-casein b", | |
1401 "alpha s2 casein", "alpha s2-casein", "alpha(s2) casein", "alpha-s2 casein", "alpha-s2 casein", | |
1402 "alpha-s2-casein", "alpha-s2-casein mrna", "alphas2-casein", "as2-casein", | |
1403 "alpha s2a casein (aa 1 to 165)", "alpha s2a casein (aa 1 to 167)", "alph as2-casein", | |
1404 "alpha(s2)-casein"); | |
1405 $synonym{$_} = "b-casein" foreach ("b-casein", "beta casein", "beta-casein", "beta-casein (aa 1 - 213)", "beta-casein a3", | |
1406 "beta-casein precursor", "beta casein precursor", "beta-casein variant a2", | |
1407 "beta-casein variant i", "mat. beta-casein", "casein beta", "casein b", "casein b (aa 1-183) pgpk48", | |
1408 "mature beta-casein (aa 1 to 226)"); | |
1409 $synonym{$_} = "k-casein" foreach ("k-casein", "kappa casein", "kappa-casein", "kappa-casein precursor", "kappa casein precursor", | |
1410 "casein kappa", "kappa-cas", "kappa-casein long form", "kappa-casein mature peptide", | |
1411 "kappa-casein short form"); | |
1412 $synonym{$_} = "sry" foreach ("sry", "sex determining factor sry", "sex determining region y protein", "sex-determining factor sry", | |
1413 "sex-determining region y", "sry gene", "sry protein", "sex region of y chromosome"); | |
1414 $synonym{$_} = "c-mos" foreach ("c-mos", "oocyte maturation factor mos", "mos", "mos protein"); | |
1415 $synonym{$_} = "cryaa" foreach ("cryaa", "crya1", "alpha a-crystallin","alpha a-crystallin", "alphaa-crystallin", | |
1416 "alpha-a crystallin chain", "alpha-a-crystallin", "crystallin, alpha a", "alphaa-crystallin (crya1)", | |
1417 "alpha a crystallin"); | |
1418 $synonym{$_} = "cryab" foreach ("cryab", "crya2", "alpha b-crystallin","alpha b-crystallin", "alphab-crystallin", | |
1419 "alpha-b crystallin chain", "alpha-b-crystallin", "crystallin, alpha b", "alphab-crystallin (crya2)", | |
1420 "alpha b crystallin"); | |
1421 $synonym{$_} = "t-cell_receptor_beta" foreach ("t-cell receptor beta", "t cell receptor beta", "t-cell receptor beta chain", | |
1422 "t cell receptor beta chain", "t-cell receptor beta chain variable region", | |
1423 "t cell receptor beta chain variable region", "t-cell receptor beta chain variable", | |
1424 "t cell receptor beta chain variable", "t-cell receptor variable beta chain", | |
1425 "t cell receptor variable beta chain", "t-cell receptor beta-chain", | |
1426 "t cell receptor beta-chain", "t-cell receptor beta chain vj region", | |
1427 "t cell receptor beta chain vj region", "t-cell receptor beta chain v-d-j region", | |
1428 "t cell receptor beta chain v-d-j region", "t-cell receptor beta chain variable segment", | |
1429 "t cell receptor beta chain variable segment", "t-cell receptor beta chain constant region", | |
1430 "t cell receptor beta chain constant region", "t-cell receptor v beta gene", | |
1431 "t cell receptor v beta gene", "t-cell receptor-beta chain", "t cell receptor-beta chain", | |
1432 "t cell receptor bata chain", "t-cell receptor beta-chain", "t cell receptor beta-chain", | |
1433 "t-cell receptor beta chain (v-d-j-c)", "t cell receptor beta chain (v-d-j-c)", | |
1434 "t-cell receptor beta-chain v region", "t cell receptor beta-chain v region", | |
1435 "t-cell receptor v region beta-chain", "t cell receptor v region beta-chain", | |
1436 "t-cell receptor beta chain vdj region", "t cell receptor beta chain vdj region", | |
1437 "t-cell receptor beta chain v-region", "t cell receptor beta chain v-region", | |
1438 "t-cell receptor v-beta", "t cell receptor v-beta", "t-cell_receptor_beta"); | |
1439 $synonym{$_} = "t-cell_receptor_alpha" foreach ("t-cell receptor alpha", "t cell receptor alpha", "t-cell receptor alpha chain", | |
1440 "t cell receptor alpha chain", "t-cell receptor alpha chain variable region", | |
1441 "t cell receptor alpha chain variable region", "t-cell receptor alpha chain variable", | |
1442 "t cell receptor alpha chain variable", "t-cell receptor variable alpha chain", | |
1443 "t cell receptor variable alpha chain", "t-cell receptor alpha-chain", | |
1444 "t cell receptor alpha-chain", "t-cell receptor alpha chain vj region", | |
1445 "t cell receptor alpha chain vj region", "t-cell receptor alpha chain v-d-j region", | |
1446 "t cell receptor alpha chain v-d-j region", "t-cell receptor alpha chain variable segment", | |
1447 "t cell receptor alpha chain variable segment", "t-cell receptor alpha chain constant region", | |
1448 "t cell receptor alpha chain constant region", "t-cell receptor v alpha gene", | |
1449 "t cell receptor v alpha gene", "t-cell receptor-alpha chain", "t cell receptor-alpha chain", | |
1450 "t cell receptor bata chain", "t-cell receptor alpha-chain", "t cell receptor alpha-chain", | |
1451 "t-cell receptor alpha chain (v-d-j-c)", "t cell receptor alpha chain (v-d-j-c)", | |
1452 "t-cell receptor alpha-chain v region", "t cell receptor alpha-chain v region", | |
1453 "t-cell receptor v region alpha-chain", "t cell receptor v region alpha-chain", | |
1454 "t-cell receptor alpha chain vdj region", "t cell receptor alpha chain vdj region", | |
1455 "t-cell receptor alpha chain v-region", "t cell receptor alpha chain v-region", | |
1456 "t-cell receptor v-alpha", "t cell receptor v-alpha", "t-cell_receptor_alpha"); | |
1457 | |
1458 # tRNAs | |
1459 my @aminoAcids = qw(gly ala val leu ile met phe trp pro ser thr cys tyr asn gln asp glu lys arg his); | |
1460 my %AAcodes = ('glycine' => 'gly', 'alanine' => 'ala', 'valine' => 'val', 'leucine' => 'leu', 'isoleucine' => 'ile', | |
1461 'methionine' => 'met', 'phenylalanine' => 'phe', 'tryptophan' => 'trp', 'proline' => 'pro', | |
1462 'serine' => 'ser', 'threonine' => 'thr', 'cysteine' => 'cys', 'tyrosine' => 'tyr', 'asparagine' => 'asn', | |
1463 'glutamine' => 'gln', 'aspartic acid' => 'asp', 'glutamic acid' => 'glu', 'lysine' => 'lys', | |
1464 'arginine' => 'arg', 'histidine' => 'his'); | |
1465 my %fullName = ('gly' => 'glycine', 'ala' => 'alanine', 'val' => 'valine', 'leu' => 'leucine', 'ile' => 'isoleucine', | |
1466 'met' => 'methionine', 'phe' => 'phenylalanine', 'trp' => 'tryptophan', 'pro' => 'proline', | |
1467 'ser' => 'serine', 'thr' => 'threonine', 'cys' => 'cysteine', 'tyr' => 'tyrosine', 'asn' => 'asparagine', | |
1468 'gln' => 'glutamine', 'asp' => 'aspartic acid', 'glu' => 'glutamic acid', 'lys' => 'lysine', | |
1469 'arg' => 'arginine', 'his' => 'histidine'); | |
1470 | |
1471 foreach my $aa (@aminoAcids) | |
1472 { | |
1473 # Abbreviations | |
1474 $synonym{"trna $aa"} = "trna-$aa"; | |
1475 $synonym{"transfer rna $aa"} = "trna-$aa"; | |
1476 $synonym{"transfer rna-$aa"} = "trna-$aa"; | |
1477 $synonym{"trna($aa)"} = "trna-$aa"; | |
1478 $synonym{"trna $aa gene"} = "trna-$aa"; | |
1479 $synonym{"trna-$aa gene"} = "trna-$aa"; | |
1480 $synonym{"$aa trna"} = "trna-$aa"; | |
1481 | |
1482 my $dashedName = $aa."-trna"; | |
1483 $synonym{"$dashedName"} = "trna-$aa"; | |
1484 | |
1485 # And again for full names | |
1486 $synonym{"trna $fullName{$aa}"} = "trna-$aa"; | |
1487 $synonym{"transfer rna $fullName{$aa}"} = "trna-$aa"; | |
1488 $synonym{"transfer rna-$fullName{$aa}"} = "trna-$aa"; | |
1489 $synonym{"trna-$fullName{$aa}"} = "trna-$aa"; | |
1490 $synonym{"trna($fullName{$aa})"} = "trna-$aa"; | |
1491 $synonym{"trna $fullName{$aa} gene"} = "trna-$aa"; | |
1492 $synonym{"trna-$fullName{$aa} gene"} = "trna-$aa"; | |
1493 $synonym{"$fullName{$aa} trna"} = "trna-$aa"; | |
1494 | |
1495 $dashedName = $fullName{$aa}."-trna"; | |
1496 $synonym{"$dashedName"} = "trna-$aa"; | |
1497 } | |
1498 } | |
1499 | |
1500 sub geneClean | |
1501 { | |
1502 my $dirtyGene = shift; | |
1503 | |
1504 # Initial synonymizing | |
1505 $dirtyGene = lc($dirtyGene); # Only work with lower-case for synonymizing and file names | |
1506 $dirtyGene = $synonym{$dirtyGene} if (defined $synonym{$dirtyGene}); | |
1507 | |
1508 # Clean end of gene name | |
1509 $dirtyGene =~ s/\"$//g; | |
1510 $dirtyGene =~ s/\s+$//g; | |
1511 $dirtyGene =~ s/^\s+//g; | |
1512 | |
1513 # Remove punctuation that will confound file saving | |
1514 $dirtyGene =~ s/^'//g; | |
1515 $dirtyGene =~ s/'$//; | |
1516 $dirtyGene =~ s/:/ /g; | |
1517 $dirtyGene =~ s/, / /g; | |
1518 $dirtyGene =~ s/,//g; | |
1519 $dirtyGene =~ s/\*/-/g; | |
1520 $dirtyGene =~ s/\$/-/g; | |
1521 $dirtyGene =~ s/\#/-/g; | |
1522 $dirtyGene =~ s/\&/and/g; | |
1523 $dirtyGene =~ s/\//-/g; | |
1524 $dirtyGene =~ s/\\//g; | |
1525 $dirtyGene =~ s/\|/-/g; | |
1526 $dirtyGene =~ s/;//g; | |
1527 $dirtyGene =~ s/\<//g; | |
1528 $dirtyGene =~ s/\>//g; | |
1529 $dirtyGene =~ s/-+/-/g; | |
1530 $dirtyGene =~ s/\s+-/-/g; | |
1531 $dirtyGene =~ s/-\s+/-/g; | |
1532 $dirtyGene =~ s/^-+//; | |
1533 $dirtyGene =~ s/`/'/; | |
1534 | |
1535 # Collapse multiple whitespace | |
1536 $dirtyGene =~ s/\s+/_/g; | |
1537 | |
1538 # Clean up some tRNA variants (easier than specifying explicit synonyms for each tRNA) | |
1539 if ($dirtyGene =~ /tRNA/i) | |
1540 { | |
1541 $dirtyGene =~ s/_\d+$//; | |
1542 $dirtyGene =~ s/ \d+$//; | |
1543 } | |
1544 | |
1545 # Final synonymizing | |
1546 $dirtyGene = $synonym{$dirtyGene} if (defined $synonym{$dirtyGene}); # Recheck for synonym | |
1547 $dirtyGene = lc($dirtyGene); # Ensure that gene names are in lower case for further synonymizing and file saving (final safety) | |
1548 | |
1549 return $dirtyGene; | |
1550 } | |
1551 | |
1552 sub geneCount | |
1553 { | |
1554 # Set local parameters | |
1555 my $gbFile = shift; | |
1556 | |
1557 my (%iGene, @geneList, $speciesNum, %quickSpeciesGenePresent, %speciesCounter); | |
1558 my $wordBlock = 1; | |
1559 my $modelBlock = 0; | |
1560 my %modelSpecies; | |
1561 my $stripCount = 0; | |
1562 | |
1563 print "\nCounting gene occurrences in file $gbFile to establish genes that do not meet user-defined threshold(s)\n"; | |
1564 | |
1565 # Search input file | |
1566 setLineBreak($gbFile); | |
1567 open (GENE, "<$gbFile") or die "Cannot open GenBank output file, $gbFile.\n"; | |
1568 my $accCount = 0; | |
1569 my $modelFlag = 0; | |
1570 my $speciesFlag = 0; | |
1571 my $nameFlag = 0; | |
1572 my $countZero = time; | |
1573 my ($organism, $gene); | |
1574 while (<GENE>) | |
1575 { | |
1576 next unless ($_ =~ /LOCUS/ or $_ =~ /^\s*(\/gene)|(\/product)=\"/ or $_ =~ /^\s*\/organism=\"/ or $nameFlag == 1); | |
1577 chomp; | |
1578 my $gbLine = $_; | |
1579 | |
1580 if ($gbLine =~ /LOCUS/) | |
1581 { | |
1582 undef %iGene; | |
1583 undef $organism; | |
1584 $modelFlag = $speciesFlag = $nameFlag = 0; | |
1585 $accCount++; | |
1586 print "\t$accCount sequences read in\n" if ($accCount == int($accCount/10000)*10000 and $verbose); | |
1587 next; | |
1588 } | |
1589 | |
1590 # Get organism name | |
1591 if ($gbLine =~ /^\s*\/organism=\"(.+)\"/) | |
1592 { | |
1593 $organism = $1; | |
1594 $organism =~ s/\s/_/g; | |
1595 $modelFlag = 1 if (defined $modelSpecies{$organism}); | |
1596 $speciesFlag = 1 if ($organism =~ /sp\./ or $organism =~ /cf\./); | |
1597 unless ($speciesFlag or ($modelFlag and $modelBlock)) | |
1598 { | |
1599 $speciesCounter{$organism}++; | |
1600 $speciesNum++ if ($speciesCounter{$organism} == 1); | |
1601 } | |
1602 } | |
1603 next if ($modelFlag and $modelBlock); # Entry pertains to model organism; skip parsing rest of entry | |
1604 | |
1605 # Get gene name | |
1606 if ($gbLine =~ /^\s*(\/gene)|(\/product)=\"(.+)/) # Get start of gene name | |
1607 { | |
1608 $gene = $1 if ($gbLine =~ /=\"(.+)/); | |
1609 | |
1610 # Check whether name wraps onto a new line | |
1611 if (substr($gbLine, -1) ne "\"") | |
1612 { | |
1613 $nameFlag = 1; | |
1614 next; | |
1615 } | |
1616 else | |
1617 { | |
1618 $nameFlag = 2; | |
1619 $gene =~ s/\"$// if ($gene =~ /\"$/); | |
1620 } | |
1621 } | |
1622 | |
1623 if ($nameFlag == 1) # Get continuation / end of gene name | |
1624 { | |
1625 $gene .= $gbLine; | |
1626 $nameFlag = 2 if ($gbLine =~ /\"$/) # Gene name is complete | |
1627 } | |
1628 | |
1629 if ($gene and $nameFlag == 2) # Process complete gene | |
1630 { | |
1631 next if (not defined $organism); | |
1632 unless ($wordBlock and ($gene =~ /hypothetical/i or $gene =~ /open reading frame/i or $gene =~ /similar/i or $gene =~ /homolog/i or $gene =~ /putative/i or $gene =~ /unknown/i or $gene =~ /unnamed/i or $gene =~ /\d+rik/i or $gene =~ /possible/i or $gene =~ /pseudo/i)) | |
1633 { | |
1634 $gene = geneClean($gene); # Clean up and synonymize gene name | |
1635 | |
1636 # Increment counters | |
1637 $iGene{$gene}++; # Sequence counter | |
1638 $quickSequenceCount{$gene}++ if ($iGene{$gene} == 1); | |
1639 | |
1640 $quickSpeciesGenePresent{$gene}{$organism}++; # Species-gene counter | |
1641 $quickSpeciesCount{$gene}++ if ($quickSpeciesGenePresent{$gene}{$organism} == 1); | |
1642 } | |
1643 $nameFlag = 0; | |
1644 undef $gene; | |
1645 } | |
1646 } | |
1647 close GENE; | |
1648 print "\n" if ($verbose); | |
1649 my $countTime = time - $countZero; | |
1650 | |
1651 # Print results and block genes as appropriate | |
1652 open (OUT, ">$countFile") or die "Cannot open results file for quick gene count, $countFile.\n"; | |
1653 if ($speciesThreshold) | |
1654 { | |
1655 @geneList = sort {$quickSpeciesCount{$b} <=> $quickSpeciesCount{$a} } keys %quickSpeciesCount; | |
1656 } | |
1657 else | |
1658 { | |
1659 @geneList = sort {$quickSequenceCount{$b} <=> $quickSequenceCount{$a} } keys %quickSequenceCount; | |
1660 } | |
1661 | |
1662 printf OUT "Searching $accCount sequences took $countTime seconds with %s distinct genes found\n", scalar(@geneList); | |
1663 print OUT "\tModel organisms were excluded from search\n" if ($modelBlock); | |
1664 print OUT "\tCommon words were excluded from search\n" if ($wordBlock); | |
1665 print OUT "\nGene\tNo. of sequences\tNo. of species\n\n"; | |
1666 | |
1667 print "\tGene counts:\n" if ($debug); | |
1668 foreach my $entry (@geneList) | |
1669 { | |
1670 print OUT "$entry\t$quickSequenceCount{$entry}\t$quickSpeciesCount{$entry}\n"; | |
1671 print "\t$entry\t$quickSequenceCount{$entry}\t$quickSpeciesCount{$entry}\n" if ($debug); | |
1672 if ($quickSequenceCount{$entry} < $seqThreshold or $quickSpeciesCount{$entry} < $speciesThreshold) | |
1673 { | |
1674 $geneStatus{$entry} = "rejected"; | |
1675 push @rejectList, $entry; | |
1676 } | |
1677 else | |
1678 { | |
1679 $stripCount++; | |
1680 } | |
1681 } | |
1682 close OUT; | |
1683 | |
1684 printf "\tSearching $accCount sequences took $countTime seconds with %s distinct genes found; results written to $countFile\n", scalar(@geneList); | |
1685 if (not $stripCount) | |
1686 { | |
1687 print "No genes were present more than the threshold value(s)\n"; | |
1688 exit(0); | |
1689 } | |
1690 } | |
1691 | |
1692 # Version history | |
1693 # | |
1694 # v2.0 (July 25, 2005) | |
1695 # - improved and more complete parsing of GenBank files (esp. of gene names or CDs covering multiple lines, | |
1696 # join statements and other multi-segment entries, detection of pseudogenes) | |
1697 # - added ability to mine input file for all gene content | |
1698 # - added ability to strip only those genes present for 1) a minimum number of sequences and/or species and | |
1699 # 2) species with valid names only | |
1700 # - added (incomplete) gene synonymy list (initially compiled by Robin Beck) | |
1701 # - minor bug fixes | |
1702 # | |
1703 # v1.0.1a (April 29, 2004) | |
1704 # - added automatic detection of line breaks | |
1705 # - minor bug fixes | |
1706 # | |
1707 # v1.0 (April 30, 2002) | |
1708 # - initial release |