Mercurial > repos > jesse-erdmann > tapdance
diff lib/util.pl @ 0:1437a2df99c0
Uploaded
author | jesse-erdmann |
---|---|
date | Fri, 09 Dec 2011 11:56:56 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/util.pl Fri Dec 09 11:56:56 2011 -0500 @@ -0,0 +1,125 @@ +#!/project/bioperl/perl-5.10.1-sles11/bin/perl -w + +my $dos_end = chr(13); + +my %chrom_map = ( + "gi|149288852|ref|NC_000067.5|NC_000067" => "chr1", + "gi|149288869|ref|NC_000076.5|NC_000076" => "chr10", + "gi|149288871|ref|NC_000077.5|NC_000077" => "chr11", + "gi|149292731|ref|NC_000078.5|NC_000078" => "chr12", + "gi|149292733|ref|NC_000079.5|NC_000079" => "chr13", + "gi|149292735|ref|NC_000080.5|NC_000080" => "chr14", + "gi|149301884|ref|NC_000081.5|NC_000081" => "chr15", + "gi|149304713|ref|NC_000082.5|NC_000082" => "chr16", + "gi|149313536|ref|NC_000083.5|NC_000083" => "chr17", + "gi|149321426|ref|NC_000084.5|NC_000084" => "chr18", + "gi|149323268|ref|NC_000085.5|NC_000085" => "chr19", + "gi|149338249|ref|NC_000068.6|NC_000068" => "chr2", + "gi|149352351|ref|NC_000069.5|NC_000069" => "chr3", + "gi|149354223|ref|NC_000070.5|NC_000070" => "chr4", + "gi|149354224|ref|NC_000071.5|NC_000071" => "chr5", + "gi|149361431|ref|NC_000072.5|NC_000072" => "chr6", + "gi|149361432|ref|NC_000073.5|NC_000073" => "chr7", + "gi|149361523|ref|NC_000074.5|NC_000074" => "chr8", + "gi|149361524|ref|NC_000075.5|NC_000075" => "chr9", + "gi|149361525|ref|NC_000086.6|NC_000086" => "chrX", + "gi|149361526|ref|NC_000087.6|NC_000087" => "chrY" +); + +return 1; + +sub get_chrom_map { + return \%chrom_map; +} + +sub sanitize_project { + my ($project) = @_; + $project =~ s/@/_/g; + $project =~ s/\./_/g; + $project =~ s/\s/_/g; + return $project; +} + +sub process_fasta { + my $fn = shift @_; + my $process = shift @_; + my $curr_seq_id = ""; + my $curr_seq = ""; + my $entries = 0; + push (my @opts, @_); + open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n"; + while (<FILE>) { + chomp; + $_ =~ s/$dos_end//g; + if ($_=~/^>/) { + if (length($curr_seq_id) > 0 && length($curr_seq) > 0) { + #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n")); + &$process(\$curr_seq_id, \$curr_seq, \@opts); + $entries++; + } + if (($entries % 100000) == 0) { + my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time); + print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries); + } + $curr_seq_id = $_; + $curr_seq_id =~ s/>//g; + $curr_seq = ""; + } + elsif (length($_) > 0) { $curr_seq = join("", ($curr_seq, $_)); } + } + if (length($curr_seq_id) > 0 && length($curr_seq) > 0) { + &$process(\$curr_seq_id, \$curr_seq, \@opts); + $entries++; + #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed + } + close (FILE); + return $entries; +} + +sub process_fastq { + my $fn = shift @_; + my $process = shift @_; + my $curr_seq_id = ""; + my $curr_str = ""; + my $curr_seq = ""; + my $entries = 0; + push (my @opts, @_); + open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n"; + while (<FILE>) { + chomp; + $_ =~ s/$dos_end//g; + if ($_=~/^@/) { + if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) { + #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n")); + &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts); + $entries++; + } + if (($entries % 100000) == 0) { + my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time); + print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries); + } + $curr_seq_id = $_; + $curr_seq_id =~ s/@//g; + $curr_seq = ""; + $curr_str = ""; + } + elsif ($_=~/^\+/) { + $curr_seq = $curr_str; + $curr_str = $_; + $curr_str =~ s/\+//g; + if ($curr_seq_id ne $curr_str) { + warn "Invalid FASTQ file, sequence id and quality score id mismatch: $curr_seq_id, $curr_str\n"; + return -1; + } + $curr_str = ""; + } + elsif (length($_) > 0) { $curr_str = join("", ($curr_str, $_)); } + } + if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) { + &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts); + $entries++; + #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed + } + close (FILE); + return $entries; +}