Mercurial > repos > jesse-erdmann > tapdance
view util.pl @ 3:17ce4f3bffa2 default tip
Uploaded
author | jesse-erdmann |
---|---|
date | Tue, 24 Jan 2012 18:33:41 -0500 |
parents | 1437a2df99c0 |
children |
line wrap: on
line source
#!/project/bioperl/perl-5.10.1-sles11/bin/perl -w my $dos_end = chr(13); my %chrom_map = ( "gi|149288852|ref|NC_000067.5|NC_000067" => "chr1", "gi|149288869|ref|NC_000076.5|NC_000076" => "chr10", "gi|149288871|ref|NC_000077.5|NC_000077" => "chr11", "gi|149292731|ref|NC_000078.5|NC_000078" => "chr12", "gi|149292733|ref|NC_000079.5|NC_000079" => "chr13", "gi|149292735|ref|NC_000080.5|NC_000080" => "chr14", "gi|149301884|ref|NC_000081.5|NC_000081" => "chr15", "gi|149304713|ref|NC_000082.5|NC_000082" => "chr16", "gi|149313536|ref|NC_000083.5|NC_000083" => "chr17", "gi|149321426|ref|NC_000084.5|NC_000084" => "chr18", "gi|149323268|ref|NC_000085.5|NC_000085" => "chr19", "gi|149338249|ref|NC_000068.6|NC_000068" => "chr2", "gi|149352351|ref|NC_000069.5|NC_000069" => "chr3", "gi|149354223|ref|NC_000070.5|NC_000070" => "chr4", "gi|149354224|ref|NC_000071.5|NC_000071" => "chr5", "gi|149361431|ref|NC_000072.5|NC_000072" => "chr6", "gi|149361432|ref|NC_000073.5|NC_000073" => "chr7", "gi|149361523|ref|NC_000074.5|NC_000074" => "chr8", "gi|149361524|ref|NC_000075.5|NC_000075" => "chr9", "gi|149361525|ref|NC_000086.6|NC_000086" => "chrX", "gi|149361526|ref|NC_000087.6|NC_000087" => "chrY" ); return 1; sub get_chrom_map { return \%chrom_map; } sub sanitize_project { my ($project) = @_; $project =~ s/@/_/g; $project =~ s/\./_/g; $project =~ s/\s/_/g; return $project; } sub process_fasta { my $fn = shift @_; my $process = shift @_; my $curr_seq_id = ""; my $curr_seq = ""; my $entries = 0; push (my @opts, @_); open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n"; while (<FILE>) { chomp; $_ =~ s/$dos_end//g; if ($_=~/^>/) { if (length($curr_seq_id) > 0 && length($curr_seq) > 0) { #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n")); &$process(\$curr_seq_id, \$curr_seq, \@opts); $entries++; } #if (($entries % 100000) == 0) { # my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time); # print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries); #} $curr_seq_id = $_; $curr_seq_id =~ s/>//g; $curr_seq = ""; } elsif (length($_) > 0) { $curr_seq = join("", ($curr_seq, $_)); } } if (length($curr_seq_id) > 0 && length($curr_seq) > 0) { &$process(\$curr_seq_id, \$curr_seq, \@opts); $entries++; #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed } close (FILE); return $entries; } sub process_fastq { my $fn = shift @_; my $process = shift @_; my $curr_seq_id = ""; my $curr_str = ""; my $curr_seq = ""; my $entries = 0; push (my @opts, @_); open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n"; while (<FILE>) { chomp; $_ =~ s/$dos_end//g; if ($_=~/^@/) { if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) { #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n")); &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts); $entries++; } if (($entries % 100000) == 0) { my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time); print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries); } $curr_seq_id = $_; $curr_seq_id =~ s/@//g; $curr_seq = ""; $curr_str = ""; } elsif ($_=~/^\+/) { $curr_seq = $curr_str; $curr_str = $_; $curr_str =~ s/\+//g; if ($curr_seq_id ne $curr_str) { warn "Invalid FASTQ file, sequence id and quality score id mismatch: $curr_seq_id, $curr_str\n"; return -1; } $curr_str = ""; } elsif (length($_) > 0) { $curr_str = join("", ($curr_str, $_)); } } if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) { &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts); $entries++; #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed } close (FILE); return $entries; }