view util.pl @ 3:17ce4f3bffa2 default tip

Uploaded
author jesse-erdmann
date Tue, 24 Jan 2012 18:33:41 -0500
parents 1437a2df99c0
children
line wrap: on
line source

#!/project/bioperl/perl-5.10.1-sles11/bin/perl -w

my $dos_end = chr(13);

my %chrom_map = ( 
    "gi|149288852|ref|NC_000067.5|NC_000067" => "chr1",
    "gi|149288869|ref|NC_000076.5|NC_000076" => "chr10",
    "gi|149288871|ref|NC_000077.5|NC_000077" => "chr11",
    "gi|149292731|ref|NC_000078.5|NC_000078" => "chr12",
    "gi|149292733|ref|NC_000079.5|NC_000079" => "chr13",
    "gi|149292735|ref|NC_000080.5|NC_000080" => "chr14",
    "gi|149301884|ref|NC_000081.5|NC_000081" => "chr15",
    "gi|149304713|ref|NC_000082.5|NC_000082" => "chr16",
    "gi|149313536|ref|NC_000083.5|NC_000083" => "chr17",
    "gi|149321426|ref|NC_000084.5|NC_000084" => "chr18",
    "gi|149323268|ref|NC_000085.5|NC_000085" => "chr19",
    "gi|149338249|ref|NC_000068.6|NC_000068" => "chr2",
    "gi|149352351|ref|NC_000069.5|NC_000069" => "chr3",
    "gi|149354223|ref|NC_000070.5|NC_000070" => "chr4",
    "gi|149354224|ref|NC_000071.5|NC_000071" => "chr5",
    "gi|149361431|ref|NC_000072.5|NC_000072" => "chr6",
    "gi|149361432|ref|NC_000073.5|NC_000073" => "chr7",
    "gi|149361523|ref|NC_000074.5|NC_000074" => "chr8",
    "gi|149361524|ref|NC_000075.5|NC_000075" => "chr9",
    "gi|149361525|ref|NC_000086.6|NC_000086" => "chrX",
    "gi|149361526|ref|NC_000087.6|NC_000087" => "chrY"
);

return 1;

sub get_chrom_map {
    return \%chrom_map;
}

sub sanitize_project {
    my ($project) = @_;
    $project =~ s/@/_/g;
    $project =~ s/\./_/g;
    $project =~ s/\s/_/g;
    return $project;
}

sub process_fasta {
    my $fn = shift @_;
    my $process = shift @_;
    my $curr_seq_id = "";
    my $curr_seq = "";
    my $entries = 0;
    push (my @opts, @_);
    open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n";
    while (<FILE>) {
        chomp;
        $_ =~ s/$dos_end//g;
        if ($_=~/^>/) {
            if (length($curr_seq_id) > 0 && length($curr_seq) > 0) {
                #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n"));                                                                                                              
                &$process(\$curr_seq_id, \$curr_seq, \@opts);
                $entries++;
            }
            #if (($entries % 100000) == 0) {
            #    my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time);
            #    print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries);
            #}
            $curr_seq_id = $_;
            $curr_seq_id =~ s/>//g;
            $curr_seq = "";
        }
        elsif (length($_) > 0) { $curr_seq = join("", ($curr_seq, $_)); }
    }
    if (length($curr_seq_id) > 0 && length($curr_seq) > 0) {
        &$process(\$curr_seq_id, \$curr_seq, \@opts);
        $entries++;
        #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed                                                                                      
    }
    close (FILE);
    return $entries;
}

sub process_fastq {
    my $fn = shift @_;
    my $process = shift @_;
    my $curr_seq_id = "";
    my $curr_str = "";
    my $curr_seq = "";
    my $entries = 0;
    push (my @opts, @_);
    open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n";
    while (<FILE>) {
        chomp;
        $_ =~ s/$dos_end//g;
        if ($_=~/^@/) {
            if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) {
                #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n"));                                                                                                              
                &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts);
                $entries++;
            }
            if (($entries % 100000) == 0) {
                my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time);
                print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries);
            }
            $curr_seq_id = $_;
            $curr_seq_id =~ s/@//g;
            $curr_seq = "";
            $curr_str = "";
        }
        elsif ($_=~/^\+/) {
            $curr_seq = $curr_str;
            $curr_str = $_;
            $curr_str =~ s/\+//g;
            if ($curr_seq_id ne $curr_str) {
                warn "Invalid FASTQ file, sequence id and quality score id mismatch: $curr_seq_id, $curr_str\n";
                return -1;
            }
            $curr_str = "";
        }
        elsif (length($_) > 0) { $curr_str = join("", ($curr_str, $_)); }
    }
    if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) {
        &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts);
        $entries++;
        #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed                                                                                      
    }
    close (FILE);
    return $entries;
}