annotate lib/util.pl @ 3:17ce4f3bffa2 default tip

Uploaded
author jesse-erdmann
date Tue, 24 Jan 2012 18:33:41 -0500
parents 1437a2df99c0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
1 #!/project/bioperl/perl-5.10.1-sles11/bin/perl -w
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
2
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
3 my $dos_end = chr(13);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
4
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
5 my %chrom_map = (
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
6 "gi|149288852|ref|NC_000067.5|NC_000067" => "chr1",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
7 "gi|149288869|ref|NC_000076.5|NC_000076" => "chr10",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
8 "gi|149288871|ref|NC_000077.5|NC_000077" => "chr11",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
9 "gi|149292731|ref|NC_000078.5|NC_000078" => "chr12",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
10 "gi|149292733|ref|NC_000079.5|NC_000079" => "chr13",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
11 "gi|149292735|ref|NC_000080.5|NC_000080" => "chr14",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
12 "gi|149301884|ref|NC_000081.5|NC_000081" => "chr15",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
13 "gi|149304713|ref|NC_000082.5|NC_000082" => "chr16",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
14 "gi|149313536|ref|NC_000083.5|NC_000083" => "chr17",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
15 "gi|149321426|ref|NC_000084.5|NC_000084" => "chr18",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
16 "gi|149323268|ref|NC_000085.5|NC_000085" => "chr19",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
17 "gi|149338249|ref|NC_000068.6|NC_000068" => "chr2",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
18 "gi|149352351|ref|NC_000069.5|NC_000069" => "chr3",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
19 "gi|149354223|ref|NC_000070.5|NC_000070" => "chr4",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
20 "gi|149354224|ref|NC_000071.5|NC_000071" => "chr5",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
21 "gi|149361431|ref|NC_000072.5|NC_000072" => "chr6",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
22 "gi|149361432|ref|NC_000073.5|NC_000073" => "chr7",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
23 "gi|149361523|ref|NC_000074.5|NC_000074" => "chr8",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
24 "gi|149361524|ref|NC_000075.5|NC_000075" => "chr9",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
25 "gi|149361525|ref|NC_000086.6|NC_000086" => "chrX",
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
26 "gi|149361526|ref|NC_000087.6|NC_000087" => "chrY"
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
27 );
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
28
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
29 return 1;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
30
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
31 sub get_chrom_map {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
32 return \%chrom_map;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
33 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
34
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
35 sub sanitize_project {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
36 my ($project) = @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
37 $project =~ s/@/_/g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
38 $project =~ s/\./_/g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
39 $project =~ s/\s/_/g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
40 return $project;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
41 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
42
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
43 sub process_fasta {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
44 my $fn = shift @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
45 my $process = shift @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
46 my $curr_seq_id = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
47 my $curr_seq = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
48 my $entries = 0;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
49 push (my @opts, @_);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
50 open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
51 while (<FILE>) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
52 chomp;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
53 $_ =~ s/$dos_end//g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
54 if ($_=~/^>/) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
55 if (length($curr_seq_id) > 0 && length($curr_seq) > 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
56 #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n"));
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
57 &$process(\$curr_seq_id, \$curr_seq, \@opts);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
58 $entries++;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
59 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
60 if (($entries % 100000) == 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
61 my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
62 print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
63 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
64 $curr_seq_id = $_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
65 $curr_seq_id =~ s/>//g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
66 $curr_seq = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
67 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
68 elsif (length($_) > 0) { $curr_seq = join("", ($curr_seq, $_)); }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
69 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
70 if (length($curr_seq_id) > 0 && length($curr_seq) > 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
71 &$process(\$curr_seq_id, \$curr_seq, \@opts);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
72 $entries++;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
73 #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
74 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
75 close (FILE);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
76 return $entries;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
77 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
78
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
79 sub process_fastq {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
80 my $fn = shift @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
81 my $process = shift @_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
82 my $curr_seq_id = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
83 my $curr_str = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
84 my $curr_seq = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
85 my $entries = 0;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
86 push (my @opts, @_);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
87 open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
88 while (<FILE>) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
89 chomp;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
90 $_ =~ s/$dos_end//g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
91 if ($_=~/^@/) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
92 if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
93 #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n"));
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
94 &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
95 $entries++;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
96 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
97 if (($entries % 100000) == 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
98 my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
99 print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
100 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
101 $curr_seq_id = $_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
102 $curr_seq_id =~ s/@//g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
103 $curr_seq = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
104 $curr_str = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
105 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
106 elsif ($_=~/^\+/) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
107 $curr_seq = $curr_str;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
108 $curr_str = $_;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
109 $curr_str =~ s/\+//g;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
110 if ($curr_seq_id ne $curr_str) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
111 warn "Invalid FASTQ file, sequence id and quality score id mismatch: $curr_seq_id, $curr_str\n";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
112 return -1;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
113 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
114 $curr_str = "";
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
115 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
116 elsif (length($_) > 0) { $curr_str = join("", ($curr_str, $_)); }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
117 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
118 if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) {
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
119 &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
120 $entries++;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
121 #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
122 }
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
123 close (FILE);
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
124 return $entries;
1437a2df99c0 Uploaded
jesse-erdmann
parents:
diff changeset
125 }