comparison util.pl @ 0:1437a2df99c0

Uploaded
author jesse-erdmann
date Fri, 09 Dec 2011 11:56:56 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1437a2df99c0
1 #!/project/bioperl/perl-5.10.1-sles11/bin/perl -w
2
3 my $dos_end = chr(13);
4
5 my %chrom_map = (
6 "gi|149288852|ref|NC_000067.5|NC_000067" => "chr1",
7 "gi|149288869|ref|NC_000076.5|NC_000076" => "chr10",
8 "gi|149288871|ref|NC_000077.5|NC_000077" => "chr11",
9 "gi|149292731|ref|NC_000078.5|NC_000078" => "chr12",
10 "gi|149292733|ref|NC_000079.5|NC_000079" => "chr13",
11 "gi|149292735|ref|NC_000080.5|NC_000080" => "chr14",
12 "gi|149301884|ref|NC_000081.5|NC_000081" => "chr15",
13 "gi|149304713|ref|NC_000082.5|NC_000082" => "chr16",
14 "gi|149313536|ref|NC_000083.5|NC_000083" => "chr17",
15 "gi|149321426|ref|NC_000084.5|NC_000084" => "chr18",
16 "gi|149323268|ref|NC_000085.5|NC_000085" => "chr19",
17 "gi|149338249|ref|NC_000068.6|NC_000068" => "chr2",
18 "gi|149352351|ref|NC_000069.5|NC_000069" => "chr3",
19 "gi|149354223|ref|NC_000070.5|NC_000070" => "chr4",
20 "gi|149354224|ref|NC_000071.5|NC_000071" => "chr5",
21 "gi|149361431|ref|NC_000072.5|NC_000072" => "chr6",
22 "gi|149361432|ref|NC_000073.5|NC_000073" => "chr7",
23 "gi|149361523|ref|NC_000074.5|NC_000074" => "chr8",
24 "gi|149361524|ref|NC_000075.5|NC_000075" => "chr9",
25 "gi|149361525|ref|NC_000086.6|NC_000086" => "chrX",
26 "gi|149361526|ref|NC_000087.6|NC_000087" => "chrY"
27 );
28
29 return 1;
30
31 sub get_chrom_map {
32 return \%chrom_map;
33 }
34
35 sub sanitize_project {
36 my ($project) = @_;
37 $project =~ s/@/_/g;
38 $project =~ s/\./_/g;
39 $project =~ s/\s/_/g;
40 return $project;
41 }
42
43 sub process_fasta {
44 my $fn = shift @_;
45 my $process = shift @_;
46 my $curr_seq_id = "";
47 my $curr_seq = "";
48 my $entries = 0;
49 push (my @opts, @_);
50 open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n";
51 while (<FILE>) {
52 chomp;
53 $_ =~ s/$dos_end//g;
54 if ($_=~/^>/) {
55 if (length($curr_seq_id) > 0 && length($curr_seq) > 0) {
56 #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n"));
57 &$process(\$curr_seq_id, \$curr_seq, \@opts);
58 $entries++;
59 }
60 #if (($entries % 100000) == 0) {
61 # my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time);
62 # print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries);
63 #}
64 $curr_seq_id = $_;
65 $curr_seq_id =~ s/>//g;
66 $curr_seq = "";
67 }
68 elsif (length($_) > 0) { $curr_seq = join("", ($curr_seq, $_)); }
69 }
70 if (length($curr_seq_id) > 0 && length($curr_seq) > 0) {
71 &$process(\$curr_seq_id, \$curr_seq, \@opts);
72 $entries++;
73 #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed
74 }
75 close (FILE);
76 return $entries;
77 }
78
79 sub process_fastq {
80 my $fn = shift @_;
81 my $process = shift @_;
82 my $curr_seq_id = "";
83 my $curr_str = "";
84 my $curr_seq = "";
85 my $entries = 0;
86 push (my @opts, @_);
87 open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n";
88 while (<FILE>) {
89 chomp;
90 $_ =~ s/$dos_end//g;
91 if ($_=~/^@/) {
92 if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) {
93 #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n"));
94 &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts);
95 $entries++;
96 }
97 if (($entries % 100000) == 0) {
98 my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time);
99 print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries);
100 }
101 $curr_seq_id = $_;
102 $curr_seq_id =~ s/@//g;
103 $curr_seq = "";
104 $curr_str = "";
105 }
106 elsif ($_=~/^\+/) {
107 $curr_seq = $curr_str;
108 $curr_str = $_;
109 $curr_str =~ s/\+//g;
110 if ($curr_seq_id ne $curr_str) {
111 warn "Invalid FASTQ file, sequence id and quality score id mismatch: $curr_seq_id, $curr_str\n";
112 return -1;
113 }
114 $curr_str = "";
115 }
116 elsif (length($_) > 0) { $curr_str = join("", ($curr_str, $_)); }
117 }
118 if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) {
119 &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts);
120 $entries++;
121 #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed
122 }
123 close (FILE);
124 return $entries;
125 }