Mercurial > repos > jesse-erdmann > tapdance
comparison util.pl @ 0:1437a2df99c0
Uploaded
author | jesse-erdmann |
---|---|
date | Fri, 09 Dec 2011 11:56:56 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1437a2df99c0 |
---|---|
1 #!/project/bioperl/perl-5.10.1-sles11/bin/perl -w | |
2 | |
3 my $dos_end = chr(13); | |
4 | |
5 my %chrom_map = ( | |
6 "gi|149288852|ref|NC_000067.5|NC_000067" => "chr1", | |
7 "gi|149288869|ref|NC_000076.5|NC_000076" => "chr10", | |
8 "gi|149288871|ref|NC_000077.5|NC_000077" => "chr11", | |
9 "gi|149292731|ref|NC_000078.5|NC_000078" => "chr12", | |
10 "gi|149292733|ref|NC_000079.5|NC_000079" => "chr13", | |
11 "gi|149292735|ref|NC_000080.5|NC_000080" => "chr14", | |
12 "gi|149301884|ref|NC_000081.5|NC_000081" => "chr15", | |
13 "gi|149304713|ref|NC_000082.5|NC_000082" => "chr16", | |
14 "gi|149313536|ref|NC_000083.5|NC_000083" => "chr17", | |
15 "gi|149321426|ref|NC_000084.5|NC_000084" => "chr18", | |
16 "gi|149323268|ref|NC_000085.5|NC_000085" => "chr19", | |
17 "gi|149338249|ref|NC_000068.6|NC_000068" => "chr2", | |
18 "gi|149352351|ref|NC_000069.5|NC_000069" => "chr3", | |
19 "gi|149354223|ref|NC_000070.5|NC_000070" => "chr4", | |
20 "gi|149354224|ref|NC_000071.5|NC_000071" => "chr5", | |
21 "gi|149361431|ref|NC_000072.5|NC_000072" => "chr6", | |
22 "gi|149361432|ref|NC_000073.5|NC_000073" => "chr7", | |
23 "gi|149361523|ref|NC_000074.5|NC_000074" => "chr8", | |
24 "gi|149361524|ref|NC_000075.5|NC_000075" => "chr9", | |
25 "gi|149361525|ref|NC_000086.6|NC_000086" => "chrX", | |
26 "gi|149361526|ref|NC_000087.6|NC_000087" => "chrY" | |
27 ); | |
28 | |
29 return 1; | |
30 | |
31 sub get_chrom_map { | |
32 return \%chrom_map; | |
33 } | |
34 | |
35 sub sanitize_project { | |
36 my ($project) = @_; | |
37 $project =~ s/@/_/g; | |
38 $project =~ s/\./_/g; | |
39 $project =~ s/\s/_/g; | |
40 return $project; | |
41 } | |
42 | |
43 sub process_fasta { | |
44 my $fn = shift @_; | |
45 my $process = shift @_; | |
46 my $curr_seq_id = ""; | |
47 my $curr_seq = ""; | |
48 my $entries = 0; | |
49 push (my @opts, @_); | |
50 open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n"; | |
51 while (<FILE>) { | |
52 chomp; | |
53 $_ =~ s/$dos_end//g; | |
54 if ($_=~/^>/) { | |
55 if (length($curr_seq_id) > 0 && length($curr_seq) > 0) { | |
56 #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n")); | |
57 &$process(\$curr_seq_id, \$curr_seq, \@opts); | |
58 $entries++; | |
59 } | |
60 #if (($entries % 100000) == 0) { | |
61 # my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time); | |
62 # print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries); | |
63 #} | |
64 $curr_seq_id = $_; | |
65 $curr_seq_id =~ s/>//g; | |
66 $curr_seq = ""; | |
67 } | |
68 elsif (length($_) > 0) { $curr_seq = join("", ($curr_seq, $_)); } | |
69 } | |
70 if (length($curr_seq_id) > 0 && length($curr_seq) > 0) { | |
71 &$process(\$curr_seq_id, \$curr_seq, \@opts); | |
72 $entries++; | |
73 #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed | |
74 } | |
75 close (FILE); | |
76 return $entries; | |
77 } | |
78 | |
79 sub process_fastq { | |
80 my $fn = shift @_; | |
81 my $process = shift @_; | |
82 my $curr_seq_id = ""; | |
83 my $curr_str = ""; | |
84 my $curr_seq = ""; | |
85 my $entries = 0; | |
86 push (my @opts, @_); | |
87 open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n"; | |
88 while (<FILE>) { | |
89 chomp; | |
90 $_ =~ s/$dos_end//g; | |
91 if ($_=~/^@/) { | |
92 if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) { | |
93 #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n")); | |
94 &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts); | |
95 $entries++; | |
96 } | |
97 if (($entries % 100000) == 0) { | |
98 my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time); | |
99 print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries); | |
100 } | |
101 $curr_seq_id = $_; | |
102 $curr_seq_id =~ s/@//g; | |
103 $curr_seq = ""; | |
104 $curr_str = ""; | |
105 } | |
106 elsif ($_=~/^\+/) { | |
107 $curr_seq = $curr_str; | |
108 $curr_str = $_; | |
109 $curr_str =~ s/\+//g; | |
110 if ($curr_seq_id ne $curr_str) { | |
111 warn "Invalid FASTQ file, sequence id and quality score id mismatch: $curr_seq_id, $curr_str\n"; | |
112 return -1; | |
113 } | |
114 $curr_str = ""; | |
115 } | |
116 elsif (length($_) > 0) { $curr_str = join("", ($curr_str, $_)); } | |
117 } | |
118 if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) { | |
119 &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts); | |
120 $entries++; | |
121 #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed | |
122 } | |
123 close (FILE); | |
124 return $entries; | |
125 } |