0
|
1 #!/project/bioperl/perl-5.10.1-sles11/bin/perl -w
|
|
2
|
|
3 my $dos_end = chr(13);
|
|
4
|
|
5 my %chrom_map = (
|
|
6 "gi|149288852|ref|NC_000067.5|NC_000067" => "chr1",
|
|
7 "gi|149288869|ref|NC_000076.5|NC_000076" => "chr10",
|
|
8 "gi|149288871|ref|NC_000077.5|NC_000077" => "chr11",
|
|
9 "gi|149292731|ref|NC_000078.5|NC_000078" => "chr12",
|
|
10 "gi|149292733|ref|NC_000079.5|NC_000079" => "chr13",
|
|
11 "gi|149292735|ref|NC_000080.5|NC_000080" => "chr14",
|
|
12 "gi|149301884|ref|NC_000081.5|NC_000081" => "chr15",
|
|
13 "gi|149304713|ref|NC_000082.5|NC_000082" => "chr16",
|
|
14 "gi|149313536|ref|NC_000083.5|NC_000083" => "chr17",
|
|
15 "gi|149321426|ref|NC_000084.5|NC_000084" => "chr18",
|
|
16 "gi|149323268|ref|NC_000085.5|NC_000085" => "chr19",
|
|
17 "gi|149338249|ref|NC_000068.6|NC_000068" => "chr2",
|
|
18 "gi|149352351|ref|NC_000069.5|NC_000069" => "chr3",
|
|
19 "gi|149354223|ref|NC_000070.5|NC_000070" => "chr4",
|
|
20 "gi|149354224|ref|NC_000071.5|NC_000071" => "chr5",
|
|
21 "gi|149361431|ref|NC_000072.5|NC_000072" => "chr6",
|
|
22 "gi|149361432|ref|NC_000073.5|NC_000073" => "chr7",
|
|
23 "gi|149361523|ref|NC_000074.5|NC_000074" => "chr8",
|
|
24 "gi|149361524|ref|NC_000075.5|NC_000075" => "chr9",
|
|
25 "gi|149361525|ref|NC_000086.6|NC_000086" => "chrX",
|
|
26 "gi|149361526|ref|NC_000087.6|NC_000087" => "chrY"
|
|
27 );
|
|
28
|
|
29 return 1;
|
|
30
|
|
31 sub get_chrom_map {
|
|
32 return \%chrom_map;
|
|
33 }
|
|
34
|
|
35 sub sanitize_project {
|
|
36 my ($project) = @_;
|
|
37 $project =~ s/@/_/g;
|
|
38 $project =~ s/\./_/g;
|
|
39 $project =~ s/\s/_/g;
|
|
40 return $project;
|
|
41 }
|
|
42
|
|
43 sub process_fasta {
|
|
44 my $fn = shift @_;
|
|
45 my $process = shift @_;
|
|
46 my $curr_seq_id = "";
|
|
47 my $curr_seq = "";
|
|
48 my $entries = 0;
|
|
49 push (my @opts, @_);
|
|
50 open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n";
|
|
51 while (<FILE>) {
|
|
52 chomp;
|
|
53 $_ =~ s/$dos_end//g;
|
|
54 if ($_=~/^>/) {
|
|
55 if (length($curr_seq_id) > 0 && length($curr_seq) > 0) {
|
|
56 #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n"));
|
|
57 &$process(\$curr_seq_id, \$curr_seq, \@opts);
|
|
58 $entries++;
|
|
59 }
|
|
60 if (($entries % 100000) == 0) {
|
|
61 my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time);
|
|
62 print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries);
|
|
63 }
|
|
64 $curr_seq_id = $_;
|
|
65 $curr_seq_id =~ s/>//g;
|
|
66 $curr_seq = "";
|
|
67 }
|
|
68 elsif (length($_) > 0) { $curr_seq = join("", ($curr_seq, $_)); }
|
|
69 }
|
|
70 if (length($curr_seq_id) > 0 && length($curr_seq) > 0) {
|
|
71 &$process(\$curr_seq_id, \$curr_seq, \@opts);
|
|
72 $entries++;
|
|
73 #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed
|
|
74 }
|
|
75 close (FILE);
|
|
76 return $entries;
|
|
77 }
|
|
78
|
|
79 sub process_fastq {
|
|
80 my $fn = shift @_;
|
|
81 my $process = shift @_;
|
|
82 my $curr_seq_id = "";
|
|
83 my $curr_str = "";
|
|
84 my $curr_seq = "";
|
|
85 my $entries = 0;
|
|
86 push (my @opts, @_);
|
|
87 open (FILE, "<${$fn}") || die "Unable to open ${$fn}, $!\n";
|
|
88 while (<FILE>) {
|
|
89 chomp;
|
|
90 $_ =~ s/$dos_end//g;
|
|
91 if ($_=~/^@/) {
|
|
92 if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) {
|
|
93 #print join("\t", ("Joined:", $curr_seq_id, $curr_seq, @opts, "\n"));
|
|
94 &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts);
|
|
95 $entries++;
|
|
96 }
|
|
97 if (($entries % 100000) == 0) {
|
|
98 my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time);
|
|
99 print sprintf("%02d-%02d %02d:%02d:%02d Entries Processed: %9s\n", $mon+1, $day, $hour, $min, $sec, $entries);
|
|
100 }
|
|
101 $curr_seq_id = $_;
|
|
102 $curr_seq_id =~ s/@//g;
|
|
103 $curr_seq = "";
|
|
104 $curr_str = "";
|
|
105 }
|
|
106 elsif ($_=~/^\+/) {
|
|
107 $curr_seq = $curr_str;
|
|
108 $curr_str = $_;
|
|
109 $curr_str =~ s/\+//g;
|
|
110 if ($curr_seq_id ne $curr_str) {
|
|
111 warn "Invalid FASTQ file, sequence id and quality score id mismatch: $curr_seq_id, $curr_str\n";
|
|
112 return -1;
|
|
113 }
|
|
114 $curr_str = "";
|
|
115 }
|
|
116 elsif (length($_) > 0) { $curr_str = join("", ($curr_str, $_)); }
|
|
117 }
|
|
118 if (length($curr_seq_id) > 0 && length($curr_seq) > 0 && length($curr_str) > 0) {
|
|
119 &$process(\$curr_seq_id, \$curr_seq, \$curr_str, \@opts);
|
|
120 $entries++;
|
|
121 #$entries = 1; #for performance, returning 1 as true indicating there is at least one entry processed
|
|
122 }
|
|
123 close (FILE);
|
|
124 return $entries;
|
|
125 }
|