0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # load modules
|
|
4 use Getopt::Std;
|
|
5 use File::Basename;
|
|
6
|
|
7 ##########
|
|
8 ## opts ##
|
|
9 ##########
|
|
10 ## input files
|
|
11 # i : path to (i)nput fastq file
|
|
12 ## output files
|
|
13 # o : (o)utput file
|
|
14 ## optional options
|
|
15 # f : (f)ormat of input fastq file, default : detect automatically.
|
|
16 getopts('i:o:f:', \%opts) ;
|
|
17
|
|
18 ## check existence of input file
|
|
19 if (!exists($opts{'i'}) || !-e $opts{'i'}) {
|
|
20 die('input file not found');
|
|
21 }
|
|
22
|
|
23
|
|
24
|
|
25 ############################
|
|
26 ## Construct Quality hash ##
|
|
27 ############################
|
|
28 my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
|
|
29 my @asci = ('') x 33;
|
|
30 my @qvalues = split('',$qstring);
|
|
31 push(@asci, @qvalues);
|
|
32
|
|
33 #############################
|
|
34 ## INPUT FILE SCORE FORMAT ##
|
|
35 #############################
|
|
36 my $informat = '';
|
|
37 my $validate = 0;
|
|
38 if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') {
|
|
39 $informat = $opts{'f'};
|
|
40 }
|
|
41 else {
|
|
42 $validate = 1; ## track during conversion to make sure.
|
|
43 my %tracker = ();
|
|
44 my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger.
|
|
45 ## scan input file until clear.
|
|
46 open IN, $opts{'i'};
|
|
47 $try = 0;
|
|
48 while ($try < 15000) {
|
|
49 $try ++;
|
|
50 my $name = <IN>;
|
|
51 my $seq = <IN>;
|
|
52 my $dump = <IN>;
|
|
53 my $qual = <IN>;
|
|
54 chomp($qual);
|
|
55 for ($i = 0; $i < length($qual); $i++) {
|
|
56 my $char = quotemeta(substr($qual,$i,1));
|
|
57 if ($sanger =~ m/$char/) {
|
|
58 $informat = 'sanger';
|
|
59 $try = 50000;
|
|
60 print "Item ($char) found in :\n $qual\n";
|
|
61 last;
|
|
62 }
|
|
63 $tracker{$char} = 1;
|
|
64 }
|
|
65 }
|
|
66 close IN;
|
|
67 if ($informat eq '' ) {
|
|
68 ## no sanger (or ALL reads should be completely above qual=26)
|
|
69 ## check solexa items
|
|
70 foreach (qw/;<=>?/) {
|
|
71 if (exists($tracker{$_})) {
|
|
72 $informat = 'solexa';
|
|
73 last;
|
|
74 }
|
|
75 }
|
|
76 }
|
|
77 if ($informat eq '') {
|
|
78 ## both illumina 1.3 and 1.5 are phred+64.
|
|
79 $informat = 'illumina';
|
|
80 }
|
|
81 }
|
|
82
|
|
83
|
|
84 print "Input format detected : $informat\n";
|
|
85
|
|
86 ## SET CONVERSION RULE to sanger
|
|
87 my @conv_table;
|
|
88 if ($informat eq 'solexa') {
|
|
89 ## advanced conversion table
|
|
90 foreach (@qvalues) {
|
|
91 my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred
|
|
92 $conv_table[ord($_)] = $asci[$Q + 33]; # phred + 33 = sanger
|
|
93 }
|
|
94 }
|
|
95 elsif ($informat ne 'sanger') {
|
|
96 ## easy conversion table
|
|
97 foreach (@qvalues) {
|
|
98 $conv_table[ord($_)] = $asci[ord($_) - 31]; # base64 - 31 = sanger
|
|
99 }
|
|
100 }
|
|
101 else {
|
|
102 foreach (@qvalues) {
|
|
103 $conv_table[ord($_)] = $asci[ord($_)]; # sanger = sanger
|
|
104 }
|
|
105 }
|
|
106
|
|
107 if ($informat eq 'sanger') {
|
|
108 system("cp -f ".$opts{'i'}." ".$opts{'o'});
|
|
109 exit;
|
|
110 }
|
|
111 ######################
|
|
112 ## CONVERT THE FILE ##
|
|
113 ######################
|
|
114 open IN, $opts{'i'};
|
|
115 open OUT, ">".$opts{'o'};
|
|
116 my $counter = 0;
|
|
117 my $outstring = '';
|
|
118 while ($readname = <IN>) {
|
|
119 $counter++;
|
|
120 my $seq = <IN>;
|
|
121 my $readnamebis = <IN>;
|
|
122 my $qstring = <IN>;
|
|
123 chomp($qstring);
|
|
124 my $outqual = '';
|
|
125 foreach $ascval (unpack("C*",$qstring)) { # => returns the asci value of each item in string
|
|
126 $outqual .= $conv_table[$ascval];
|
|
127 }
|
|
128 $outstring .= "$readname$seq$readnamebis$outqual\n";
|
|
129 if ($counter > 500000) { # print per 500K reads.
|
|
130 print OUT $outstring;
|
|
131 $outstring = '';
|
|
132 $counter = 0;
|
|
133 }
|
|
134
|
|
135 }
|
|
136 print OUT $outstring;
|
|
137 close IN;
|
|
138 close OUT;
|