Mercurial > repos > geert-vandeweyer > fastq_all_to_sanger
view FastQ_QualConverter.pl @ 1:3990d6b37e2d draft default tip
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:24:43 -0500 |
parents | c450731486c8 |
children |
line wrap: on
line source
#!/usr/bin/perl # load modules use Getopt::Std; use File::Basename; ########## ## opts ## ########## ## input files # i : path to (i)nput fastq file ## output files # o : (o)utput file ## optional options # f : (f)ormat of input fastq file, default : detect automatically. getopts('i:o:f:', \%opts) ; ## check existence of input file if (!exists($opts{'i'}) || !-e $opts{'i'}) { die('input file not found'); } ############################ ## Construct Quality hash ## ############################ my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'; my @asci = ('') x 33; my @qvalues = split('',$qstring); push(@asci, @qvalues); ############################# ## INPUT FILE SCORE FORMAT ## ############################# my $informat = ''; my $validate = 0; if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') { $informat = $opts{'f'}; } else { $validate = 1; ## track during conversion to make sure. my %tracker = (); my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger. ## scan input file until clear. open IN, $opts{'i'}; $try = 0; while ($try < 15000) { $try ++; my $name = <IN>; my $seq = <IN>; my $dump = <IN>; my $qual = <IN>; chomp($qual); for ($i = 0; $i < length($qual); $i++) { my $char = quotemeta(substr($qual,$i,1)); if ($sanger =~ m/$char/) { $informat = 'sanger'; $try = 50000; print "Item ($char) found in :\n $qual\n"; last; } $tracker{$char} = 1; } } close IN; if ($informat eq '' ) { ## no sanger (or ALL reads should be completely above qual=26) ## check solexa items foreach (qw/;<=>?/) { if (exists($tracker{$_})) { $informat = 'solexa'; last; } } } if ($informat eq '') { ## both illumina 1.3 and 1.5 are phred+64. $informat = 'illumina'; } } print "Input format detected : $informat\n"; ## SET CONVERSION RULE to sanger my @conv_table; if ($informat eq 'solexa') { ## advanced conversion table foreach (@qvalues) { my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred $conv_table[ord($_)] = $asci[$Q + 33]; # phred + 33 = sanger } } elsif ($informat ne 'sanger') { ## easy conversion table foreach (@qvalues) { $conv_table[ord($_)] = $asci[ord($_) - 31]; # base64 - 31 = sanger } } else { foreach (@qvalues) { $conv_table[ord($_)] = $asci[ord($_)]; # sanger = sanger } } if ($informat eq 'sanger') { system("cp -f ".$opts{'i'}." ".$opts{'o'}); exit; } ###################### ## CONVERT THE FILE ## ###################### open IN, $opts{'i'}; open OUT, ">".$opts{'o'}; my $counter = 0; my $outstring = ''; while ($readname = <IN>) { $counter++; my $seq = <IN>; my $readnamebis = <IN>; my $qstring = <IN>; chomp($qstring); my $outqual = ''; foreach $ascval (unpack("C*",$qstring)) { # => returns the asci value of each item in string $outqual .= $conv_table[$ascval]; } $outstring .= "$readname$seq$readnamebis$outqual\n"; if ($counter > 500000) { # print per 500K reads. print OUT $outstring; $outstring = ''; $counter = 0; } } print OUT $outstring; close IN; close OUT;