Mercurial > repos > geert-vandeweyer > fastq_all_to_sanger
changeset 0:c450731486c8 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:24:33 -0500 |
parents | |
children | 3990d6b37e2d |
files | FastQ_QualConverter.pl |
diffstat | 1 files changed, 138 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FastQ_QualConverter.pl Thu Feb 13 08:24:33 2014 -0500 @@ -0,0 +1,138 @@ +#!/usr/bin/perl + +# load modules +use Getopt::Std; +use File::Basename; + +########## +## opts ## +########## +## input files +# i : path to (i)nput fastq file +## output files +# o : (o)utput file +## optional options +# f : (f)ormat of input fastq file, default : detect automatically. +getopts('i:o:f:', \%opts) ; + +## check existence of input file +if (!exists($opts{'i'}) || !-e $opts{'i'}) { + die('input file not found'); +} + + + +############################ +## Construct Quality hash ## +############################ +my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'; +my @asci = ('') x 33; +my @qvalues = split('',$qstring); +push(@asci, @qvalues); + +############################# +## INPUT FILE SCORE FORMAT ## +############################# +my $informat = ''; +my $validate = 0; +if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') { + $informat = $opts{'f'}; +} +else { + $validate = 1; ## track during conversion to make sure. + my %tracker = (); + my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger. + ## scan input file until clear. + open IN, $opts{'i'}; + $try = 0; + while ($try < 15000) { + $try ++; + my $name = <IN>; + my $seq = <IN>; + my $dump = <IN>; + my $qual = <IN>; + chomp($qual); + for ($i = 0; $i < length($qual); $i++) { + my $char = quotemeta(substr($qual,$i,1)); + if ($sanger =~ m/$char/) { + $informat = 'sanger'; + $try = 50000; + print "Item ($char) found in :\n $qual\n"; + last; + } + $tracker{$char} = 1; + } + } + close IN; + if ($informat eq '' ) { + ## no sanger (or ALL reads should be completely above qual=26) + ## check solexa items + foreach (qw/;<=>?/) { + if (exists($tracker{$_})) { + $informat = 'solexa'; + last; + } + } + } + if ($informat eq '') { + ## both illumina 1.3 and 1.5 are phred+64. + $informat = 'illumina'; + } +} + + +print "Input format detected : $informat\n"; + +## SET CONVERSION RULE to sanger +my @conv_table; +if ($informat eq 'solexa') { + ## advanced conversion table + foreach (@qvalues) { + my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred + $conv_table[ord($_)] = $asci[$Q + 33]; # phred + 33 = sanger + } +} +elsif ($informat ne 'sanger') { + ## easy conversion table + foreach (@qvalues) { + $conv_table[ord($_)] = $asci[ord($_) - 31]; # base64 - 31 = sanger + } +} +else { + foreach (@qvalues) { + $conv_table[ord($_)] = $asci[ord($_)]; # sanger = sanger + } +} + +if ($informat eq 'sanger') { + system("cp -f ".$opts{'i'}." ".$opts{'o'}); + exit; +} +###################### +## CONVERT THE FILE ## +###################### +open IN, $opts{'i'}; +open OUT, ">".$opts{'o'}; +my $counter = 0; +my $outstring = ''; +while ($readname = <IN>) { + $counter++; + my $seq = <IN>; + my $readnamebis = <IN>; + my $qstring = <IN>; + chomp($qstring); + my $outqual = ''; + foreach $ascval (unpack("C*",$qstring)) { # => returns the asci value of each item in string + $outqual .= $conv_table[$ascval]; + } + $outstring .= "$readname$seq$readnamebis$outqual\n"; + if ($counter > 500000) { # print per 500K reads. + print OUT $outstring; + $outstring = ''; + $counter = 0; + } + +} +print OUT $outstring; +close IN; +close OUT;