Mercurial > repos > jjohnson > fastq_sync
view pe_sync_2_files.pl @ 1:b0ab279b5add default tip
Add test cases
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Wed, 20 Mar 2013 08:26:48 -0500 |
parents | 751f4938cf0d |
children |
line wrap: on
line source
#!/usr/bin/perl -w ############################################### # pe-sync-2-files.pl # this is a stripped-down version of pe-sync-directory.pl # John Garbe # February 2012 # # Paired-end read file synchronization script: checks if left and right # reads are in the same order. # This script checks the two files specificd on the command line. # The "quick" option will not report the percent of out-of-sync reads # for many failed files, but will run much faster. # # This script should run fine on linux systems or solaris # (which is what the project space fileserver runs) # # This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids. # Other types of read ID formats will cause a terminal error. # ############################################### die "Usage: pe-sync-directory.pl [quick] left_fastqfile right_fastqfile\n" unless ($#ARGV > 0); # determine if we're running on solaris in order to change system calls $output = `perl -v`; $solaris = 0; $solaris = 1 if ($output =~ /solaris/); if ($solaris) { # print STDERR "Running on Solaris\n"; } # handle "quick" option $quick = 0; # 1 for quickcheck, 0 for thorough check if (defined($ARGV[0])) { if ($ARGV[0] eq "quick") { $quick = 1; shift @ARGV; } } print STDERR "QUICK CHECK enabled\n" if ($quick); $filename_r1 = $ARGV[0]; $filename_r2 = $ARGV[1]; # identify read id format type if ($solaris) { $readid = `head -1 $filename_r1`; } else { $readid = `head -n 1 $filename_r1`; } chomp $readid; if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line $id_type = 1; print STDERR "Casava 1.7 read id style\n"; # TESTING } elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace $id_type = 2; print STDERR "Casava 1.8 read id style\n"; # TESTING } else { print STDERR "Cannot determine read id style\n"; print STDOUT "Unknwon id style: $readid\n"; exit 1; } chomp $filename_r1; chomp $filename_r2; if ($quick) { $failure = &quickcheck($filename_r1, $filename_r2); # quickie check print STDERR "FAILED quickcheck\n" if ($failure); } # If we aren't doing the quick check, or the quick check passed, do the slow check if (!($quick) || !($failure)) { $failure = &percent($filename_r1, $filename_r2); # slow thorough check } exit; ######################## subs ######################### # do a quick check to see if the files are bad (many bad files will pass this quick check) sub quickcheck { my ($file1, $file2) = @_; if ($solaris) { $result1 = `tail -10000l $file1 | head -1`; # grab the line 10,000 lines from the EOF $result2 = `tail -10000l $file2 | head -1`; } else { $result1 = `tail -n 10000 $file1 | head -n 1`; $result2 = `tail -n 10000 $file2 | head -n 1`; } # process the ids depending on id style if ($id_type == 1) { # 1 or 2 at end of id chomp $result1; chomp $result2; chop $result1; chop $result2; } else { # first word is a unique read id, second word is left/right-specific ($result1) = split ' ', $result1; ($result2) = split ' ', $result2; } if ($result1 eq $result2) { # print "pass\n"; return 0; } else { # print "fail\n"; return 1; } } # compare every read id from each file and determine what percent of reads are # out of sync sub percent { my($file1, $file2) = @_; open R1, '<', $file1 or die "Couldn't open $file1 for reading:$!"; open R2, '<', $file2 or die "Couldn't open $file2 for reading:$!"; my $readCount = 0; my $failcount = 0; while (!eof(R1) and !eof(R2)) { $id1 = <R1>; $id2 = <R2>; next unless ($. % 4 == 1); # check every sequence (4th line in the file) # process the ids depending on id style if ($id_type == 1) { # 1 or 2 at end of id chomp $id1; # newline chop $id1; # last char of the id (should be the "1" or "2") chomp $id2; chop $id2; } else { ($id1) = split ' ', $id1; ($id2) = split ' ', $id2; } $failcount++ unless ($id1 eq $id2); $readCount++; } my $percent = $failcount / $readCount; $prettypercent = $percent * 100; # make it percent $prettypercent = sprintf("%.2f", $prettypercent); if ($failcount > 0) { print STDERR "FAILED full check\t$prettypercent% of reads are out-of-sync\n"; $exitStatus=1; } else { print STDERR "PASSED full check\n"; $exitStatus=0; } return $exitStatus; }