Mercurial > repos > jjohnson > fastq_sync
comparison pe_sync_2_files.pl @ 0:751f4938cf0d
Uploaded
| author | jjohnson |
|---|---|
| date | Tue, 05 Feb 2013 15:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:751f4938cf0d |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 ############################################### | |
| 4 # pe-sync-2-files.pl # this is a stripped-down version of pe-sync-directory.pl | |
| 5 # John Garbe | |
| 6 # February 2012 | |
| 7 # | |
| 8 # Paired-end read file synchronization script: checks if left and right | |
| 9 # reads are in the same order. | |
| 10 # This script checks the two files specificd on the command line. | |
| 11 # The "quick" option will not report the percent of out-of-sync reads | |
| 12 # for many failed files, but will run much faster. | |
| 13 # | |
| 14 # This script should run fine on linux systems or solaris | |
| 15 # (which is what the project space fileserver runs) | |
| 16 # | |
| 17 # This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids. | |
| 18 # Other types of read ID formats will cause a terminal error. | |
| 19 # | |
| 20 ############################################### | |
| 21 | |
| 22 die "Usage: pe-sync-directory.pl [quick] left_fastqfile right_fastqfile\n" unless ($#ARGV > 0); | |
| 23 | |
| 24 # determine if we're running on solaris in order to change system calls | |
| 25 $output = `perl -v`; | |
| 26 $solaris = 0; | |
| 27 $solaris = 1 if ($output =~ /solaris/); | |
| 28 if ($solaris) { | |
| 29 # print STDERR "Running on Solaris\n"; | |
| 30 } | |
| 31 | |
| 32 # handle "quick" option | |
| 33 $quick = 0; # 1 for quickcheck, 0 for thorough check | |
| 34 if (defined($ARGV[0])) { | |
| 35 if ($ARGV[0] eq "quick") { | |
| 36 $quick = 1; | |
| 37 shift @ARGV; | |
| 38 } | |
| 39 } | |
| 40 print STDERR "QUICK CHECK enabled\n" if ($quick); | |
| 41 | |
| 42 $filename_r1 = $ARGV[0]; | |
| 43 $filename_r2 = $ARGV[1]; | |
| 44 | |
| 45 # identify read id format type | |
| 46 if ($solaris) { | |
| 47 $readid = `head -1 $filename_r1`; | |
| 48 } else { | |
| 49 $readid = `head -n 1 $filename_r1`; | |
| 50 } | |
| 51 chomp $readid; | |
| 52 if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line | |
| 53 $id_type = 1; | |
| 54 print STDERR "Casava 1.7 read id style\n"; # TESTING | |
| 55 } elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace | |
| 56 $id_type = 2; | |
| 57 print STDERR "Casava 1.8 read id style\n"; # TESTING | |
| 58 } else { | |
| 59 print STDERR "Cannot determine read id style\n"; | |
| 60 print STDOUT "Unknwon id style: $readid\n"; | |
| 61 exit 1; | |
| 62 } | |
| 63 | |
| 64 chomp $filename_r1; | |
| 65 chomp $filename_r2; | |
| 66 | |
| 67 if ($quick) { | |
| 68 $failure = &quickcheck($filename_r1, $filename_r2); # quickie check | |
| 69 print STDERR "FAILED quickcheck\n" if ($failure); | |
| 70 } | |
| 71 # If we aren't doing the quick check, or the quick check passed, do the slow check | |
| 72 if (!($quick) || !($failure)) { | |
| 73 $failure = &percent($filename_r1, $filename_r2); # slow thorough check | |
| 74 } | |
| 75 | |
| 76 exit; | |
| 77 | |
| 78 ######################## subs ######################### | |
| 79 # do a quick check to see if the files are bad (many bad files will pass this quick check) | |
| 80 sub quickcheck { | |
| 81 my ($file1, $file2) = @_; | |
| 82 if ($solaris) { | |
| 83 $result1 = `tail -10000l $file1 | head -1`; # grab the line 10,000 lines from the EOF | |
| 84 $result2 = `tail -10000l $file2 | head -1`; | |
| 85 } else { | |
| 86 $result1 = `tail -n 10000 $file1 | head -n 1`; | |
| 87 $result2 = `tail -n 10000 $file2 | head -n 1`; | |
| 88 } | |
| 89 # process the ids depending on id style | |
| 90 if ($id_type == 1) { # 1 or 2 at end of id | |
| 91 chomp $result1; | |
| 92 chomp $result2; | |
| 93 chop $result1; | |
| 94 chop $result2; | |
| 95 } else { # first word is a unique read id, second word is left/right-specific | |
| 96 ($result1) = split ' ', $result1; | |
| 97 ($result2) = split ' ', $result2; | |
| 98 } | |
| 99 if ($result1 eq $result2) { | |
| 100 # print "pass\n"; | |
| 101 return 0; | |
| 102 } else { | |
| 103 # print "fail\n"; | |
| 104 return 1; | |
| 105 } | |
| 106 } | |
| 107 | |
| 108 | |
| 109 # compare every read id from each file and determine what percent of reads are | |
| 110 # out of sync | |
| 111 sub percent { | |
| 112 my($file1, $file2) = @_; | |
| 113 | |
| 114 open R1, '<', $file1 or die "Couldn't open $file1 for reading:$!"; | |
| 115 open R2, '<', $file2 or die "Couldn't open $file2 for reading:$!"; | |
| 116 | |
| 117 my $readCount = 0; | |
| 118 my $failcount = 0; | |
| 119 while (!eof(R1) and !eof(R2)) { | |
| 120 $id1 = <R1>; | |
| 121 $id2 = <R2>; | |
| 122 next unless ($. % 4 == 1); # check every sequence (4th line in the file) | |
| 123 # process the ids depending on id style | |
| 124 if ($id_type == 1) { # 1 or 2 at end of id | |
| 125 chomp $id1; # newline | |
| 126 chop $id1; # last char of the id (should be the "1" or "2") | |
| 127 chomp $id2; | |
| 128 chop $id2; | |
| 129 } else { | |
| 130 ($id1) = split ' ', $id1; | |
| 131 ($id2) = split ' ', $id2; | |
| 132 } | |
| 133 $failcount++ unless ($id1 eq $id2); | |
| 134 $readCount++; | |
| 135 } | |
| 136 my $percent = $failcount / $readCount; | |
| 137 $prettypercent = $percent * 100; # make it percent | |
| 138 $prettypercent = sprintf("%.2f", $prettypercent); | |
| 139 if ($failcount > 0) { | |
| 140 print STDERR "FAILED full check\t$prettypercent% of reads are out-of-sync\n"; | |
| 141 $exitStatus=1; | |
| 142 } else { | |
| 143 print STDERR "PASSED full check\n"; | |
| 144 $exitStatus=0; | |
| 145 } | |
| 146 | |
| 147 return $exitStatus; | |
| 148 } |
