fastq_sync: pe_sync_2_files.pl comparison

comparison pe_sync_2_files.pl @ 0:751f4938cf0d

Uploaded

author	jjohnson
date	Tue, 05 Feb 2013 15:23:18 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:751f4938cf0d
+#!/usr/bin/perl -w
+###############################################
+# pe-sync-2-files.pl # this is a stripped-down version of pe-sync-directory.pl
+# John Garbe
+# February 2012
+#
+# Paired-end read file synchronization script: checks if left and right
+# reads are in the same order.
+# This script checks the two files specificd on the command line.
+# The "quick" option will not report the percent of out-of-sync reads
+# for many failed files, but will run much faster.
+#
+# This script should run fine on linux systems or solaris
+# (which is what the project space fileserver runs)
+#
+# This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids.
+# Other types of read ID formats will cause a terminal error.
+#
+###############################################
+die "Usage: pe-sync-directory.pl [quick] left_fastqfile right_fastqfile\n" unless ($#ARGV > 0);
+# determine if we're running on solaris in order to change system calls
+$output = `perl -v`;
+$solaris = 0;
+$solaris = 1 if ($output =~ /solaris/);
+if ($solaris) {
+#    print STDERR "Running on Solaris\n";
+}
+# handle "quick" option
+$quick = 0; # 1 for quickcheck, 0 for thorough check
+if (defined($ARGV[0])) {
+if ($ARGV[0] eq "quick") {
+	$quick = 1;
+	shift @ARGV;
+}
+}
+print STDERR "QUICK CHECK enabled\n" if ($quick);
+$filename_r1 = $ARGV[0];
+$filename_r2 = $ARGV[1];
+# identify read id format type
+if ($solaris) {
+$readid = `head -1 $filename_r1`;
+} else {
+$readid = `head -n 1 $filename_r1`;
+}
+chomp $readid;
+if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line
+$id_type = 1;
+print STDERR "Casava 1.7 read id style\n"; # TESTING
+} elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace
+$id_type = 2;
+print STDERR "Casava 1.8 read id style\n"; # TESTING
+} else {
+print STDERR "Cannot determine read id style\n";
+print STDOUT "Unknwon id style: $readid\n";
+exit 1;
+}
+chomp $filename_r1;
+chomp $filename_r2;
+if ($quick) {
+$failure = &quickcheck($filename_r1, $filename_r2); # quickie check
+print STDERR "FAILED quickcheck\n" if ($failure);
+}
+# If we aren't doing the quick check, or the quick check passed, do the slow check
+if (!($quick) || !($failure)) {
+$failure = &percent($filename_r1, $filename_r2); # slow thorough check
+}
+exit;
+######################## subs #########################
+# do a quick check to see if the files are bad (many bad files will pass this quick check)
+sub quickcheck {
+my ($file1, $file2) = @_;
+if ($solaris) {
+	$result1 = `tail -10000l $file1 | head -1`; # grab the line 10,000 lines from the EOF
+	$result2 = `tail -10000l $file2 | head -1`;
+} else {
+	$result1 = `tail -n 10000 $file1 | head -n 1`;
+	$result2 = `tail -n 10000 $file2 | head -n 1`;
+}
+# process the ids depending on id style
+if ($id_type == 1) { # 1 or 2 at end of id
+	chomp $result1;
+	chomp $result2;
+	chop $result1;
+	chop $result2;
+} else { # first word is a unique read id, second word is left/right-specific
+	($result1) = split ' ', $result1;
+	($result2) = split ' ', $result2;
+}
+if ($result1 eq $result2) {
+#	print "pass\n";
+	return 0;
+} else {
+#	print "fail\n";
+	return 1;
+}
+}
+# compare every read id from each file and determine what percent of reads are
+# out of sync
+sub percent {
+my($file1, $file2) = @_;
+open R1, '<', $file1 or die "Couldn't open $file1 for reading:$!";
+open R2, '<', $file2 or die "Couldn't open $file2 for reading:$!";
+my $readCount = 0;
+my $failcount = 0;
+while (!eof(R1) and !eof(R2)) {
+	$id1 = <R1>;
+	$id2 = <R2>;
+	next unless ($. % 4 == 1); # check every sequence (4th line in the file)
+	# process the ids depending on id style
+	if ($id_type == 1) { # 1 or 2 at end of id
+	    chomp $id1;  # newline
+	    chop $id1;   # last char of the id (should be the "1" or "2")
+	    chomp $id2;
+	    chop $id2;
+	} else {
+	    ($id1) = split ' ', $id1;
+	    ($id2) = split ' ', $id2;
+	}
+	$failcount++ unless ($id1 eq $id2);
+	$readCount++;
+}
+my $percent = $failcount / $readCount;
+$prettypercent = $percent * 100; # make it percent
+$prettypercent = sprintf("%.2f", $prettypercent);
+if ($failcount > 0) {
+	print STDERR "FAILED full check\t$prettypercent% of reads are out-of-sync\n";
+	$exitStatus=1;
+} else {
+	print STDERR "PASSED full check\n";
+	$exitStatus=0;
+}
+return $exitStatus;
+}

Mercurial > repos > jjohnson > fastq_sync

comparison pe_sync_2_files.pl @ 0:751f4938cf0d