# HG changeset patch # User nilesh # Date 1373657779 18000 # Node ID 54ab39950f598e81b7503a92a2d214f7fdf8340f # Parent 177470e28e5f79f1cd4ac04128585dc5b0c92026 first commit diff -r 177470e28e5f -r 54ab39950f59 cmpfastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpfastq Fri Jul 12 14:36:19 2013 -0500 @@ -0,0 +1,221 @@ +use strict; +use warnings; +use Getopt::Std; +use File::Basename; + +## CONSTANTS ## +my $TRUE = 1; +my $FALSE = 0; +my $DEBUG = $FALSE; +my $EXITSTATUS = 0; + +# Default umask +umask 027; + +# Define variables + +# Get the options passed at the command prompt +GetOptions(); + +############################## +# M A I N P R O G R A M # +############################## + +# Check to see we received two files in the arguments + +my $fail = $FALSE; + +# Check to see if the files exist +foreach my $file (@ARGV) +{ + if(!-e $file) + { + print STDERR "File $file didn't exist\n"; + $fail = $TRUE; + } +} + +# If any of the files didn't exist, let's kill it +if($fail) +{ + exit(1); +} + +# Read the file names in from the command line +my $file1 = shift(@ARGV); +my $file2 = shift(@ARGV); + + +# Index the first file. +my %fastqIndex1 = %{IndexFastq($file1)}; + +# Compare the two files +CompareFastq($file1, $file2, \%fastqIndex1); + +exit($EXITSTATUS); + +# Subroutines +sub Usage +{ + my $base = basename($0); + print "Usage: $base [dh] file1 file2\n"; + print "\td:\tDebug mode on (default off)\n"; + print "\th:\tPrint this usage\n"; +} + +sub GetOptions +{ + # Get the options passed at the command prompt + my %options=(); + getopts("dh", \%options); + + if(defined($options{'d'})) + { + $DEBUG = $TRUE; + } + + if(defined($options{'h'})) + { + Usage(); + exit($EXITSTATUS); + } +} + +sub IndexFastq +{ + my $file = shift; + my %fastqIndex; + + open(IN, $file) or die("Could not open $file\n"); + my $pos = tell(IN); + my $lineCounter = 1; + while(my $line = ) + { + chomp($line); + + # Each block is going to be of 4 lines + # Let's get the seq ID from the sequence name + if($line =~ m/^@(.*)#.*/) + { + $fastqIndex{$1} = $pos; + # Skip the next 3 lines + for(my $i=0; $i<3; $i++) + { + ; + $lineCounter++; + } + } + elsif($line =~ m/^#/) + { + print STDERR "File: $file\[$lineCounter]: Skipping comment line: $line\n" if($DEBUG); + } + elsif($line =~ m/^$/) + { + print STDERR "File: $file\[$lineCounter]: Skipping empty line: $line\n" if($DEBUG); + } + else + { + print STDERR "File: $file\[$lineCounter]: Could not match the sequence ID from the name: $line\n" if($DEBUG); + } + $pos = tell(IN); + $lineCounter++; + } + close(IN); + + return \%fastqIndex; +} + +sub CompareFastq +{ + my $file1 = shift; + my $file2 = shift; + my $fastqIndex1Ref = shift; + my %fastqIndex1 = %{$fastqIndex1Ref}; + my %found1; + + # We don't want to have to open/close file handles each time, so let's open them here + open(F1COUT, ">$ARGV[0]"); + open(F2COUT, ">$ARGV[1]"); + open(F1UOUT, ">$ARGV[2]"); + open(F2UOUT, ">$ARGV[3]"); + + open(F1IN, $file1) or die("Could not open $file1\n"); + open(F2IN, $file2) or die("Could not open $file2\n"); + while(my $line = ) + { + chomp($line); + + # Skip empty lines or comments + if($line =~ m/^$/g or $line =~ m/^\s*#/) + { + next; + } + + # Each block is going to be of 4 lines + # Let's get the seq ID from the sequence name + if($line =~ m/^@(.*)#.*/) + { + my $seqId = $1; + + if(defined($fastqIndex1{$seqId})) + { + $found1{$seqId} = $TRUE; + + # Print out from file1 + seek(F1IN, $fastqIndex1{$seqId}, 0); + for(my $i=0;$i<4;$i++) + { + my $tmpLine = ; + print F1COUT $tmpLine; + } + + # Print out from file 2 + print F2COUT $line . "\n"; + for(my $i=0; $i<3; $i++) + { + my $tmpLine = ; + print F2COUT $tmpLine; + } + } + else + { + # Print out from file 2 + print F2UOUT $line . "\n"; + for(my $i=0; $i<3; $i++) + { + my $tmpLine = ; + print F2UOUT $tmpLine; + } + } + } + else + { + print STDERR "Could not match the sequence ID from the name: $line\n";; + next; + } + } + close(F1COUT); + close(F2COUT); + close(F2UOUT); + close(F2IN); + # Now let's worry about the sequences that weren't common in file 1 + + # File 1 + if(keys(%fastqIndex1) != keys(%found1)) + { + foreach my $seqId (keys %fastqIndex1) + { + if(!defined($found1{$seqId})) + { + seek(F1IN, $fastqIndex1{$seqId}, 0); + for(my $i=0;$i<4;$i++) + { + my $tmpLine = ; + print F1UOUT $tmpLine; + } + } + } + } + close(F1UOUT); + close(F1IN) +} \ No newline at end of file diff -r 177470e28e5f -r 54ab39950f59 cmpfastq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cmpfastq.xml Fri Jul 12 14:36:19 2013 -0500 @@ -0,0 +1,33 @@ + + Comparing two fastq files + + CFQ_SCRIPT_PATH + + \$CFQ_SCRIPT_PATH/cmpfastq $input1 $input2 $output1 $output2 $output3 $output4 + + + + + + + + + + + + + + + + + + + +This tool computes GC content from a FASTA file. + + + \ No newline at end of file