diff redup.pl @ 0:df1e7c7dd9cb draft default tip

Initial uploaded of files
author jgarbe
date Wed, 27 Nov 2013 14:39:56 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/redup.pl	Wed Nov 27 14:39:56 2013 -0500
@@ -0,0 +1,127 @@
+#!/usr/bin/perl -w
+
+###########################################################
+# redup.pl
+# John Garbe
+# August 2012
+#
+# Remove exact duplicate reads from paired-end fastq files, printing out
+# unique reads to .unique files. Prints out top N most duplicated sequences 
+#
+###########################################################
+
+=head1 NAME
+
+redup.pl - Remove exact duplicate reads from paired-end fastq files, printing out unique reads to .unique files. Prints out top N most duplicated sequences in fasta format
+
+=head1 SYNOPSIS
+
+redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq > topdups.fasta
+
+=head1 DESCRIPTION
+
+This script removes duplicate paired-end reads from the input files sample1_R1.fastq and sample1_R2.fastq and prints out unique reads to the files sample1_R1.fastq.unique and sample2_R2.fastq.unique. Reads must have the exact same sequence to be called duplicates, quality scores are ignored. The top N (default 20) most duplicated sequences are printed out in fasta format, making it convenient for using BLAST to identify them.
+
+=cut
+
+use Getopt::Std;
+
+# determine number of most duplicated sequences to return
+$opt_n = 20;
+$ok = getopts('n:');
+$k = $opt_n - 1; # subtract one to work with Perl's zero-based arrays
+
+die "USAGE: redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq\n" unless ((($#ARGV == 1) || ($#ARGV == 3)) && $ok);
+
+# open up input files
+open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n";
+open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n";
+
+# open up output files
+open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n";
+open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n";
+
+# open up output files
+if ($#ARGV < 3) {
+    open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n";
+    open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n";
+} else {
+    open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n";
+    open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n";
+}
+
+
+# run through the input files
+$readcount = 0;
+$dupcount = 0;
+while ($f1line1 = <F1>) {
+    $readcount++;
+    if ($readcount % 1000000 == 0) { # print progress update
+	print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
+    }
+    $f1line2 = <F1>;
+    $f1line3 = <F1>;
+    $f1line4 = <F1>;
+
+    $f2line1 = <F2>;
+    $f2line2 = <F2>;
+    $f2line3 = <F2>;
+    $f2line4 = <F2>;
+
+    $combo = $f1line2 . $f2line2; # concatenate seqs from reads 1 and 2
+    if (defined($seen{$combo})) { # if seq is a dup skip it
+	$seen{$combo}++;
+	$dupcount++;
+	next;
+    } else { # seq is not a dup, print reads back out
+	$seen{$combo} = 1;
+	print O1 "$f1line1";
+	print O1 "$f1line2";
+	print O1 "$f1line3";
+	print O1 "$f1line4";
+
+	print O2 "$f2line1";
+	print O2 "$f2line2";
+	print O2 "$f2line3";
+	print O2 "$f2line4";
+    }
+}
+
+close O1;
+close O2;
+print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
+
+print STDERR "Identifying most frequent sequences (kill program to skip this step)\n";
+
+### print out the top k most duplicated reads in fasta format
+# It would be better to use a proper partial sort algorithm here
+
+# initialize first k elements
+for $i (0..$k) {
+    $result[$i][0] = "seed";
+    $result[$i][1] = -1;
+}
+
+# find reads with more duplicates than least duplicated sequence in results
+for $key (keys %seen) {
+
+    if ($seen{$key} > $result[$#result][1]) {
+	push @result, [ ( $key, $seen{$key} ) ];
+	@result = sort { $b->[1] <=> $a->[1] } @result;
+	$#result = $k;
+    }
+}
+
+# print out the top k most duplicated reads in fasta format
+for $i (0..$k) {
+    next if ($result[$i][1] < 0);
+    $rank = $i + 1;
+    print ">Rank:$rank:Count:$result[$i][1]\n";
+    ($r1, $r2) = split /\n/, $result[$i][0];
+    print $r1 . "N" . "$r2\n";
+}
+
+# print out total again in case it was missed
+print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
+
+exit;