Mercurial > repos > jgarbe > redup
diff redup.pl @ 0:df1e7c7dd9cb draft default tip
Initial uploaded of files
author | jgarbe |
---|---|
date | Wed, 27 Nov 2013 14:39:56 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/redup.pl Wed Nov 27 14:39:56 2013 -0500 @@ -0,0 +1,127 @@ +#!/usr/bin/perl -w + +########################################################### +# redup.pl +# John Garbe +# August 2012 +# +# Remove exact duplicate reads from paired-end fastq files, printing out +# unique reads to .unique files. Prints out top N most duplicated sequences +# +########################################################### + +=head1 NAME + +redup.pl - Remove exact duplicate reads from paired-end fastq files, printing out unique reads to .unique files. Prints out top N most duplicated sequences in fasta format + +=head1 SYNOPSIS + +redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq > topdups.fasta + +=head1 DESCRIPTION + +This script removes duplicate paired-end reads from the input files sample1_R1.fastq and sample1_R2.fastq and prints out unique reads to the files sample1_R1.fastq.unique and sample2_R2.fastq.unique. Reads must have the exact same sequence to be called duplicates, quality scores are ignored. The top N (default 20) most duplicated sequences are printed out in fasta format, making it convenient for using BLAST to identify them. + +=cut + +use Getopt::Std; + +# determine number of most duplicated sequences to return +$opt_n = 20; +$ok = getopts('n:'); +$k = $opt_n - 1; # subtract one to work with Perl's zero-based arrays + +die "USAGE: redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq\n" unless ((($#ARGV == 1) || ($#ARGV == 3)) && $ok); + +# open up input files +open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n"; +open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n"; + +# open up output files +open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n"; +open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n"; + +# open up output files +if ($#ARGV < 3) { + open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n"; + open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n"; +} else { + open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n"; + open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n"; +} + + +# run through the input files +$readcount = 0; +$dupcount = 0; +while ($f1line1 = <F1>) { + $readcount++; + if ($readcount % 1000000 == 0) { # print progress update + print STDERR "$readcount reads processed, $dupcount duplicates removed\n"; + } + $f1line2 = <F1>; + $f1line3 = <F1>; + $f1line4 = <F1>; + + $f2line1 = <F2>; + $f2line2 = <F2>; + $f2line3 = <F2>; + $f2line4 = <F2>; + + $combo = $f1line2 . $f2line2; # concatenate seqs from reads 1 and 2 + if (defined($seen{$combo})) { # if seq is a dup skip it + $seen{$combo}++; + $dupcount++; + next; + } else { # seq is not a dup, print reads back out + $seen{$combo} = 1; + print O1 "$f1line1"; + print O1 "$f1line2"; + print O1 "$f1line3"; + print O1 "$f1line4"; + + print O2 "$f2line1"; + print O2 "$f2line2"; + print O2 "$f2line3"; + print O2 "$f2line4"; + } +} + +close O1; +close O2; +print STDERR "$readcount reads processed, $dupcount duplicates removed\n"; + +print STDERR "Identifying most frequent sequences (kill program to skip this step)\n"; + +### print out the top k most duplicated reads in fasta format +# It would be better to use a proper partial sort algorithm here + +# initialize first k elements +for $i (0..$k) { + $result[$i][0] = "seed"; + $result[$i][1] = -1; +} + +# find reads with more duplicates than least duplicated sequence in results +for $key (keys %seen) { + + if ($seen{$key} > $result[$#result][1]) { + push @result, [ ( $key, $seen{$key} ) ]; + @result = sort { $b->[1] <=> $a->[1] } @result; + $#result = $k; + } +} + +# print out the top k most duplicated reads in fasta format +for $i (0..$k) { + next if ($result[$i][1] < 0); + $rank = $i + 1; + print ">Rank:$rank:Count:$result[$i][1]\n"; + ($r1, $r2) = split /\n/, $result[$i][0]; + print $r1 . "N" . "$r2\n"; +} + +# print out total again in case it was missed +print STDERR "$readcount reads processed, $dupcount duplicates removed\n"; + +exit;