0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 ###########################################################
|
|
4 # redup.pl
|
|
5 # John Garbe
|
|
6 # August 2012
|
|
7 #
|
|
8 # Remove exact duplicate reads from paired-end fastq files, printing out
|
|
9 # unique reads to .unique files. Prints out top N most duplicated sequences
|
|
10 #
|
|
11 ###########################################################
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 redup.pl - Remove exact duplicate reads from paired-end fastq files, printing out unique reads to .unique files. Prints out top N most duplicated sequences in fasta format
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq > topdups.fasta
|
|
20
|
|
21 =head1 DESCRIPTION
|
|
22
|
|
23 This script removes duplicate paired-end reads from the input files sample1_R1.fastq and sample1_R2.fastq and prints out unique reads to the files sample1_R1.fastq.unique and sample2_R2.fastq.unique. Reads must have the exact same sequence to be called duplicates, quality scores are ignored. The top N (default 20) most duplicated sequences are printed out in fasta format, making it convenient for using BLAST to identify them.
|
|
24
|
|
25 =cut
|
|
26
|
|
27 use Getopt::Std;
|
|
28
|
|
29 # determine number of most duplicated sequences to return
|
|
30 $opt_n = 20;
|
|
31 $ok = getopts('n:');
|
|
32 $k = $opt_n - 1; # subtract one to work with Perl's zero-based arrays
|
|
33
|
|
34 die "USAGE: redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq\n" unless ((($#ARGV == 1) || ($#ARGV == 3)) && $ok);
|
|
35
|
|
36 # open up input files
|
|
37 open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n";
|
|
38 open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n";
|
|
39
|
|
40 # open up output files
|
|
41 open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n";
|
|
42 open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n";
|
|
43
|
|
44 # open up output files
|
|
45 if ($#ARGV < 3) {
|
|
46 open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n";
|
|
47 open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n";
|
|
48 } else {
|
|
49 open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n";
|
|
50 open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n";
|
|
51 }
|
|
52
|
|
53
|
|
54 # run through the input files
|
|
55 $readcount = 0;
|
|
56 $dupcount = 0;
|
|
57 while ($f1line1 = <F1>) {
|
|
58 $readcount++;
|
|
59 if ($readcount % 1000000 == 0) { # print progress update
|
|
60 print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
|
|
61 }
|
|
62 $f1line2 = <F1>;
|
|
63 $f1line3 = <F1>;
|
|
64 $f1line4 = <F1>;
|
|
65
|
|
66 $f2line1 = <F2>;
|
|
67 $f2line2 = <F2>;
|
|
68 $f2line3 = <F2>;
|
|
69 $f2line4 = <F2>;
|
|
70
|
|
71 $combo = $f1line2 . $f2line2; # concatenate seqs from reads 1 and 2
|
|
72 if (defined($seen{$combo})) { # if seq is a dup skip it
|
|
73 $seen{$combo}++;
|
|
74 $dupcount++;
|
|
75 next;
|
|
76 } else { # seq is not a dup, print reads back out
|
|
77 $seen{$combo} = 1;
|
|
78 print O1 "$f1line1";
|
|
79 print O1 "$f1line2";
|
|
80 print O1 "$f1line3";
|
|
81 print O1 "$f1line4";
|
|
82
|
|
83 print O2 "$f2line1";
|
|
84 print O2 "$f2line2";
|
|
85 print O2 "$f2line3";
|
|
86 print O2 "$f2line4";
|
|
87 }
|
|
88 }
|
|
89
|
|
90 close O1;
|
|
91 close O2;
|
|
92 print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
|
|
93
|
|
94 print STDERR "Identifying most frequent sequences (kill program to skip this step)\n";
|
|
95
|
|
96 ### print out the top k most duplicated reads in fasta format
|
|
97 # It would be better to use a proper partial sort algorithm here
|
|
98
|
|
99 # initialize first k elements
|
|
100 for $i (0..$k) {
|
|
101 $result[$i][0] = "seed";
|
|
102 $result[$i][1] = -1;
|
|
103 }
|
|
104
|
|
105 # find reads with more duplicates than least duplicated sequence in results
|
|
106 for $key (keys %seen) {
|
|
107
|
|
108 if ($seen{$key} > $result[$#result][1]) {
|
|
109 push @result, [ ( $key, $seen{$key} ) ];
|
|
110 @result = sort { $b->[1] <=> $a->[1] } @result;
|
|
111 $#result = $k;
|
|
112 }
|
|
113 }
|
|
114
|
|
115 # print out the top k most duplicated reads in fasta format
|
|
116 for $i (0..$k) {
|
|
117 next if ($result[$i][1] < 0);
|
|
118 $rank = $i + 1;
|
|
119 print ">Rank:$rank:Count:$result[$i][1]\n";
|
|
120 ($r1, $r2) = split /\n/, $result[$i][0];
|
|
121 print $r1 . "N" . "$r2\n";
|
|
122 }
|
|
123
|
|
124 # print out total again in case it was missed
|
|
125 print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
|
|
126
|
|
127 exit;
|