comparison redup.pl @ 0:df1e7c7dd9cb draft default tip

Initial uploaded of files
author jgarbe
date Wed, 27 Nov 2013 14:39:56 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:df1e7c7dd9cb
1 #!/usr/bin/perl -w
2
3 ###########################################################
4 # redup.pl
5 # John Garbe
6 # August 2012
7 #
8 # Remove exact duplicate reads from paired-end fastq files, printing out
9 # unique reads to .unique files. Prints out top N most duplicated sequences
10 #
11 ###########################################################
12
13 =head1 NAME
14
15 redup.pl - Remove exact duplicate reads from paired-end fastq files, printing out unique reads to .unique files. Prints out top N most duplicated sequences in fasta format
16
17 =head1 SYNOPSIS
18
19 redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq > topdups.fasta
20
21 =head1 DESCRIPTION
22
23 This script removes duplicate paired-end reads from the input files sample1_R1.fastq and sample1_R2.fastq and prints out unique reads to the files sample1_R1.fastq.unique and sample2_R2.fastq.unique. Reads must have the exact same sequence to be called duplicates, quality scores are ignored. The top N (default 20) most duplicated sequences are printed out in fasta format, making it convenient for using BLAST to identify them.
24
25 =cut
26
27 use Getopt::Std;
28
29 # determine number of most duplicated sequences to return
30 $opt_n = 20;
31 $ok = getopts('n:');
32 $k = $opt_n - 1; # subtract one to work with Perl's zero-based arrays
33
34 die "USAGE: redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq\n" unless ((($#ARGV == 1) || ($#ARGV == 3)) && $ok);
35
36 # open up input files
37 open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n";
38 open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n";
39
40 # open up output files
41 open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n";
42 open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n";
43
44 # open up output files
45 if ($#ARGV < 3) {
46 open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n";
47 open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n";
48 } else {
49 open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n";
50 open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n";
51 }
52
53
54 # run through the input files
55 $readcount = 0;
56 $dupcount = 0;
57 while ($f1line1 = <F1>) {
58 $readcount++;
59 if ($readcount % 1000000 == 0) { # print progress update
60 print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
61 }
62 $f1line2 = <F1>;
63 $f1line3 = <F1>;
64 $f1line4 = <F1>;
65
66 $f2line1 = <F2>;
67 $f2line2 = <F2>;
68 $f2line3 = <F2>;
69 $f2line4 = <F2>;
70
71 $combo = $f1line2 . $f2line2; # concatenate seqs from reads 1 and 2
72 if (defined($seen{$combo})) { # if seq is a dup skip it
73 $seen{$combo}++;
74 $dupcount++;
75 next;
76 } else { # seq is not a dup, print reads back out
77 $seen{$combo} = 1;
78 print O1 "$f1line1";
79 print O1 "$f1line2";
80 print O1 "$f1line3";
81 print O1 "$f1line4";
82
83 print O2 "$f2line1";
84 print O2 "$f2line2";
85 print O2 "$f2line3";
86 print O2 "$f2line4";
87 }
88 }
89
90 close O1;
91 close O2;
92 print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
93
94 print STDERR "Identifying most frequent sequences (kill program to skip this step)\n";
95
96 ### print out the top k most duplicated reads in fasta format
97 # It would be better to use a proper partial sort algorithm here
98
99 # initialize first k elements
100 for $i (0..$k) {
101 $result[$i][0] = "seed";
102 $result[$i][1] = -1;
103 }
104
105 # find reads with more duplicates than least duplicated sequence in results
106 for $key (keys %seen) {
107
108 if ($seen{$key} > $result[$#result][1]) {
109 push @result, [ ( $key, $seen{$key} ) ];
110 @result = sort { $b->[1] <=> $a->[1] } @result;
111 $#result = $k;
112 }
113 }
114
115 # print out the top k most duplicated reads in fasta format
116 for $i (0..$k) {
117 next if ($result[$i][1] < 0);
118 $rank = $i + 1;
119 print ">Rank:$rank:Count:$result[$i][1]\n";
120 ($r1, $r2) = split /\n/, $result[$i][0];
121 print $r1 . "N" . "$r2\n";
122 }
123
124 # print out total again in case it was missed
125 print STDERR "$readcount reads processed, $dupcount duplicates removed\n";
126
127 exit;