Mercurial > repos > jgarbe > redup
comparison redup.pl @ 0:df1e7c7dd9cb draft default tip
Initial uploaded of files
author | jgarbe |
---|---|
date | Wed, 27 Nov 2013 14:39:56 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:df1e7c7dd9cb |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 ########################################################### | |
4 # redup.pl | |
5 # John Garbe | |
6 # August 2012 | |
7 # | |
8 # Remove exact duplicate reads from paired-end fastq files, printing out | |
9 # unique reads to .unique files. Prints out top N most duplicated sequences | |
10 # | |
11 ########################################################### | |
12 | |
13 =head1 NAME | |
14 | |
15 redup.pl - Remove exact duplicate reads from paired-end fastq files, printing out unique reads to .unique files. Prints out top N most duplicated sequences in fasta format | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq > topdups.fasta | |
20 | |
21 =head1 DESCRIPTION | |
22 | |
23 This script removes duplicate paired-end reads from the input files sample1_R1.fastq and sample1_R2.fastq and prints out unique reads to the files sample1_R1.fastq.unique and sample2_R2.fastq.unique. Reads must have the exact same sequence to be called duplicates, quality scores are ignored. The top N (default 20) most duplicated sequences are printed out in fasta format, making it convenient for using BLAST to identify them. | |
24 | |
25 =cut | |
26 | |
27 use Getopt::Std; | |
28 | |
29 # determine number of most duplicated sequences to return | |
30 $opt_n = 20; | |
31 $ok = getopts('n:'); | |
32 $k = $opt_n - 1; # subtract one to work with Perl's zero-based arrays | |
33 | |
34 die "USAGE: redup.pl [-n N] sample1_R1.fastq sample1_R2.fastq\n" unless ((($#ARGV == 1) || ($#ARGV == 3)) && $ok); | |
35 | |
36 # open up input files | |
37 open F1, "<$ARGV[0]" or die "cannot open $ARGV[0]\n"; | |
38 open F2, "<$ARGV[1]" or die "cannot open $ARGV[1]\n"; | |
39 | |
40 # open up output files | |
41 open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n"; | |
42 open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n"; | |
43 | |
44 # open up output files | |
45 if ($#ARGV < 3) { | |
46 open O1, ">$ARGV[0].unique" or die "cannot open $ARGV[0].unique\n"; | |
47 open O2, ">$ARGV[1].unique" or die "cannot open $ARGV[1].unique\n"; | |
48 } else { | |
49 open O1, ">$ARGV[2]" or die "cannot open $ARGV[2]\n"; | |
50 open O2, ">$ARGV[3]" or die "cannot open $ARGV[3]\n"; | |
51 } | |
52 | |
53 | |
54 # run through the input files | |
55 $readcount = 0; | |
56 $dupcount = 0; | |
57 while ($f1line1 = <F1>) { | |
58 $readcount++; | |
59 if ($readcount % 1000000 == 0) { # print progress update | |
60 print STDERR "$readcount reads processed, $dupcount duplicates removed\n"; | |
61 } | |
62 $f1line2 = <F1>; | |
63 $f1line3 = <F1>; | |
64 $f1line4 = <F1>; | |
65 | |
66 $f2line1 = <F2>; | |
67 $f2line2 = <F2>; | |
68 $f2line3 = <F2>; | |
69 $f2line4 = <F2>; | |
70 | |
71 $combo = $f1line2 . $f2line2; # concatenate seqs from reads 1 and 2 | |
72 if (defined($seen{$combo})) { # if seq is a dup skip it | |
73 $seen{$combo}++; | |
74 $dupcount++; | |
75 next; | |
76 } else { # seq is not a dup, print reads back out | |
77 $seen{$combo} = 1; | |
78 print O1 "$f1line1"; | |
79 print O1 "$f1line2"; | |
80 print O1 "$f1line3"; | |
81 print O1 "$f1line4"; | |
82 | |
83 print O2 "$f2line1"; | |
84 print O2 "$f2line2"; | |
85 print O2 "$f2line3"; | |
86 print O2 "$f2line4"; | |
87 } | |
88 } | |
89 | |
90 close O1; | |
91 close O2; | |
92 print STDERR "$readcount reads processed, $dupcount duplicates removed\n"; | |
93 | |
94 print STDERR "Identifying most frequent sequences (kill program to skip this step)\n"; | |
95 | |
96 ### print out the top k most duplicated reads in fasta format | |
97 # It would be better to use a proper partial sort algorithm here | |
98 | |
99 # initialize first k elements | |
100 for $i (0..$k) { | |
101 $result[$i][0] = "seed"; | |
102 $result[$i][1] = -1; | |
103 } | |
104 | |
105 # find reads with more duplicates than least duplicated sequence in results | |
106 for $key (keys %seen) { | |
107 | |
108 if ($seen{$key} > $result[$#result][1]) { | |
109 push @result, [ ( $key, $seen{$key} ) ]; | |
110 @result = sort { $b->[1] <=> $a->[1] } @result; | |
111 $#result = $k; | |
112 } | |
113 } | |
114 | |
115 # print out the top k most duplicated reads in fasta format | |
116 for $i (0..$k) { | |
117 next if ($result[$i][1] < 0); | |
118 $rank = $i + 1; | |
119 print ">Rank:$rank:Count:$result[$i][1]\n"; | |
120 ($r1, $r2) = split /\n/, $result[$i][0]; | |
121 print $r1 . "N" . "$r2\n"; | |
122 } | |
123 | |
124 # print out total again in case it was missed | |
125 print STDERR "$readcount reads processed, $dupcount duplicates removed\n"; | |
126 | |
127 exit; |