annotate 1_Filter_by_truncation.pl @ 0:c9680c78151b draft default tip

Uploaded
author caitlin-potter
date Mon, 23 May 2016 11:05:49 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
1 #!/usr/bin/perl
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
2 #Name : 1_Filter_by_truncation.pl
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
3 #Author : Morgan, Matthew
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
4 #Created : 02/2013
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
5 #Modified : 08/2013
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
6 #Purpose : Filters raw pyrosequencing reads, trims primers and barcodes, truncates at user-defined length, bins by sequence, removes poor sequences
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
7 #Syntax : perl 1_Filter_by_truncation.pl [Raw sequence file] [gasket file] [FMID file] [RMID file] [truncation length]
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
8 #Further info : Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review)
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
9 #Acknowledgment : Code for converting between tab-delimited text and fasta formats was borrowed and modified from http://sysbio.harvard.edu/csb/resources/computational/scriptome/
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
10 #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
11
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
12 #########################################################################################################################################################
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
13 # #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
14 #CSIRO Open Source Software License Agreement (GPLv3) #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
15 # #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
16 #Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230. #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
17 # #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
18 #All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3 #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
19 # as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material. #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
20 #The following additional terms apply under clause 7 of that license: #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
21 # #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
22 #EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
23 #CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
24 #WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
25 #NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
26 # #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
27 #TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
28 #LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED. #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
29 #WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
30 #DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
31 #OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
32 #OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
33 #SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY. #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
34 # #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
35 #APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
36 #OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
37 #MODIFIED "CONSUMER GUARANTEES". IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
38 #EXTENT PERMITTED BY THE APPLICABLE LEGISLATION. WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
39 #THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM: #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
40 #1. THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN; #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
41 #2. THE REPAIR OF THE SOFTWARE; #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
42 #3. THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN, #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
43 # OR HAVING THE SOFTWARE REPAIRED. #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
44 # #
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
45 #########################################################################################################################################################
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
46
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
47 use warnings;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
48 use strict;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
49
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
50 my $rawdata = $ARGV[0];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
51 my (%FORMID, %REVMID);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
52
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
53 #Store FMIDs and RMIDs
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
54
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
55 if ($ARGV[2]){my $midfile = $ARGV[2];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
56 open (INFILE, "<$midfile")||die("Can't open MID file");
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
57 while (<INFILE>){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
58 s/\r?\n//;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
59 chomp $_;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
60 my @parts = split(/\t/, $_);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
61 my $midname = $parts[0];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
62 my $midsequence = $parts[1];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
63 $FORMID{$midname} = $midsequence;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
64 print "$midname\t$midsequence\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
65 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
66 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
67 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
68 $FORMID{'NOFMID'} = ();
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
69 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
70 close (INFILE);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
71
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
72 if ($ARGV[3]) {my $rmidfile = $ARGV[3];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
73 open (RMIDS, "<$rmidfile")||die("Can't open RMID file");
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
74 while (<RMIDS>){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
75 s/\r?\n//;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
76 chomp;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
77 my @parts = split(/\t/, $_);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
78 my $midname = $parts[0];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
79 my $midsequence = $parts[1];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
80 $REVMID{$midname} = $midsequence;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
81 print "$midname\t$midsequence\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
82 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
83 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
84 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
85 $REVMID{'NORMID'} = ();
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
86 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
87 my $n;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
88 #read truncation length
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
89 if ($ARGV[4]){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
90 $n = $ARGV[4];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
91 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
92 close (IN);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
93
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
94 #add gasket file
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
95 my @garray;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
96 if ($ARGV[1]){my $gasketin = $ARGV[1];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
97 open (G, "<$gasketin")||die("Can't open gasket file");
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
98 while (<G>){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
99 chomp;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
100 push @garray, $_;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
101 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
102 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
103 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
104 push @garray, "NOGASKET";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
105 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
106
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
107 #open fasta file, read sequence name, sequence, assign to MID group, remove MID and primer sequence.
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
108
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
109 #convert fasta to tabular format
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
110 my $count = 0;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
111 open (IN, "<$rawdata") || die ("Can't open sequence data");
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
112 open OUT, ">Sequence_data.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
113 while (<IN>) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
114 s/\r?\n//;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
115 s/\t/ /g;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
116 if (s/^>//) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
117 if ($. != 1) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
118 print OUT "\n"
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
119 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
120 $count++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
121 s/ |$/\t/;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
122 $_ .= "\t";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
123 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
124 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
125 s/ //g;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
126 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
127 print OUT $_;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
128 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
129 print "Converted $count raw sequences in fasta format to tab-delimited text\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
130 close (IN);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
131 close (OUT);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
132
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
133 # read sequence name, sequence, assign to MID group, remove MID and primer sequence.
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
134
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
135 my ($seqname, $sequence, %seen, $partition, $gasket, $formidname, $revmidname, %tophash, @observed_mids, %hash, $midcombination, %totalreads, %totalinvalidreads, %rep);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
136 my $accepted=0; my $multiple_fmids=0; my $multiple_rmids=0; my $no_mids=0; my $no_rmid=0;my $no_fmid=0; my $unused_mids=0; my $tooshort=0;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
137 open (INFILE, "<Sequence_data.txt") || die ("Can't open infile");
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
138 open OUT, ">Read_status.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
139 open OUT1, ">Read_status_summary.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
140 open OUTFILE1, ">Reads_with_forward_and_reverse_MIDs.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
141 open OUTFILE2, ">Filtered_unique_sequences.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
142 open READS, ">Reads_by_sequence.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
143 while (<INFILE>){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
144 my (%fmid, %rmid, $f, $r);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
145 my $fmid_count = 0;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
146 my $rmid_count = 0;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
147 chomp $_;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
148 my @nameparts = split(/\t/, $_);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
149 $seqname = $nameparts[0];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
150 $sequence = $nameparts[2];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
151 if ($garray[0] eq "NOGASKET"){$partition = "NOGASKET";}
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
152 foreach my $formidname (sort keys %FORMID)
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
153 {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
154 my $formidsequence = $FORMID{$formidname};
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
155 if ( $sequence =~ /$formidsequence/ ) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
156 $fmid_count++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
157 $fmid{$formidname}=1;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
158 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
159 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
160 $fmid{$formidname}=0;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
161 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
162 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
163 if ($fmid_count>1){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
164 $multiple_fmids++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
165 print OUT "$seqname\tREJECTED\tMULTIPLE_FORWARD_MIDS\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
166 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
167 elsif ($fmid_count==0){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
168 $no_mids++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
169 print OUT "$seqname\tREJECTED\tNO_FORWARD_MID\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
170 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
171 elsif ($fmid_count==1){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
172 for my $formid (keys %fmid){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
173 if ($fmid{$formid}==1){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
174 $f=$formid;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
175 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
176 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
177 my $partition = substr( $seqname, 0, 9 );
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
178 my $comb = $partition . $f;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
179 my $formidsequence = $FORMID{$f};
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
180 $accepted++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
181 print OUT "$seqname\tACCEPTED\t$comb\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
182 $sequence =~ s/(.*)$formidsequence//;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
183 if (length($sequence)<$n){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
184 $tooshort++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
185 next;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
186 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
187 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
188 my $fragment=substr($sequence, 0, $n);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
189 $sequence=$fragment;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
190 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
191 #want count of how many times seen this sequence with these mids
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
192 if (defined($hash{$sequence}{$comb})){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
193 $hash{$sequence}{$comb} ++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
194 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
195 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
196 $hash{$sequence}{$comb} = 1;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
197 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
198 #get total read count for sequence
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
199 if (defined($totalreads{$sequence})){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
200 $totalreads{$sequence} ++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
201 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
202 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
203 $totalreads{$sequence} = 1;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
204 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
205 if (defined($rep{$sequence})){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
206 my @rep_array = @{$rep{$sequence}};
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
207 push @rep_array, $seqname;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
208 $rep{$sequence} = \@rep_array;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
209 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
210 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
211 my @new_rep;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
212 push @new_rep, $seqname;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
213 $rep{$sequence} = \@new_rep;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
214 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
215
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
216 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
217
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
218 }#closes while loop
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
219 close (OUT);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
220 my $rej = ($multiple_fmids + $multiple_rmids + $no_mids + $no_rmid + $no_fmid + $unused_mids);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
221 print OUT1 "Reads accepted:\t$accepted\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
222 print OUT1 "Reads rejected:\t$rej\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
223 print OUT1 "\tneither MID:\t$no_mids\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
224 print OUT1 "\tno forward MID:\t$no_fmid\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
225 print OUT1 "\tno reverse MID:\t$no_rmid\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
226 print OUT1 "\tunused mid combination:\t$unused_mids\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
227 print OUT1 "\tmultiple forward mids:\t$multiple_fmids\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
228 print OUT1 "\tmultiple reverse mids:\t$multiple_rmids\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
229 close (OUT1);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
230 #enters zeroes for all unobserved MID combinations in submitted txt file
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
231
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
232 foreach my $gasket (@garray) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
233 foreach my $formidname (sort keys %FORMID) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
234 my $comb = $gasket . $formidname;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
235 foreach $sequence (sort keys %hash) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
236 if (defined( $hash{$sequence}{$comb} )) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
237 next;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
238 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
239
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
240 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
241 $hash{$sequence}{$comb} = 0;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
242 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
243 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
244
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
245 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
246 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
247
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
248 #print out tab delimited text file: headers are Sequence name (arbitrary); Sequence; total reads; MID reads
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
249
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
250 my $seqcount;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
251 foreach my $sequence (sort keys %hash) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
252 my $independenttotal = $totalreads{$sequence};
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
253 my $l = length($sequence);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
254 $seqcount++;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
255 my $mids = $hash{$sequence};
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
256 local $" = "\t";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
257 my @sorted_keys = sort keys %$mids;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
258 my @values = @$mids{@sorted_keys};
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
259
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
260 my @rep_reads = sort(@{$rep{$sequence}});
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
261 my $check_total = scalar(@rep_reads);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
262 if ($check_total != $independenttotal){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
263 print "Oops: read totals don't match for $sequence\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
264 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
265 my $rep_read = $rep_reads[0];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
266
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
267 if ( $seqcount == 1 ) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
268 print OUTFILE1 "name\tseq_length\trep_read\tsequence\ttotal_reads\t@sorted_keys\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
269 printf OUTFILE1 "$rawdata\_%07d\t", $seqcount;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
270 print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
271 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
272 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
273 printf OUTFILE1 "$rawdata\_%07d\t", $seqcount;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
274 print OUTFILE1 "$l\t$rep_read\t$sequence\t$independenttotal\t@values\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
275 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
276 print READS "$sequence\t$check_total\t@rep_reads\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
277 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
278
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
279 close (INFILE);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
280 close (OUTFILE1);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
281
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
282 open (IN, "<Reads_with_forward_and_reverse_MIDs.txt")||die("can't open Reads by MID file");
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
283 open OUT, ">Rejected_reads.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
284 while (<IN>){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
285 chomp;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
286 my @t = split(/\t/,$_);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
287 if ($_ =~ /^name/){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
288 print OUT "$_\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
289 print OUTFILE2 "$_\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
290 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
291 elsif ($_ =~ /UNUSED_MIDS/){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
292 print OUT "$_\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
293 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
294 elsif ($t[3]=~/N/){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
295 print OUT "$_\tAMBIGUOUS_BASE_CALLS\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
296 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
297 elsif (length($t[3])<=40){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
298 print OUT "$_\tTOO_SHORT\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
299 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
300 elsif ($t[4] == 1){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
301 print OUT "$_\tSINGLETON\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
302 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
303 else {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
304 print OUTFILE2 "$_\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
305 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
306 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
307 close (IN);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
308 close (OUT);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
309 close (OUTFILE2);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
310 my (@F, $s);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
311
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
312 #tab back to fasta
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
313
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
314 open IN, "<Reads_with_forward_and_reverse_MIDs.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
315 open OUT, ">Reads_with_forward_and_reverse_MIDs.fasta";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
316 while (<IN>) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
317 unless ($_ =~ /^name/){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
318 chomp;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
319 my @A = split(/\t/,$_);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
320 my $name = $A[0];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
321 my $sequence = $A[1];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
322 print OUT ">$name\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
323 print OUT "$sequence\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
324 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
325 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
326 print "Converted $. sequences from MID-filtered reads to FASTA format\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
327 close (IN);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
328 close (OUT);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
329
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
330 open IN, "<Filtered_unique_sequences.txt";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
331 open OUT, ">Filtered_unique_sequences.fasta";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
332 while (<IN>) {
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
333 unless ($_ =~ /^name/){
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
334 chomp;
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
335 my @A = split(/\t/,$_);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
336 my $name = $A[0];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
337 my $sequence = $A[3];
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
338 print OUT ">$name\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
339 print OUT "$sequence\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
340 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
341 }
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
342 print "Converted $. sequences from valid reads to FASTA format\n\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
343 close (IN);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
344 close (OUT);
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
345 print "Reads rejected as $n: $tooshort\n";
c9680c78151b Uploaded
caitlin-potter
parents:
diff changeset
346 print time - $^T . "\n";