Mercurial > repos > jjohnson > fastq_sync
comparison pe_sync_2_files.pl @ 0:751f4938cf0d
Uploaded
author | jjohnson |
---|---|
date | Tue, 05 Feb 2013 15:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:751f4938cf0d |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 ############################################### | |
4 # pe-sync-2-files.pl # this is a stripped-down version of pe-sync-directory.pl | |
5 # John Garbe | |
6 # February 2012 | |
7 # | |
8 # Paired-end read file synchronization script: checks if left and right | |
9 # reads are in the same order. | |
10 # This script checks the two files specificd on the command line. | |
11 # The "quick" option will not report the percent of out-of-sync reads | |
12 # for many failed files, but will run much faster. | |
13 # | |
14 # This script should run fine on linux systems or solaris | |
15 # (which is what the project space fileserver runs) | |
16 # | |
17 # This script can handle Casava 1.8.0 style read IDs, and pre 1.8.0 style ids. | |
18 # Other types of read ID formats will cause a terminal error. | |
19 # | |
20 ############################################### | |
21 | |
22 die "Usage: pe-sync-directory.pl [quick] left_fastqfile right_fastqfile\n" unless ($#ARGV > 0); | |
23 | |
24 # determine if we're running on solaris in order to change system calls | |
25 $output = `perl -v`; | |
26 $solaris = 0; | |
27 $solaris = 1 if ($output =~ /solaris/); | |
28 if ($solaris) { | |
29 # print STDERR "Running on Solaris\n"; | |
30 } | |
31 | |
32 # handle "quick" option | |
33 $quick = 0; # 1 for quickcheck, 0 for thorough check | |
34 if (defined($ARGV[0])) { | |
35 if ($ARGV[0] eq "quick") { | |
36 $quick = 1; | |
37 shift @ARGV; | |
38 } | |
39 } | |
40 print STDERR "QUICK CHECK enabled\n" if ($quick); | |
41 | |
42 $filename_r1 = $ARGV[0]; | |
43 $filename_r2 = $ARGV[1]; | |
44 | |
45 # identify read id format type | |
46 if ($solaris) { | |
47 $readid = `head -1 $filename_r1`; | |
48 } else { | |
49 $readid = `head -n 1 $filename_r1`; | |
50 } | |
51 chomp $readid; | |
52 if ($readid =~ /^@\S+\/[12]$/) { # @ at the start of the line followed by non-whitespace, a /, a 1 or 2, the end of the line | |
53 $id_type = 1; | |
54 print STDERR "Casava 1.7 read id style\n"; # TESTING | |
55 } elsif ($readid =~ /^@\S+\W[12]\S+$/) { # @ at the start of the line followed by non-whitspace, a space, a 1 or 2, non-whitespace | |
56 $id_type = 2; | |
57 print STDERR "Casava 1.8 read id style\n"; # TESTING | |
58 } else { | |
59 print STDERR "Cannot determine read id style\n"; | |
60 print STDOUT "Unknwon id style: $readid\n"; | |
61 exit 1; | |
62 } | |
63 | |
64 chomp $filename_r1; | |
65 chomp $filename_r2; | |
66 | |
67 if ($quick) { | |
68 $failure = &quickcheck($filename_r1, $filename_r2); # quickie check | |
69 print STDERR "FAILED quickcheck\n" if ($failure); | |
70 } | |
71 # If we aren't doing the quick check, or the quick check passed, do the slow check | |
72 if (!($quick) || !($failure)) { | |
73 $failure = &percent($filename_r1, $filename_r2); # slow thorough check | |
74 } | |
75 | |
76 exit; | |
77 | |
78 ######################## subs ######################### | |
79 # do a quick check to see if the files are bad (many bad files will pass this quick check) | |
80 sub quickcheck { | |
81 my ($file1, $file2) = @_; | |
82 if ($solaris) { | |
83 $result1 = `tail -10000l $file1 | head -1`; # grab the line 10,000 lines from the EOF | |
84 $result2 = `tail -10000l $file2 | head -1`; | |
85 } else { | |
86 $result1 = `tail -n 10000 $file1 | head -n 1`; | |
87 $result2 = `tail -n 10000 $file2 | head -n 1`; | |
88 } | |
89 # process the ids depending on id style | |
90 if ($id_type == 1) { # 1 or 2 at end of id | |
91 chomp $result1; | |
92 chomp $result2; | |
93 chop $result1; | |
94 chop $result2; | |
95 } else { # first word is a unique read id, second word is left/right-specific | |
96 ($result1) = split ' ', $result1; | |
97 ($result2) = split ' ', $result2; | |
98 } | |
99 if ($result1 eq $result2) { | |
100 # print "pass\n"; | |
101 return 0; | |
102 } else { | |
103 # print "fail\n"; | |
104 return 1; | |
105 } | |
106 } | |
107 | |
108 | |
109 # compare every read id from each file and determine what percent of reads are | |
110 # out of sync | |
111 sub percent { | |
112 my($file1, $file2) = @_; | |
113 | |
114 open R1, '<', $file1 or die "Couldn't open $file1 for reading:$!"; | |
115 open R2, '<', $file2 or die "Couldn't open $file2 for reading:$!"; | |
116 | |
117 my $readCount = 0; | |
118 my $failcount = 0; | |
119 while (!eof(R1) and !eof(R2)) { | |
120 $id1 = <R1>; | |
121 $id2 = <R2>; | |
122 next unless ($. % 4 == 1); # check every sequence (4th line in the file) | |
123 # process the ids depending on id style | |
124 if ($id_type == 1) { # 1 or 2 at end of id | |
125 chomp $id1; # newline | |
126 chop $id1; # last char of the id (should be the "1" or "2") | |
127 chomp $id2; | |
128 chop $id2; | |
129 } else { | |
130 ($id1) = split ' ', $id1; | |
131 ($id2) = split ' ', $id2; | |
132 } | |
133 $failcount++ unless ($id1 eq $id2); | |
134 $readCount++; | |
135 } | |
136 my $percent = $failcount / $readCount; | |
137 $prettypercent = $percent * 100; # make it percent | |
138 $prettypercent = sprintf("%.2f", $prettypercent); | |
139 if ($failcount > 0) { | |
140 print STDERR "FAILED full check\t$prettypercent% of reads are out-of-sync\n"; | |
141 $exitStatus=1; | |
142 } else { | |
143 print STDERR "PASSED full check\n"; | |
144 $exitStatus=0; | |
145 } | |
146 | |
147 return $exitStatus; | |
148 } |