annotate tools/next_gen_conversion/bwa_solid2fastq_modified.pl @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/perl -w
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 # Author: lh3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 # Note: Ideally, this script should be written in C. It is a bit slow at present.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 use strict;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 use warnings;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 use Getopt::Std;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 my %opts;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 my $version = '0.1.3';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 my $usage = qq{
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 Usage: solid2fastq.pl <paired> <outfile1> <outfile2> <F3.csfasta> <F3.qual> <R3.csfasta> <R3.qual>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 Note: <in.title> is the string showed in the `# Title:' line of a
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 ".csfasta" read file. Then <in.title>F3.csfasta is read sequence
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 file and <in.title>F3_QV.qual is the quality file. If
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 <in.title>R3.csfasta is present, this script assumes reads are
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 paired; otherwise reads will be regarded as single-end.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 tag and `2' for F3. Usually you may want to use short <out.prefix>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 to save diskspace. Long <out.prefix> also causes troubles to maq.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 };
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 getopts('', \%opts);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 die($usage) if (@ARGV != 7);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 my ($is_paired,$outfile1,$outfile2,$f3reads,$f3qual,$r3reads,$r3qual) = @ARGV;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 my (@fhr, @fhw);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 my $fn = '';
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 my @fn_suff = ($f3reads,$f3qual,$r3reads,$r3qual);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 if ($is_paired eq "yes") { # paired end
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 for (0 .. 3) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 $fn = $fn_suff[$_];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 open($fhw[0], "|gzip >$outfile2") || die;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 open($fhw[1], "|gzip >$outfile1") || die;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 my (@df, @dr);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 @df = &read1(1); @dr = &read1(2);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 while (@df && @dr) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 if ($df[0] eq $dr[0]) { # mate pair
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 @df = &read1(1); @dr = &read1(2);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 close($fhr[$_]) for (0 .. $#fhr);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 close($fhw[$_]) for (0 .. $#fhw);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 } else { # single end
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 for (0 .. 1) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 my $fn = "$fn_suff[$_]";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 open($fhw[2], "|gzip >$outfile1") || die;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 my @df;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 while (@df = &read1(1, $fhr[0], $fhr[1])) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 print {$fhw[2]} $df[1];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 close($fhr[$_]) for (0 .. $#fhr);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 close($fhw[2]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 sub read1 {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 my $i = shift(@_);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 my $j = ($i-1)<<1;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 my ($key, $seq);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 while (<$fhs>) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 my $t = <$fhq>;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 #print $key;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 die(qq/** unmatched read name: '$_' != '$t'\n/) unless ($_ eq $t);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 my $name = "$1_$2_$3/$i";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 $_ = substr(<$fhs>, 2);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 tr/0123./ACGTN/;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 my $s = $_;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 $_ = <$fhq>;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 s/^(\d+)\s*//;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 s/(\d+)\s*/chr($1+33)/eg;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 $seq = qq/\@$name\n$s+\n$_\n/;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 last;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 return defined($seq)? ($key, $seq) : ();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 }