comparison summarize_run.pl @ 0:b42eb9ff6a8c draft

planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9260aa02a5f703bce63d2db5b69003df9be371ac
author jvolkening
date Fri, 08 Mar 2024 00:47:53 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b42eb9ff6a8c
1 #!/usr/bin/perl
2
3 use strict;
4 use warnings;
5
6 use Getopt::Long;
7 use BioX::Seq::Stream;
8 use List::Util qw/sum/;
9
10 my $fn_raw1;
11 my $fn_raw2;
12 my $fn_filt1;
13 my $fn_filt2;
14 my $fn_bedgraph;
15 my $fn_qc1;
16 my $fn_qc2;
17 my $fn_consensus;
18 my $fn_out;
19 my $n_threads = 1;
20
21 GetOptions(
22 'raw_1=s' => \$fn_raw1,
23 'raw_2=s' => \$fn_raw2,
24 'filt_1=s' => \$fn_filt1,
25 'filt_2=s' => \$fn_filt2,
26 'bedgraph=s' => \$fn_bedgraph,
27 'fastqc_1=s' => \$fn_qc1,
28 'fastqc_2=s' => \$fn_qc2,
29 'consensus=s' => \$fn_consensus,
30 'out=s' => \$fn_out,
31 'threads=i' => \$n_threads,
32 );
33
34 my @counts;
35 for ($fn_raw1, $fn_raw2, $fn_filt1, $fn_filt2) {
36 open my $in, '-|', 'wc', '-l', $_;
37 my $ret = <$in>;
38 close $in;
39 chomp $ret;
40 my ($count, $fn) = split ' ', $ret;
41 die "line length not multiple of four for $_\n"
42 if ($count % 4);
43 push @counts, $count/4;
44 }
45
46
47 die "raw pair count mismatch\n" if ($counts[0] != $counts[1]);
48 die "filtered pair count mismatch\n" if ($counts[2] != $counts[3]);
49
50 # read fragment stats from STDIN
51 my @lens;
52 while (<STDIN>) {
53 chomp $_;
54 push @lens, $_;
55 }
56
57 my $frag_mean = int( sum(@lens)/scalar(@lens)+0.5 );
58 my $frag_sd = int( sqrt( sum( map {($_ - $frag_mean)**2} @lens)/(scalar(@lens)-1) )+0.5 );
59
60 # extract FastQC data
61 #warn "extracting FastQC stats...\n";
62
63 my @five_nums;
64 for my $fn ($fn_qc1, $fn_qc2) {
65 open my $in, '<', $fn;
66
67 my $in_data = 0;
68 my @data;
69 LINE:
70 while (my $line = <$in>) {
71 chomp $line;
72 if ($in_data) {
73 if ($line =~ /^>>END_MODULE/) {
74 last LINE;
75 }
76 next if ($line =~ /^#/);
77 my ($score, $count) = split ' ', $line;
78 push @data, [$score,$count];
79 }
80 elsif ($line =~ /^>>Per sequence quality scores/) {
81 $in_data = 1;
82 }
83 }
84
85 push @five_nums, data_to_5( @data );
86 }
87
88 # Count contigs
89
90 my $p = BioX::Seq::Stream->new($fn_consensus);
91 my %n_contigs;
92 my @names;
93 while (my $seq = $p->next_seq) {
94
95 my $id = $seq->id;
96 push @names, $id;
97 while ($seq =~ /[^Nn]+/g) {
98 ++$n_contigs{$id};
99 }
100 }
101
102 # Parse assembly depth info
103 #warn "calculating coverage stats...\n";
104
105 open my $in, '<', $fn_bedgraph;
106
107 my %cov_5nums;
108 my %counts;
109 my $last_end;
110 my $last_contig;
111 my $head = <$in>;
112 while (my $line = <$in>) {
113 chomp $line;
114 my ($contig,$start,$end,$depth) = split "\t", $line;
115 $last_contig //= $contig;
116 if ($contig ne $last_contig) {
117
118 my @depths = sort {$a <=> $b} keys %counts;
119 my @data;
120 for (@depths) {
121 push @data, [$_, $counts{$_}];
122 }
123 $cov_5nums{$last_contig} = data_to_5( @data );
124 $last_contig = $contig;
125 %counts = ();
126 $last_end = undef;
127 }
128
129 if (defined($last_end) && $last_end < $start) {
130 $counts{0} += $start - $last_end;
131 }
132 $counts{$depth} += $end - $start;
133 $last_end = $end;
134 }
135 my @depths = sort {$a <=> $b} keys %counts;
136 my @data;
137 for (@depths) {
138 push @data, [$_, $counts{$_}];
139 }
140 $cov_5nums{$last_contig} = data_to_5( @data );
141
142 open my $out, '>', $fn_out;
143 print {$out} join("\t",
144 '#id',
145 'raw_read_pairs',
146 'filt_read_pairs',
147 'frag_len_mean',
148 'frag_len_sd',
149 'forward_qual',
150 'reverse_qual',
151 'n_contigs',
152 'coverage_depth',
153 ), "\n";
154 for my $id (@names) {
155 print {$out} join("\t",
156 $id,
157 $counts[0],
158 $counts[2],
159 $frag_mean,
160 $frag_sd,
161 $five_nums[0],
162 $five_nums[1],
163 $n_contigs{$id},
164 $cov_5nums{$id},
165 ), "\n";
166 }
167 close $out;
168
169 exit;
170
171
172 sub data_to_5 {
173
174 my (@data) = @_;
175 my $total = sum map {$_->[1]} @data;
176 my @five_num;
177 my $curr = 0;
178 for my $i (0..$#data) {
179 $curr += $data[$i]->[1];
180 for my $j (0..4) {
181 next if (defined $five_num[$j]);
182 my $quant = $j/4;
183 if ($curr/$total > $quant) {
184 $five_num[$j] = $data[$i]->[0];
185 }
186 elsif ($curr/$total == $quant) {
187 $five_num[$j] = $i < $#data
188 ? int(($data[$i]->[0]+$data[$i+1]->[0])/2)
189 : $data[$i]->[0];
190 }
191 }
192 }
193 return join('|',@five_num);
194
195 }