Mercurial > repos > jvolkening > b2b_summarize_run
comparison summarize_run.pl @ 0:9e0453df1745 draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 54eef6140a5086e3373b2406efb2e18dbfae1336-dirty
author | jvolkening |
---|---|
date | Sat, 02 Mar 2024 07:43:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9e0453df1745 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 use Getopt::Long; | |
7 use BioX::Seq::Stream; | |
8 use List::Util qw/sum/; | |
9 | |
10 my $fn_raw1; | |
11 my $fn_raw2; | |
12 my $fn_filt1; | |
13 my $fn_filt2; | |
14 my $fn_bedgraph; | |
15 my $fn_qc1; | |
16 my $fn_qc2; | |
17 my $fn_consensus; | |
18 my $fn_out; | |
19 my $n_threads = 1; | |
20 | |
21 GetOptions( | |
22 'raw_1=s' => \$fn_raw1, | |
23 'raw_2=s' => \$fn_raw2, | |
24 'filt_1=s' => \$fn_filt1, | |
25 'filt_2=s' => \$fn_filt2, | |
26 'bedgraph=s' => \$fn_bedgraph, | |
27 'fastqc_1=s' => \$fn_qc1, | |
28 'fastqc_2=s' => \$fn_qc2, | |
29 'consensus=s' => \$fn_consensus, | |
30 'out=s' => \$fn_out, | |
31 'threads=i' => \$n_threads, | |
32 ); | |
33 | |
34 my @counts; | |
35 for ($fn_raw1, $fn_raw2, $fn_filt1, $fn_filt2) { | |
36 open my $in, '-|', 'wc', '-l', $_; | |
37 my $ret = <$in>; | |
38 close $in; | |
39 chomp $ret; | |
40 my ($count, $fn) = split ' ', $ret; | |
41 die "line length not multiple of four for $_\n" | |
42 if ($count % 4); | |
43 push @counts, $count/4; | |
44 } | |
45 | |
46 | |
47 die "raw pair count mismatch\n" if ($counts[0] != $counts[1]); | |
48 die "filtered pair count mismatch\n" if ($counts[2] != $counts[3]); | |
49 | |
50 # read fragment stats from STDIN | |
51 my @lens; | |
52 while (<STDIN>) { | |
53 chomp $_; | |
54 push @lens, $_; | |
55 } | |
56 | |
57 my $frag_mean = int( sum(@lens)/scalar(@lens)+0.5 ); | |
58 my $frag_sd = int( sqrt( sum( map {($_ - $frag_mean)**2} @lens)/(scalar(@lens)-1) )+0.5 ); | |
59 | |
60 # extract FastQC data | |
61 #warn "extracting FastQC stats...\n"; | |
62 | |
63 my @five_nums; | |
64 for my $fn ($fn_qc1, $fn_qc2) { | |
65 open my $in, '<', $fn; | |
66 | |
67 my $in_data = 0; | |
68 my @data; | |
69 LINE: | |
70 while (my $line = <$in>) { | |
71 chomp $line; | |
72 if ($in_data) { | |
73 if ($line =~ /^>>END_MODULE/) { | |
74 last LINE; | |
75 } | |
76 next if ($line =~ /^#/); | |
77 my ($score, $count) = split ' ', $line; | |
78 push @data, [$score,$count]; | |
79 } | |
80 elsif ($line =~ /^>>Per sequence quality scores/) { | |
81 $in_data = 1; | |
82 } | |
83 } | |
84 | |
85 push @five_nums, data_to_5( @data ); | |
86 } | |
87 | |
88 # Count contigs | |
89 | |
90 my $p = BioX::Seq::Stream->new($fn_consensus); | |
91 my %n_contigs; | |
92 my @names; | |
93 while (my $seq = $p->next_seq) { | |
94 | |
95 my $id = $seq->id; | |
96 push @names, $id; | |
97 while ($seq =~ /[^Nn]+/g) { | |
98 ++$n_contigs{$id}; | |
99 } | |
100 } | |
101 | |
102 # Parse assembly depth info | |
103 #warn "calculating coverage stats...\n"; | |
104 | |
105 open my $in, '<', $fn_bedgraph; | |
106 | |
107 my %cov_5nums; | |
108 my %counts; | |
109 my $last_end; | |
110 my $last_contig; | |
111 my $head = <$in>; | |
112 while (my $line = <$in>) { | |
113 chomp $line; | |
114 my ($contig,$start,$end,$depth) = split "\t", $line; | |
115 $last_contig //= $contig; | |
116 if ($contig ne $last_contig) { | |
117 | |
118 my @depths = sort {$a <=> $b} keys %counts; | |
119 my @data; | |
120 for (@depths) { | |
121 push @data, [$_, $counts{$_}]; | |
122 } | |
123 $cov_5nums{$last_contig} = data_to_5( @data ); | |
124 $last_contig = $contig; | |
125 %counts = (); | |
126 $last_end = undef; | |
127 } | |
128 | |
129 if (defined($last_end) && $last_end < $start) { | |
130 $counts{0} += $start - $last_end; | |
131 } | |
132 $counts{$depth} += $end - $start; | |
133 $last_end = $end; | |
134 } | |
135 my @depths = sort {$a <=> $b} keys %counts; | |
136 my @data; | |
137 for (@depths) { | |
138 push @data, [$_, $counts{$_}]; | |
139 } | |
140 $cov_5nums{$last_contig} = data_to_5( @data ); | |
141 | |
142 open my $out, '>', $fn_out; | |
143 print {$out} join("\t", | |
144 '#id', | |
145 'raw_read_pairs', | |
146 'filt_read_pairs', | |
147 'frag_len_mean', | |
148 'frag_len_sd', | |
149 'forward_qual', | |
150 'reverse_qual', | |
151 'n_contigs', | |
152 'coverage_depth', | |
153 ), "\n"; | |
154 for my $id (@names) { | |
155 print {$out} join("\t", | |
156 $id, | |
157 $counts[0], | |
158 $counts[2], | |
159 $frag_mean, | |
160 $frag_sd, | |
161 $five_nums[0], | |
162 $five_nums[1], | |
163 $n_contigs{$id}, | |
164 $cov_5nums{$id}, | |
165 ), "\n"; | |
166 } | |
167 close $out; | |
168 | |
169 exit; | |
170 | |
171 | |
172 sub data_to_5 { | |
173 | |
174 my (@data) = @_; | |
175 my $total = sum map {$_->[1]} @data; | |
176 my @five_num; | |
177 my $curr = 0; | |
178 for my $i (0..$#data) { | |
179 $curr += $data[$i]->[1]; | |
180 for my $j (0..4) { | |
181 next if (defined $five_num[$j]); | |
182 my $quant = $j/4; | |
183 if ($curr/$total > $quant) { | |
184 $five_num[$j] = $data[$i]->[0]; | |
185 } | |
186 elsif ($curr/$total == $quant) { | |
187 $five_num[$j] = $i < $#data | |
188 ? int(($data[$i]->[0]+$data[$i+1]->[0])/2) | |
189 : $data[$i]->[0]; | |
190 } | |
191 } | |
192 } | |
193 return join('|',@five_num); | |
194 | |
195 } |