Mercurial > repos > jvolkening > b2b_summarize_assembly
comparison summarize_run.pl @ 0:1995630ef409 draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 54eef6140a5086e3373b2406efb2e18dbfae1336-dirty
| author | jvolkening |
|---|---|
| date | Sat, 02 Mar 2024 07:44:53 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1995630ef409 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 | |
| 6 use Getopt::Long; | |
| 7 use BioX::Seq::Stream; | |
| 8 use List::Util qw/sum/; | |
| 9 | |
| 10 my $fn_raw1; | |
| 11 my $fn_raw2; | |
| 12 my $fn_filt1; | |
| 13 my $fn_filt2; | |
| 14 my $fn_bedgraph; | |
| 15 my $fn_qc1; | |
| 16 my $fn_qc2; | |
| 17 my $fn_consensus; | |
| 18 my $fn_out; | |
| 19 my $n_threads = 1; | |
| 20 | |
| 21 GetOptions( | |
| 22 'raw_1=s' => \$fn_raw1, | |
| 23 'raw_2=s' => \$fn_raw2, | |
| 24 'filt_1=s' => \$fn_filt1, | |
| 25 'filt_2=s' => \$fn_filt2, | |
| 26 'bedgraph=s' => \$fn_bedgraph, | |
| 27 'fastqc_1=s' => \$fn_qc1, | |
| 28 'fastqc_2=s' => \$fn_qc2, | |
| 29 'consensus=s' => \$fn_consensus, | |
| 30 'out=s' => \$fn_out, | |
| 31 'threads=i' => \$n_threads, | |
| 32 ); | |
| 33 | |
| 34 my @counts; | |
| 35 for ($fn_raw1, $fn_raw2, $fn_filt1, $fn_filt2) { | |
| 36 open my $in, '-|', 'wc', '-l', $_; | |
| 37 my $ret = <$in>; | |
| 38 close $in; | |
| 39 chomp $ret; | |
| 40 my ($count, $fn) = split ' ', $ret; | |
| 41 die "line length not multiple of four for $_\n" | |
| 42 if ($count % 4); | |
| 43 push @counts, $count/4; | |
| 44 } | |
| 45 | |
| 46 | |
| 47 die "raw pair count mismatch\n" if ($counts[0] != $counts[1]); | |
| 48 die "filtered pair count mismatch\n" if ($counts[2] != $counts[3]); | |
| 49 | |
| 50 # read fragment stats from STDIN | |
| 51 my @lens; | |
| 52 while (<STDIN>) { | |
| 53 chomp $_; | |
| 54 push @lens, $_; | |
| 55 } | |
| 56 | |
| 57 my $frag_mean = int( sum(@lens)/scalar(@lens)+0.5 ); | |
| 58 my $frag_sd = int( sqrt( sum( map {($_ - $frag_mean)**2} @lens)/(scalar(@lens)-1) )+0.5 ); | |
| 59 | |
| 60 # extract FastQC data | |
| 61 #warn "extracting FastQC stats...\n"; | |
| 62 | |
| 63 my @five_nums; | |
| 64 for my $fn ($fn_qc1, $fn_qc2) { | |
| 65 open my $in, '<', $fn; | |
| 66 | |
| 67 my $in_data = 0; | |
| 68 my @data; | |
| 69 LINE: | |
| 70 while (my $line = <$in>) { | |
| 71 chomp $line; | |
| 72 if ($in_data) { | |
| 73 if ($line =~ /^>>END_MODULE/) { | |
| 74 last LINE; | |
| 75 } | |
| 76 next if ($line =~ /^#/); | |
| 77 my ($score, $count) = split ' ', $line; | |
| 78 push @data, [$score,$count]; | |
| 79 } | |
| 80 elsif ($line =~ /^>>Per sequence quality scores/) { | |
| 81 $in_data = 1; | |
| 82 } | |
| 83 } | |
| 84 | |
| 85 push @five_nums, data_to_5( @data ); | |
| 86 } | |
| 87 | |
| 88 # Count contigs | |
| 89 | |
| 90 my $p = BioX::Seq::Stream->new($fn_consensus); | |
| 91 my %n_contigs; | |
| 92 my @names; | |
| 93 while (my $seq = $p->next_seq) { | |
| 94 | |
| 95 my $id = $seq->id; | |
| 96 push @names, $id; | |
| 97 while ($seq =~ /[^Nn]+/g) { | |
| 98 ++$n_contigs{$id}; | |
| 99 } | |
| 100 } | |
| 101 | |
| 102 # Parse assembly depth info | |
| 103 #warn "calculating coverage stats...\n"; | |
| 104 | |
| 105 open my $in, '<', $fn_bedgraph; | |
| 106 | |
| 107 my %cov_5nums; | |
| 108 my %counts; | |
| 109 my $last_end; | |
| 110 my $last_contig; | |
| 111 my $head = <$in>; | |
| 112 while (my $line = <$in>) { | |
| 113 chomp $line; | |
| 114 my ($contig,$start,$end,$depth) = split "\t", $line; | |
| 115 $last_contig //= $contig; | |
| 116 if ($contig ne $last_contig) { | |
| 117 | |
| 118 my @depths = sort {$a <=> $b} keys %counts; | |
| 119 my @data; | |
| 120 for (@depths) { | |
| 121 push @data, [$_, $counts{$_}]; | |
| 122 } | |
| 123 $cov_5nums{$last_contig} = data_to_5( @data ); | |
| 124 $last_contig = $contig; | |
| 125 %counts = (); | |
| 126 $last_end = undef; | |
| 127 } | |
| 128 | |
| 129 if (defined($last_end) && $last_end < $start) { | |
| 130 $counts{0} += $start - $last_end; | |
| 131 } | |
| 132 $counts{$depth} += $end - $start; | |
| 133 $last_end = $end; | |
| 134 } | |
| 135 my @depths = sort {$a <=> $b} keys %counts; | |
| 136 my @data; | |
| 137 for (@depths) { | |
| 138 push @data, [$_, $counts{$_}]; | |
| 139 } | |
| 140 $cov_5nums{$last_contig} = data_to_5( @data ); | |
| 141 | |
| 142 open my $out, '>', $fn_out; | |
| 143 print {$out} join("\t", | |
| 144 '#id', | |
| 145 'raw_read_pairs', | |
| 146 'filt_read_pairs', | |
| 147 'frag_len_mean', | |
| 148 'frag_len_sd', | |
| 149 'forward_qual', | |
| 150 'reverse_qual', | |
| 151 'n_contigs', | |
| 152 'coverage_depth', | |
| 153 ), "\n"; | |
| 154 for my $id (@names) { | |
| 155 print {$out} join("\t", | |
| 156 $id, | |
| 157 $counts[0], | |
| 158 $counts[2], | |
| 159 $frag_mean, | |
| 160 $frag_sd, | |
| 161 $five_nums[0], | |
| 162 $five_nums[1], | |
| 163 $n_contigs{$id}, | |
| 164 $cov_5nums{$id}, | |
| 165 ), "\n"; | |
| 166 } | |
| 167 close $out; | |
| 168 | |
| 169 exit; | |
| 170 | |
| 171 | |
| 172 sub data_to_5 { | |
| 173 | |
| 174 my (@data) = @_; | |
| 175 my $total = sum map {$_->[1]} @data; | |
| 176 my @five_num; | |
| 177 my $curr = 0; | |
| 178 for my $i (0..$#data) { | |
| 179 $curr += $data[$i]->[1]; | |
| 180 for my $j (0..4) { | |
| 181 next if (defined $five_num[$j]); | |
| 182 my $quant = $j/4; | |
| 183 if ($curr/$total > $quant) { | |
| 184 $five_num[$j] = $data[$i]->[0]; | |
| 185 } | |
| 186 elsif ($curr/$total == $quant) { | |
| 187 $five_num[$j] = $i < $#data | |
| 188 ? int(($data[$i]->[0]+$data[$i+1]->[0])/2) | |
| 189 : $data[$i]->[0]; | |
| 190 } | |
| 191 } | |
| 192 } | |
| 193 return join('|',@five_num); | |
| 194 | |
| 195 } |
