| 0 | 1 package resize; | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use warnings; | 
|  | 5 | 
|  | 6 use FindBin; | 
|  | 7 use lib $FindBin::Bin; | 
|  | 8 use Rcall qw ( histogram ); | 
|  | 9 | 
|  | 10 use Exporter; | 
|  | 11 our @ISA = qw( Exporter ); | 
|  | 12 our @EXPORT_OK = qw( &size_distribution ); | 
|  | 13 | 
|  | 14 sub size_distribution | 
|  | 15 { | 
|  | 16   my ( $fastq, $fastq_out, $dir, $min, $max ) = @_; | 
|  | 17 | 
|  | 18   my ( %fragments_size, %duplicates ) ; | 
|  | 19   my $num = size($min, $max, $fastq, $fastq_out, \%fragments_size, \%duplicates); | 
|  | 20 | 
|  | 21   my $png = $dir.'histogram.png'; | 
|  | 22    histogram(\%fragments_size, $png, $num); | 
|  | 23 | 
|  | 24   my $size = $dir.'reads_size.txt'; | 
|  | 25 | 
|  | 26 | 
|  | 27   my $pourcentage; | 
|  | 28   open  my $o, '>', $size || die "cannot open $size $!\n"; | 
|  | 29   print $o "size\tnumber\tpercentage\n"; | 
|  | 30   foreach my  $k (sort { $a <=> $b } keys %fragments_size ) | 
|  | 31   { | 
|  | 32     $pourcentage = $fragments_size{$k} / $num * 100; | 
|  | 33 | 
|  | 34     print $o "$k\t$fragments_size{$k}\t"; | 
|  | 35     printf $o "%.2f\n",$pourcentage; | 
|  | 36   } | 
|  | 37   close $o; | 
|  | 38 | 
|  | 39   my $dup = $dir.'duplicates.txt' ; | 
|  | 40   open $o, '>', $dup || die "cannot open $size $!\n"; | 
|  | 41   print $o "size\tnumber\n"; | 
|  | 42   foreach my  $k (sort { $duplicates{$b} <=> $duplicates{$a} } keys %duplicates ) | 
|  | 43   { | 
|  | 44     print $o "$k\t$duplicates{$k}\n"; | 
|  | 45   } | 
|  | 46   close $o; | 
|  | 47 } | 
|  | 48 | 
|  | 49 sub size | 
|  | 50 { | 
|  | 51   my ($min, $max, $in_file, $out_file, $sizeHashR, $duplicateHashR) = @_; | 
|  | 52   my ($numreads, $size, $cmp, $ok, $line) = (0, 0, 0, 0); | 
|  | 53   my @fastq; | 
|  | 54   open (my $in, $in_file) || die "cannot open $in_file $!\n"; | 
|  | 55   open (my $out, ">".$out_file)  || die "cannot create $out_file $!\n"; | 
|  | 56   while(<$in>) | 
|  | 57   { | 
|  | 58     chomp $_; | 
|  | 59     $cmp++; $line++; | 
|  | 60     if ($cmp == 1) | 
|  | 61     { | 
|  | 62       die "file do not contain a @ at line $line\n" unless ($_ =~ /^\@/ ); | 
|  | 63       $ok = 0; @fastq = (); | 
|  | 64       push(@fastq,$_); | 
|  | 65     } | 
|  | 66     elsif ($cmp == 2) | 
|  | 67     { | 
|  | 68       #die "unrecognized symbol at line $line\n" unless ($_ =~ /[atcgATCGnN]+/ || $_ =~ /^$/ ); | 
|  | 69       push(@fastq,$_); | 
|  | 70       $size = length($_); | 
|  | 71       if ($size >= $min && $size <= $max) | 
|  | 72       { | 
|  | 73         $numreads++; | 
|  | 74         ${$sizeHashR}{$size}+=1; | 
|  | 75         ${$duplicateHashR}{$_}+=1 if (defined($duplicateHashR)); | 
|  | 76         $ok = 1; | 
|  | 77       } | 
|  | 78     } | 
|  | 79     elsif ($cmp == 3 ) | 
|  | 80     { | 
|  | 81       die "file do not contain a + at line $line\n" unless $_ =~ /^\+/; | 
|  | 82       push(@fastq,$_); | 
|  | 83     } | 
|  | 84     elsif ($cmp == 4 ) | 
|  | 85     { | 
|  | 86       push(@fastq,$_); | 
|  | 87       $cmp = 0; | 
|  | 88       if ($ok == 1) | 
|  | 89       { | 
|  | 90         foreach my $t (@fastq) | 
|  | 91         { | 
|  | 92           print $out $t."\n"; | 
|  | 93         } | 
|  | 94       } | 
|  | 95     } | 
|  | 96   } | 
|  | 97   close $in; close $out; | 
|  | 98   return $numreads; | 
|  | 99 } | 
|  | 100 | 
|  | 101 1; |