| 
0
 | 
     1 package subgroups;
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 use Exporter;
 | 
| 
 | 
     6 our @ISA = qw( Exporter );
 | 
| 
 | 
     7 our @EXPORT_OK = qw( &subgroups );
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 use POSIX;
 | 
| 
 | 
    10 use FindBin;
 | 
| 
 | 
    11 use lib $FindBin::Bin;
 | 
| 
 | 
    12 use align qw ( get_hash_alignment );
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 sub subgroups
 | 
| 
 | 
    15 {
 | 
| 
 | 
    16 	my ($fin, $dir, $mis, $mis_TE, $proc, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $exons, $TE, $min_si, $max_si, $min_pi, $max_pi, $report ) = @_;
 | 
| 
 | 
    17 	my (@files, $sum, $pie, $repar, %ismapped, %isjunk, %repartition, @junk_ref, @all_ref );
 | 
| 
 | 
    18 
 | 
| 
20
 | 
    19 	srand();
 | 
| 
0
 | 
    20 	print $report "----------------------------\n";
 | 
| 
 | 
    21 	print $report "Create subgroups:\nfastq_in: $fin\ndirectory_out: $dir\nmismatches: $mis\nmismatches TE: $mis_TE\n";
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 	mkdir $dir;
 | 
| 
 | 
    24 	$dir = $dir.'/' unless $dir =~ /(.*)\/$/;
 | 
| 
 | 
    25 
 | 
| 
 | 
    26 	my $accept_miRNas = $dir.'miRNAs.fastq'; 
 | 
| 
 | 
    27 	my $reject_miRNAs = $dir.'miRNAs_rejected.fastq';
 | 
| 
 | 
    28 	my $sam_miRNAs = $dir.'miRNAs.sam'; 
 | 
| 
 | 
    29 	my @tmp = get_hash_alignment($miRNAs, $mis, 1, 1, $accept_miRNas, $reject_miRNAs, $fin, $proc, 'miRNAs',$sam_miRNAs, $report);
 | 
| 
 | 
    30 	my $mi = $tmp[0];
 | 
| 
 | 
    31 	$repartition{'miRNAs'} = $mi;
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 	my $sam = new String::Random;
 | 
| 
 | 
    34 	$sam = $sam->randpattern("CCcccccc");
 | 
| 
 | 
    35 	my $reject_rRNAs = $dir.'rRNAs_rejected.fastq';
 | 
| 
 | 
    36 	@tmp = get_hash_alignment($rRNAs, $mis, 0, 1, 'NA', $reject_rRNAs, $reject_miRNAs, $proc, 'rRNAs', $sam, $report);
 | 
| 
 | 
    37 	$repartition{'rRNAs'} = $tmp[0];
 | 
| 
 | 
    38 	unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 	$sam = new String::Random;
 | 
| 
 | 
    41 	$sam = $sam->randpattern("CCcccccc");
 | 
| 
 | 
    42 	my $reject_tRNAs = $dir.'tRNAs_rejected.fastq';
 | 
| 
 | 
    43 	@tmp = get_hash_alignment($tRNAs, $mis, 0, 1, 'NA', $reject_tRNAs, $reject_rRNAs, $proc, 'tRNAs', $sam, $report);
 | 
| 
 | 
    44 	$repartition{'tRNAs'} = $tmp[0];
 | 
| 
 | 
    45 	unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 	$sam = new String::Random;
 | 
| 
 | 
    48 	$sam = $sam->randpattern("CCcccccc");
 | 
| 
 | 
    49 	my $bonafide = $dir.'bonafide_reads.fastq';
 | 
| 
 | 
    50 	@tmp = get_hash_alignment($snRNAs, $mis, 0, 1, 'NA', $bonafide, $reject_tRNAs, $proc, 'snRNAs', $sam, $report);
 | 
| 
 | 
    51 	$repartition{'snRNAs'} = $tmp[0];
 | 
| 
 | 
    52 	my $bo = $tmp[1];
 | 
| 
 | 
    53 	unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
 | 
| 
 | 
    54 
 | 
| 
 | 
    55 	my $sam_exons = $dir.'exons.sam'; 
 | 
| 
 | 
    56 	my $reject_exons = $dir.'rejected_exons.fastq';
 | 
| 
 | 
    57 	@tmp = get_hash_alignment($exons, $mis, 0, 1, 'NA', $reject_exons, $bonafide, $proc, 'exons', $sam_exons, $report, $dir.'exons.fai');
 | 
| 
 | 
    58 	$repartition{'exons'} = $tmp[0];
 | 
| 
 | 
    59 
 | 
| 
 | 
    60 	
 | 
| 
 | 
    61 	my $sam_TEs = $dir.'TEs.sam';
 | 
| 
 | 
    62 	my $reject_TEs = $dir.'rejected.fastq';
 | 
| 
 | 
    63 	@tmp = get_hash_alignment($TE, $mis_TE, 0, 1, 'NA', $reject_TEs, $reject_exons, $proc, 'TEs', $sam_TEs, $report, $dir.'TEs.fai' );
 | 
| 
 | 
    64 	$repartition{'TEs'} = $tmp[0] ; $repartition{'others'} = $tmp[1];
 | 
| 
 | 
    65 	unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
 | 
| 
 | 
    66 	unlink $reject_exons;
 | 
| 
 | 
    67 	unlink $reject_rRNAs;
 | 
| 
 | 
    68 	unlink $reject_miRNAs;
 | 
| 
 | 
    69 	unlink $reject_tRNAs;
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 	#create repartition
 | 
| 
 | 
    72 	my $pi = fastqSubgroups($bonafide, $dir, $min_si, $max_si, $min_pi, $max_pi );
 | 
| 
 | 
    73 
 | 
| 
 | 
    74 	open (my $re, '>'.$dir.'repartition.txt') || die "cannot open $dir repartition.txt $!\n";
 | 
| 
 | 
    75 	print $re "type\tnumber\tpercentage\n";
 | 
| 
 | 
    76 	$sum += $_ foreach values %repartition;
 | 
| 
 | 
    77 	foreach my $k  ( sort keys  %repartition ) 
 | 
| 
 | 
    78 	{
 | 
| 
 | 
    79 		my $prct = 0;
 | 
| 
 | 
    80 		$prct = $repartition{$k} / $sum * 100 if $sum != 0;
 | 
| 
 | 
    81 		print $re "$k\t$repartition{$k}\t"; printf $re "%.2f\n",$prct;
 | 
| 
 | 
    82 	}
 | 
| 
 | 
    83 	return ( $bo, $mi, $pi);
 | 
| 
 | 
    84 }
 | 
| 
 | 
    85 
 | 
| 
 | 
    86 sub fastqSubgroups
 | 
| 
 | 
    87 {
 | 
| 
 | 
    88   my ( $fastq, $output_directory, $min_si, $max_si, $min_pi, $max_pi ) = @_;
 | 
| 
 | 
    89   my $fastq_siRNA = $output_directory."siRNAs.fastq";
 | 
| 
 | 
    90   my $fastq_piRNA = $output_directory."piRNAs.fastq";
 | 
| 
 | 
    91 
 | 
| 
 | 
    92   open my $fic, '<', $fastq || die "cannot open input file $! \n";
 | 
| 
 | 
    93   open my $si, '>', $fastq_siRNA || die "cannot open siRNA.fastq $! \n";
 | 
| 
 | 
    94   open my $pi, '>', $fastq_piRNA || die "cannot open piRNA.fastq $! \n";
 | 
| 
 | 
    95   
 | 
| 
 | 
    96   my ($length, $cmp, $type, $siRNA_number, $miRNA_h_number, $piRNA_number, $not_pi_number) = (0,0,0,0,0,0,0);
 | 
| 
 | 
    97   my (@fastq) =(); my $seq_name;
 | 
| 
 | 
    98   my $out;
 | 
| 
 | 
    99   while(<$fic>)
 | 
| 
 | 
   100   {
 | 
| 
 | 
   101     chomp $_;
 | 
| 
 | 
   102     $cmp++;
 | 
| 
 | 
   103     if ($cmp == 1)
 | 
| 
 | 
   104     {
 | 
| 
 | 
   105       die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ );
 | 
| 
 | 
   106       $type = 0; @fastq = ();
 | 
| 
 | 
   107       if ($_ =~ /^\@(.*)\s.*/) { $seq_name = $1;}
 | 
| 
 | 
   108       elsif ($_ =~ /^\@(.*)/) {$seq_name = $1;}
 | 
| 
 | 
   109       push(@fastq,$_);
 | 
| 
 | 
   110     }
 | 
| 
 | 
   111     elsif ($cmp == 2)
 | 
| 
 | 
   112     {
 | 
| 
 | 
   113       #die "unrecognized symbol at line $cmp\n" unless $_ =~ /[atcgATCGnN]+/;
 | 
| 
 | 
   114       push(@fastq,$_);
 | 
| 
 | 
   115       $length = length($_);
 | 
| 
 | 
   116       $type = 0;
 | 
| 
 | 
   117       if ( $length >= $min_si  && $length <= $max_si )
 | 
| 
 | 
   118       {
 | 
| 
 | 
   119          $type = 2;
 | 
| 
 | 
   120          $siRNA_number++;
 | 
| 
 | 
   121       }
 | 
| 
 | 
   122       if ($length >= $min_pi  && $length <= $max_pi )
 | 
| 
 | 
   123       {
 | 
| 
 | 
   124         $type += 4;
 | 
| 
 | 
   125         $piRNA_number++;
 | 
| 
 | 
   126       }      
 | 
| 
 | 
   127     }
 | 
| 
 | 
   128     elsif ($cmp == 3 )
 | 
| 
 | 
   129     {
 | 
| 
 | 
   130       die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
 | 
| 
 | 
   131       push(@fastq,$_);
 | 
| 
 | 
   132     }
 | 
| 
 | 
   133     elsif ($cmp == 4 )
 | 
| 
 | 
   134     {
 | 
| 
 | 
   135       push(@fastq,$_);
 | 
| 
 | 
   136       $cmp = 0;
 | 
| 
 | 
   137       if ($type != 0)
 | 
| 
 | 
   138       {
 | 
| 
 | 
   139         if ($type & 4 ) { foreach my $t (@fastq) { print $pi $t."\n";} }
 | 
| 
 | 
   140         if ($type & 2 ) { foreach my $t (@fastq) { print $si $t."\n";} }        
 | 
| 
 | 
   141       }
 | 
| 
 | 
   142     }
 | 
| 
 | 
   143   }
 | 
| 
 | 
   144   close $fic;
 | 
| 
 | 
   145   close $si; close $pi;
 | 
| 
 | 
   146   return ($piRNA_number);
 | 
| 
 | 
   147 }
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 1;
 |