| 0 | 1 package subgroups; | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use warnings; | 
|  | 5 use Exporter; | 
|  | 6 our @ISA = qw( Exporter ); | 
|  | 7 our @EXPORT_OK = qw( &subgroups ); | 
|  | 8 | 
|  | 9 use POSIX; | 
|  | 10 use FindBin; | 
|  | 11 use lib $FindBin::Bin; | 
|  | 12 use align qw ( get_hash_alignment ); | 
|  | 13 | 
|  | 14 sub subgroups | 
|  | 15 { | 
|  | 16 	my ($fin, $dir, $mis, $mis_TE, $proc, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $exons, $TE, $min_si, $max_si, $min_pi, $max_pi, $report ) = @_; | 
|  | 17 	my (@files, $sum, $pie, $repar, %ismapped, %isjunk, %repartition, @junk_ref, @all_ref ); | 
|  | 18 | 
| 20 | 19 	srand(); | 
| 0 | 20 	print $report "----------------------------\n"; | 
|  | 21 	print $report "Create subgroups:\nfastq_in: $fin\ndirectory_out: $dir\nmismatches: $mis\nmismatches TE: $mis_TE\n"; | 
|  | 22 | 
|  | 23 	mkdir $dir; | 
|  | 24 	$dir = $dir.'/' unless $dir =~ /(.*)\/$/; | 
|  | 25 | 
|  | 26 	my $accept_miRNas = $dir.'miRNAs.fastq'; | 
|  | 27 	my $reject_miRNAs = $dir.'miRNAs_rejected.fastq'; | 
|  | 28 	my $sam_miRNAs = $dir.'miRNAs.sam'; | 
|  | 29 	my @tmp = get_hash_alignment($miRNAs, $mis, 1, 1, $accept_miRNas, $reject_miRNAs, $fin, $proc, 'miRNAs',$sam_miRNAs, $report); | 
|  | 30 	my $mi = $tmp[0]; | 
|  | 31 	$repartition{'miRNAs'} = $mi; | 
|  | 32 | 
|  | 33 	my $sam = new String::Random; | 
|  | 34 	$sam = $sam->randpattern("CCcccccc"); | 
|  | 35 	my $reject_rRNAs = $dir.'rRNAs_rejected.fastq'; | 
|  | 36 	@tmp = get_hash_alignment($rRNAs, $mis, 0, 1, 'NA', $reject_rRNAs, $reject_miRNAs, $proc, 'rRNAs', $sam, $report); | 
|  | 37 	$repartition{'rRNAs'} = $tmp[0]; | 
|  | 38 	unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; | 
|  | 39 | 
|  | 40 	$sam = new String::Random; | 
|  | 41 	$sam = $sam->randpattern("CCcccccc"); | 
|  | 42 	my $reject_tRNAs = $dir.'tRNAs_rejected.fastq'; | 
|  | 43 	@tmp = get_hash_alignment($tRNAs, $mis, 0, 1, 'NA', $reject_tRNAs, $reject_rRNAs, $proc, 'tRNAs', $sam, $report); | 
|  | 44 	$repartition{'tRNAs'} = $tmp[0]; | 
|  | 45 	unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; | 
|  | 46 | 
|  | 47 	$sam = new String::Random; | 
|  | 48 	$sam = $sam->randpattern("CCcccccc"); | 
|  | 49 	my $bonafide = $dir.'bonafide_reads.fastq'; | 
|  | 50 	@tmp = get_hash_alignment($snRNAs, $mis, 0, 1, 'NA', $bonafide, $reject_tRNAs, $proc, 'snRNAs', $sam, $report); | 
|  | 51 	$repartition{'snRNAs'} = $tmp[0]; | 
|  | 52 	my $bo = $tmp[1]; | 
|  | 53 	unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; | 
|  | 54 | 
|  | 55 	my $sam_exons = $dir.'exons.sam'; | 
|  | 56 	my $reject_exons = $dir.'rejected_exons.fastq'; | 
|  | 57 	@tmp = get_hash_alignment($exons, $mis, 0, 1, 'NA', $reject_exons, $bonafide, $proc, 'exons', $sam_exons, $report, $dir.'exons.fai'); | 
|  | 58 	$repartition{'exons'} = $tmp[0]; | 
|  | 59 | 
|  | 60 | 
|  | 61 	my $sam_TEs = $dir.'TEs.sam'; | 
|  | 62 	my $reject_TEs = $dir.'rejected.fastq'; | 
|  | 63 	@tmp = get_hash_alignment($TE, $mis_TE, 0, 1, 'NA', $reject_TEs, $reject_exons, $proc, 'TEs', $sam_TEs, $report, $dir.'TEs.fai' ); | 
|  | 64 	$repartition{'TEs'} = $tmp[0] ; $repartition{'others'} = $tmp[1]; | 
|  | 65 	unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; | 
|  | 66 	unlink $reject_exons; | 
|  | 67 	unlink $reject_rRNAs; | 
|  | 68 	unlink $reject_miRNAs; | 
|  | 69 	unlink $reject_tRNAs; | 
|  | 70 | 
|  | 71 	#create repartition | 
|  | 72 	my $pi = fastqSubgroups($bonafide, $dir, $min_si, $max_si, $min_pi, $max_pi ); | 
|  | 73 | 
|  | 74 	open (my $re, '>'.$dir.'repartition.txt') || die "cannot open $dir repartition.txt $!\n"; | 
|  | 75 	print $re "type\tnumber\tpercentage\n"; | 
|  | 76 	$sum += $_ foreach values %repartition; | 
|  | 77 	foreach my $k  ( sort keys  %repartition ) | 
|  | 78 	{ | 
|  | 79 		my $prct = 0; | 
|  | 80 		$prct = $repartition{$k} / $sum * 100 if $sum != 0; | 
|  | 81 		print $re "$k\t$repartition{$k}\t"; printf $re "%.2f\n",$prct; | 
|  | 82 	} | 
|  | 83 	return ( $bo, $mi, $pi); | 
|  | 84 } | 
|  | 85 | 
|  | 86 sub fastqSubgroups | 
|  | 87 { | 
|  | 88   my ( $fastq, $output_directory, $min_si, $max_si, $min_pi, $max_pi ) = @_; | 
|  | 89   my $fastq_siRNA = $output_directory."siRNAs.fastq"; | 
|  | 90   my $fastq_piRNA = $output_directory."piRNAs.fastq"; | 
|  | 91 | 
|  | 92   open my $fic, '<', $fastq || die "cannot open input file $! \n"; | 
|  | 93   open my $si, '>', $fastq_siRNA || die "cannot open siRNA.fastq $! \n"; | 
|  | 94   open my $pi, '>', $fastq_piRNA || die "cannot open piRNA.fastq $! \n"; | 
|  | 95 | 
|  | 96   my ($length, $cmp, $type, $siRNA_number, $miRNA_h_number, $piRNA_number, $not_pi_number) = (0,0,0,0,0,0,0); | 
|  | 97   my (@fastq) =(); my $seq_name; | 
|  | 98   my $out; | 
|  | 99   while(<$fic>) | 
|  | 100   { | 
|  | 101     chomp $_; | 
|  | 102     $cmp++; | 
|  | 103     if ($cmp == 1) | 
|  | 104     { | 
|  | 105       die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); | 
|  | 106       $type = 0; @fastq = (); | 
|  | 107       if ($_ =~ /^\@(.*)\s.*/) { $seq_name = $1;} | 
|  | 108       elsif ($_ =~ /^\@(.*)/) {$seq_name = $1;} | 
|  | 109       push(@fastq,$_); | 
|  | 110     } | 
|  | 111     elsif ($cmp == 2) | 
|  | 112     { | 
|  | 113       #die "unrecognized symbol at line $cmp\n" unless $_ =~ /[atcgATCGnN]+/; | 
|  | 114       push(@fastq,$_); | 
|  | 115       $length = length($_); | 
|  | 116       $type = 0; | 
|  | 117       if ( $length >= $min_si  && $length <= $max_si ) | 
|  | 118       { | 
|  | 119          $type = 2; | 
|  | 120          $siRNA_number++; | 
|  | 121       } | 
|  | 122       if ($length >= $min_pi  && $length <= $max_pi ) | 
|  | 123       { | 
|  | 124         $type += 4; | 
|  | 125         $piRNA_number++; | 
|  | 126       } | 
|  | 127     } | 
|  | 128     elsif ($cmp == 3 ) | 
|  | 129     { | 
|  | 130       die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/; | 
|  | 131       push(@fastq,$_); | 
|  | 132     } | 
|  | 133     elsif ($cmp == 4 ) | 
|  | 134     { | 
|  | 135       push(@fastq,$_); | 
|  | 136       $cmp = 0; | 
|  | 137       if ($type != 0) | 
|  | 138       { | 
|  | 139         if ($type & 4 ) { foreach my $t (@fastq) { print $pi $t."\n";} } | 
|  | 140         if ($type & 2 ) { foreach my $t (@fastq) { print $si $t."\n";} } | 
|  | 141       } | 
|  | 142     } | 
|  | 143   } | 
|  | 144   close $fic; | 
|  | 145   close $si; close $pi; | 
|  | 146   return ($piRNA_number); | 
|  | 147 } | 
|  | 148 | 
|  | 149 1; |