1
|
1 package subgroups;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use Exporter;
|
|
6 our @ISA = qw( Exporter );
|
|
7 our @EXPORT_OK = qw( &subgroups );
|
|
8
|
|
9 use POSIX;
|
|
10 use FindBin;
|
|
11 use lib $FindBin::Bin;
|
|
12 use align qw ( get_hash_alignment );
|
|
13
|
|
14 sub subgroups
|
|
15 {
|
|
16 my ($fin, $dir, $mis, $mis_TE, $proc, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $transcripts, $TE, $min_si, $max_si, $min_pi, $max_pi, $report ) = @_;
|
|
17 my (@files, $sum, $pie, $repar, %ismapped, %isjunk, %repartition, @junk_ref, @all_ref );
|
|
18
|
|
19 srand();
|
|
20 print $report "----------------------------\n";
|
|
21 print $report "Create subgroups:\nfastq_in: $fin\ndirectory_out: $dir\nmismatches: $mis\nmismatches TE: $mis_TE\n";
|
|
22
|
|
23 mkdir $dir;
|
|
24 $dir = $dir.'/' unless $dir =~ /(.*)\/$/;
|
|
25
|
|
26 my $accept_miRNas = $dir.'miRNAs.fastq';
|
|
27 my $reject_miRNAs = $dir.'miRNAs_rejected.fastq';
|
|
28 my $sam_miRNAs = $dir.'miRNAs.sam';
|
|
29 my @tmp = get_hash_alignment($miRNAs, $mis, 1, 1, $accept_miRNas, $reject_miRNAs, $fin, $proc, 'miRNAs',$sam_miRNAs, $report);
|
|
30 my $mi = $tmp[0];
|
|
31 $repartition{'miRNAs'} = $mi;
|
|
32
|
|
33 my $sam = new String::Random;
|
|
34 $sam = $sam->randpattern("CCcccccc");
|
|
35 my $reject_rRNAs = $dir.'rRNAs_rejected.fastq';
|
|
36 @tmp = get_hash_alignment($rRNAs, $mis, 0, 1, 'NA', $reject_rRNAs, $reject_miRNAs, $proc, 'rRNAs', $sam, $report);
|
|
37 $repartition{'rRNAs'} = $tmp[0];
|
|
38 unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
|
|
39
|
|
40 $sam = new String::Random;
|
|
41 $sam = $sam->randpattern("CCcccccc");
|
|
42 my $reject_tRNAs = $dir.'tRNAs_rejected.fastq';
|
|
43 @tmp = get_hash_alignment($tRNAs, $mis, 0, 1, 'NA', $reject_tRNAs, $reject_rRNAs, $proc, 'tRNAs', $sam, $report);
|
|
44 $repartition{'tRNAs'} = $tmp[0];
|
|
45 unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
|
|
46
|
|
47 $sam = new String::Random;
|
|
48 $sam = $sam->randpattern("CCcccccc");
|
|
49 my $bonafide = $dir.'bonafide_reads.fastq';
|
|
50 @tmp = get_hash_alignment($snRNAs, $mis, 0, 1, 'NA', $bonafide, $reject_tRNAs, $proc, 'snRNAs', $sam, $report);
|
|
51 $repartition{'snRNAs'} = $tmp[0];
|
|
52 my $bo = $tmp[1];
|
|
53 unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
|
|
54
|
|
55 my $sam_transcripts = $dir.'transcripts.sam';
|
|
56 my $reject_transcripts = $dir.'rejected_transcripts.fastq';
|
|
57 @tmp = get_hash_alignment($transcripts, $mis, 0, 1, 'NA', $reject_transcripts, $bonafide, $proc, 'transcripts', $sam_transcripts, $report, $dir.'transcripts.fai');
|
|
58 $repartition{'transcripts'} = $tmp[0];
|
|
59
|
|
60
|
|
61 my $sam_TEs = $dir.'TEs.sam';
|
|
62 my $reject_TEs = $dir.'rejected.fastq';
|
|
63 @tmp = get_hash_alignment($TE, $mis_TE, 0, 1, 'NA', $reject_TEs, $reject_transcripts, $proc, 'TEs', $sam_TEs, $report, $dir.'TEs.fai' );
|
|
64 $repartition{'TEs'} = $tmp[0] ; $repartition{'others'} = $tmp[1];
|
|
65 unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
|
|
66 unlink $reject_transcripts;
|
|
67 unlink $reject_rRNAs;
|
|
68 unlink $reject_miRNAs;
|
|
69 unlink $reject_tRNAs;
|
|
70
|
|
71 #create repartition
|
|
72 my $pi = fastqSubgroups($bonafide, $dir, $min_si, $max_si, $min_pi, $max_pi );
|
|
73
|
|
74 open (my $re, '>'.$dir.'repartition.txt') || die "cannot open $dir repartition.txt $!\n";
|
|
75 print $re "type\tnumber\tpercentage\n";
|
|
76 $sum += $_ foreach values %repartition;
|
|
77 foreach my $k ( sort keys %repartition )
|
|
78 {
|
|
79 my $prct = 0;
|
|
80 $prct = $repartition{$k} / $sum * 100 if $sum != 0;
|
|
81 print $re "$k\t$repartition{$k}\t"; printf $re "%.2f\n",$prct;
|
|
82 }
|
|
83 return ( $bo, $mi, $pi);
|
|
84 }
|
|
85
|
|
86 sub fastqSubgroups
|
|
87 {
|
|
88 my ( $fastq, $output_directory, $min_si, $max_si, $min_pi, $max_pi ) = @_;
|
|
89 my $fastq_siRNA = $output_directory."siRNAs.fastq";
|
|
90 my $fastq_piRNA = $output_directory."piRNAs.fastq";
|
|
91
|
|
92 open my $fic, '<', $fastq || die "cannot open input file $! \n";
|
|
93 open my $si, '>', $fastq_siRNA || die "cannot open siRNA.fastq $! \n";
|
|
94 open my $pi, '>', $fastq_piRNA || die "cannot open piRNA.fastq $! \n";
|
|
95
|
|
96 my ($length, $cmp, $type, $siRNA_number, $miRNA_h_number, $piRNA_number, $not_pi_number) = (0,0,0,0,0,0,0);
|
|
97 my (@fastq) =(); my $seq_name;
|
|
98 my $out;
|
|
99 while(<$fic>)
|
|
100 {
|
|
101 chomp $_;
|
|
102 $cmp++;
|
|
103 if ($cmp == 1)
|
|
104 {
|
|
105 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ );
|
|
106 $type = 0; @fastq = ();
|
|
107 if ($_ =~ /^\@(.*)\s.*/) { $seq_name = $1;}
|
|
108 elsif ($_ =~ /^\@(.*)/) {$seq_name = $1;}
|
|
109 push(@fastq,$_);
|
|
110 }
|
|
111 elsif ($cmp == 2)
|
|
112 {
|
|
113 #die "unrecognized symbol at line $cmp\n" unless $_ =~ /[atcgATCGnN]+/;
|
|
114 push(@fastq,$_);
|
|
115 $length = length($_);
|
|
116 $type = 0;
|
|
117 if ( $length >= $min_si && $length <= $max_si )
|
|
118 {
|
|
119 $type = 2;
|
|
120 $siRNA_number++;
|
|
121 }
|
|
122 if ($length >= $min_pi && $length <= $max_pi )
|
|
123 {
|
|
124 $type += 4;
|
|
125 $piRNA_number++;
|
|
126 }
|
|
127 }
|
|
128 elsif ($cmp == 3 )
|
|
129 {
|
|
130 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
|
|
131 push(@fastq,$_);
|
|
132 }
|
|
133 elsif ($cmp == 4 )
|
|
134 {
|
|
135 push(@fastq,$_);
|
|
136 $cmp = 0;
|
|
137 if ($type != 0)
|
|
138 {
|
|
139 if ($type & 4 ) { foreach my $t (@fastq) { print $pi $t."\n";} }
|
|
140 if ($type & 2 ) { foreach my $t (@fastq) { print $si $t."\n";} }
|
|
141 }
|
|
142 }
|
|
143 }
|
|
144 close $fic;
|
|
145 close $si; close $pi;
|
|
146 return ($piRNA_number);
|
|
147 }
|
|
148
|
|
149 1;
|