Mercurial > repos > brasset_jensen > srnapipe
comparison bin/subgroups.pm @ 61:9185ca0a7b43 draft
Updated package according to recommendations.
author | pierre.pouchin |
---|---|
date | Wed, 16 Jan 2019 08:18:13 -0500 |
parents | 4bc00caa60b4 |
children |
comparison
equal
deleted
inserted
replaced
60:9645d995fb3c | 61:9185ca0a7b43 |
---|---|
12 use lib $FindBin::Bin; | 12 use lib $FindBin::Bin; |
13 use align qw ( get_hash_alignment ); | 13 use align qw ( get_hash_alignment ); |
14 | 14 |
15 sub subgroups | 15 sub subgroups |
16 { | 16 { |
17 my ($fin, $dir, $mis, $mis_TE, $proc, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $transcripts, $TE, $min_si, $max_si, $min_pi, $max_pi, $report ) = @_; | 17 my ($fin, $dir, $mis, $mis_TE, $proc, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $transcripts, $TE, $min_si, $max_si, $min_pi, $max_pi, $report ) = @_; |
18 my (@files, $sum, $pie, $repar, %ismapped, %isjunk, %repartition, @junk_ref, @all_ref ); | 18 my (@files, $sum, $pie, $repar, %ismapped, %isjunk, %repartition, @junk_ref, @all_ref ); |
19 | 19 |
20 srand(); | 20 srand(); |
21 print $report "----------------------------\n"; | 21 print $report "----------------------------\n"; |
22 print $report "Create subgroups:\nfastq_in: $fin\ndirectory_out: $dir\nmismatches: $mis\nmismatches TE: $mis_TE\n"; | 22 print $report "Create subgroups:\nfastq_in: $fin\ndirectory_out: $dir\nmismatches: $mis\nmismatches TE: $mis_TE\n"; |
23 | 23 |
24 mkdir $dir; | 24 mkdir $dir; |
25 $dir = $dir.'/' unless $dir =~ /(.*)\/$/; | 25 $dir = $dir.'/' unless $dir =~ /(.*)\/$/; |
26 | 26 |
27 my $accept_miRNas = $dir.'miRNAs.fastq'; | 27 my $accept_miRNas = $dir.'miRNAs.fastq'; |
28 my $reject_miRNAs = $dir.'miRNAs_rejected.fastq'; | 28 my $reject_miRNAs = $dir.'miRNAs_rejected.fastq'; |
29 my $sam_miRNAs = $dir.'miRNAs.sam'; | 29 my $sam_miRNAs = $dir.'miRNAs.sam'; |
30 my @tmp = get_hash_alignment($miRNAs, $mis, 1, 1, $accept_miRNas, $reject_miRNAs, $fin, $proc, 'miRNAs',$sam_miRNAs, $report); | 30 my @tmp = get_hash_alignment($miRNAs, $mis, 1, 1, $accept_miRNas, $reject_miRNAs, $fin, $proc, 'miRNAs',$sam_miRNAs, $report); |
31 my $mi = $tmp[0]; my $sam = ''; | 31 my $mi = $tmp[0]; my $sam = ''; |
32 $repartition{'miRNAs'} = $mi; | 32 $repartition{'miRNAs'} = $mi; |
33 | 33 |
34 | 34 |
35 my $reject_rRNAs = $dir.'rRNAs_rejected.fastq'; | 35 my $reject_rRNAs = $dir.'rRNAs_rejected.fastq'; |
36 if ( $rRNAs eq 'None') | 36 if ( $rRNAs eq 'None') |
37 { | 37 { |
38 move($reject_miRNAs,$reject_rRNAs); | 38 move($reject_miRNAs,$reject_rRNAs); |
39 } | 39 } |
40 else | 40 else |
41 { | 41 { |
42 $sam = new String::Random; | 42 $sam = new String::Random; |
43 $sam = $sam->randpattern("CCcccccc"); | 43 $sam = $sam->randpattern("CCcccccc"); |
44 @tmp = get_hash_alignment($rRNAs, $mis, 0, 1, 'NA', $reject_rRNAs, $reject_miRNAs, $proc, 'rRNAs', $sam, $report); | 44 @tmp = get_hash_alignment($rRNAs, $mis, 0, 1, 'NA', $reject_rRNAs, $reject_miRNAs, $proc, 'rRNAs', $sam, $report); |
45 $repartition{'rRNAs'} = $tmp[0]; | 45 $repartition{'rRNAs'} = $tmp[0]; |
46 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; | 46 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; |
47 } | 47 } |
48 | 48 |
49 my $reject_tRNAs = $dir.'rRNAs_rejected.fastq'; | 49 my $reject_tRNAs = $dir.'rRNAs_rejected.fastq'; |
50 if ( $rRNAs eq 'None') | 50 if ( $rRNAs eq 'None') |
51 { | 51 { |
52 move($reject_rRNAs,$reject_tRNAs); | 52 move($reject_rRNAs,$reject_tRNAs); |
53 } | 53 } |
54 else | 54 else |
55 { | 55 { |
56 $sam = new String::Random; | 56 $sam = new String::Random; |
57 $sam = $sam->randpattern("CCcccccc"); | 57 $sam = $sam->randpattern("CCcccccc"); |
58 @tmp = get_hash_alignment($tRNAs, $mis, 0, 1, 'NA', $reject_tRNAs, $reject_rRNAs, $proc, 'tRNAs', $sam, $report); | 58 @tmp = get_hash_alignment($tRNAs, $mis, 0, 1, 'NA', $reject_tRNAs, $reject_rRNAs, $proc, 'tRNAs', $sam, $report); |
59 $repartition{'tRNAs'} = $tmp[0]; | 59 $repartition{'tRNAs'} = $tmp[0]; |
60 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; | 60 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; |
61 } | 61 } |
62 | 62 |
63 | 63 |
64 my $bonafide = $dir.'bonafide_reads.fastq'; | 64 my $bonafide = $dir.'bonafide_reads.fastq'; |
65 if ( $rRNAs eq 'None') | 65 if ( $rRNAs eq 'None') |
66 { | 66 { |
67 move($reject_tRNAs,$bonafide); | 67 move($reject_tRNAs,$bonafide); |
68 } | 68 } |
69 else | 69 else |
70 { | 70 { |
71 $sam = new String::Random; | 71 $sam = new String::Random; |
72 $sam = $sam->randpattern("CCcccccc"); | 72 $sam = $sam->randpattern("CCcccccc"); |
73 @tmp = get_hash_alignment($snRNAs, $mis, 0, 1, 'NA', $bonafide, $reject_tRNAs, $proc, 'snRNAs', $sam, $report); | 73 @tmp = get_hash_alignment($snRNAs, $mis, 0, 1, 'NA', $bonafide, $reject_tRNAs, $proc, 'snRNAs', $sam, $report); |
74 $repartition{'snRNAs'} = $tmp[0]; | 74 $repartition{'snRNAs'} = $tmp[0]; |
75 | 75 |
76 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; | 76 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; |
77 } | 77 } |
78 my $bo = $tmp[1]; | 78 my $bo = $tmp[1]; |
79 | 79 |
80 my $sam_transcripts = $dir.'transcripts.sam'; | 80 my $sam_transcripts = $dir.'transcripts.sam'; |
81 my $reject_transcripts = $dir.'rejected_transcripts.fastq'; | 81 my $reject_transcripts = $dir.'rejected_transcripts.fastq'; |
82 @tmp = get_hash_alignment($transcripts, $mis, 0, 1, 'NA', $reject_transcripts, $bonafide, $proc, 'transcripts', $sam_transcripts, $report, $dir.'transcripts.fai'); | 82 @tmp = get_hash_alignment($transcripts, $mis, 0, 1, 'NA', $reject_transcripts, $bonafide, $proc, 'transcripts', $sam_transcripts, $report, $dir.'transcripts.fai'); |
83 $repartition{'transcripts'} = $tmp[0]; | 83 $repartition{'transcripts'} = $tmp[0]; |
84 | 84 |
85 | 85 |
86 my $sam_TEs = $dir.'TEs.sam'; | 86 my $sam_TEs = $dir.'TEs.sam'; |
87 my $reject_TEs = $dir.'rejected.fastq'; | 87 my $reject_TEs = $dir.'rejected.fastq'; |
88 @tmp = get_hash_alignment($TE, $mis_TE, 0, 1, 'NA', $reject_TEs, $reject_transcripts, $proc, 'TEs', $sam_TEs, $report, $dir.'TEs.fai' ); | 88 @tmp = get_hash_alignment($TE, $mis_TE, 0, 1, 'NA', $reject_TEs, $reject_transcripts, $proc, 'TEs', $sam_TEs, $report, $dir.'TEs.fai' ); |
89 $repartition{'TEs'} = $tmp[0] ; $repartition{'others'} = $tmp[1]; | 89 $repartition{'TEs'} = $tmp[0] ; $repartition{'others'} = $tmp[1]; |
90 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; | 90 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; |
91 unlink $reject_transcripts; | 91 unlink $reject_transcripts; |
92 unlink $reject_rRNAs; | 92 unlink $reject_rRNAs; |
93 unlink $reject_miRNAs; | 93 unlink $reject_miRNAs; |
94 unlink $reject_tRNAs; | 94 unlink $reject_tRNAs; |
95 | 95 |
96 #create repartition | 96 #create repartition |
97 my $pi = fastqSubgroups($bonafide, $dir, $min_si, $max_si, $min_pi, $max_pi ); | 97 my $pi = fastqSubgroups($bonafide, $dir, $min_si, $max_si, $min_pi, $max_pi ); |
98 | 98 |
99 open (my $re, '>'.$dir.'repartition.txt') || die "cannot open $dir repartition.txt $!\n"; | 99 open (my $re, '>'.$dir.'repartition.txt') || die "cannot open $dir repartition.txt $!\n"; |
100 print $re "type\tnumber\tpercentage\n"; | 100 print $re "type\tnumber\tpercentage\n"; |
101 $sum += $_ foreach values %repartition; | 101 $sum += $_ foreach values %repartition; |
102 foreach my $k ( sort keys %repartition ) | 102 foreach my $k ( sort keys %repartition ) |
103 { | 103 { |
104 my $prct = 0; | 104 my $prct = 0; |
105 $prct = $repartition{$k} / $sum * 100 if $sum != 0; | 105 $prct = $repartition{$k} / $sum * 100 if $sum != 0; |
106 print $re "$k\t$repartition{$k}\t"; printf $re "%.2f\n",$prct; | 106 print $re "$k\t$repartition{$k}\t"; printf $re "%.2f\n",$prct; |
107 } | 107 } |
108 return ( $bo, $mi, $pi); | 108 return ( $bo, $mi, $pi); |
109 } | 109 } |
110 | 110 |
111 sub fastqSubgroups | 111 sub fastqSubgroups |
112 { | 112 { |
113 my ( $fastq, $output_directory, $min_si, $max_si, $min_pi, $max_pi ) = @_; | 113 my ( $fastq, $output_directory, $min_si, $max_si, $min_pi, $max_pi ) = @_; |
114 my $fastq_siRNA = $output_directory."siRNAs.fastq"; | 114 my $fastq_siRNA = $output_directory."siRNAs.fastq"; |
115 my $fastq_piRNA = $output_directory."piRNAs.fastq"; | 115 my $fastq_piRNA = $output_directory."piRNAs.fastq"; |
116 | 116 |
117 open my $fic, '<', $fastq || die "cannot open input file $! \n"; | 117 open my $fic, '<', $fastq || die "cannot open input file $! \n"; |
118 open my $si, '>', $fastq_siRNA || die "cannot open siRNA.fastq $! \n"; | 118 open my $si, '>', $fastq_siRNA || die "cannot open siRNA.fastq $! \n"; |
119 open my $pi, '>', $fastq_piRNA || die "cannot open piRNA.fastq $! \n"; | 119 open my $pi, '>', $fastq_piRNA || die "cannot open piRNA.fastq $! \n"; |
120 | 120 |
121 my ($length, $cmp, $type, $siRNA_number, $miRNA_h_number, $piRNA_number, $not_pi_number) = (0,0,0,0,0,0,0); | 121 my ($length, $cmp, $type, $siRNA_number, $miRNA_h_number, $piRNA_number, $not_pi_number) = (0,0,0,0,0,0,0); |
122 my (@fastq) =(); my $seq_name; | 122 my (@fastq) =(); my $seq_name; |
123 my $out; | 123 my $out; |
124 while(<$fic>) | 124 while(<$fic>) |
125 { | |
126 chomp $_; | |
127 $cmp++; | |
128 if ($cmp == 1) | |
129 { | 125 { |
130 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); | 126 chomp $_; |
131 $type = 0; @fastq = (); | 127 $cmp++; |
132 if ($_ =~ /^\@(.*)\s.*/) { $seq_name = $1;} | 128 if ($cmp == 1) |
133 elsif ($_ =~ /^\@(.*)/) {$seq_name = $1;} | 129 { |
134 push(@fastq,$_); | 130 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); |
131 $type = 0; @fastq = (); | |
132 if ($_ =~ /^\@(.*)\s.*/) { $seq_name = $1;} | |
133 elsif ($_ =~ /^\@(.*)/) {$seq_name = $1;} | |
134 push(@fastq,$_); | |
135 } | |
136 elsif ($cmp == 2) | |
137 { | |
138 #die "unrecognized symbol at line $cmp\n" unless $_ =~ /[atcgATCGnN]+/; | |
139 push(@fastq,$_); | |
140 $length = length($_); | |
141 $type = 0; | |
142 if ( $length >= $min_si && $length <= $max_si ) | |
143 { | |
144 $type = 2; | |
145 $siRNA_number++; | |
146 } | |
147 if ($length >= $min_pi && $length <= $max_pi ) | |
148 { | |
149 $type += 4; | |
150 $piRNA_number++; | |
151 } | |
152 } | |
153 elsif ($cmp == 3 ) | |
154 { | |
155 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/; | |
156 push(@fastq,$_); | |
157 } | |
158 elsif ($cmp == 4 ) | |
159 { | |
160 push(@fastq,$_); | |
161 $cmp = 0; | |
162 if ($type != 0) | |
163 { | |
164 if ($type & 4 ) { foreach my $t (@fastq) { print $pi $t."\n";} } | |
165 if ($type & 2 ) { foreach my $t (@fastq) { print $si $t."\n";} } | |
166 } | |
167 } | |
135 } | 168 } |
136 elsif ($cmp == 2) | 169 close $fic; |
137 { | 170 close $si; close $pi; |
138 #die "unrecognized symbol at line $cmp\n" unless $_ =~ /[atcgATCGnN]+/; | 171 return ($piRNA_number); |
139 push(@fastq,$_); | |
140 $length = length($_); | |
141 $type = 0; | |
142 if ( $length >= $min_si && $length <= $max_si ) | |
143 { | |
144 $type = 2; | |
145 $siRNA_number++; | |
146 } | |
147 if ($length >= $min_pi && $length <= $max_pi ) | |
148 { | |
149 $type += 4; | |
150 $piRNA_number++; | |
151 } | |
152 } | |
153 elsif ($cmp == 3 ) | |
154 { | |
155 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/; | |
156 push(@fastq,$_); | |
157 } | |
158 elsif ($cmp == 4 ) | |
159 { | |
160 push(@fastq,$_); | |
161 $cmp = 0; | |
162 if ($type != 0) | |
163 { | |
164 if ($type & 4 ) { foreach my $t (@fastq) { print $pi $t."\n";} } | |
165 if ($type & 2 ) { foreach my $t (@fastq) { print $si $t."\n";} } | |
166 } | |
167 } | |
168 } | |
169 close $fic; | |
170 close $si; close $pi; | |
171 return ($piRNA_number); | |
172 } | 172 } |
173 | 173 |
174 1; | 174 1; |