comparison bin/subgroups.pm @ 61:9185ca0a7b43 draft

Updated package according to recommendations.
author pierre.pouchin
date Wed, 16 Jan 2019 08:18:13 -0500
parents 4bc00caa60b4
children
comparison
equal deleted inserted replaced
60:9645d995fb3c 61:9185ca0a7b43
12 use lib $FindBin::Bin; 12 use lib $FindBin::Bin;
13 use align qw ( get_hash_alignment ); 13 use align qw ( get_hash_alignment );
14 14
15 sub subgroups 15 sub subgroups
16 { 16 {
17 my ($fin, $dir, $mis, $mis_TE, $proc, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $transcripts, $TE, $min_si, $max_si, $min_pi, $max_pi, $report ) = @_; 17 my ($fin, $dir, $mis, $mis_TE, $proc, $tRNAs, $rRNAs, $snRNAs, $miRNAs, $transcripts, $TE, $min_si, $max_si, $min_pi, $max_pi, $report ) = @_;
18 my (@files, $sum, $pie, $repar, %ismapped, %isjunk, %repartition, @junk_ref, @all_ref ); 18 my (@files, $sum, $pie, $repar, %ismapped, %isjunk, %repartition, @junk_ref, @all_ref );
19 19
20 srand(); 20 srand();
21 print $report "----------------------------\n"; 21 print $report "----------------------------\n";
22 print $report "Create subgroups:\nfastq_in: $fin\ndirectory_out: $dir\nmismatches: $mis\nmismatches TE: $mis_TE\n"; 22 print $report "Create subgroups:\nfastq_in: $fin\ndirectory_out: $dir\nmismatches: $mis\nmismatches TE: $mis_TE\n";
23 23
24 mkdir $dir; 24 mkdir $dir;
25 $dir = $dir.'/' unless $dir =~ /(.*)\/$/; 25 $dir = $dir.'/' unless $dir =~ /(.*)\/$/;
26 26
27 my $accept_miRNas = $dir.'miRNAs.fastq'; 27 my $accept_miRNas = $dir.'miRNAs.fastq';
28 my $reject_miRNAs = $dir.'miRNAs_rejected.fastq'; 28 my $reject_miRNAs = $dir.'miRNAs_rejected.fastq';
29 my $sam_miRNAs = $dir.'miRNAs.sam'; 29 my $sam_miRNAs = $dir.'miRNAs.sam';
30 my @tmp = get_hash_alignment($miRNAs, $mis, 1, 1, $accept_miRNas, $reject_miRNAs, $fin, $proc, 'miRNAs',$sam_miRNAs, $report); 30 my @tmp = get_hash_alignment($miRNAs, $mis, 1, 1, $accept_miRNas, $reject_miRNAs, $fin, $proc, 'miRNAs',$sam_miRNAs, $report);
31 my $mi = $tmp[0]; my $sam = ''; 31 my $mi = $tmp[0]; my $sam = '';
32 $repartition{'miRNAs'} = $mi; 32 $repartition{'miRNAs'} = $mi;
33 33
34 34
35 my $reject_rRNAs = $dir.'rRNAs_rejected.fastq'; 35 my $reject_rRNAs = $dir.'rRNAs_rejected.fastq';
36 if ( $rRNAs eq 'None') 36 if ( $rRNAs eq 'None')
37 { 37 {
38 move($reject_miRNAs,$reject_rRNAs); 38 move($reject_miRNAs,$reject_rRNAs);
39 } 39 }
40 else 40 else
41 { 41 {
42 $sam = new String::Random; 42 $sam = new String::Random;
43 $sam = $sam->randpattern("CCcccccc"); 43 $sam = $sam->randpattern("CCcccccc");
44 @tmp = get_hash_alignment($rRNAs, $mis, 0, 1, 'NA', $reject_rRNAs, $reject_miRNAs, $proc, 'rRNAs', $sam, $report); 44 @tmp = get_hash_alignment($rRNAs, $mis, 0, 1, 'NA', $reject_rRNAs, $reject_miRNAs, $proc, 'rRNAs', $sam, $report);
45 $repartition{'rRNAs'} = $tmp[0]; 45 $repartition{'rRNAs'} = $tmp[0];
46 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; 46 unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
47 } 47 }
48 48
49 my $reject_tRNAs = $dir.'rRNAs_rejected.fastq'; 49 my $reject_tRNAs = $dir.'rRNAs_rejected.fastq';
50 if ( $rRNAs eq 'None') 50 if ( $rRNAs eq 'None')
51 { 51 {
52 move($reject_rRNAs,$reject_tRNAs); 52 move($reject_rRNAs,$reject_tRNAs);
53 } 53 }
54 else 54 else
55 { 55 {
56 $sam = new String::Random; 56 $sam = new String::Random;
57 $sam = $sam->randpattern("CCcccccc"); 57 $sam = $sam->randpattern("CCcccccc");
58 @tmp = get_hash_alignment($tRNAs, $mis, 0, 1, 'NA', $reject_tRNAs, $reject_rRNAs, $proc, 'tRNAs', $sam, $report); 58 @tmp = get_hash_alignment($tRNAs, $mis, 0, 1, 'NA', $reject_tRNAs, $reject_rRNAs, $proc, 'tRNAs', $sam, $report);
59 $repartition{'tRNAs'} = $tmp[0]; 59 $repartition{'tRNAs'} = $tmp[0];
60 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; 60 unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
61 } 61 }
62 62
63 63
64 my $bonafide = $dir.'bonafide_reads.fastq'; 64 my $bonafide = $dir.'bonafide_reads.fastq';
65 if ( $rRNAs eq 'None') 65 if ( $rRNAs eq 'None')
66 { 66 {
67 move($reject_tRNAs,$bonafide); 67 move($reject_tRNAs,$bonafide);
68 } 68 }
69 else 69 else
70 { 70 {
71 $sam = new String::Random; 71 $sam = new String::Random;
72 $sam = $sam->randpattern("CCcccccc"); 72 $sam = $sam->randpattern("CCcccccc");
73 @tmp = get_hash_alignment($snRNAs, $mis, 0, 1, 'NA', $bonafide, $reject_tRNAs, $proc, 'snRNAs', $sam, $report); 73 @tmp = get_hash_alignment($snRNAs, $mis, 0, 1, 'NA', $bonafide, $reject_tRNAs, $proc, 'snRNAs', $sam, $report);
74 $repartition{'snRNAs'} = $tmp[0]; 74 $repartition{'snRNAs'} = $tmp[0];
75 75
76 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; 76 unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
77 } 77 }
78 my $bo = $tmp[1]; 78 my $bo = $tmp[1];
79 79
80 my $sam_transcripts = $dir.'transcripts.sam'; 80 my $sam_transcripts = $dir.'transcripts.sam';
81 my $reject_transcripts = $dir.'rejected_transcripts.fastq'; 81 my $reject_transcripts = $dir.'rejected_transcripts.fastq';
82 @tmp = get_hash_alignment($transcripts, $mis, 0, 1, 'NA', $reject_transcripts, $bonafide, $proc, 'transcripts', $sam_transcripts, $report, $dir.'transcripts.fai'); 82 @tmp = get_hash_alignment($transcripts, $mis, 0, 1, 'NA', $reject_transcripts, $bonafide, $proc, 'transcripts', $sam_transcripts, $report, $dir.'transcripts.fai');
83 $repartition{'transcripts'} = $tmp[0]; 83 $repartition{'transcripts'} = $tmp[0];
84 84
85 85
86 my $sam_TEs = $dir.'TEs.sam'; 86 my $sam_TEs = $dir.'TEs.sam';
87 my $reject_TEs = $dir.'rejected.fastq'; 87 my $reject_TEs = $dir.'rejected.fastq';
88 @tmp = get_hash_alignment($TE, $mis_TE, 0, 1, 'NA', $reject_TEs, $reject_transcripts, $proc, 'TEs', $sam_TEs, $report, $dir.'TEs.fai' ); 88 @tmp = get_hash_alignment($TE, $mis_TE, 0, 1, 'NA', $reject_TEs, $reject_transcripts, $proc, 'TEs', $sam_TEs, $report, $dir.'TEs.fai' );
89 $repartition{'TEs'} = $tmp[0] ; $repartition{'others'} = $tmp[1]; 89 $repartition{'TEs'} = $tmp[0] ; $repartition{'others'} = $tmp[1];
90 unlink $sam, $sam.'_aln.err', $sam.'_samse.err'; 90 unlink $sam, $sam.'_aln.err', $sam.'_samse.err';
91 unlink $reject_transcripts; 91 unlink $reject_transcripts;
92 unlink $reject_rRNAs; 92 unlink $reject_rRNAs;
93 unlink $reject_miRNAs; 93 unlink $reject_miRNAs;
94 unlink $reject_tRNAs; 94 unlink $reject_tRNAs;
95 95
96 #create repartition 96 #create repartition
97 my $pi = fastqSubgroups($bonafide, $dir, $min_si, $max_si, $min_pi, $max_pi ); 97 my $pi = fastqSubgroups($bonafide, $dir, $min_si, $max_si, $min_pi, $max_pi );
98 98
99 open (my $re, '>'.$dir.'repartition.txt') || die "cannot open $dir repartition.txt $!\n"; 99 open (my $re, '>'.$dir.'repartition.txt') || die "cannot open $dir repartition.txt $!\n";
100 print $re "type\tnumber\tpercentage\n"; 100 print $re "type\tnumber\tpercentage\n";
101 $sum += $_ foreach values %repartition; 101 $sum += $_ foreach values %repartition;
102 foreach my $k ( sort keys %repartition ) 102 foreach my $k ( sort keys %repartition )
103 { 103 {
104 my $prct = 0; 104 my $prct = 0;
105 $prct = $repartition{$k} / $sum * 100 if $sum != 0; 105 $prct = $repartition{$k} / $sum * 100 if $sum != 0;
106 print $re "$k\t$repartition{$k}\t"; printf $re "%.2f\n",$prct; 106 print $re "$k\t$repartition{$k}\t"; printf $re "%.2f\n",$prct;
107 } 107 }
108 return ( $bo, $mi, $pi); 108 return ( $bo, $mi, $pi);
109 } 109 }
110 110
111 sub fastqSubgroups 111 sub fastqSubgroups
112 { 112 {
113 my ( $fastq, $output_directory, $min_si, $max_si, $min_pi, $max_pi ) = @_; 113 my ( $fastq, $output_directory, $min_si, $max_si, $min_pi, $max_pi ) = @_;
114 my $fastq_siRNA = $output_directory."siRNAs.fastq"; 114 my $fastq_siRNA = $output_directory."siRNAs.fastq";
115 my $fastq_piRNA = $output_directory."piRNAs.fastq"; 115 my $fastq_piRNA = $output_directory."piRNAs.fastq";
116 116
117 open my $fic, '<', $fastq || die "cannot open input file $! \n"; 117 open my $fic, '<', $fastq || die "cannot open input file $! \n";
118 open my $si, '>', $fastq_siRNA || die "cannot open siRNA.fastq $! \n"; 118 open my $si, '>', $fastq_siRNA || die "cannot open siRNA.fastq $! \n";
119 open my $pi, '>', $fastq_piRNA || die "cannot open piRNA.fastq $! \n"; 119 open my $pi, '>', $fastq_piRNA || die "cannot open piRNA.fastq $! \n";
120 120
121 my ($length, $cmp, $type, $siRNA_number, $miRNA_h_number, $piRNA_number, $not_pi_number) = (0,0,0,0,0,0,0); 121 my ($length, $cmp, $type, $siRNA_number, $miRNA_h_number, $piRNA_number, $not_pi_number) = (0,0,0,0,0,0,0);
122 my (@fastq) =(); my $seq_name; 122 my (@fastq) =(); my $seq_name;
123 my $out; 123 my $out;
124 while(<$fic>) 124 while(<$fic>)
125 {
126 chomp $_;
127 $cmp++;
128 if ($cmp == 1)
129 { 125 {
130 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); 126 chomp $_;
131 $type = 0; @fastq = (); 127 $cmp++;
132 if ($_ =~ /^\@(.*)\s.*/) { $seq_name = $1;} 128 if ($cmp == 1)
133 elsif ($_ =~ /^\@(.*)/) {$seq_name = $1;} 129 {
134 push(@fastq,$_); 130 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ );
131 $type = 0; @fastq = ();
132 if ($_ =~ /^\@(.*)\s.*/) { $seq_name = $1;}
133 elsif ($_ =~ /^\@(.*)/) {$seq_name = $1;}
134 push(@fastq,$_);
135 }
136 elsif ($cmp == 2)
137 {
138 #die "unrecognized symbol at line $cmp\n" unless $_ =~ /[atcgATCGnN]+/;
139 push(@fastq,$_);
140 $length = length($_);
141 $type = 0;
142 if ( $length >= $min_si && $length <= $max_si )
143 {
144 $type = 2;
145 $siRNA_number++;
146 }
147 if ($length >= $min_pi && $length <= $max_pi )
148 {
149 $type += 4;
150 $piRNA_number++;
151 }
152 }
153 elsif ($cmp == 3 )
154 {
155 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
156 push(@fastq,$_);
157 }
158 elsif ($cmp == 4 )
159 {
160 push(@fastq,$_);
161 $cmp = 0;
162 if ($type != 0)
163 {
164 if ($type & 4 ) { foreach my $t (@fastq) { print $pi $t."\n";} }
165 if ($type & 2 ) { foreach my $t (@fastq) { print $si $t."\n";} }
166 }
167 }
135 } 168 }
136 elsif ($cmp == 2) 169 close $fic;
137 { 170 close $si; close $pi;
138 #die "unrecognized symbol at line $cmp\n" unless $_ =~ /[atcgATCGnN]+/; 171 return ($piRNA_number);
139 push(@fastq,$_);
140 $length = length($_);
141 $type = 0;
142 if ( $length >= $min_si && $length <= $max_si )
143 {
144 $type = 2;
145 $siRNA_number++;
146 }
147 if ($length >= $min_pi && $length <= $max_pi )
148 {
149 $type += 4;
150 $piRNA_number++;
151 }
152 }
153 elsif ($cmp == 3 )
154 {
155 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
156 push(@fastq,$_);
157 }
158 elsif ($cmp == 4 )
159 {
160 push(@fastq,$_);
161 $cmp = 0;
162 if ($type != 0)
163 {
164 if ($type & 4 ) { foreach my $t (@fastq) { print $pi $t."\n";} }
165 if ($type & 2 ) { foreach my $t (@fastq) { print $si $t."\n";} }
166 }
167 }
168 }
169 close $fic;
170 close $si; close $pi;
171 return ($piRNA_number);
172 } 172 }
173 173
174 1; 174 1;