comparison script/CLIFinder.pl @ 15:6d7caeea1e74 draft

"planemo upload for repository https://github.com/GReD-Clermont/CLIFinder/ commit 2a3a263fc5a2123e6988872d23057f8797231a76"
author clifinder
date Wed, 12 Feb 2020 05:32:38 -0500
parents feecd33c8390
children 40695c10cfbd
comparison
equal deleted inserted replaced
14:feecd33c8390 15:6d7caeea1e74
43 "size_read:i" => \(my $size_reads = 100), 43 "size_read:i" => \(my $size_reads = 100),
44 "BDir:i" => \(my $Bdir = 0), 44 "BDir:i" => \(my $Bdir = 0),
45 "min_L1:i" => \(my $min_L1 = 50), 45 "min_L1:i" => \(my $min_L1 = 50),
46 "mis_L1:i" => \(my $mis_L1 = 2), 46 "mis_L1:i" => \(my $mis_L1 = 2),
47 "threads:i" => \(my $threads = 1), 47 "threads:i" => \(my $threads = 1),
48 'help' => sub { HelpMessage(0); }, 48 "help" => sub { HelpMessage(0); },
49 'version' => sub { VersionMessage(0); }, 49 "version" => sub { VersionMessage(0); },
50 ) or HelpMessage(1); 50 ) or HelpMessage(1);
51 51
52 HelpMessage(1) unless @fastq1 && @fastq2 && @name && defined($TE) && defined($ref) && defined($rmsk_source) && defined($refseq) && defined($html) && defined($html_repertory); 52 HelpMessage(1) unless @fastq1 && @fastq2 && @name && defined($TE) && defined($ref) && defined($rmsk_source) && defined($refseq) && defined($html) && defined($html_repertory);
53 53
54 my $iprct = 100 - (($prct / $size_reads)*100) ; 54 my $iprct = 100 - (($prct / $size_reads)*100) ;
93 { 93 {
94 ################################################### 94 ###################################################
95 # Paired end mapping against L1 promoter sequences# 95 # Paired end mapping against L1 promoter sequences#
96 ################################################### 96 ###################################################
97 97
98 print STDOUT "Alignement of $name[$tabR] to L1\n"; 98 print STDOUT "Alignment of $name[$tabR] to L1\n";
99 my $sam = $html_repertory.'/'.$name[$tabR]."_L1.sam"; push(@garbage, $sam); 99 my $sam = $html_repertory.'/'.$name[$tabR]."_L1.sam"; push(@garbage, $sam);
100 align_paired( $TE, $fastq1[$tabR], $fastq2[$tabR], $sam, $threads, $mis_auth); 100 align_paired( $TE, $fastq1[$tabR], $fastq2[$tabR], $sam, $threads, $mis_auth);
101 print STDOUT "Alignement done\n"; 101 print STDOUT "Alignment done\n";
102 102
103 ################################################## 103 ##################################################
104 # Creation of two fastq for paired halfed mapped:# 104 # Creation of two fastq for paired halfed mapped:#
105 # - _1 correspond to sequences mapped to L1 # 105 # - _1 correspond to sequences mapped to L1 #
106 # - _2 correspond to sequences unmapped to L1 # 106 # - _2 correspond to sequences unmapped to L1 #
107 ################################################## 107 ##################################################
108 108
109 print STDOUT "Getting pairs with one mate matched to L1 and the other mate undetected by repeatmasker as a repeat sequence\n"; 109 print STDOUT "Getting pairs with one mate matched to L1 and the other mate undetected by repeatmasker as a repeat sequence\n";
110
111 my $out_ASP_1 = $html_repertory.'/'.$name[$tabR]."_1.fastq"; push(@garbage, $out_ASP_1); 110 my $out_ASP_1 = $html_repertory.'/'.$name[$tabR]."_1.fastq"; push(@garbage, $out_ASP_1);
112 my $out_ASP_2 = $html_repertory.'/'.$name[$tabR]."_2.fastq"; push(@garbage, $out_ASP_2); 111 my $out_ASP_2 = $html_repertory.'/'.$name[$tabR]."_2.fastq"; push(@garbage, $out_ASP_2);
113 112
114 ##split mate that matched to L1 and others## 113 ##split mate that matched to L1 and others##
115 my ($ASP_readsHashR, $half_num_out) = get_half($sam, $mis_L1, $min_L1, $Bdir); 114 my ($ASP_readsHashR, $half_num_out) = get_half($sam, $mis_L1, $min_L1, $Bdir);
116 # $ASP_reads{$line[0]}[0] mapped - $ASP_reads{$line[0]}[1] unmapped 115 print STDOUT "Number of half mapped pairs: $half_num_out\n";
117 116
118 ##pairs obtained after repeatmasker on the other mate## 117 ##pairs obtained after repeatmasker on the other mate##
119 my $left = sort_out($threads, $out_ASP_1, $out_ASP_2, $dprct, $eprct, $ASP_readsHashR, $html_repertory); 118 my $left = sort_out($threads, $out_ASP_1, $out_ASP_2, $dprct, $eprct, $ASP_readsHashR, $html_repertory);
120
121 print STDOUT "Number of half mapped pairs : $half_num_out\n";
122 print STDOUT "Number of pairs after repeatmasker: $left\n"; 119 print STDOUT "Number of pairs after repeatmasker: $left\n";
123 120
124 ################################################## 121 ##################################################
125 # Alignment of halfed mapped pairs on genome # 122 # Alignment of halfed mapped pairs on genome #
126 ################################################## 123 ##################################################
166 163
167 my (%frag_uni, @second_R, @second_exp, @results); 164 my (%frag_uni, @second_R, @second_exp, @results);
168 my $merge_target = $html_repertory.'/target_merged.bed'; push(@garbage, $merge_target); 165 my $merge_target = $html_repertory.'/target_merged.bed'; push(@garbage, $merge_target);
169 my $merge = $html_repertory.'/merged.bed'; push(@garbage, $merge); 166 my $merge = $html_repertory.'/merged.bed'; push(@garbage, $merge);
170 167
171 open (my $mT, ">".$merge_target) || die "cannot open $merge_target\n"; 168 open (my $mT, ">".$merge_target) || die "Cannot open $merge_target\n";
172 open (my $m, ">".$merge) || die "cannot open $merge\n"; 169 open (my $m, ">".$merge) || die "Cannot open $merge\n";
173 open (my $in, $repMsecond) || die "cannot open secondM\n"; 170 open (my $in, $repMsecond) || die "Cannot open $repMsecond\n";
174 my $cmp = 0; 171 my $cmp = 0;
175 while (<$in>) 172 while (<$in>)
176 { 173 {
177 chomp $_; 174 chomp $_;
178 my @tmp = (0) x scalar(@fastq1); 175 my @tmp = (0) x scalar(@fastq1);
183 $cmp++; 180 $cmp++;
184 push @second_R, [$line[0],$line[1],$line[2],$line[3]]; 181 push @second_R, [$line[0],$line[1],$line[2],$line[3]];
185 } 182 }
186 183
187 $cmp = 0; 184 $cmp = 0;
188 open ($in, $repMfirst) || die "cannot open firstM\n"; 185 open ($in, $repMfirst) || die "Cannot open $repMfirst\n";
189 while (<$in>) 186 while (<$in>)
190 { 187 {
191 chomp $_; 188 chomp $_;
192 my %sec; 189 my %sec;
193 my @line = split /\t/, $_; 190 my @line = split /\t/, $_;
311 ############################################################ 308 ############################################################
312 309
313 sub filter_convert_rmsk 310 sub filter_convert_rmsk
314 { 311 {
315 my ($source, $bed, $line_only) = @_; 312 my ($source, $bed, $line_only) = @_;
316 open(my $input, $source) || die "cannot open rmsk file! $!\n"; ## Open source file 313 open(my $input, $source) || die "Cannot open rmsk file! $!\n"; ## Open source file
317 open(my $bedfile, ">".$bed) || die "cannot open output bed file for rmsk! $!\n"; ## Open bed file 314 open(my $bedfile, ">".$bed) || die "Cannot open output bed file for rmsk! $!\n"; ## Open bed file
318 open(my $linefile, ">".$line_only) || die "cannot open output LINE-only file for rmsk! $!\n"; ## Open line_only file 315 open(my $linefile, ">".$line_only) || die "Cannot open output LINE-only file for rmsk! $!\n"; ## Open line_only file
319 my @headers; 316 my @headers;
320 my %indices; 317 my %indices;
321 318
322 print $linefile "#filter: rmsk.repClass = 'LINE'\n"; 319 print $linefile "#filter: rmsk.repClass = 'LINE'\n";
323 320
489 ## store name of file 486 ## store name of file
490 my $sam = shift; 487 my $sam = shift;
491 my $mis_L1 = shift; 488 my $mis_L1 = shift;
492 my $min_L1 = shift; 489 my $min_L1 = shift;
493 my $Bdir = shift; 490 my $Bdir = shift;
494 open(my $fic, $sam) || die "cannot open sam file! $!\n"; ## Open file 491 open(my $fic, $sam) || die "Cannot open sam file! $!\n"; ## Open file
495 my (%ASP_reads); my $cmp = 0; ## Declare variables for 492 my (%ASP_reads); my $cmp = 0; ## Declare variables for
496 my $sequence = ''; 493 my $sequence = '';
497 my $score = ''; 494 my $score = '';
498 495
499 ##read file## 496 ##read file##
592 mkdir $repout; 589 mkdir $repout;
593 my %notLine; 590 my %notLine;
594 591
595 ##Write on file containing of readssHashTabR 592 ##Write on file containing of readssHashTabR
596 593
597 open(my $tmp, ">".$second) || die "cannot open temp file $second\n"; 594 open(my $tmp, ">".$second) || die "Cannot open temp file $second\n";
598 while ( my ($k,$v) = each %{$readsHashTabR} ) 595 while ( my ($k,$v) = each %{$readsHashTabR} )
599 { 596 {
600 print $tmp ${$v}[1] if defined(${$v}[1]); 597 print $tmp ${$v}[1] if defined(${$v}[1]);
601 } 598 }
602 close $tmp; 599 close $tmp;
607 604
608 ##Launch RepeatMasker on fasta file 605 ##Launch RepeatMasker on fasta file
609 606
610 `RepeatMasker -s -pa $threads -dir $repout -engine hmmer -species human $fa`; 607 `RepeatMasker -s -pa $threads -dir $repout -engine hmmer -species human $fa`;
611 my $repfile = $repout.$name.".fa.out"; 608 my $repfile = $repout.$name.".fa.out";
612 open (my $rep, $repfile) || die "cannot open $repfile $!\n"; 609 open (my $rep, $repfile) || die "Cannot open $repfile $!\n";
613 while(<$rep>) 610 while(<$rep>)
614 { 611 {
615 chomp; 612 chomp;
616 ## test the percent of repeats ## 613 ## test the percent of repeats ##
617 my $string = $_; 614 my $string = $_;
633 { 630 {
634 $notLine{$k} = 1 unless ($v->[0] > $dprct || $v->[1] < $eprct); 631 $notLine{$k} = 1 unless ($v->[0] > $dprct || $v->[1] < $eprct);
635 } 632 }
636 633
637 ##write resulting reads in both files for paired ## 634 ##write resulting reads in both files for paired ##
638 open(my $accepted_1, ">".$out1 ) || die "cannot open $out1 file $!\n"; 635 open(my $accepted_1, ">".$out1 ) || die "Cannot open $out1 file $!\n";
639 open(my $accepted_2, ">".$out2 ) || die "cannot open $out2 file $!\n"; 636 open(my $accepted_2, ">".$out2 ) || die "Cannot open $out2 file $!\n";
640 while ( my ($k,$v) = each %{$readsHashTabR} ) 637 while ( my ($k,$v) = each %{$readsHashTabR} )
641 { 638 {
642 if ( defined (${$v}[0]) && defined (${$v}[1]) ) 639 if ( defined (${$v}[0]) && defined (${$v}[1]) )
643 { 640 {
644 unless (defined ($notLine{$k}) && $notLine{$k} == 1) 641 unless (defined ($notLine{$k}) && $notLine{$k} == 1)
750 $IdCov{$1} = $split_cov[-1] if $split_cov[-1] > $IdCov{$1}; 747 $IdCov{$1} = $split_cov[-1] if $split_cov[-1] > $IdCov{$1};
751 } 748 }
752 } 749 }
753 750
754 ## get only first mate that have less tant $iprct repeats ## 751 ## get only first mate that have less tant $iprct repeats ##
755 open (my $tmp_fi, 'temp_name_first') || die "cannot open $namefirst!\n"; 752 open (my $tmp_fi, 'temp_name_first') || die "Cannot open $namefirst!\n";
756 open (my $nam_fi, ">".$namefirst) || die "cannot open $namefirst!\n"; 753 open (my $nam_fi, ">".$namefirst) || die "Cannot open $namefirst!\n";
757 while (<$tmp_fi>) 754 while (<$tmp_fi>)
758 { 755 {
759 my @line = split /\t/, $_; 756 my @line = split /\t/, $_;
760 $line[3] =~ /(.*?)\/[12]/; 757 $line[3] =~ /(.*?)\/[12]/;
761 758
769 close $tmp_fi; close $nam_fi; 766 close $tmp_fi; close $nam_fi;
770 767
771 768
772 ## get only second mate that have less than $iprct repeats ## 769 ## get only second mate that have less than $iprct repeats ##
773 770
774 open (my $tmp_sec, 'temp_name_second') || die "cannot open $namesecond!\n"; 771 open (my $tmp_sec, 'temp_name_second') || die "Cannot open $namesecond!\n";
775 open (my $nam_sec, ">".$namesecond) || die "cannot open $namesecond!\n"; 772 open (my $nam_sec, ">".$namesecond) || die "Cannot open $namesecond!\n";
776 while (<$tmp_sec>) 773 while (<$tmp_sec>)
777 { 774 {
778 my @line = split /\t/, $_; 775 my @line = split /\t/, $_;
779 $line[3] =~ /(.*?)\/[12]/; 776 $line[3] =~ /(.*?)\/[12]/;
780 if ($IdCov{$1} <= $iprct/100) 777 if ($IdCov{$1} <= $iprct/100)
782 print $nam_sec $_; 779 print $nam_sec $_;
783 } 780 }
784 } 781 }
785 close $tmp_sec; close $nam_sec; 782 close $tmp_sec; close $nam_sec;
786 } 783 }
787
788 #sub results
789 #{
790 # my ($out_repertory, $file, $name, $hashRef,$ps) = @_;
791 # my $namefirst = $out_repertory.'/'.$name.'-first.bed'; push(@garbage, $namefirst);
792 # my $namesecond = $out_repertory.'/'.$name.'-second.bed'; push(@garbage, $namesecond);
793 # `samtools view -Sb -f66 $file | bedtools bamtobed -i /dev/stdin > $namefirst`;
794 # `samtools view -Sb -f130 $file | bedtools bamtobed -i /dev/stdin > $namesecond`;
795 # open( my $in, $out_repertory.'/'.$name.'-first.bed') || die "cannot open first read bed\n";
796 # while (<$in>)
797 # {
798 # my @line = split /\t/, $_;
799 # $line[3] =~ /(.*?)\/1/;
800 # ${$hashRef}{$1}= $ps;
801 # }
802 #}
803 784
804 785
805 ############################################################ 786 ############################################################
806 ##Function blast: blast nucleotide sequences on ref ### 787 ##Function blast: blast nucleotide sequences on ref ###
807 ############################################################ 788 ############################################################
831 812
832 sub extract_blast 813 sub extract_blast
833 { 814 {
834 my $file = shift; 815 my $file = shift;
835 my %hash = (); 816 my %hash = ();
836 open (my $f, $file) || die "cannot open $file\n"; 817 open (my $f, $file) || die "Cannot open $file\n";
837 while (<$f>) 818 while (<$f>)
838 { 819 {
839 chomp $_; 820 chomp $_;
840 my ($seq,$id) = split /\t/,$_; 821 my ($seq,$id) = split /\t/,$_;
841 $seq = $1 if ($seq =~ /(\d+)-(.*?)-(\d+)-(\d+)/); 822 $seq = $1 if ($seq =~ /(\d+)-(.*?)-(\d+)-(\d+)/);
908 File::Copy::Recursive::dircopy "$Bin/js/", "$out/js" or die "Copy failed: $!"; 889 File::Copy::Recursive::dircopy "$Bin/js/", "$out/js" or die "Copy failed: $!";
909 File::Copy::Recursive::dircopy "$Bin/static/", "$out/static" or die "Copy failed: $!"; 890 File::Copy::Recursive::dircopy "$Bin/static/", "$out/static" or die "Copy failed: $!";
910 891
911 my $chimOut = $html; 892 my $chimOut = $html;
912 893
913 open(my $tab, ">".$chimOut) || die "cannot open $chimOut"; 894 open(my $tab, ">".$chimOut) || die "Cannot open $chimOut";
914 print_header($tab,"Chimerae"); 895 print_header($tab,"Chimerae");
915 print $tab "\t\t<tr>\n\t\t\t<th>L1 chromosome</th>\n\t\t\t<th>L1 start</th>\n\t\t\t<th>L1 end</th>\n\t\t\t<th>L1 strand</th>\n"; 896 print $tab "\t\t<tr>\n\t\t\t<th>L1 chromosome</th>\n\t\t\t<th>L1 start</th>\n\t\t\t<th>L1 end</th>\n\t\t\t<th>L1 strand</th>\n";
916 for my $i (0..$#fastq1) 897 for my $i (0..$#fastq1)
917 { 898 {
918 print $tab "\t\t\t<th>$name[$i] read #</th>\n"; 899 print $tab "\t\t\t<th>$name[$i] read #</th>\n";
1020 my $out3= $out.'/final_result_chimerae.txt'; 1001 my $out3= $out.'/final_result_chimerae.txt';
1021 1002
1022 # save result in csv file ## 1003 # save result in csv file ##
1023 1004
1024 my $filed = $out1; 1005 my $filed = $out1;
1025 open(my $tab, ">".$filed) || die "cannot open $filed"; 1006 open(my $tab, ">".$filed) || die "Cannot open $filed";
1026 print $tab "L1 chromosome \t L1 start \t L1 end \t L1 strand";; 1007 print $tab "L1 chromosome \t L1 start \t L1 end \t L1 strand";;
1027 for my $i (0..$#fastq1) 1008 for my $i (0..$#fastq1)
1028 { 1009 {
1029 print $tab "\t $name[$i] read #"; 1010 print $tab "\t $name[$i] read #";
1030 } 1011 }