Mercurial > repos > clifinder > clifinder
comparison script/CLIFinder.pl @ 15:6d7caeea1e74 draft
"planemo upload for repository https://github.com/GReD-Clermont/CLIFinder/ commit 2a3a263fc5a2123e6988872d23057f8797231a76"
author | clifinder |
---|---|
date | Wed, 12 Feb 2020 05:32:38 -0500 |
parents | feecd33c8390 |
children | 40695c10cfbd |
comparison
equal
deleted
inserted
replaced
14:feecd33c8390 | 15:6d7caeea1e74 |
---|---|
43 "size_read:i" => \(my $size_reads = 100), | 43 "size_read:i" => \(my $size_reads = 100), |
44 "BDir:i" => \(my $Bdir = 0), | 44 "BDir:i" => \(my $Bdir = 0), |
45 "min_L1:i" => \(my $min_L1 = 50), | 45 "min_L1:i" => \(my $min_L1 = 50), |
46 "mis_L1:i" => \(my $mis_L1 = 2), | 46 "mis_L1:i" => \(my $mis_L1 = 2), |
47 "threads:i" => \(my $threads = 1), | 47 "threads:i" => \(my $threads = 1), |
48 'help' => sub { HelpMessage(0); }, | 48 "help" => sub { HelpMessage(0); }, |
49 'version' => sub { VersionMessage(0); }, | 49 "version" => sub { VersionMessage(0); }, |
50 ) or HelpMessage(1); | 50 ) or HelpMessage(1); |
51 | 51 |
52 HelpMessage(1) unless @fastq1 && @fastq2 && @name && defined($TE) && defined($ref) && defined($rmsk_source) && defined($refseq) && defined($html) && defined($html_repertory); | 52 HelpMessage(1) unless @fastq1 && @fastq2 && @name && defined($TE) && defined($ref) && defined($rmsk_source) && defined($refseq) && defined($html) && defined($html_repertory); |
53 | 53 |
54 my $iprct = 100 - (($prct / $size_reads)*100) ; | 54 my $iprct = 100 - (($prct / $size_reads)*100) ; |
93 { | 93 { |
94 ################################################### | 94 ################################################### |
95 # Paired end mapping against L1 promoter sequences# | 95 # Paired end mapping against L1 promoter sequences# |
96 ################################################### | 96 ################################################### |
97 | 97 |
98 print STDOUT "Alignement of $name[$tabR] to L1\n"; | 98 print STDOUT "Alignment of $name[$tabR] to L1\n"; |
99 my $sam = $html_repertory.'/'.$name[$tabR]."_L1.sam"; push(@garbage, $sam); | 99 my $sam = $html_repertory.'/'.$name[$tabR]."_L1.sam"; push(@garbage, $sam); |
100 align_paired( $TE, $fastq1[$tabR], $fastq2[$tabR], $sam, $threads, $mis_auth); | 100 align_paired( $TE, $fastq1[$tabR], $fastq2[$tabR], $sam, $threads, $mis_auth); |
101 print STDOUT "Alignement done\n"; | 101 print STDOUT "Alignment done\n"; |
102 | 102 |
103 ################################################## | 103 ################################################## |
104 # Creation of two fastq for paired halfed mapped:# | 104 # Creation of two fastq for paired halfed mapped:# |
105 # - _1 correspond to sequences mapped to L1 # | 105 # - _1 correspond to sequences mapped to L1 # |
106 # - _2 correspond to sequences unmapped to L1 # | 106 # - _2 correspond to sequences unmapped to L1 # |
107 ################################################## | 107 ################################################## |
108 | 108 |
109 print STDOUT "Getting pairs with one mate matched to L1 and the other mate undetected by repeatmasker as a repeat sequence\n"; | 109 print STDOUT "Getting pairs with one mate matched to L1 and the other mate undetected by repeatmasker as a repeat sequence\n"; |
110 | |
111 my $out_ASP_1 = $html_repertory.'/'.$name[$tabR]."_1.fastq"; push(@garbage, $out_ASP_1); | 110 my $out_ASP_1 = $html_repertory.'/'.$name[$tabR]."_1.fastq"; push(@garbage, $out_ASP_1); |
112 my $out_ASP_2 = $html_repertory.'/'.$name[$tabR]."_2.fastq"; push(@garbage, $out_ASP_2); | 111 my $out_ASP_2 = $html_repertory.'/'.$name[$tabR]."_2.fastq"; push(@garbage, $out_ASP_2); |
113 | 112 |
114 ##split mate that matched to L1 and others## | 113 ##split mate that matched to L1 and others## |
115 my ($ASP_readsHashR, $half_num_out) = get_half($sam, $mis_L1, $min_L1, $Bdir); | 114 my ($ASP_readsHashR, $half_num_out) = get_half($sam, $mis_L1, $min_L1, $Bdir); |
116 # $ASP_reads{$line[0]}[0] mapped - $ASP_reads{$line[0]}[1] unmapped | 115 print STDOUT "Number of half mapped pairs: $half_num_out\n"; |
117 | 116 |
118 ##pairs obtained after repeatmasker on the other mate## | 117 ##pairs obtained after repeatmasker on the other mate## |
119 my $left = sort_out($threads, $out_ASP_1, $out_ASP_2, $dprct, $eprct, $ASP_readsHashR, $html_repertory); | 118 my $left = sort_out($threads, $out_ASP_1, $out_ASP_2, $dprct, $eprct, $ASP_readsHashR, $html_repertory); |
120 | |
121 print STDOUT "Number of half mapped pairs : $half_num_out\n"; | |
122 print STDOUT "Number of pairs after repeatmasker: $left\n"; | 119 print STDOUT "Number of pairs after repeatmasker: $left\n"; |
123 | 120 |
124 ################################################## | 121 ################################################## |
125 # Alignment of halfed mapped pairs on genome # | 122 # Alignment of halfed mapped pairs on genome # |
126 ################################################## | 123 ################################################## |
166 | 163 |
167 my (%frag_uni, @second_R, @second_exp, @results); | 164 my (%frag_uni, @second_R, @second_exp, @results); |
168 my $merge_target = $html_repertory.'/target_merged.bed'; push(@garbage, $merge_target); | 165 my $merge_target = $html_repertory.'/target_merged.bed'; push(@garbage, $merge_target); |
169 my $merge = $html_repertory.'/merged.bed'; push(@garbage, $merge); | 166 my $merge = $html_repertory.'/merged.bed'; push(@garbage, $merge); |
170 | 167 |
171 open (my $mT, ">".$merge_target) || die "cannot open $merge_target\n"; | 168 open (my $mT, ">".$merge_target) || die "Cannot open $merge_target\n"; |
172 open (my $m, ">".$merge) || die "cannot open $merge\n"; | 169 open (my $m, ">".$merge) || die "Cannot open $merge\n"; |
173 open (my $in, $repMsecond) || die "cannot open secondM\n"; | 170 open (my $in, $repMsecond) || die "Cannot open $repMsecond\n"; |
174 my $cmp = 0; | 171 my $cmp = 0; |
175 while (<$in>) | 172 while (<$in>) |
176 { | 173 { |
177 chomp $_; | 174 chomp $_; |
178 my @tmp = (0) x scalar(@fastq1); | 175 my @tmp = (0) x scalar(@fastq1); |
183 $cmp++; | 180 $cmp++; |
184 push @second_R, [$line[0],$line[1],$line[2],$line[3]]; | 181 push @second_R, [$line[0],$line[1],$line[2],$line[3]]; |
185 } | 182 } |
186 | 183 |
187 $cmp = 0; | 184 $cmp = 0; |
188 open ($in, $repMfirst) || die "cannot open firstM\n"; | 185 open ($in, $repMfirst) || die "Cannot open $repMfirst\n"; |
189 while (<$in>) | 186 while (<$in>) |
190 { | 187 { |
191 chomp $_; | 188 chomp $_; |
192 my %sec; | 189 my %sec; |
193 my @line = split /\t/, $_; | 190 my @line = split /\t/, $_; |
311 ############################################################ | 308 ############################################################ |
312 | 309 |
313 sub filter_convert_rmsk | 310 sub filter_convert_rmsk |
314 { | 311 { |
315 my ($source, $bed, $line_only) = @_; | 312 my ($source, $bed, $line_only) = @_; |
316 open(my $input, $source) || die "cannot open rmsk file! $!\n"; ## Open source file | 313 open(my $input, $source) || die "Cannot open rmsk file! $!\n"; ## Open source file |
317 open(my $bedfile, ">".$bed) || die "cannot open output bed file for rmsk! $!\n"; ## Open bed file | 314 open(my $bedfile, ">".$bed) || die "Cannot open output bed file for rmsk! $!\n"; ## Open bed file |
318 open(my $linefile, ">".$line_only) || die "cannot open output LINE-only file for rmsk! $!\n"; ## Open line_only file | 315 open(my $linefile, ">".$line_only) || die "Cannot open output LINE-only file for rmsk! $!\n"; ## Open line_only file |
319 my @headers; | 316 my @headers; |
320 my %indices; | 317 my %indices; |
321 | 318 |
322 print $linefile "#filter: rmsk.repClass = 'LINE'\n"; | 319 print $linefile "#filter: rmsk.repClass = 'LINE'\n"; |
323 | 320 |
489 ## store name of file | 486 ## store name of file |
490 my $sam = shift; | 487 my $sam = shift; |
491 my $mis_L1 = shift; | 488 my $mis_L1 = shift; |
492 my $min_L1 = shift; | 489 my $min_L1 = shift; |
493 my $Bdir = shift; | 490 my $Bdir = shift; |
494 open(my $fic, $sam) || die "cannot open sam file! $!\n"; ## Open file | 491 open(my $fic, $sam) || die "Cannot open sam file! $!\n"; ## Open file |
495 my (%ASP_reads); my $cmp = 0; ## Declare variables for | 492 my (%ASP_reads); my $cmp = 0; ## Declare variables for |
496 my $sequence = ''; | 493 my $sequence = ''; |
497 my $score = ''; | 494 my $score = ''; |
498 | 495 |
499 ##read file## | 496 ##read file## |
592 mkdir $repout; | 589 mkdir $repout; |
593 my %notLine; | 590 my %notLine; |
594 | 591 |
595 ##Write on file containing of readssHashTabR | 592 ##Write on file containing of readssHashTabR |
596 | 593 |
597 open(my $tmp, ">".$second) || die "cannot open temp file $second\n"; | 594 open(my $tmp, ">".$second) || die "Cannot open temp file $second\n"; |
598 while ( my ($k,$v) = each %{$readsHashTabR} ) | 595 while ( my ($k,$v) = each %{$readsHashTabR} ) |
599 { | 596 { |
600 print $tmp ${$v}[1] if defined(${$v}[1]); | 597 print $tmp ${$v}[1] if defined(${$v}[1]); |
601 } | 598 } |
602 close $tmp; | 599 close $tmp; |
607 | 604 |
608 ##Launch RepeatMasker on fasta file | 605 ##Launch RepeatMasker on fasta file |
609 | 606 |
610 `RepeatMasker -s -pa $threads -dir $repout -engine hmmer -species human $fa`; | 607 `RepeatMasker -s -pa $threads -dir $repout -engine hmmer -species human $fa`; |
611 my $repfile = $repout.$name.".fa.out"; | 608 my $repfile = $repout.$name.".fa.out"; |
612 open (my $rep, $repfile) || die "cannot open $repfile $!\n"; | 609 open (my $rep, $repfile) || die "Cannot open $repfile $!\n"; |
613 while(<$rep>) | 610 while(<$rep>) |
614 { | 611 { |
615 chomp; | 612 chomp; |
616 ## test the percent of repeats ## | 613 ## test the percent of repeats ## |
617 my $string = $_; | 614 my $string = $_; |
633 { | 630 { |
634 $notLine{$k} = 1 unless ($v->[0] > $dprct || $v->[1] < $eprct); | 631 $notLine{$k} = 1 unless ($v->[0] > $dprct || $v->[1] < $eprct); |
635 } | 632 } |
636 | 633 |
637 ##write resulting reads in both files for paired ## | 634 ##write resulting reads in both files for paired ## |
638 open(my $accepted_1, ">".$out1 ) || die "cannot open $out1 file $!\n"; | 635 open(my $accepted_1, ">".$out1 ) || die "Cannot open $out1 file $!\n"; |
639 open(my $accepted_2, ">".$out2 ) || die "cannot open $out2 file $!\n"; | 636 open(my $accepted_2, ">".$out2 ) || die "Cannot open $out2 file $!\n"; |
640 while ( my ($k,$v) = each %{$readsHashTabR} ) | 637 while ( my ($k,$v) = each %{$readsHashTabR} ) |
641 { | 638 { |
642 if ( defined (${$v}[0]) && defined (${$v}[1]) ) | 639 if ( defined (${$v}[0]) && defined (${$v}[1]) ) |
643 { | 640 { |
644 unless (defined ($notLine{$k}) && $notLine{$k} == 1) | 641 unless (defined ($notLine{$k}) && $notLine{$k} == 1) |
750 $IdCov{$1} = $split_cov[-1] if $split_cov[-1] > $IdCov{$1}; | 747 $IdCov{$1} = $split_cov[-1] if $split_cov[-1] > $IdCov{$1}; |
751 } | 748 } |
752 } | 749 } |
753 | 750 |
754 ## get only first mate that have less tant $iprct repeats ## | 751 ## get only first mate that have less tant $iprct repeats ## |
755 open (my $tmp_fi, 'temp_name_first') || die "cannot open $namefirst!\n"; | 752 open (my $tmp_fi, 'temp_name_first') || die "Cannot open $namefirst!\n"; |
756 open (my $nam_fi, ">".$namefirst) || die "cannot open $namefirst!\n"; | 753 open (my $nam_fi, ">".$namefirst) || die "Cannot open $namefirst!\n"; |
757 while (<$tmp_fi>) | 754 while (<$tmp_fi>) |
758 { | 755 { |
759 my @line = split /\t/, $_; | 756 my @line = split /\t/, $_; |
760 $line[3] =~ /(.*?)\/[12]/; | 757 $line[3] =~ /(.*?)\/[12]/; |
761 | 758 |
769 close $tmp_fi; close $nam_fi; | 766 close $tmp_fi; close $nam_fi; |
770 | 767 |
771 | 768 |
772 ## get only second mate that have less than $iprct repeats ## | 769 ## get only second mate that have less than $iprct repeats ## |
773 | 770 |
774 open (my $tmp_sec, 'temp_name_second') || die "cannot open $namesecond!\n"; | 771 open (my $tmp_sec, 'temp_name_second') || die "Cannot open $namesecond!\n"; |
775 open (my $nam_sec, ">".$namesecond) || die "cannot open $namesecond!\n"; | 772 open (my $nam_sec, ">".$namesecond) || die "Cannot open $namesecond!\n"; |
776 while (<$tmp_sec>) | 773 while (<$tmp_sec>) |
777 { | 774 { |
778 my @line = split /\t/, $_; | 775 my @line = split /\t/, $_; |
779 $line[3] =~ /(.*?)\/[12]/; | 776 $line[3] =~ /(.*?)\/[12]/; |
780 if ($IdCov{$1} <= $iprct/100) | 777 if ($IdCov{$1} <= $iprct/100) |
782 print $nam_sec $_; | 779 print $nam_sec $_; |
783 } | 780 } |
784 } | 781 } |
785 close $tmp_sec; close $nam_sec; | 782 close $tmp_sec; close $nam_sec; |
786 } | 783 } |
787 | |
788 #sub results | |
789 #{ | |
790 # my ($out_repertory, $file, $name, $hashRef,$ps) = @_; | |
791 # my $namefirst = $out_repertory.'/'.$name.'-first.bed'; push(@garbage, $namefirst); | |
792 # my $namesecond = $out_repertory.'/'.$name.'-second.bed'; push(@garbage, $namesecond); | |
793 # `samtools view -Sb -f66 $file | bedtools bamtobed -i /dev/stdin > $namefirst`; | |
794 # `samtools view -Sb -f130 $file | bedtools bamtobed -i /dev/stdin > $namesecond`; | |
795 # open( my $in, $out_repertory.'/'.$name.'-first.bed') || die "cannot open first read bed\n"; | |
796 # while (<$in>) | |
797 # { | |
798 # my @line = split /\t/, $_; | |
799 # $line[3] =~ /(.*?)\/1/; | |
800 # ${$hashRef}{$1}= $ps; | |
801 # } | |
802 #} | |
803 | 784 |
804 | 785 |
805 ############################################################ | 786 ############################################################ |
806 ##Function blast: blast nucleotide sequences on ref ### | 787 ##Function blast: blast nucleotide sequences on ref ### |
807 ############################################################ | 788 ############################################################ |
831 | 812 |
832 sub extract_blast | 813 sub extract_blast |
833 { | 814 { |
834 my $file = shift; | 815 my $file = shift; |
835 my %hash = (); | 816 my %hash = (); |
836 open (my $f, $file) || die "cannot open $file\n"; | 817 open (my $f, $file) || die "Cannot open $file\n"; |
837 while (<$f>) | 818 while (<$f>) |
838 { | 819 { |
839 chomp $_; | 820 chomp $_; |
840 my ($seq,$id) = split /\t/,$_; | 821 my ($seq,$id) = split /\t/,$_; |
841 $seq = $1 if ($seq =~ /(\d+)-(.*?)-(\d+)-(\d+)/); | 822 $seq = $1 if ($seq =~ /(\d+)-(.*?)-(\d+)-(\d+)/); |
908 File::Copy::Recursive::dircopy "$Bin/js/", "$out/js" or die "Copy failed: $!"; | 889 File::Copy::Recursive::dircopy "$Bin/js/", "$out/js" or die "Copy failed: $!"; |
909 File::Copy::Recursive::dircopy "$Bin/static/", "$out/static" or die "Copy failed: $!"; | 890 File::Copy::Recursive::dircopy "$Bin/static/", "$out/static" or die "Copy failed: $!"; |
910 | 891 |
911 my $chimOut = $html; | 892 my $chimOut = $html; |
912 | 893 |
913 open(my $tab, ">".$chimOut) || die "cannot open $chimOut"; | 894 open(my $tab, ">".$chimOut) || die "Cannot open $chimOut"; |
914 print_header($tab,"Chimerae"); | 895 print_header($tab,"Chimerae"); |
915 print $tab "\t\t<tr>\n\t\t\t<th>L1 chromosome</th>\n\t\t\t<th>L1 start</th>\n\t\t\t<th>L1 end</th>\n\t\t\t<th>L1 strand</th>\n"; | 896 print $tab "\t\t<tr>\n\t\t\t<th>L1 chromosome</th>\n\t\t\t<th>L1 start</th>\n\t\t\t<th>L1 end</th>\n\t\t\t<th>L1 strand</th>\n"; |
916 for my $i (0..$#fastq1) | 897 for my $i (0..$#fastq1) |
917 { | 898 { |
918 print $tab "\t\t\t<th>$name[$i] read #</th>\n"; | 899 print $tab "\t\t\t<th>$name[$i] read #</th>\n"; |
1020 my $out3= $out.'/final_result_chimerae.txt'; | 1001 my $out3= $out.'/final_result_chimerae.txt'; |
1021 | 1002 |
1022 # save result in csv file ## | 1003 # save result in csv file ## |
1023 | 1004 |
1024 my $filed = $out1; | 1005 my $filed = $out1; |
1025 open(my $tab, ">".$filed) || die "cannot open $filed"; | 1006 open(my $tab, ">".$filed) || die "Cannot open $filed"; |
1026 print $tab "L1 chromosome \t L1 start \t L1 end \t L1 strand";; | 1007 print $tab "L1 chromosome \t L1 start \t L1 end \t L1 strand";; |
1027 for my $i (0..$#fastq1) | 1008 for my $i (0..$#fastq1) |
1028 { | 1009 { |
1029 print $tab "\t $name[$i] read #"; | 1010 print $tab "\t $name[$i] read #"; |
1030 } | 1011 } |