Mercurial > repos > amadeo > amadeo
view Tools/First_version/rm_overlap_motifs_galaxy.pl @ 0:229d36377838 draft
Uploaded
author | amadeo |
---|---|
date | Mon, 05 Sep 2016 05:53:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w $|=1; use warnings; use strict; my $line; my @cols; my %hash; my %hash_negative; my $gene; my $TF; my @sequences; my $seq_len; my $OL; my @output_pos; my @output_neg; my $actual_pvalue; my $pvalue; my $pvalue_neg; #$ARGV[3]=<STDIN>; if(@ARGV < 4){ print "\nUsage: rm_overlap_motifs_posneg.pl fimo-test-sue.gff fimo-nol-pos.gff fimo-nol-neg.gff overlap_percentage\n\n"; exit(0); } open(FIMO, "<$ARGV[0]") || die "File '$ARGV[0]' not found\n"; open(POSITIVE, ">$ARGV[1]") || die "File '>$ARGV[1]' not found\n"; open(NEGATIVE, ">$ARGV[2]") || die "File '>$ARGV[2]' not found\n"; # Getting overlap value form user and testing to see if its 0-100 and # converting to 0-1 scale. if ($ARGV[3] >0.0 && $ARGV[3] <=100){ $OL=$ARGV[3]/100; } else{ print" ERROR: overlap is a value 0-100\n"; exit(0); } #print "OL is $OL\n"; while (<FIMO>) { $line=$_; chomp $line; @cols=split; my $pos1; my $pos2; my $scalar; my $decimal; my $e; my @list=(); if ($line=~/^#/){ printf POSITIVE"%s\n", $line; printf NEGATIVE"%s\n", $line; } elsif ($line!~/^##/ and $cols[6]eq"+") { @cols=split; $TF= substr $cols[8],5,8; $gene=substr $cols[0],0,21; $pos1 = $cols[3]; $pos2=$cols[4]; @list=(); @list=($pos1,$pos2); @sequences= split( "=", $cols[9]); $seq_len = int(length (substr $sequences[1],0,-1)); $decimal= substr $cols[8],-16,4; $e=substr $cols[8],-11,3; $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. $actual_pvalue=$decimal*(10**$e); #it will take the p value of the current line if (not exists $hash{$gene}{$TF}) { #Every time that a block of a GENE-MOTIF starts, it will register #the GENE-MOTIF in a hash: GENE-MOTIF as a key and pos1 and pos2 as values. $hash{$gene}{$TF}=\@list; $pvalue=$actual_pvalue; #p value of the current line that it will be compared in the next loop push @output_pos, $line; #it saves the information of the gene motif in the array } elsif (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) {#if the gene exists and the motif is not overlaped #with the previous one #then it will take the line in the list and it will #consider the p value in the next loop $hash{$gene}{$TF}=\@list; $pvalue=$actual_pvalue; push @output_pos, $line; } elsif ( (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])and ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]) and (int($pos2-(@{$hash{$gene}{$TF}}[0]))/$seq_len)<$OL) ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the second position #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. #It will store it in the array and its p value it will consider in the next loop. $hash{$gene}{$TF}=\@list; $pvalue=$actual_pvalue; push @output_pos, $line; #print $pvalue , "\n"; } elsif ( (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])and ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]) and (int($pos2-(@{$hash{$gene}{$TF}}[0]))/$seq_len)>$OL) and $actual_pvalue<$pvalue ) { #If the actual motif overlaps with the previous motif and the overlaping sequence includes the second #position and not the first one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif #with the lowest p value. This p value will consider in the next loop. pop @output_pos; $hash{$gene}{$TF}=\@list; $pvalue=$actual_pvalue; push @output_pos, $line; #print $pvalue , "\n"; } elsif ( ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and (int((@{$hash{$gene}{$TF}}[1])-$pos1)/$seq_len)<$OL ) and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the first position #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. #It will store it in the array and its p value it will consider in the next loop. $hash{$gene}{$TF}=\@list; $pvalue=$actual_pvalue; push @output_pos, $line; } elsif ( ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and (int((@{$hash{$gene}{$TF}}[1])-$pos1)/$seq_len)>$OL ) and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) and $actual_pvalue<$pvalue #If the actual motif overlaps with the previous motif and the overlaping sequence includes the first #position and not the second one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif #with the lowest p value. This p value will consider in the next loop. ) { $hash{$gene}{$TF}=\@list; $pvalue=$actual_pvalue; pop @output_pos; push @output_pos, $line; } elsif ( ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) ) and ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) and $actual_pvalue<$pvalue ) { $hash{$gene}{$TF}=\@list; $pvalue=$actual_pvalue; pop @output_pos; push @output_pos, $line; } } elsif ($line!~/^##/ and $cols[6]eq"-") { #same strategy for the motifs located in the minus strand @cols=split; #$TF= substr $cols[8],5,8; $gene=substr $cols[0],0,21; $pos1 = $cols[3]; $pos2=$cols[4]; @list=(); @list=($pos1,$pos2); @sequences= split( "=", $cols[9]); $seq_len = int(length (substr $sequences[1],0,-1)); $decimal= substr $cols[8],-16,4; $e=substr $cols[8],-11,3; $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. $actual_pvalue=$decimal*(10**$e); if (not exists $hash_negative{$gene}{$TF}) { $hash_negative{$gene}{$TF}=\@list; $pvalue_neg=$actual_pvalue; push @output_neg, $line; } elsif (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1])) { $pvalue_neg=$actual_pvalue; $hash_negative{$gene}{$TF}=\@list; push @output_neg, $line; } elsif ( (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1])and ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1]) and (int($pos2-(@{$hash_negative{$gene}{$TF}}[0]))/$seq_len)<$OL ) ) { $pvalue_neg=$actual_pvalue; $hash_negative{$gene}{$TF}=\@list; push @output_neg, $line; } elsif ( (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1]) and (int($pos2-(@{$hash_negative{$gene}{$TF}}[0]))/$seq_len)>$OL and $actual_pvalue<$pvalue_neg) ) { $pvalue=$actual_pvalue; $hash_negative{$gene}{$TF}=\@list; pop @output_neg; push @output_neg, $line; } elsif ( ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and (int((@{$hash_negative{$gene}{$TF}}[1])-$pos1)/$seq_len)<$OL ) and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )) ) { $pvalue_neg=$actual_pvalue; $hash_negative{$gene}{$TF}=\@list; push @output_neg, $line; } elsif ( ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and (int((@{$hash_negative{$gene}{$TF}}[1])-$pos1)/$seq_len)>$OL ) and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )and $actual_pvalue<$pvalue_neg) ) { $pvalue_neg=$actual_pvalue; $hash_negative{$gene}{$TF}=\@list; pop @output_neg; push @output_neg, $line; } elsif ( ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1])) and ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )and $actual_pvalue<$pvalue_neg) ) { $pvalue_neg=$actual_pvalue; $hash_negative{$gene}{$TF}=\@list; pop @output_neg; push @output_neg, $line; } } } foreach my $lines_pos (@output_pos){ printf POSITIVE"%s\n", $lines_pos; } foreach my $lines_neg (@output_neg){ printf NEGATIVE"%s\n", $lines_neg; }