# HG changeset patch # User amadeo # Date 1473069188 14400 # Node ID 229d36377838c54b9173f8eb89761ae5f231f64a Uploaded diff -r 000000000000 -r 229d36377838 Tools/CREF/Extract_matrix_subset_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/CREF/Extract_matrix_subset_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,83 @@ +#!/usr/bin/perl -w + + +$|=1; +use strict; #using this makes debugging your code much easier +use warnings; + + +#Script to take a list of JASPAR ids and extract a subset of matrix information for each of them from the JASPAR_CORE_2016.meme file + +#Checking to see if the user has provided 3 arguments + + + +if(@ARGV < 3){ +print "\nUsage: Extract_matrix_subset.pl list-motifs.txt JAPSAR_CORE_2016.meme JASPAR_AME_subset.meme\n\n"; +exit(0); +} + +my $motif_id; +my $line; +my @lines; +my %matrix=(); +my $header; +my $data; +my @header_list_motif; +my $line2; + + + +open (MOTIF, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n" ; + +open (JASPAR, "<$ARGV[1]") || + die "File '$ARGV[1]' not found\n" ; + +open (OUTPUT, ">$ARGV[2]") || + die "File '$ARGV[2]' not found\n" ; + + +@lines=; + + +#In this loop I "delete" the 9 first lines of the JASPAR-database from @lines and store them in a new array called @header_list_motif, +#that will be the header of the output file (I am doing this because I had errors on my hash because of the header lines). +for (my $i = 0; $i < 9; $i++) { + $line2 = shift @lines; + push (@header_list_motif, $line2); +} + +#Once I delete the first 9 lines, I create a hash with the motifs +#as keys and the data as values. +foreach $line(@lines){ + if ($line =~ /^MO/) { + $header = $line; + } + else { + push( @{$matrix{$header}}, $line); +} +} + +#I use this to test if the number of motifs of my motif list are the same of the motifs +#of my output file. +#my $counter =0; + +#Print the header. +foreach my $line3(@header_list_motif){ + printf OUTPUT $line3; +} + +#Print the motifs with the data +while (){ + chomp; + $motif_id = $_; + foreach my $motif_hash(keys %matrix){ + if ($motif_hash=~/$motif_id/) { + printf OUTPUT "$motif_hash @{$matrix{$motif_hash}}\n"; + #$counter = $counter +1; + } + + } +} +#print $counter; diff -r 000000000000 -r 229d36377838 Tools/CREF/Extract_motif_codes_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/CREF/Extract_motif_codes_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,54 @@ +#!/usr/bin/perl -w + +# The lines that start with # are comment lines that are not executed + + +$|=1; +use strict; +use warnings; + + +#Script to take output from AME (part of memesuite-org) and extract a +#list of the overrepresented motifs and print them to a new file +#called ame-motif-id.list + + +#Checking to see if the user has provided 1 argument - which is the +#name of the AME results file + +if(@ARGV < 2){ +print "\nUsage: Extract_motif_codes.pl ame.txt ame-shorted.txt\n\n"; +exit(0); +} + +#Declaring variables +my @cols; #an array variable +my $line; # a scalar varaible + +#Using a FIELHANDLE to open the input file +open (INPUT, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n" ; + +open (OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n" ; + +#looping through each line of the file + while (){ + #assigning line to variable $line + #$_ is a special default variable that here holds the line contents + $line = $_; + #match lines that have Ranksum + if ($line =~ /Ranksum/){ + printf OUTPUT "%s\n", $line; + #split the lines on white space, so each part of the line gets + #stored as an array element + @cols=split; + #Testing to see what line elements are stored in the array + #print "cols [0] is $cols[0] \n"; + #print "cols [2] is $cols[2] \n\n"; + + #Now see if you can print out the array elemnent that stores the + #motif ID to a new file called ame-motif-id.list. + } + } + diff -r 000000000000 -r 229d36377838 Tools/CREF/extract_motifs_codes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/CREF/extract_motifs_codes.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + Script to take output from AME (part of memesuite-org) and extract a list of the overrepresented motifs + Extract_motif_codes_galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/CREF/extract_motifs_subset.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/CREF/extract_motifs_subset.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,12 @@ + + Script to take a list of JASPAR ids and extract a subset of matrix information for each of them from a meme file + Extract_matrix_subset_galaxy.pl $input $secondinput $output + + + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/CREF/list_motifs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/CREF/list_motifs.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + Extract the motifs names from ame processed file + list_motifs_galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/CREF/list_motifs_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/CREF/list_motifs_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,29 @@ +#!/usr/bin/perl -w + + +$|=1; +use strict; #using this makes debugging your code much easier +use warnings; + +my $line; +my @cols; + + +if(@ARGV < 2){ +print "\nUsage: list_motifs.pl ame-shorted.txt list-motifs.txt\n\n"; +exit(0); +} +open (INPUT, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n" ; + +open (OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n" ; + + while (){ + + $line = $_; + @cols=split; + if ($line =~ /MA/){ + printf OUTPUT "%s\n", $cols[5]; + } + } \ No newline at end of file diff -r 000000000000 -r 229d36377838 Tools/CREF/short-headers.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/CREF/short-headers.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + re-writes a fasta file with compatible headers for MEME suite tool. + shorten-headers-galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/CREF/shorten-headers-galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/CREF/shorten-headers-galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,46 @@ +#!/usr/bin/perl -w + +$|=1; +use strict; #using this makes debugging your code much easier +use warnings; + +#Script to take a multiple fasta file and truncate the header lines to +#GeneIDs so the fasta file can be used as input to AME + +#Checking to see if the user has provided 2 arguments - which is the +#name of the promoter sequence file and an output file name + +if(@ARGV < 2){ +print "\nUsage:shorten-headers.pl promoters.fasta promoters-sh.fasta\n\n"; +exit(0); +} + +#Declaring variables +my $line; # a scalar varaible + +#Using a FIELHANDLE to open the input file +open (INPUT, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n" ; + +#Using a FIELHANDLE to open the input file +open (OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n" ; + +#looping through each line of the file + while (){ + #assigning line to variable $line + #$_ is a special default variable that here holds the line contents + $line = $_; + #match lines header lines + if ($line =~ /^>/){ + #printing header lines to file as a substring of x charaters + printf OUTPUT "%s\n", substr($line,0,21); #the third number is the x characters of the name of the header + } + else{ + #printing out sequence lines just as they are in the orginal file. + printf OUTPUT "$line"; + } + } + +close (INPUT); +close(OUTPUT); diff -r 000000000000 -r 229d36377838 Tools/First_version/remove_motifs_overlaped.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/First_version/remove_motifs_overlaped.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,12 @@ + + Script to compare same contigous motifs in gff file and, in case that two overlap, remove the motif with the highest p value + rm_overlap_motifs_galaxy.pl $input $output $secondoutput $value + + + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/First_version/rm_overlap_motifs_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/First_version/rm_overlap_motifs_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,258 @@ +#!/usr/bin/perl -w + +$|=1; +use warnings; +use strict; + + +my $line; +my @cols; +my %hash; +my %hash_negative; +my $gene; +my $TF; +my @sequences; +my $seq_len; +my $OL; +my @output_pos; +my @output_neg; +my $actual_pvalue; +my $pvalue; +my $pvalue_neg; + + +#$ARGV[3]=; + +if(@ARGV < 4){ +print "\nUsage: rm_overlap_motifs_posneg.pl fimo-test-sue.gff fimo-nol-pos.gff fimo-nol-neg.gff overlap_percentage\n\n"; +exit(0); +} + + + +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(POSITIVE, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; +open(NEGATIVE, ">$ARGV[2]") || + die "File '>$ARGV[2]' not found\n"; + +# Getting overlap value form user and testing to see if its 0-100 and +# converting to 0-1 scale. +if ($ARGV[3] >0.0 && $ARGV[3] <=100){ + $OL=$ARGV[3]/100; +} +else{ + print" ERROR: overlap is a value 0-100\n"; + exit(0); +} +#print "OL is $OL\n"; + +while () { + $line=$_; + chomp $line; + @cols=split; + my $pos1; + my $pos2; + my $scalar; + my $decimal; + my $e; + + my @list=(); + if ($line=~/^#/){ + printf POSITIVE"%s\n", $line; + printf NEGATIVE"%s\n", $line; + } + elsif ($line!~/^##/ and $cols[6]eq"+") { + @cols=split; + $TF= substr $cols[8],5,8; + $gene=substr $cols[0],0,21; + $pos1 = $cols[3]; + $pos2=$cols[4]; + @list=(); + @list=($pos1,$pos2); + @sequences= split( "=", $cols[9]); + $seq_len = int(length (substr $sequences[1],0,-1)); + $decimal= substr $cols[8],-16,4; + $e=substr $cols[8],-11,3; + $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. + $actual_pvalue=$decimal*(10**$e); #it will take the p value of the current line + + if (not exists $hash{$gene}{$TF}) { #Every time that a block of a GENE-MOTIF starts, it will register + #the GENE-MOTIF in a hash: GENE-MOTIF as a key and pos1 and pos2 as values. + $hash{$gene}{$TF}=\@list; + $pvalue=$actual_pvalue; #p value of the current line that it will be compared in the next loop + push @output_pos, $line; #it saves the information of the gene motif in the array + } + + elsif (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) + and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) {#if the gene exists and the motif is not overlaped + #with the previous one + #then it will take the line in the list and it will + #consider the p value in the next loop + $hash{$gene}{$TF}=\@list; + $pvalue=$actual_pvalue; + push @output_pos, $line; + } + + + elsif ( + + (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])and + ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]) and (int($pos2-(@{$hash{$gene}{$TF}}[0]))/$seq_len)<$OL) + + ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the second position + #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. + #It will store it in the array and its p value it will consider in the next loop. + $hash{$gene}{$TF}=\@list; + $pvalue=$actual_pvalue; + push @output_pos, $line; + #print $pvalue , "\n"; + } + elsif ( + + (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])and + ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]) and (int($pos2-(@{$hash{$gene}{$TF}}[0]))/$seq_len)>$OL) + and $actual_pvalue<$pvalue + + + ) { #If the actual motif overlaps with the previous motif and the overlaping sequence includes the second + #position and not the first one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value + #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif + #with the lowest p value. This p value will consider in the next loop. + pop @output_pos; + $hash{$gene}{$TF}=\@list; + $pvalue=$actual_pvalue; + push @output_pos, $line; + #print $pvalue , "\n"; + } + elsif ( + + ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and (int((@{$hash{$gene}{$TF}}[1])-$pos1)/$seq_len)<$OL ) + and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) + + ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the first position + #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. + #It will store it in the array and its p value it will consider in the next loop. + + $hash{$gene}{$TF}=\@list; + $pvalue=$actual_pvalue; + push @output_pos, $line; + } + elsif ( + + ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and (int((@{$hash{$gene}{$TF}}[1])-$pos1)/$seq_len)>$OL ) + and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) and $actual_pvalue<$pvalue + #If the actual motif overlaps with the previous motif and the overlaping sequence includes the first + #position and not the second one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value + #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif + #with the lowest p value. This p value will consider in the next loop. + ) { + $hash{$gene}{$TF}=\@list; + $pvalue=$actual_pvalue; + pop @output_pos; + push @output_pos, $line; + } + elsif ( + + ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) ) + and ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) and $actual_pvalue<$pvalue + + ) { + $hash{$gene}{$TF}=\@list; + $pvalue=$actual_pvalue; + pop @output_pos; + push @output_pos, $line; + } + + + } + elsif ($line!~/^##/ and $cols[6]eq"-") { #same strategy for the motifs located in the minus strand + @cols=split; + #$TF= substr $cols[8],5,8; + $gene=substr $cols[0],0,21; + $pos1 = $cols[3]; + $pos2=$cols[4]; + @list=(); + @list=($pos1,$pos2); + @sequences= split( "=", $cols[9]); + $seq_len = int(length (substr $sequences[1],0,-1)); + $decimal= substr $cols[8],-16,4; + $e=substr $cols[8],-11,3; + $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. + $actual_pvalue=$decimal*(10**$e); + + if (not exists $hash_negative{$gene}{$TF}) { + $hash_negative{$gene}{$TF}=\@list; + $pvalue_neg=$actual_pvalue; + push @output_neg, $line; + } + + elsif (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) + and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1])) { + $pvalue_neg=$actual_pvalue; + $hash_negative{$gene}{$TF}=\@list; + push @output_neg, $line; + } + + + elsif ( + + (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1])and + ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1]) and (int($pos2-(@{$hash_negative{$gene}{$TF}}[0]))/$seq_len)<$OL ) + ) { + $pvalue_neg=$actual_pvalue; + $hash_negative{$gene}{$TF}=\@list; + push @output_neg, $line; + } + elsif ( + + (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and + ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1]) and (int($pos2-(@{$hash_negative{$gene}{$TF}}[0]))/$seq_len)>$OL and + $actual_pvalue<$pvalue_neg) + ) { + $pvalue=$actual_pvalue; + $hash_negative{$gene}{$TF}=\@list; + pop @output_neg; + push @output_neg, $line; + } + elsif ( + ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and (int((@{$hash_negative{$gene}{$TF}}[1])-$pos1)/$seq_len)<$OL ) + and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )) + ) { + $pvalue_neg=$actual_pvalue; + $hash_negative{$gene}{$TF}=\@list; + push @output_neg, $line; + } + elsif ( + ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and + (int((@{$hash_negative{$gene}{$TF}}[1])-$pos1)/$seq_len)>$OL ) + and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )and + $actual_pvalue<$pvalue_neg) + ) { + $pvalue_neg=$actual_pvalue; + $hash_negative{$gene}{$TF}=\@list; + pop @output_neg; + push @output_neg, $line; + } + elsif ( + ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1])) + and ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )and + $actual_pvalue<$pvalue_neg) + ) { + $pvalue_neg=$actual_pvalue; + $hash_negative{$gene}{$TF}=\@list; + pop @output_neg; + push @output_neg, $line; + } + + + } +} +foreach my $lines_pos (@output_pos){ + printf POSITIVE"%s\n", $lines_pos; + +} +foreach my $lines_neg (@output_neg){ + printf NEGATIVE"%s\n", $lines_neg; +} \ No newline at end of file diff -r 000000000000 -r 229d36377838 Tools/First_version/sorting_first.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/First_version/sorting_first.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + Script to take sort block of genes in a .gff file by the start position of the motifs + sorting_first_galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/First_version/sorting_first_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/First_version/sorting_first_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,88 @@ +#!/usr/bin/perl -w + +$|=1; +use warnings; +use strict; + +my $line; +my @cols; +my %hash1; +my %hash2; +my @list1; +my @list2; +my $gene; +my $pos1; +my $n; +my $index; +my $position; +my $scalar; +my $TF; + +if(@ARGV < 2){ +print "\nUsage: sorting_promoters_by_first_position.pl fimo.gff fimo-position-sorted.gff e\n\n"; +exit(0); +} + +my $counter=0; +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; + +while () { + $line=$_; + chomp $line; + @cols=split; + + if ($line=~/^#/){ + printf OUTPUT "%s\n", $line; + $counter++; + } + else { + $gene=substr $cols[0],0,21; + $pos1 = $cols[3]; + $TF= substr $cols[8],5,8; + $scalar= scalar @list1; + if (not exists $hash1{$gene}{$TF} and not $scalar == 0) { + $n= scalar @list1; + my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); + for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ + $index=$list_pos_sorted[$i]; + $position = $list1[$index]; + printf OUTPUT "%s\n", $hash2{$position}; + #print $list2[$index], "\n"; + $counter++; + } + } + if (not exists $hash1{$gene}{$TF}) { + %hash1=(); + %hash2=(); + @list1=(); + @list2=(); + $hash1{$gene}{$TF}=1; + $hash2{$pos1}=$line; + push @list1, $pos1; + push @list2, $line; + } + + elsif (exists $hash1{$gene}{$TF}) { + $hash2{$pos1}=$line; + push @list1, $pos1; + push @list2, $line; + } + + + } + +} + +$n= scalar @list1; +my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); + for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ + $index=$list_pos_sorted[$i]; + $position = $list1[$index]; + printf OUTPUT "%s\n", $hash2{$position}; + #print $list2[$index], "\n"; + $counter++; + } +#print $counter; diff -r 000000000000 -r 229d36377838 Tools/Matrix/gene-TF-matrix-csv-galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Matrix/gene-TF-matrix-csv-galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,121 @@ +#!/usr/bin/perl -w +$|=1; +use strict; +use warnings; + + + +# Script to create csv formatted gene vs TF matrix from a filtered gff +# file. GFF file can contain just Positive or Just neagtive strand +# TFBS. Has two types of matrix produced: (0) resence/Abscence with only +# 1 and 0s. With option=0. (1) counts of TFs with numbers 1,3,5 etc. + + + + +my $line; +my $line3; +my @cols; +my @TF_array; +my @gene_array; +my %matrix_1= (); +my %matrix_2= (); +my $TF; +my $gene; +my %matrix; +my $matrixType; + +if(@ARGV < 3){ +print "\nUsage: gene-TF-matrix.pl fimo-nol-P.gff/fimo-nol-N.gff gene-matrix-P.csv/gene-matrix-N.csv +\n Options: Presence/Abscence=0 counts=1\n\n"; +exit(0); +} +open (FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n" ; +open(MATRIX, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; + +$matrixType = $ARGV[2]; +print "MatrixTYpe is $matrixType\n"; + +#Put all the motifs and genes in two separate arrays: each appears +#only once in each array. +while () { + $line=$_; + if ($line!~/^##/) {#ignore header line + @cols=split; + $TF= substr $cols[8],5,8; + if (not exists $matrix_1{$TF}) { + $matrix_1{$TF}=""; + push @TF_array, $TF; + } + $gene=substr $cols[0],0,21; + if (not exists $matrix_2{$gene}) { + $matrix_2{$gene}=""; + push @gene_array, $gene + } + } + } + +my $n_motifs=scalar @TF_array; +my $n_genes=scalar@gene_array; +#printf "Scalar motifs is %d\n", scalar@TF_array; +#printf "Scalar genes is %d\n", scalar@gene_array; + +close(FIMO); +#I want to create a hash on which each gene has a list of 0s. Then I want to "read" the .gff file +#and if a gene has a certain TF it will add "+1" to the possition of the TF, and it will look like this. + + +open (FIMO, "$ARGV[0]") || + die "File '$ARGV[0]' not found\n" ; + +#$matrix{"PGSC0003DMG400006788"}=(0,0,1,0,2,0,3,0,0,...,0) + +#Filling 2d gene/motif array with zeros to start +foreach my $element (@gene_array){ + my @auxilary_list = (); + for (my $i=1; $i <= $n_motifs; $i++){ + $auxilary_list[$i-1] =0; + } + $matrix{$element}=\@auxilary_list; +} + +#This is how I want to read the .gff file and check if a gene has a certain TF. I dont consider the positions yet. I just +# want to see if this first step works. + +while (){ + $line3 = $_; + if ($line3!~/^##/) { + for (my $j=0; $j < scalar@gene_array; $j++){ + for (my $h=0; $h < scalar@TF_array; $h++){ + #printf "Genes[%d] -%s- Motifs[%d] -%s- \n",$j, $gene_array[$j], $h, $TF_array[$h]; + if (($line3 =~/$gene_array[$j]/) and ($line3 =~/$TF_array[$h]/)) { + if ($matrixType ==0){${$matrix{$gene_array[$j]}}[$h]=1;} + if ($matrixType ==1){${$matrix{$gene_array[$j]}}[$h]++;} + } + } + } + } +} + +printf MATRIX "Gene,"; +for (my $h=0; $h < scalar@TF_array; $h++){ + if ($h!=scalar@TF_array-1) { + printf MATRIX "$TF_array[$h],"; + } + else{printf MATRIX "$TF_array[$h]"} +} +printf MATRIX "\n"; +foreach my $element(sort keys %matrix){ + printf MATRIX "$element,"; + for (my $r=0; $r + Script to create csv formatted gene vs TF matrix from a filtered gff file. GFF file can contain just Positive or Just neagtive strand TFBS. Has two types of matrix produced: (0) resence/Abscence with only 1 and 0s. With option=0. (1) counts of TFs with numbers 1,3,5 etc. + gene-TF-matrix-csv-galaxy.pl $input $output $value + + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Motif_search/motif_search_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/motif_search_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,56 @@ +#!/usr/bin/perl -w +$|=1; +use warnings; +use strict; + +#Script that returns the lines of a gff file according to the ID motifs that we want. + + + +my $motif=" "; +my @motifs; +my $line; +my @cols; +my $motif_fimo; + +if ($ARGV[3] ne "#"){ + push @motifs, $ARGV[3]; +} +if ($ARGV[4] ne "#"){ + push @motifs, $ARGV[3]; +} +if ($ARGV[5] ne "#"){ + push @motifs, $ARGV[3]; +} +if ($ARGV[6] ne "#"){ + push @motifs, $ARGV[3]; +} + + + + + +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; + +while () { + foreach my $tf (@motifs){ + $line= $_; + chomp $line; + @cols=split; + if ($line=~/^#/){ + printf OUTPUT "%s\n", " "; + } + elsif ($line!~/^##/ and $tf eq (substr $cols[8],5,8)) { + + printf OUTPUT "%s\n", $line; + + + + } + } + + +} \ No newline at end of file diff -r 000000000000 -r 229d36377838 Tools/Motif_search/motif_search_galaxy.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/motif_search_galaxy.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,15 @@ + + tool to compare contigous motifs in gff file and, in case that two overlap, remove the motif with the highest p value + motif_search_galaxy.pl $input $output $motif1 $motif2 $motif3 $motif4 $motif5 + + + + + + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Motif_search/rules.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/rules.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,11 @@ + + tool that looks for genes that have motifs from a certain rule + rules_galaxy.pl $input $output $value + + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Motif_search/rules_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/rules_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,95 @@ +#!/usr/bin/perl -w + +$|=1; +use warnings; +use strict; +#Script that looks for genes that have motifs from a certain rule. + +#Declaration of variables +my $line; +my $line2; +my @cols; +my @cols2; +my %hash; +my %hash1; +my %hash3; + +my $gene; +my $TF; +my $num_motifs; +my @genes_rules; + + +$num_motifs=$ARGV[2]; + + +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; + +open(OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; + + +while () { + $line=$_; + chomp $line; + @cols=split; + if (not $line=~/^ /){ + $TF= substr $cols[8],5,8; + $gene=substr $cols[0],0,21; + + if (not exists $hash{$gene}) { + $hash1{$gene}=0; + + } + if (not exists $hash{$gene}{$TF}) { + $hash1{$gene}++; + $hash{$gene}{$TF}=1; + #print $hash1{$gene}; + } + + + if ($hash1{$gene}==$num_motifs and not exists $hash3{$gene}) { + $hash3{$gene}=1; + #print $line, "\n"; + + } + + + + } + +} + +close FIMO; +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; + +while () { + $line2=$_; + chomp $line2; + @cols2=split; + + if (not $line2=~/^ /){ + $TF= substr $cols2[8],5,8; + $gene=substr $cols2[0],0,21; + foreach my $gene_listed (keys %hash3){ + + if ($gene_listed eq $gene) { + printf OUTPUT "%s\n", $line2; + + } + + + } + + } +} + + +print "Genes that have this rule:", "\n"; +foreach my $gene_listed (keys %hash3){ + print $gene_listed,"\n"; + } + + diff -r 000000000000 -r 229d36377838 Tools/Motif_search/sort_genes_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/sort_genes_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,67 @@ +#!/usr/bin/perl -w +$|=1; +use warnings; +use strict; + +#Script that takes a gff format file as input and orders it by genes, +#so it will create an output with all the information grouped by genes. Motifs will be mixed. + +my $line; +my @cols; +my %hash1; +my %hash2; +my @list1; +my @list2; +my $gene; +my $pos1; +my $n; +my $index; +my $position; +my $scalar; +my $TF; + + +if(@ARGV < 2){ +print "\nUsage: step1.pl fimo.gff motif_search-position-sorted.gff e\n\n"; +exit(0); +} + +my $counter=0; +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; + +while () { + $line=$_; + chomp $line; + @cols=split; + + if ($line=~/^ /){ + printf OUTPUT "%s\n", $line; + $counter++; + } + else { + $gene=substr $cols[0],0,21; + $pos1 = $cols[3]; + $TF= substr $cols[8],5,8; + if (not exists $hash1{$gene}{$TF}{$pos1}) { + $hash1{$gene}{$TF}{$pos1}=1; + push @list1, $gene; + push @list2, $line; + } + + } + +} + +$n= scalar @list1; +my @list_gen_sorted= sort { $list1[$a] cmp $list1[$b] } 0..($n - 1); + for (my $i=0; $i <(scalar @list_gen_sorted); $i++){ + $index=$list_gen_sorted[$i]; + $position = $list1[$index]; + #print $hash2{$position}; + printf OUTPUT "%s\n", $list2[$index]; + $counter++; + } +print $counter; diff -r 000000000000 -r 229d36377838 Tools/Motif_search/sort_genes_galaxy.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/sort_genes_galaxy.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + Script that takes a gff format file from MEME suite as input and orders it by genes, so it will create an output with all the information grouped by genes. Motifs will be mixed. + sort_genes_galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Motif_search/sort_positions_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/sort_positions_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/perl -w +$|=1; +use warnings; +use strict; +#Script that takes a gff format file from step1.pl as input and orders +#each block of gene data by the start position of the motif. +my $line; +my @cols; +my %hash1; +my %hash2; +my @list1; +my @list2; +my $gene; +my $pos1; +my $n; +my $index; +my $position; +my $scalar; +my $TF; + + +if(@ARGV < 2){ +print "\nUsage: step2.pl motif_search-position-sorted.gff motif_search-gene-&-position-sorted.gff e\n\n"; +exit(0); +} + +my $counter=0; +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; + +while () { + $line=$_; + chomp $line; + @cols=split; + + if ($line=~/^ /){ + printf OUTPUT "%s\n", $line; + $counter++; + } + else { + $gene=substr $cols[0],0,21; + $pos1 = $cols[3]; + $TF= substr $cols[8],5,8; + $scalar= scalar @list1; + if (not exists $hash1{$gene} and not $scalar == 0) { + $n= scalar @list1; + my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); + for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ + $index=$list_pos_sorted[$i]; + $position = $list1[$index]; + #printf OUTPUT "%s\n",$hash2{$position}; + printf OUTPUT "%s\n", $list2[$index]; + $counter++; + } + } + if (not exists $hash1{$gene}) { + %hash1=(); + %hash2=(); + @list1=(); + @list2=(); + $hash1{$gene}=1; + $hash2{$pos1}=$line; + push @list1, $pos1; + push @list2, $line; + } + + elsif (exists $hash1{$gene}) { + $hash2{$pos1}=$line; + push @list1, $pos1; + push @list2, $line; + } + + + } + +} + +$n= scalar @list1; +my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); + for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ + $index=$list_pos_sorted[$i]; + $position = $list1[$index]; + printf OUTPUT "%s\n", $hash2{$position}; + #printf OUTPUT "%s\n", $list2[$index]; + $counter++; + } +print $counter; diff -r 000000000000 -r 229d36377838 Tools/Motif_search/sort_positions_galaxy.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/sort_positions_galaxy.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + Script that takes a gff format file from step1.pl as input and orders each block of gene data by the start position of the motif. + sort_positions_galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Motif_search/testrules_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/testrules_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w + +$|=1; +use warnings; +use strict; +#Script that looks for genes that have motifs from a certain rule. + +#Declaration of variables +my %hash; +my $line; +my @cols; +my @pos; +my @motif; +my @genes; +my $pos1; +my $gene; +my $TF; +my $current_gene; +my $size; + +if(@ARGV < 2){ +print "\nUsage: testrules_galaxy.pl fimo.gff testrules.gff \n\n"; +exit(0); +} + +open(FIMO, ">$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(OUTPUT, "<$ARGV[1]") || + die "File '<$ARGV[0]' not found\n"; + +$current_gene=""; + +while () { + $line=$_; + @cols=split; + $TF= substr $cols[8],5,8; + $gene=substr $cols[0],0,21; + $pos1 = $cols[3]; + $size=scalar @motif; + if (not exists $hash{$gene} ) { + + if ($current_gene ne "") { + printf OUTPUT "%s\n", $current_gene, " ", "=>"," "; + } + for (my $i=0;$i<$size;$i++){ + printf OUTPUT "%s\n", $motif[$i],"($pos[$i])","\t"; + } + print "\n"; + @motif=(); + @pos=(); + $current_gene=$gene; + push @motif,$TF; + push @pos, $pos1; + + $hash{$gene}=1; + + + } + + else { + push @motif,$TF; + push @pos, $pos1; + } + + } + +$size=scalar @motif; +printf OUTPUT "%s\n", $current_gene, " ", "=>"," "; +for (my $i=0;$i<$size;$i++){ + printf OUTPUT "%s\n", $motif[$i],"($pos[$i])","\t"; + } + + diff -r 000000000000 -r 229d36377838 Tools/Motif_search/testrules_galaxy.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Motif_search/testrules_galaxy.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + tool prints out motifs of a certain rule that are present in a gene + testrules_galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Second/remove_motifs_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Second/remove_motifs_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,262 @@ +#!/usr/bin/perl -w + +$|=1; +use warnings; +use strict; +#Script that takes a gff format file from step2.pl as input and compares contiguous motifs listed in the gff file. +#If motifs overlap and surpass the threshold, then it will remove that motif with the highest p value. + +my $line; +my @cols; +my %hash; +my %hash_negative; +my $gene; +my @sequences; +my $seq_len; +my $OL; +my @output_pos; +my @output_neg; +my $actual_pvalue; +my $actual_pvalue_neg; +my $pvalue; +my $pvalue_neg; + + +if(@ARGV < 4){ +print "\nUsage: rm_overlap_motifs_posneg.pl fimo-test-sue.gff fimo-nol-pos.gff fimo-nol-neg.gff overlap_percentage\n\n"; +exit(0); +} + + + +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(POSITIVE, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; +open(NEGATIVE, ">$ARGV[2]") || + die "File '>$ARGV[2]' not found\n"; + +# Getting overlap value form user and testing to see if it's 0-100 and +# converting to 0-1 scale. +if ($ARGV[3] >0.0 && $ARGV[3] <=100){ + $OL=$ARGV[3]/100; +} +else{ + print" ERROR: overlap is a value 0-100\n"; + exit(0); +} +#print "OL is $OL\n"; + +while () { + $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents + chomp $line; #avoid \n on last field + @cols=split;#Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context. + #This is very useful because the data of the gff file can be called using this variable. + my $pos1; + my $pos2; + my $scalar; + my $decimal; + my $e; + + my @list=(); + if ($line=~/^#/){ + printf POSITIVE"%s\n", $line; + printf NEGATIVE"%s\n", $line; + } + elsif ($line!~/^##/ and $cols[6]eq"+") { + @cols=split; + #$TF= substr $cols[8],5,8; #in this case we don't need that the hash considers the motif + $gene=substr $cols[0],0,21; + $pos1 = $cols[3]; #start position of the motif + $pos2=$cols[4]; #end position of the motif + @list=(); + @list=($pos1,$pos2); + @sequences= split( "=", $cols[9]); + $seq_len = int(length (substr $sequences[1],0,-1)); #returns the length of the sequence + ####These variables consider the p value#### + $decimal= substr $cols[8],-16,4; + $e=substr $cols[8],-11,3; + $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. + $actual_pvalue=$decimal*(10**$e); #it will take the p value of the current line + ####====### + if (not exists $hash{$gene}) { #Every time that a block of a gene with all the different motifs starts, it will register + #the gene in a hash: gene as a key and pos1 and pos2 as values. + $hash{$gene}=\@list; + $pvalue=$actual_pvalue; #p value of the current line that it will be compared in the next loop + push @output_pos, $line; #it saves the information of the gene motif in the array + } + + elsif (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) + and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) {#if the gene exists and the motif is not overlaped + #with the previous one + #then it will take the line in the list and it will + #consider the p value in the next loop + $hash{$gene}=\@list; + $pvalue=$actual_pvalue; + push @output_pos, $line; + } + + + elsif ( + + (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1])and + ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1]) and (int($pos2-(@{$hash{$gene}}[0]))/$seq_len)<$OL) + + ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the second position + #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. + #It will store it in the array and its p value it will consider in the next loop. + $hash{$gene}=\@list; + $pvalue=$actual_pvalue; + push @output_pos, $line; + #print $pvalue , "\n"; + } + elsif ( + + (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1])and + ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1]) and (int($pos2-(@{$hash{$gene}}[0]))/$seq_len)>$OL) + and $actual_pvalue<$pvalue + + + ) { #If the actual motif overlaps with the previous motif and the overlaping sequence includes the second + #position and not the first one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value + #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif + #with the lowest p value. This p value will consider in the next loop. + pop @output_pos; + $hash{$gene}=\@list; + $pvalue=$actual_pvalue; + push @output_pos, $line; + #print $pvalue , "\n"; + } + elsif ( + + ((($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) and (int((@{$hash{$gene}}[1])-$pos1)/$seq_len)<$OL ) + and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) + + ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the first position + #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. + #It will store it in the array and its p value it will consider in the next loop. + + $hash{$gene}=\@list; + $pvalue=$actual_pvalue; + push @output_pos, $line; + } + elsif ( + + ((($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) and (int((@{$hash{$gene}}[1])-$pos1)/$seq_len)>$OL ) + and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) and $actual_pvalue<$pvalue + #If the actual motif overlaps with the previous motif and the overlaping sequence includes the first + #position and not the second one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value + #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif + #with the lowest p value. This p value will consider in the next loop. + ) { + $hash{$gene}=\@list; + $pvalue=$actual_pvalue; + pop @output_pos; + push @output_pos, $line; + } + elsif ( + + (($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) + and ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) and $actual_pvalue<$pvalue + + ) { + $hash{$gene}=\@list; + $pvalue=$actual_pvalue; + pop @output_pos; + push @output_pos, $line; + } + + + } + + ##===========Same strategy applied to the motifs located in the minus strand===========# + elsif ($line!~/^##/ and $cols[6]eq"-") { + @cols=split; + #$TF= substr $cols[8],5,8; + $gene=substr $cols[0],0,21; + $pos1 = $cols[3]; + $pos2=$cols[4]; + @list=(); + @list=($pos1,$pos2); + @sequences= split( "=", $cols[9]); + $seq_len = int(length (substr $sequences[1],0,-1)); + $decimal= substr $cols[8],-16,4; + $e=substr $cols[8],-11,3; + $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. + $actual_pvalue_neg=$decimal*(10**$e); + + if (not exists $hash_negative{$gene}) { + $hash_negative{$gene}=\@list; + $pvalue_neg=$actual_pvalue_neg; + push @output_neg, $line; + } + + elsif (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) + and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1])) { + $pvalue_neg=$actual_pvalue_neg; + $hash_negative{$gene}=\@list; + push @output_neg, $line; + } + + + elsif ( + + (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1])and + ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1]) and (int($pos2-(@{$hash_negative{$gene}}[0]))/$seq_len)<$OL ) + ) { + $pvalue_neg=$actual_pvalue_neg; + $hash_negative{$gene}=\@list; + push @output_neg, $line; + } + elsif ( + + (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and + ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1]) and (int($pos2-(@{$hash_negative{$gene}}[0]))/$seq_len)>$OL and + $actual_pvalue_neg<$pvalue_neg) + ) { + $pvalue=$actual_pvalue_neg; + $hash_negative{$gene}=\@list; + pop @output_neg; + push @output_neg, $line; + } + elsif ( + ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and (int((@{$hash_negative{$gene}}[1])-$pos1)/$seq_len)<$OL ) + and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] )) + ) { + $pvalue_neg=$actual_pvalue_neg; + $hash_negative{$gene}=\@list; + push @output_neg, $line; + } + elsif ( + ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and + (int((@{$hash_negative{$gene}}[1])-$pos1)/$seq_len)>$OL ) + and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] )and + $actual_pvalue_neg<$pvalue_neg) + ) { + $pvalue_neg=$actual_pvalue_neg; + $hash_negative{$gene}=\@list; + pop @output_neg; + push @output_neg, $line; + } + + elsif ( + ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) ) + and ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] )and + $actual_pvalue_neg<$pvalue_neg) + ) { + $pvalue_neg=$actual_pvalue_neg; + $hash_negative{$gene}=\@list; + pop @output_neg; + push @output_neg, $line; + } + + + } +} +foreach my $lines_pos (@output_pos){ + printf POSITIVE"%s\n", $lines_pos; + +} +foreach my $lines_neg (@output_neg){ + printf NEGATIVE"%s\n", $lines_neg; +} \ No newline at end of file diff -r 000000000000 -r 229d36377838 Tools/Second/remove_motifs_v2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Second/remove_motifs_v2.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,12 @@ + + Script to compare contigous motifs in gff file and, in case that two overlap, remove the motif with the highest p value + remove_motifs_galaxy.pl $input $output $secondoutput $value + + + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Second/step1.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Second/step1.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + Script that takes a gff format file from MEME suite as input and orders it by genes, so it will create an output with all the information grouped by genes. Motifs will be mixed. + step1_galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Second/step1_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Second/step1_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,78 @@ +#!/usr/bin/perl -w +$|=1; +use warnings; +use strict; + +#Script that takes a gff format file from MEME suite as input and orders it by genes, +#so it will create an output with all the information grouped by genes. Motifs will be mixed. + +#Declaration of variables +my $line; +my @cols; +my %hash1; +my %hash2; +my @list1; +my @list2; +my $gene; +my $pos1; +my $n; +my $index; +my $position; +my $scalar; +my $TF; +my $counter=0;#it gives you the number of lines of the gff file. It is a good way to check that the information is not lost. + +#Files that I am going to use + +if(@ARGV < 2){ +print "\nUsage: step1.pl fimo.gff fimo-position-sorted.gff e\n\n"; +exit(0); +} + +#I open both files, FIMO as the input and OUTPUT as the ouput. +open(FIMO, "<$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; + + +while () { + $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents + chomp $line; #avoid \n on last field + @cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context. + #This is very useful because the data of the gff file can be called using this variable. + + if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest + printf OUTPUT "%s\n", $line; + $counter++; + } + else { #considers the other lines of the file + $gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line + $pos1 = $cols[3]; #variable that returns the motif's start position on the gene + $TF= substr $cols[8],5,8; #variable that returns the name of the motif + + #I use two arrays (list1 and list2) list1 returns the name of the genes and list2 the lines with all the information. + #Notice that the gene and its line will have the same position in both list. + if (not exists $hash1{$gene}{$TF}{$pos1}) { + $hash1{$gene}{$TF}{$pos1}=1; + push @list1, $gene; + push @list2, $line; + } + + } + +} + +#In this section I sort the list1 (genes) by the name of the genes, so I will take the position of every gene sorted +#and I will use the position to print out the lines in the order that I want. The main function of this script +#is to write the gff file but having the genes sorted by blocks. +$n= scalar @list1; +my @list_pos_sorted= sort { $list1[$a] cmp $list1[$b] } 0..($n - 1); + for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ + $index=$list_pos_sorted[$i]; + $position = $list1[$index]; + #print $hash2{$position}; + printf OUTPUT "%s\n", $list2[$index]; + $counter++; + } +print $counter; diff -r 000000000000 -r 229d36377838 Tools/Second/step2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Second/step2.xml Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,10 @@ + + Script that takes a gff format file from step1.pl as input and orders each block of gene data by the start position of the motif. + step2_galaxy.pl $input $output + + + + + + + diff -r 000000000000 -r 229d36377838 Tools/Second/step2_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/Second/step2_galaxy.pl Mon Sep 05 05:53:08 2016 -0400 @@ -0,0 +1,108 @@ +#!/usr/bin/perl -w +$|=1; +use warnings; +use strict; + +#Script that takes a gff format file from step1.pl as input and orders +#each block of gene data by the start position of the motif. + +#Declaration of variables +my $line; +my @cols; +my %hash1; +my %hash2; +my @list1; +my @list2; +my $gene; +my $pos1; +my $n; +my $index; +my $position; +my $scalar; +my $TF; +my $counter=0; #it gives you the number of lines of the gff file. It is a good way to check that the information is not lost. + +#Files that I am going to use + +if(@ARGV < 2){ +print "\nUsage: step2.pl fimo-gene-sorted.gff fimo-gene-&-position-sorted.gff e\n\n"; +exit(0); +} + +#I open both files, FIMO as the input and OUTPUT as the ouput. +open(FIMO, "$ARGV[0]") || + die "File '$ARGV[0]' not found\n"; +open(OUTPUT, ">$ARGV[1]") || + die "File '>$ARGV[1]' not found\n"; + +while () { + $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents + chomp $line; #avoid \n on last field + @cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context. + #This is very useful because the data of the gff file can be called using this variable. + + if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest + printf OUTPUT "%s\n", $line; + $counter++; + } + else { #considers the other lines of the file + $gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line + $pos1 = $cols[3]; #variable that returns the motif's first position on the gene + $TF= substr $cols[8],5,8; #variable that returns the name of the motif + $scalar= scalar @list1; #returns the size of the current list1 + + #This script stores infromation in two arrays (list1 and list2). The first one will register the first position of the motif in the + #gene and the list2 will store the corresponding line. + + if (not exists $hash1{$gene} and not $scalar == 0) { #Every time that a new gene is considered in the loop, it will print out + #all the information of the previous one + + #This section will print out each line of each block of genes sorted by the first position. + $n= scalar @list1; + my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); #This will sort the POSITION NUMBERS of the array + #list1 and store them in a new array name + + #This will print out the information of each gene sorted by the first position. #list_pos_sorted + for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ + $index=$list_pos_sorted[$i]; + #$position = $list1[$index]; + #printf OUTPUT "%s\n",$hash2{$position}; + printf OUTPUT "%s\n", $list2[$index]; + $counter++; + } + } + if (not exists $hash1{$gene}) {#Every time that a new gene is considered in the loop, it will reset the variables + #so a new gene can be registered + %hash1=(); + %hash2=(); + @list1=(); + @list2=(); + $hash1{$gene}=1; + $hash2{$pos1}=$line; + push @list1, $pos1; + push @list2, $line; + } + + elsif (exists $hash1{$gene}) { #if the next line has information of the same gene, it will + #store the information in the arrays. + $hash2{$pos1}=$line; + push @list1, $pos1; + push @list2, $line; + } + + + } + +} + +#Section that has the same structure of the previous one to print the LAST block of the file. +$n= scalar @list1; +my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); + for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ + $index=$list_pos_sorted[$i]; + $position = $list1[$index]; + printf OUTPUT "%s\n", $hash2{$position}; + #printf OUTPUT "%s\n", $list2[$index]; + $counter++; + } +print $counter;