diff Tools/Second/step1_galaxy.pl @ 3:b30ba2b06326 draft

Uploaded
author amadeo
date Mon, 05 Sep 2016 06:01:48 -0400
parents 229d36377838
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Tools/Second/step1_galaxy.pl	Mon Sep 05 06:01:48 2016 -0400
@@ -0,0 +1,78 @@
+#!/usr/bin/perl -w
+$|=1;
+use warnings;
+use strict;
+
+#Script that takes a gff format file from MEME suite as input and orders it by genes,
+#so it will create an output with all the information grouped by genes. Motifs will be mixed.
+
+#Declaration of variables
+my $line;
+my @cols;
+my %hash1;
+my %hash2;
+my @list1;
+my @list2;
+my $gene;
+my $pos1;
+my $n;
+my $index;
+my $position;
+my $scalar;
+my $TF;
+my $counter=0;#it gives you the number of lines of the gff file. It is a good way to check that the information is not lost.
+
+#Files that I am going to use
+
+if(@ARGV < 2){
+print "\nUsage: step1.pl fimo.gff fimo-position-sorted.gff e\n\n"; 
+exit(0);
+}
+
+#I open both files, FIMO as the input and OUTPUT as the ouput.
+open(FIMO, "<$ARGV[0]") ||
+    die "File '$ARGV[0]' not found\n";
+open(OUTPUT, ">$ARGV[1]") ||
+    die "File '>$ARGV[1]' not found\n";
+
+
+while (<FIMO>) {
+    $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents
+    chomp $line;  #avoid \n on last field
+    @cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context.
+                #This is very useful because the data of the gff file can be called using this variable.
+    
+    if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest
+        printf OUTPUT "%s\n", $line;
+        $counter++;
+    }
+    else { #considers the other lines of the file
+        $gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line
+        $pos1 = $cols[3]; #variable that returns the motif's start position on the gene
+        $TF= substr $cols[8],5,8; #variable that returns the name of the motif
+        
+        #I use two arrays (list1 and list2) list1 returns the name of the genes and list2 the lines with all the information.
+        #Notice that the gene and its line will have the same position in both list.
+        if (not exists $hash1{$gene}{$TF}{$pos1}) {
+           $hash1{$gene}{$TF}{$pos1}=1;
+           push @list1, $gene;
+           push @list2, $line;       
+        }
+    
+    }
+  
+}
+
+#In this section I sort the list1 (genes) by the name of the genes, so I will take the position of every gene sorted
+#and I will use the position to print out the lines in the order that I want. The main function of this script
+#is to write the gff file but having the genes sorted by blocks.
+$n= scalar @list1;
+my @list_pos_sorted= sort { $list1[$a] cmp $list1[$b] } 0..($n - 1);
+            for (my $i=0; $i <(scalar @list_pos_sorted); $i++){
+                $index=$list_pos_sorted[$i];
+                $position = $list1[$index];
+                #print $hash2{$position};
+                printf OUTPUT "%s\n", $list2[$index];
+                $counter++;
+            }
+print $counter;