amadeo: Tools/Second/step2_galaxy.pl comparison

comparison Tools/Second/step2_galaxy.pl @ 0:229d36377838 draft

Uploaded

author	amadeo
date	Mon, 05 Sep 2016 05:53:08 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:229d36377838
+#!/usr/bin/perl -w
+$|=1;
+use warnings;
+use strict;
+#Script that takes a gff format file from step1.pl as input and orders
+#each block of gene data by the start position of the motif.
+#Declaration of variables
+my $line;
+my @cols;
+my %hash1;
+my %hash2;
+my @list1;
+my @list2;
+my $gene;
+my $pos1;
+my $n;
+my $index;
+my $position;
+my $scalar;
+my $TF;
+my $counter=0; #it gives you the number of lines of the gff file. It is a good way to check that the information is not lost.
+#Files that I am going to use
+if(@ARGV < 2){
+print "\nUsage: step2.pl fimo-gene-sorted.gff fimo-gene-&-position-sorted.gff e\n\n";
+exit(0);
+}
+#I open both files, FIMO as the input and OUTPUT as the ouput.
+open(FIMO, "$ARGV[0]") ||
+die "File '$ARGV[0]' not found\n";
+open(OUTPUT, ">$ARGV[1]") ||
+die "File '>$ARGV[1]' not found\n";
+while (<FIMO>) {
+$line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents
+chomp $line;  #avoid \n on last field
+@cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context.
+#This is very useful because the data of the gff file can be called using this variable.
+if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest
+printf OUTPUT "%s\n", $line;
+$counter++;
+}
+else { #considers the other lines of the file
+$gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line
+$pos1 = $cols[3]; #variable that returns the motif's first position on the gene
+$TF= substr $cols[8],5,8; #variable that returns the name of the motif
+$scalar= scalar @list1; #returns the size of the current list1
+#This script stores infromation in two arrays (list1 and list2). The first one will register the first position of the motif in the
+#gene and the list2 will store the corresponding line.
+if (not exists $hash1{$gene} and not $scalar == 0) { #Every time that a new gene is considered in the loop, it will print out
+#all the information of the previous one
+#This section will print out each line of each block of genes sorted by the first position.
+$n= scalar @list1;
+my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); #This will sort the POSITION NUMBERS of the array
+#list1 and store them in a new array name
+#This will print out the information of each gene sorted by the first position.                                                                      #list_pos_sorted
+for (my $i=0; $i <(scalar @list_pos_sorted); $i++){
+$index=$list_pos_sorted[$i];
+#$position = $list1[$index];
+#printf OUTPUT "%s\n",$hash2{$position};
+printf OUTPUT "%s\n", $list2[$index];
+$counter++;
+}
+}
+if (not exists $hash1{$gene}) {#Every time that a new gene is considered in the loop, it will reset the variables
+#so a new gene can be registered
+%hash1=();
+%hash2=();
+@list1=();
+@list2=();
+$hash1{$gene}=1;
+$hash2{$pos1}=$line;
+push @list1, $pos1;
+push @list2, $line;
+}
+elsif (exists $hash1{$gene}) { #if the next line has information of the same gene, it will
+#store the information in the arrays.
+$hash2{$pos1}=$line;
+push @list1, $pos1;
+push @list2, $line;
+}
+}
+}
+#Section that has the same structure of the previous one to print the LAST block of the file.
+$n= scalar @list1;
+my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1);
+for (my $i=0; $i <(scalar @list_pos_sorted); $i++){
+$index=$list_pos_sorted[$i];
+$position = $list1[$index];
+printf OUTPUT "%s\n", $hash2{$position};
+#printf OUTPUT "%s\n", $list2[$index];
+$counter++;
+}
+print $counter;

Mercurial > repos > amadeo > amadeo

comparison Tools/Second/step2_galaxy.pl @ 0:229d36377838 draft