Mercurial > repos > amadeo > amadeo
view Tools/Second/step2_galaxy.pl @ 0:229d36377838 draft
Uploaded
author | amadeo |
---|---|
date | Mon, 05 Sep 2016 05:53:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w $|=1; use warnings; use strict; #Script that takes a gff format file from step1.pl as input and orders #each block of gene data by the start position of the motif. #Declaration of variables my $line; my @cols; my %hash1; my %hash2; my @list1; my @list2; my $gene; my $pos1; my $n; my $index; my $position; my $scalar; my $TF; my $counter=0; #it gives you the number of lines of the gff file. It is a good way to check that the information is not lost. #Files that I am going to use if(@ARGV < 2){ print "\nUsage: step2.pl fimo-gene-sorted.gff fimo-gene-&-position-sorted.gff e\n\n"; exit(0); } #I open both files, FIMO as the input and OUTPUT as the ouput. open(FIMO, "$ARGV[0]") || die "File '$ARGV[0]' not found\n"; open(OUTPUT, ">$ARGV[1]") || die "File '>$ARGV[1]' not found\n"; while (<FIMO>) { $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents chomp $line; #avoid \n on last field @cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context. #This is very useful because the data of the gff file can be called using this variable. if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest printf OUTPUT "%s\n", $line; $counter++; } else { #considers the other lines of the file $gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line $pos1 = $cols[3]; #variable that returns the motif's first position on the gene $TF= substr $cols[8],5,8; #variable that returns the name of the motif $scalar= scalar @list1; #returns the size of the current list1 #This script stores infromation in two arrays (list1 and list2). The first one will register the first position of the motif in the #gene and the list2 will store the corresponding line. if (not exists $hash1{$gene} and not $scalar == 0) { #Every time that a new gene is considered in the loop, it will print out #all the information of the previous one #This section will print out each line of each block of genes sorted by the first position. $n= scalar @list1; my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); #This will sort the POSITION NUMBERS of the array #list1 and store them in a new array name #This will print out the information of each gene sorted by the first position. #list_pos_sorted for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ $index=$list_pos_sorted[$i]; #$position = $list1[$index]; #printf OUTPUT "%s\n",$hash2{$position}; printf OUTPUT "%s\n", $list2[$index]; $counter++; } } if (not exists $hash1{$gene}) {#Every time that a new gene is considered in the loop, it will reset the variables #so a new gene can be registered %hash1=(); %hash2=(); @list1=(); @list2=(); $hash1{$gene}=1; $hash2{$pos1}=$line; push @list1, $pos1; push @list2, $line; } elsif (exists $hash1{$gene}) { #if the next line has information of the same gene, it will #store the information in the arrays. $hash2{$pos1}=$line; push @list1, $pos1; push @list2, $line; } } } #Section that has the same structure of the previous one to print the LAST block of the file. $n= scalar @list1; my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ $index=$list_pos_sorted[$i]; $position = $list1[$index]; printf OUTPUT "%s\n", $hash2{$position}; #printf OUTPUT "%s\n", $list2[$index]; $counter++; } print $counter;