0
|
1 #!/usr/bin/perl -w
|
|
2 $|=1;
|
|
3 use warnings;
|
|
4 use strict;
|
|
5
|
|
6 #Script that takes a gff format file from step1.pl as input and orders
|
|
7 #each block of gene data by the start position of the motif.
|
|
8
|
|
9 #Declaration of variables
|
|
10 my $line;
|
|
11 my @cols;
|
|
12 my %hash1;
|
|
13 my %hash2;
|
|
14 my @list1;
|
|
15 my @list2;
|
|
16 my $gene;
|
|
17 my $pos1;
|
|
18 my $n;
|
|
19 my $index;
|
|
20 my $position;
|
|
21 my $scalar;
|
|
22 my $TF;
|
|
23 my $counter=0; #it gives you the number of lines of the gff file. It is a good way to check that the information is not lost.
|
|
24
|
|
25 #Files that I am going to use
|
|
26
|
|
27 if(@ARGV < 2){
|
|
28 print "\nUsage: step2.pl fimo-gene-sorted.gff fimo-gene-&-position-sorted.gff e\n\n";
|
|
29 exit(0);
|
|
30 }
|
|
31
|
|
32 #I open both files, FIMO as the input and OUTPUT as the ouput.
|
|
33 open(FIMO, "$ARGV[0]") ||
|
|
34 die "File '$ARGV[0]' not found\n";
|
|
35 open(OUTPUT, ">$ARGV[1]") ||
|
|
36 die "File '>$ARGV[1]' not found\n";
|
|
37
|
|
38 while (<FIMO>) {
|
|
39 $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents
|
|
40 chomp $line; #avoid \n on last field
|
|
41 @cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context.
|
|
42 #This is very useful because the data of the gff file can be called using this variable.
|
|
43
|
|
44 if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest
|
|
45 printf OUTPUT "%s\n", $line;
|
|
46 $counter++;
|
|
47 }
|
|
48 else { #considers the other lines of the file
|
|
49 $gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line
|
|
50 $pos1 = $cols[3]; #variable that returns the motif's first position on the gene
|
|
51 $TF= substr $cols[8],5,8; #variable that returns the name of the motif
|
|
52 $scalar= scalar @list1; #returns the size of the current list1
|
|
53
|
|
54 #This script stores infromation in two arrays (list1 and list2). The first one will register the first position of the motif in the
|
|
55 #gene and the list2 will store the corresponding line.
|
|
56
|
|
57 if (not exists $hash1{$gene} and not $scalar == 0) { #Every time that a new gene is considered in the loop, it will print out
|
|
58 #all the information of the previous one
|
|
59
|
|
60 #This section will print out each line of each block of genes sorted by the first position.
|
|
61 $n= scalar @list1;
|
|
62 my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); #This will sort the POSITION NUMBERS of the array
|
|
63 #list1 and store them in a new array name
|
|
64
|
|
65 #This will print out the information of each gene sorted by the first position. #list_pos_sorted
|
|
66 for (my $i=0; $i <(scalar @list_pos_sorted); $i++){
|
|
67 $index=$list_pos_sorted[$i];
|
|
68 #$position = $list1[$index];
|
|
69 #printf OUTPUT "%s\n",$hash2{$position};
|
|
70 printf OUTPUT "%s\n", $list2[$index];
|
|
71 $counter++;
|
|
72 }
|
|
73 }
|
|
74 if (not exists $hash1{$gene}) {#Every time that a new gene is considered in the loop, it will reset the variables
|
|
75 #so a new gene can be registered
|
|
76 %hash1=();
|
|
77 %hash2=();
|
|
78 @list1=();
|
|
79 @list2=();
|
|
80 $hash1{$gene}=1;
|
|
81 $hash2{$pos1}=$line;
|
|
82 push @list1, $pos1;
|
|
83 push @list2, $line;
|
|
84 }
|
|
85
|
|
86 elsif (exists $hash1{$gene}) { #if the next line has information of the same gene, it will
|
|
87 #store the information in the arrays.
|
|
88 $hash2{$pos1}=$line;
|
|
89 push @list1, $pos1;
|
|
90 push @list2, $line;
|
|
91 }
|
|
92
|
|
93
|
|
94 }
|
|
95
|
|
96 }
|
|
97
|
|
98 #Section that has the same structure of the previous one to print the LAST block of the file.
|
|
99 $n= scalar @list1;
|
|
100 my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1);
|
|
101 for (my $i=0; $i <(scalar @list_pos_sorted); $i++){
|
|
102 $index=$list_pos_sorted[$i];
|
|
103 $position = $list1[$index];
|
|
104 printf OUTPUT "%s\n", $hash2{$position};
|
|
105 #printf OUTPUT "%s\n", $list2[$index];
|
|
106 $counter++;
|
|
107 }
|
|
108 print $counter;
|