Mercurial > repos > amadeo > amadeo
comparison Tools/Second/step2_galaxy.pl @ 0:229d36377838 draft
Uploaded
author | amadeo |
---|---|
date | Mon, 05 Sep 2016 05:53:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:229d36377838 |
---|---|
1 #!/usr/bin/perl -w | |
2 $|=1; | |
3 use warnings; | |
4 use strict; | |
5 | |
6 #Script that takes a gff format file from step1.pl as input and orders | |
7 #each block of gene data by the start position of the motif. | |
8 | |
9 #Declaration of variables | |
10 my $line; | |
11 my @cols; | |
12 my %hash1; | |
13 my %hash2; | |
14 my @list1; | |
15 my @list2; | |
16 my $gene; | |
17 my $pos1; | |
18 my $n; | |
19 my $index; | |
20 my $position; | |
21 my $scalar; | |
22 my $TF; | |
23 my $counter=0; #it gives you the number of lines of the gff file. It is a good way to check that the information is not lost. | |
24 | |
25 #Files that I am going to use | |
26 | |
27 if(@ARGV < 2){ | |
28 print "\nUsage: step2.pl fimo-gene-sorted.gff fimo-gene-&-position-sorted.gff e\n\n"; | |
29 exit(0); | |
30 } | |
31 | |
32 #I open both files, FIMO as the input and OUTPUT as the ouput. | |
33 open(FIMO, "$ARGV[0]") || | |
34 die "File '$ARGV[0]' not found\n"; | |
35 open(OUTPUT, ">$ARGV[1]") || | |
36 die "File '>$ARGV[1]' not found\n"; | |
37 | |
38 while (<FIMO>) { | |
39 $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents | |
40 chomp $line; #avoid \n on last field | |
41 @cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context. | |
42 #This is very useful because the data of the gff file can be called using this variable. | |
43 | |
44 if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest | |
45 printf OUTPUT "%s\n", $line; | |
46 $counter++; | |
47 } | |
48 else { #considers the other lines of the file | |
49 $gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line | |
50 $pos1 = $cols[3]; #variable that returns the motif's first position on the gene | |
51 $TF= substr $cols[8],5,8; #variable that returns the name of the motif | |
52 $scalar= scalar @list1; #returns the size of the current list1 | |
53 | |
54 #This script stores infromation in two arrays (list1 and list2). The first one will register the first position of the motif in the | |
55 #gene and the list2 will store the corresponding line. | |
56 | |
57 if (not exists $hash1{$gene} and not $scalar == 0) { #Every time that a new gene is considered in the loop, it will print out | |
58 #all the information of the previous one | |
59 | |
60 #This section will print out each line of each block of genes sorted by the first position. | |
61 $n= scalar @list1; | |
62 my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); #This will sort the POSITION NUMBERS of the array | |
63 #list1 and store them in a new array name | |
64 | |
65 #This will print out the information of each gene sorted by the first position. #list_pos_sorted | |
66 for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ | |
67 $index=$list_pos_sorted[$i]; | |
68 #$position = $list1[$index]; | |
69 #printf OUTPUT "%s\n",$hash2{$position}; | |
70 printf OUTPUT "%s\n", $list2[$index]; | |
71 $counter++; | |
72 } | |
73 } | |
74 if (not exists $hash1{$gene}) {#Every time that a new gene is considered in the loop, it will reset the variables | |
75 #so a new gene can be registered | |
76 %hash1=(); | |
77 %hash2=(); | |
78 @list1=(); | |
79 @list2=(); | |
80 $hash1{$gene}=1; | |
81 $hash2{$pos1}=$line; | |
82 push @list1, $pos1; | |
83 push @list2, $line; | |
84 } | |
85 | |
86 elsif (exists $hash1{$gene}) { #if the next line has information of the same gene, it will | |
87 #store the information in the arrays. | |
88 $hash2{$pos1}=$line; | |
89 push @list1, $pos1; | |
90 push @list2, $line; | |
91 } | |
92 | |
93 | |
94 } | |
95 | |
96 } | |
97 | |
98 #Section that has the same structure of the previous one to print the LAST block of the file. | |
99 $n= scalar @list1; | |
100 my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); | |
101 for (my $i=0; $i <(scalar @list_pos_sorted); $i++){ | |
102 $index=$list_pos_sorted[$i]; | |
103 $position = $list1[$index]; | |
104 printf OUTPUT "%s\n", $hash2{$position}; | |
105 #printf OUTPUT "%s\n", $list2[$index]; | |
106 $counter++; | |
107 } | |
108 print $counter; |