annotate Tools/Second/step1_galaxy.pl @ 3:b30ba2b06326 draft

Uploaded
author amadeo
date Mon, 05 Sep 2016 06:01:48 -0400
parents 229d36377838
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
229d36377838 Uploaded
amadeo
parents:
diff changeset
1 #!/usr/bin/perl -w
229d36377838 Uploaded
amadeo
parents:
diff changeset
2 $|=1;
229d36377838 Uploaded
amadeo
parents:
diff changeset
3 use warnings;
229d36377838 Uploaded
amadeo
parents:
diff changeset
4 use strict;
229d36377838 Uploaded
amadeo
parents:
diff changeset
5
229d36377838 Uploaded
amadeo
parents:
diff changeset
6 #Script that takes a gff format file from MEME suite as input and orders it by genes,
229d36377838 Uploaded
amadeo
parents:
diff changeset
7 #so it will create an output with all the information grouped by genes. Motifs will be mixed.
229d36377838 Uploaded
amadeo
parents:
diff changeset
8
229d36377838 Uploaded
amadeo
parents:
diff changeset
9 #Declaration of variables
229d36377838 Uploaded
amadeo
parents:
diff changeset
10 my $line;
229d36377838 Uploaded
amadeo
parents:
diff changeset
11 my @cols;
229d36377838 Uploaded
amadeo
parents:
diff changeset
12 my %hash1;
229d36377838 Uploaded
amadeo
parents:
diff changeset
13 my %hash2;
229d36377838 Uploaded
amadeo
parents:
diff changeset
14 my @list1;
229d36377838 Uploaded
amadeo
parents:
diff changeset
15 my @list2;
229d36377838 Uploaded
amadeo
parents:
diff changeset
16 my $gene;
229d36377838 Uploaded
amadeo
parents:
diff changeset
17 my $pos1;
229d36377838 Uploaded
amadeo
parents:
diff changeset
18 my $n;
229d36377838 Uploaded
amadeo
parents:
diff changeset
19 my $index;
229d36377838 Uploaded
amadeo
parents:
diff changeset
20 my $position;
229d36377838 Uploaded
amadeo
parents:
diff changeset
21 my $scalar;
229d36377838 Uploaded
amadeo
parents:
diff changeset
22 my $TF;
229d36377838 Uploaded
amadeo
parents:
diff changeset
23 my $counter=0;#it gives you the number of lines of the gff file. It is a good way to check that the information is not lost.
229d36377838 Uploaded
amadeo
parents:
diff changeset
24
229d36377838 Uploaded
amadeo
parents:
diff changeset
25 #Files that I am going to use
229d36377838 Uploaded
amadeo
parents:
diff changeset
26
229d36377838 Uploaded
amadeo
parents:
diff changeset
27 if(@ARGV < 2){
229d36377838 Uploaded
amadeo
parents:
diff changeset
28 print "\nUsage: step1.pl fimo.gff fimo-position-sorted.gff e\n\n";
229d36377838 Uploaded
amadeo
parents:
diff changeset
29 exit(0);
229d36377838 Uploaded
amadeo
parents:
diff changeset
30 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
31
229d36377838 Uploaded
amadeo
parents:
diff changeset
32 #I open both files, FIMO as the input and OUTPUT as the ouput.
229d36377838 Uploaded
amadeo
parents:
diff changeset
33 open(FIMO, "<$ARGV[0]") ||
229d36377838 Uploaded
amadeo
parents:
diff changeset
34 die "File '$ARGV[0]' not found\n";
229d36377838 Uploaded
amadeo
parents:
diff changeset
35 open(OUTPUT, ">$ARGV[1]") ||
229d36377838 Uploaded
amadeo
parents:
diff changeset
36 die "File '>$ARGV[1]' not found\n";
229d36377838 Uploaded
amadeo
parents:
diff changeset
37
229d36377838 Uploaded
amadeo
parents:
diff changeset
38
229d36377838 Uploaded
amadeo
parents:
diff changeset
39 while (<FIMO>) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
40 $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents
229d36377838 Uploaded
amadeo
parents:
diff changeset
41 chomp $line; #avoid \n on last field
229d36377838 Uploaded
amadeo
parents:
diff changeset
42 @cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context.
229d36377838 Uploaded
amadeo
parents:
diff changeset
43 #This is very useful because the data of the gff file can be called using this variable.
229d36377838 Uploaded
amadeo
parents:
diff changeset
44
229d36377838 Uploaded
amadeo
parents:
diff changeset
45 if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest
229d36377838 Uploaded
amadeo
parents:
diff changeset
46 printf OUTPUT "%s\n", $line;
229d36377838 Uploaded
amadeo
parents:
diff changeset
47 $counter++;
229d36377838 Uploaded
amadeo
parents:
diff changeset
48 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
49 else { #considers the other lines of the file
229d36377838 Uploaded
amadeo
parents:
diff changeset
50 $gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line
229d36377838 Uploaded
amadeo
parents:
diff changeset
51 $pos1 = $cols[3]; #variable that returns the motif's start position on the gene
229d36377838 Uploaded
amadeo
parents:
diff changeset
52 $TF= substr $cols[8],5,8; #variable that returns the name of the motif
229d36377838 Uploaded
amadeo
parents:
diff changeset
53
229d36377838 Uploaded
amadeo
parents:
diff changeset
54 #I use two arrays (list1 and list2) list1 returns the name of the genes and list2 the lines with all the information.
229d36377838 Uploaded
amadeo
parents:
diff changeset
55 #Notice that the gene and its line will have the same position in both list.
229d36377838 Uploaded
amadeo
parents:
diff changeset
56 if (not exists $hash1{$gene}{$TF}{$pos1}) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
57 $hash1{$gene}{$TF}{$pos1}=1;
229d36377838 Uploaded
amadeo
parents:
diff changeset
58 push @list1, $gene;
229d36377838 Uploaded
amadeo
parents:
diff changeset
59 push @list2, $line;
229d36377838 Uploaded
amadeo
parents:
diff changeset
60 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
61
229d36377838 Uploaded
amadeo
parents:
diff changeset
62 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
63
229d36377838 Uploaded
amadeo
parents:
diff changeset
64 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
65
229d36377838 Uploaded
amadeo
parents:
diff changeset
66 #In this section I sort the list1 (genes) by the name of the genes, so I will take the position of every gene sorted
229d36377838 Uploaded
amadeo
parents:
diff changeset
67 #and I will use the position to print out the lines in the order that I want. The main function of this script
229d36377838 Uploaded
amadeo
parents:
diff changeset
68 #is to write the gff file but having the genes sorted by blocks.
229d36377838 Uploaded
amadeo
parents:
diff changeset
69 $n= scalar @list1;
229d36377838 Uploaded
amadeo
parents:
diff changeset
70 my @list_pos_sorted= sort { $list1[$a] cmp $list1[$b] } 0..($n - 1);
229d36377838 Uploaded
amadeo
parents:
diff changeset
71 for (my $i=0; $i <(scalar @list_pos_sorted); $i++){
229d36377838 Uploaded
amadeo
parents:
diff changeset
72 $index=$list_pos_sorted[$i];
229d36377838 Uploaded
amadeo
parents:
diff changeset
73 $position = $list1[$index];
229d36377838 Uploaded
amadeo
parents:
diff changeset
74 #print $hash2{$position};
229d36377838 Uploaded
amadeo
parents:
diff changeset
75 printf OUTPUT "%s\n", $list2[$index];
229d36377838 Uploaded
amadeo
parents:
diff changeset
76 $counter++;
229d36377838 Uploaded
amadeo
parents:
diff changeset
77 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
78 print $counter;