amadeo: Tools/Second/step2_galaxy.pl annotate

annotate Tools/Second/step2_galaxy.pl @ 3:b30ba2b06326 draft

Uploaded

author	amadeo
date	Mon, 05 Sep 2016 06:01:48 -0400
parents	229d36377838
children

rev	line source
0 229d36377838 Uploaded amadeo parents: diff changeset	1 #!/usr/bin/perl -w
229d36377838 Uploaded amadeo parents: diff changeset	2 $\|=1;
229d36377838 Uploaded amadeo parents: diff changeset	3 use warnings;
229d36377838 Uploaded amadeo parents: diff changeset	4 use strict;
229d36377838 Uploaded amadeo parents: diff changeset	5
229d36377838 Uploaded amadeo parents: diff changeset	6 #Script that takes a gff format file from step1.pl as input and orders
229d36377838 Uploaded amadeo parents: diff changeset	7 #each block of gene data by the start position of the motif.
229d36377838 Uploaded amadeo parents: diff changeset	8
229d36377838 Uploaded amadeo parents: diff changeset	9 #Declaration of variables
229d36377838 Uploaded amadeo parents: diff changeset	10 my $line;
229d36377838 Uploaded amadeo parents: diff changeset	11 my @cols;
229d36377838 Uploaded amadeo parents: diff changeset	12 my %hash1;
229d36377838 Uploaded amadeo parents: diff changeset	13 my %hash2;
229d36377838 Uploaded amadeo parents: diff changeset	14 my @list1;
229d36377838 Uploaded amadeo parents: diff changeset	15 my @list2;
229d36377838 Uploaded amadeo parents: diff changeset	16 my $gene;
229d36377838 Uploaded amadeo parents: diff changeset	17 my $pos1;
229d36377838 Uploaded amadeo parents: diff changeset	18 my $n;
229d36377838 Uploaded amadeo parents: diff changeset	19 my $index;
229d36377838 Uploaded amadeo parents: diff changeset	20 my $position;
229d36377838 Uploaded amadeo parents: diff changeset	21 my $scalar;
229d36377838 Uploaded amadeo parents: diff changeset	22 my $TF;
229d36377838 Uploaded amadeo parents: diff changeset	23 my $counter=0; #it gives you the number of lines of the gff file. It is a good way to check that the information is not lost.
229d36377838 Uploaded amadeo parents: diff changeset	24
229d36377838 Uploaded amadeo parents: diff changeset	25 #Files that I am going to use
229d36377838 Uploaded amadeo parents: diff changeset	26
229d36377838 Uploaded amadeo parents: diff changeset	27 if(@ARGV < 2){
229d36377838 Uploaded amadeo parents: diff changeset	28 print "\nUsage: step2.pl fimo-gene-sorted.gff fimo-gene-&-position-sorted.gff e\n\n";
229d36377838 Uploaded amadeo parents: diff changeset	29 exit(0);
229d36377838 Uploaded amadeo parents: diff changeset	30 }
229d36377838 Uploaded amadeo parents: diff changeset	31
229d36377838 Uploaded amadeo parents: diff changeset	32 #I open both files, FIMO as the input and OUTPUT as the ouput.
229d36377838 Uploaded amadeo parents: diff changeset	33 open(FIMO, "$ARGV[0]") \|\|
229d36377838 Uploaded amadeo parents: diff changeset	34 die "File '$ARGV[0]' not found\n";
229d36377838 Uploaded amadeo parents: diff changeset	35 open(OUTPUT, ">$ARGV[1]") \|\|
229d36377838 Uploaded amadeo parents: diff changeset	36 die "File '>$ARGV[1]' not found\n";
229d36377838 Uploaded amadeo parents: diff changeset	37
229d36377838 Uploaded amadeo parents: diff changeset	38 while (<FIMO>) {
229d36377838 Uploaded amadeo parents: diff changeset	39 $line=$_; #assigning line to variable $line \| $_ is a special default variable that here holds the line contents
229d36377838 Uploaded amadeo parents: diff changeset	40 chomp $line; #avoid \n on last field
229d36377838 Uploaded amadeo parents: diff changeset	41 @cols=split; #Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context.
229d36377838 Uploaded amadeo parents: diff changeset	42 #This is very useful because the data of the gff file can be called using this variable.
229d36377838 Uploaded amadeo parents: diff changeset	43
229d36377838 Uploaded amadeo parents: diff changeset	44 if ($line=~/^#/){ #prints the first line of the gff file that is different from the rest
229d36377838 Uploaded amadeo parents: diff changeset	45 printf OUTPUT "%s\n", $line;
229d36377838 Uploaded amadeo parents: diff changeset	46 $counter++;
229d36377838 Uploaded amadeo parents: diff changeset	47 }
229d36377838 Uploaded amadeo parents: diff changeset	48 else { #considers the other lines of the file
229d36377838 Uploaded amadeo parents: diff changeset	49 $gene=substr $cols[0],0,21; #variable that returns the name of the gene of the line
229d36377838 Uploaded amadeo parents: diff changeset	50 $pos1 = $cols[3]; #variable that returns the motif's first position on the gene
229d36377838 Uploaded amadeo parents: diff changeset	51 $TF= substr $cols[8],5,8; #variable that returns the name of the motif
229d36377838 Uploaded amadeo parents: diff changeset	52 $scalar= scalar @list1; #returns the size of the current list1
229d36377838 Uploaded amadeo parents: diff changeset	53
229d36377838 Uploaded amadeo parents: diff changeset	54 #This script stores infromation in two arrays (list1 and list2). The first one will register the first position of the motif in the
229d36377838 Uploaded amadeo parents: diff changeset	55 #gene and the list2 will store the corresponding line.
229d36377838 Uploaded amadeo parents: diff changeset	56
229d36377838 Uploaded amadeo parents: diff changeset	57 if (not exists $hash1{$gene} and not $scalar == 0) { #Every time that a new gene is considered in the loop, it will print out
229d36377838 Uploaded amadeo parents: diff changeset	58 #all the information of the previous one
229d36377838 Uploaded amadeo parents: diff changeset	59
229d36377838 Uploaded amadeo parents: diff changeset	60 #This section will print out each line of each block of genes sorted by the first position.
229d36377838 Uploaded amadeo parents: diff changeset	61 $n= scalar @list1;
229d36377838 Uploaded amadeo parents: diff changeset	62 my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1); #This will sort the POSITION NUMBERS of the array
229d36377838 Uploaded amadeo parents: diff changeset	63 #list1 and store them in a new array name
229d36377838 Uploaded amadeo parents: diff changeset	64
229d36377838 Uploaded amadeo parents: diff changeset	65 #This will print out the information of each gene sorted by the first position. #list_pos_sorted
229d36377838 Uploaded amadeo parents: diff changeset	66 for (my $i=0; $i <(scalar @list_pos_sorted); $i++){
229d36377838 Uploaded amadeo parents: diff changeset	67 $index=$list_pos_sorted[$i];
229d36377838 Uploaded amadeo parents: diff changeset	68 #$position = $list1[$index];
229d36377838 Uploaded amadeo parents: diff changeset	69 #printf OUTPUT "%s\n",$hash2{$position};
229d36377838 Uploaded amadeo parents: diff changeset	70 printf OUTPUT "%s\n", $list2[$index];
229d36377838 Uploaded amadeo parents: diff changeset	71 $counter++;
229d36377838 Uploaded amadeo parents: diff changeset	72 }
229d36377838 Uploaded amadeo parents: diff changeset	73 }
229d36377838 Uploaded amadeo parents: diff changeset	74 if (not exists $hash1{$gene}) {#Every time that a new gene is considered in the loop, it will reset the variables
229d36377838 Uploaded amadeo parents: diff changeset	75 #so a new gene can be registered
229d36377838 Uploaded amadeo parents: diff changeset	76 %hash1=();
229d36377838 Uploaded amadeo parents: diff changeset	77 %hash2=();
229d36377838 Uploaded amadeo parents: diff changeset	78 @list1=();
229d36377838 Uploaded amadeo parents: diff changeset	79 @list2=();
229d36377838 Uploaded amadeo parents: diff changeset	80 $hash1{$gene}=1;
229d36377838 Uploaded amadeo parents: diff changeset	81 $hash2{$pos1}=$line;
229d36377838 Uploaded amadeo parents: diff changeset	82 push @list1, $pos1;
229d36377838 Uploaded amadeo parents: diff changeset	83 push @list2, $line;
229d36377838 Uploaded amadeo parents: diff changeset	84 }
229d36377838 Uploaded amadeo parents: diff changeset	85
229d36377838 Uploaded amadeo parents: diff changeset	86 elsif (exists $hash1{$gene}) { #if the next line has information of the same gene, it will
229d36377838 Uploaded amadeo parents: diff changeset	87 #store the information in the arrays.
229d36377838 Uploaded amadeo parents: diff changeset	88 $hash2{$pos1}=$line;
229d36377838 Uploaded amadeo parents: diff changeset	89 push @list1, $pos1;
229d36377838 Uploaded amadeo parents: diff changeset	90 push @list2, $line;
229d36377838 Uploaded amadeo parents: diff changeset	91 }
229d36377838 Uploaded amadeo parents: diff changeset	92
229d36377838 Uploaded amadeo parents: diff changeset	93
229d36377838 Uploaded amadeo parents: diff changeset	94 }
229d36377838 Uploaded amadeo parents: diff changeset	95
229d36377838 Uploaded amadeo parents: diff changeset	96 }
229d36377838 Uploaded amadeo parents: diff changeset	97
229d36377838 Uploaded amadeo parents: diff changeset	98 #Section that has the same structure of the previous one to print the LAST block of the file.
229d36377838 Uploaded amadeo parents: diff changeset	99 $n= scalar @list1;
229d36377838 Uploaded amadeo parents: diff changeset	100 my @list_pos_sorted= sort { $list1[$a] <=> $list1[$b] } 0..($n - 1);
229d36377838 Uploaded amadeo parents: diff changeset	101 for (my $i=0; $i <(scalar @list_pos_sorted); $i++){
229d36377838 Uploaded amadeo parents: diff changeset	102 $index=$list_pos_sorted[$i];
229d36377838 Uploaded amadeo parents: diff changeset	103 $position = $list1[$index];
229d36377838 Uploaded amadeo parents: diff changeset	104 printf OUTPUT "%s\n", $hash2{$position};
229d36377838 Uploaded amadeo parents: diff changeset	105 #printf OUTPUT "%s\n", $list2[$index];
229d36377838 Uploaded amadeo parents: diff changeset	106 $counter++;
229d36377838 Uploaded amadeo parents: diff changeset	107 }
229d36377838 Uploaded amadeo parents: diff changeset	108 print $counter;

Mercurial > repos > amadeo > amadeo

annotate Tools/Second/step2_galaxy.pl @ 3:b30ba2b06326 draft