view Tools/Matrix/gene-TF-matrix-csv-galaxy.pl @ 0:229d36377838 draft

Uploaded
author amadeo
date Mon, 05 Sep 2016 05:53:08 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl -w
$|=1;
use strict;
use warnings;



# Script to create csv formatted gene vs TF matrix from a filtered gff
# file. GFF file can contain just Positive or Just neagtive strand
# TFBS. Has two types of matrix produced: (0) resence/Abscence with only
# 1 and 0s. With option=0. (1) counts of TFs with numbers 1,3,5 etc. 




my $line;
my $line3;
my @cols;
my @TF_array;
my @gene_array;
my %matrix_1= ();
my %matrix_2= ();
my $TF;
my $gene;
my %matrix;
my $matrixType;

if(@ARGV < 3){
print "\nUsage: gene-TF-matrix.pl fimo-nol-P.gff/fimo-nol-N.gff gene-matrix-P.csv/gene-matrix-N.csv 
\n Options: Presence/Abscence=0 counts=1\n\n";
exit(0);
}
open (FIMO, "<$ARGV[0]") || 
    die "File '$ARGV[0]' not found\n" ;
open(MATRIX, ">$ARGV[1]") ||
    die "File '>$ARGV[1]' not found\n";

$matrixType = $ARGV[2];
print "MatrixTYpe is $matrixType\n";
   
#Put all the motifs and genes in two separate arrays: each appears
#only once in each array.
while (<FIMO>) {
    $line=$_;   
     if ($line!~/^##/) {#ignore header line
       @cols=split;
       $TF= substr $cols[8],5,8;
       if (not exists $matrix_1{$TF}) {
	 $matrix_1{$TF}="";
	 push @TF_array, $TF;
       }
       $gene=substr $cols[0],0,21;
        if (not exists $matrix_2{$gene}) { 
	  $matrix_2{$gene}="";
	  push @gene_array, $gene
	}
     }
  }

my $n_motifs=scalar @TF_array;
my $n_genes=scalar@gene_array;
#printf "Scalar motifs is %d\n", scalar@TF_array;
#printf "Scalar genes is %d\n", scalar@gene_array;

close(FIMO);
#I want to create a hash on which each gene has a list of 0s. Then I want to "read" the .gff file
#and if a gene has a certain TF it will add "+1" to the possition of the TF, and it will look like this.


open (FIMO, "$ARGV[0]") || 
   die "File '$ARGV[0]' not found\n" ;
   
#$matrix{"PGSC0003DMG400006788"}=(0,0,1,0,2,0,3,0,0,...,0)

#Filling 2d gene/motif array with zeros to start
foreach my $element (@gene_array){
  my @auxilary_list = ();
  for (my $i=1; $i <= $n_motifs; $i++){
   $auxilary_list[$i-1] =0;
  }
  $matrix{$element}=\@auxilary_list;
}

#This is how I want to read the .gff file and check if a gene has a certain TF. I dont consider the positions yet. I just
# want to see if this first step works.

while (<FIMO>){
  $line3 = $_;
  if ($line3!~/^##/) {
    for (my $j=0; $j < scalar@gene_array; $j++){  
      for (my $h=0; $h < scalar@TF_array; $h++){
	#printf "Genes[%d] -%s-  Motifs[%d] -%s- \n",$j, $gene_array[$j], $h, $TF_array[$h]; 
	if (($line3 =~/$gene_array[$j]/) and ($line3 =~/$TF_array[$h]/)) {      
            if ($matrixType ==0){${$matrix{$gene_array[$j]}}[$h]=1;}
	    if ($matrixType ==1){${$matrix{$gene_array[$j]}}[$h]++;}
	} 
      }
    }
  }
}

printf MATRIX "Gene,";
for (my $h=0; $h < scalar@TF_array; $h++){
    if ($h!=scalar@TF_array-1) {
    printf MATRIX  "$TF_array[$h],";
    }
    else{printf MATRIX  "$TF_array[$h]"}
}   
printf MATRIX "\n";
foreach my $element(sort keys %matrix){
    printf MATRIX  "$element,";
    for (my $r=0; $r<scalar@{$matrix{$element}};$r++){
        if ($r!=scalar@{$matrix{$element}}-1) {
            printf MATRIX "$matrix{$element}[$r],"
        }
        else{printf MATRIX "$matrix{$element}[$r]"}
        
    }
    printf MATRIX "\n"
}