comparison Tools/Matrix/gene-TF-matrix-csv-galaxy.pl @ 0:229d36377838 draft

Uploaded
author amadeo
date Mon, 05 Sep 2016 05:53:08 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:229d36377838
1 #!/usr/bin/perl -w
2 $|=1;
3 use strict;
4 use warnings;
5
6
7
8 # Script to create csv formatted gene vs TF matrix from a filtered gff
9 # file. GFF file can contain just Positive or Just neagtive strand
10 # TFBS. Has two types of matrix produced: (0) resence/Abscence with only
11 # 1 and 0s. With option=0. (1) counts of TFs with numbers 1,3,5 etc.
12
13
14
15
16 my $line;
17 my $line3;
18 my @cols;
19 my @TF_array;
20 my @gene_array;
21 my %matrix_1= ();
22 my %matrix_2= ();
23 my $TF;
24 my $gene;
25 my %matrix;
26 my $matrixType;
27
28 if(@ARGV < 3){
29 print "\nUsage: gene-TF-matrix.pl fimo-nol-P.gff/fimo-nol-N.gff gene-matrix-P.csv/gene-matrix-N.csv
30 \n Options: Presence/Abscence=0 counts=1\n\n";
31 exit(0);
32 }
33 open (FIMO, "<$ARGV[0]") ||
34 die "File '$ARGV[0]' not found\n" ;
35 open(MATRIX, ">$ARGV[1]") ||
36 die "File '>$ARGV[1]' not found\n";
37
38 $matrixType = $ARGV[2];
39 print "MatrixTYpe is $matrixType\n";
40
41 #Put all the motifs and genes in two separate arrays: each appears
42 #only once in each array.
43 while (<FIMO>) {
44 $line=$_;
45 if ($line!~/^##/) {#ignore header line
46 @cols=split;
47 $TF= substr $cols[8],5,8;
48 if (not exists $matrix_1{$TF}) {
49 $matrix_1{$TF}="";
50 push @TF_array, $TF;
51 }
52 $gene=substr $cols[0],0,21;
53 if (not exists $matrix_2{$gene}) {
54 $matrix_2{$gene}="";
55 push @gene_array, $gene
56 }
57 }
58 }
59
60 my $n_motifs=scalar @TF_array;
61 my $n_genes=scalar@gene_array;
62 #printf "Scalar motifs is %d\n", scalar@TF_array;
63 #printf "Scalar genes is %d\n", scalar@gene_array;
64
65 close(FIMO);
66 #I want to create a hash on which each gene has a list of 0s. Then I want to "read" the .gff file
67 #and if a gene has a certain TF it will add "+1" to the possition of the TF, and it will look like this.
68
69
70 open (FIMO, "$ARGV[0]") ||
71 die "File '$ARGV[0]' not found\n" ;
72
73 #$matrix{"PGSC0003DMG400006788"}=(0,0,1,0,2,0,3,0,0,...,0)
74
75 #Filling 2d gene/motif array with zeros to start
76 foreach my $element (@gene_array){
77 my @auxilary_list = ();
78 for (my $i=1; $i <= $n_motifs; $i++){
79 $auxilary_list[$i-1] =0;
80 }
81 $matrix{$element}=\@auxilary_list;
82 }
83
84 #This is how I want to read the .gff file and check if a gene has a certain TF. I dont consider the positions yet. I just
85 # want to see if this first step works.
86
87 while (<FIMO>){
88 $line3 = $_;
89 if ($line3!~/^##/) {
90 for (my $j=0; $j < scalar@gene_array; $j++){
91 for (my $h=0; $h < scalar@TF_array; $h++){
92 #printf "Genes[%d] -%s- Motifs[%d] -%s- \n",$j, $gene_array[$j], $h, $TF_array[$h];
93 if (($line3 =~/$gene_array[$j]/) and ($line3 =~/$TF_array[$h]/)) {
94 if ($matrixType ==0){${$matrix{$gene_array[$j]}}[$h]=1;}
95 if ($matrixType ==1){${$matrix{$gene_array[$j]}}[$h]++;}
96 }
97 }
98 }
99 }
100 }
101
102 printf MATRIX "Gene,";
103 for (my $h=0; $h < scalar@TF_array; $h++){
104 if ($h!=scalar@TF_array-1) {
105 printf MATRIX "$TF_array[$h],";
106 }
107 else{printf MATRIX "$TF_array[$h]"}
108 }
109 printf MATRIX "\n";
110 foreach my $element(sort keys %matrix){
111 printf MATRIX "$element,";
112 for (my $r=0; $r<scalar@{$matrix{$element}};$r++){
113 if ($r!=scalar@{$matrix{$element}}-1) {
114 printf MATRIX "$matrix{$element}[$r],"
115 }
116 else{printf MATRIX "$matrix{$element}[$r]"}
117
118 }
119 printf MATRIX "\n"
120 }
121