0
|
1 #!/usr/bin/perl -w
|
|
2 $|=1;
|
|
3 use strict;
|
|
4 use warnings;
|
|
5
|
|
6
|
|
7
|
|
8 # Script to create csv formatted gene vs TF matrix from a filtered gff
|
|
9 # file. GFF file can contain just Positive or Just neagtive strand
|
|
10 # TFBS. Has two types of matrix produced: (0) resence/Abscence with only
|
|
11 # 1 and 0s. With option=0. (1) counts of TFs with numbers 1,3,5 etc.
|
|
12
|
|
13
|
|
14
|
|
15
|
|
16 my $line;
|
|
17 my $line3;
|
|
18 my @cols;
|
|
19 my @TF_array;
|
|
20 my @gene_array;
|
|
21 my %matrix_1= ();
|
|
22 my %matrix_2= ();
|
|
23 my $TF;
|
|
24 my $gene;
|
|
25 my %matrix;
|
|
26 my $matrixType;
|
|
27
|
|
28 if(@ARGV < 3){
|
|
29 print "\nUsage: gene-TF-matrix.pl fimo-nol-P.gff/fimo-nol-N.gff gene-matrix-P.csv/gene-matrix-N.csv
|
|
30 \n Options: Presence/Abscence=0 counts=1\n\n";
|
|
31 exit(0);
|
|
32 }
|
|
33 open (FIMO, "<$ARGV[0]") ||
|
|
34 die "File '$ARGV[0]' not found\n" ;
|
|
35 open(MATRIX, ">$ARGV[1]") ||
|
|
36 die "File '>$ARGV[1]' not found\n";
|
|
37
|
|
38 $matrixType = $ARGV[2];
|
|
39 print "MatrixTYpe is $matrixType\n";
|
|
40
|
|
41 #Put all the motifs and genes in two separate arrays: each appears
|
|
42 #only once in each array.
|
|
43 while (<FIMO>) {
|
|
44 $line=$_;
|
|
45 if ($line!~/^##/) {#ignore header line
|
|
46 @cols=split;
|
|
47 $TF= substr $cols[8],5,8;
|
|
48 if (not exists $matrix_1{$TF}) {
|
|
49 $matrix_1{$TF}="";
|
|
50 push @TF_array, $TF;
|
|
51 }
|
|
52 $gene=substr $cols[0],0,21;
|
|
53 if (not exists $matrix_2{$gene}) {
|
|
54 $matrix_2{$gene}="";
|
|
55 push @gene_array, $gene
|
|
56 }
|
|
57 }
|
|
58 }
|
|
59
|
|
60 my $n_motifs=scalar @TF_array;
|
|
61 my $n_genes=scalar@gene_array;
|
|
62 #printf "Scalar motifs is %d\n", scalar@TF_array;
|
|
63 #printf "Scalar genes is %d\n", scalar@gene_array;
|
|
64
|
|
65 close(FIMO);
|
|
66 #I want to create a hash on which each gene has a list of 0s. Then I want to "read" the .gff file
|
|
67 #and if a gene has a certain TF it will add "+1" to the possition of the TF, and it will look like this.
|
|
68
|
|
69
|
|
70 open (FIMO, "$ARGV[0]") ||
|
|
71 die "File '$ARGV[0]' not found\n" ;
|
|
72
|
|
73 #$matrix{"PGSC0003DMG400006788"}=(0,0,1,0,2,0,3,0,0,...,0)
|
|
74
|
|
75 #Filling 2d gene/motif array with zeros to start
|
|
76 foreach my $element (@gene_array){
|
|
77 my @auxilary_list = ();
|
|
78 for (my $i=1; $i <= $n_motifs; $i++){
|
|
79 $auxilary_list[$i-1] =0;
|
|
80 }
|
|
81 $matrix{$element}=\@auxilary_list;
|
|
82 }
|
|
83
|
|
84 #This is how I want to read the .gff file and check if a gene has a certain TF. I dont consider the positions yet. I just
|
|
85 # want to see if this first step works.
|
|
86
|
|
87 while (<FIMO>){
|
|
88 $line3 = $_;
|
|
89 if ($line3!~/^##/) {
|
|
90 for (my $j=0; $j < scalar@gene_array; $j++){
|
|
91 for (my $h=0; $h < scalar@TF_array; $h++){
|
|
92 #printf "Genes[%d] -%s- Motifs[%d] -%s- \n",$j, $gene_array[$j], $h, $TF_array[$h];
|
|
93 if (($line3 =~/$gene_array[$j]/) and ($line3 =~/$TF_array[$h]/)) {
|
|
94 if ($matrixType ==0){${$matrix{$gene_array[$j]}}[$h]=1;}
|
|
95 if ($matrixType ==1){${$matrix{$gene_array[$j]}}[$h]++;}
|
|
96 }
|
|
97 }
|
|
98 }
|
|
99 }
|
|
100 }
|
|
101
|
|
102 printf MATRIX "Gene,";
|
|
103 for (my $h=0; $h < scalar@TF_array; $h++){
|
|
104 if ($h!=scalar@TF_array-1) {
|
|
105 printf MATRIX "$TF_array[$h],";
|
|
106 }
|
|
107 else{printf MATRIX "$TF_array[$h]"}
|
|
108 }
|
|
109 printf MATRIX "\n";
|
|
110 foreach my $element(sort keys %matrix){
|
|
111 printf MATRIX "$element,";
|
|
112 for (my $r=0; $r<scalar@{$matrix{$element}};$r++){
|
|
113 if ($r!=scalar@{$matrix{$element}}-1) {
|
|
114 printf MATRIX "$matrix{$element}[$r],"
|
|
115 }
|
|
116 else{printf MATRIX "$matrix{$element}[$r]"}
|
|
117
|
|
118 }
|
|
119 printf MATRIX "\n"
|
|
120 }
|
|
121
|