annotate Tools/Matrix/gene-TF-matrix-csv-galaxy.pl @ 3:b30ba2b06326 draft

Uploaded
author amadeo
date Mon, 05 Sep 2016 06:01:48 -0400
parents 229d36377838
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
229d36377838 Uploaded
amadeo
parents:
diff changeset
1 #!/usr/bin/perl -w
229d36377838 Uploaded
amadeo
parents:
diff changeset
2 $|=1;
229d36377838 Uploaded
amadeo
parents:
diff changeset
3 use strict;
229d36377838 Uploaded
amadeo
parents:
diff changeset
4 use warnings;
229d36377838 Uploaded
amadeo
parents:
diff changeset
5
229d36377838 Uploaded
amadeo
parents:
diff changeset
6
229d36377838 Uploaded
amadeo
parents:
diff changeset
7
229d36377838 Uploaded
amadeo
parents:
diff changeset
8 # Script to create csv formatted gene vs TF matrix from a filtered gff
229d36377838 Uploaded
amadeo
parents:
diff changeset
9 # file. GFF file can contain just Positive or Just neagtive strand
229d36377838 Uploaded
amadeo
parents:
diff changeset
10 # TFBS. Has two types of matrix produced: (0) resence/Abscence with only
229d36377838 Uploaded
amadeo
parents:
diff changeset
11 # 1 and 0s. With option=0. (1) counts of TFs with numbers 1,3,5 etc.
229d36377838 Uploaded
amadeo
parents:
diff changeset
12
229d36377838 Uploaded
amadeo
parents:
diff changeset
13
229d36377838 Uploaded
amadeo
parents:
diff changeset
14
229d36377838 Uploaded
amadeo
parents:
diff changeset
15
229d36377838 Uploaded
amadeo
parents:
diff changeset
16 my $line;
229d36377838 Uploaded
amadeo
parents:
diff changeset
17 my $line3;
229d36377838 Uploaded
amadeo
parents:
diff changeset
18 my @cols;
229d36377838 Uploaded
amadeo
parents:
diff changeset
19 my @TF_array;
229d36377838 Uploaded
amadeo
parents:
diff changeset
20 my @gene_array;
229d36377838 Uploaded
amadeo
parents:
diff changeset
21 my %matrix_1= ();
229d36377838 Uploaded
amadeo
parents:
diff changeset
22 my %matrix_2= ();
229d36377838 Uploaded
amadeo
parents:
diff changeset
23 my $TF;
229d36377838 Uploaded
amadeo
parents:
diff changeset
24 my $gene;
229d36377838 Uploaded
amadeo
parents:
diff changeset
25 my %matrix;
229d36377838 Uploaded
amadeo
parents:
diff changeset
26 my $matrixType;
229d36377838 Uploaded
amadeo
parents:
diff changeset
27
229d36377838 Uploaded
amadeo
parents:
diff changeset
28 if(@ARGV < 3){
229d36377838 Uploaded
amadeo
parents:
diff changeset
29 print "\nUsage: gene-TF-matrix.pl fimo-nol-P.gff/fimo-nol-N.gff gene-matrix-P.csv/gene-matrix-N.csv
229d36377838 Uploaded
amadeo
parents:
diff changeset
30 \n Options: Presence/Abscence=0 counts=1\n\n";
229d36377838 Uploaded
amadeo
parents:
diff changeset
31 exit(0);
229d36377838 Uploaded
amadeo
parents:
diff changeset
32 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
33 open (FIMO, "<$ARGV[0]") ||
229d36377838 Uploaded
amadeo
parents:
diff changeset
34 die "File '$ARGV[0]' not found\n" ;
229d36377838 Uploaded
amadeo
parents:
diff changeset
35 open(MATRIX, ">$ARGV[1]") ||
229d36377838 Uploaded
amadeo
parents:
diff changeset
36 die "File '>$ARGV[1]' not found\n";
229d36377838 Uploaded
amadeo
parents:
diff changeset
37
229d36377838 Uploaded
amadeo
parents:
diff changeset
38 $matrixType = $ARGV[2];
229d36377838 Uploaded
amadeo
parents:
diff changeset
39 print "MatrixTYpe is $matrixType\n";
229d36377838 Uploaded
amadeo
parents:
diff changeset
40
229d36377838 Uploaded
amadeo
parents:
diff changeset
41 #Put all the motifs and genes in two separate arrays: each appears
229d36377838 Uploaded
amadeo
parents:
diff changeset
42 #only once in each array.
229d36377838 Uploaded
amadeo
parents:
diff changeset
43 while (<FIMO>) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
44 $line=$_;
229d36377838 Uploaded
amadeo
parents:
diff changeset
45 if ($line!~/^##/) {#ignore header line
229d36377838 Uploaded
amadeo
parents:
diff changeset
46 @cols=split;
229d36377838 Uploaded
amadeo
parents:
diff changeset
47 $TF= substr $cols[8],5,8;
229d36377838 Uploaded
amadeo
parents:
diff changeset
48 if (not exists $matrix_1{$TF}) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
49 $matrix_1{$TF}="";
229d36377838 Uploaded
amadeo
parents:
diff changeset
50 push @TF_array, $TF;
229d36377838 Uploaded
amadeo
parents:
diff changeset
51 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
52 $gene=substr $cols[0],0,21;
229d36377838 Uploaded
amadeo
parents:
diff changeset
53 if (not exists $matrix_2{$gene}) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
54 $matrix_2{$gene}="";
229d36377838 Uploaded
amadeo
parents:
diff changeset
55 push @gene_array, $gene
229d36377838 Uploaded
amadeo
parents:
diff changeset
56 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
57 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
58 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
59
229d36377838 Uploaded
amadeo
parents:
diff changeset
60 my $n_motifs=scalar @TF_array;
229d36377838 Uploaded
amadeo
parents:
diff changeset
61 my $n_genes=scalar@gene_array;
229d36377838 Uploaded
amadeo
parents:
diff changeset
62 #printf "Scalar motifs is %d\n", scalar@TF_array;
229d36377838 Uploaded
amadeo
parents:
diff changeset
63 #printf "Scalar genes is %d\n", scalar@gene_array;
229d36377838 Uploaded
amadeo
parents:
diff changeset
64
229d36377838 Uploaded
amadeo
parents:
diff changeset
65 close(FIMO);
229d36377838 Uploaded
amadeo
parents:
diff changeset
66 #I want to create a hash on which each gene has a list of 0s. Then I want to "read" the .gff file
229d36377838 Uploaded
amadeo
parents:
diff changeset
67 #and if a gene has a certain TF it will add "+1" to the possition of the TF, and it will look like this.
229d36377838 Uploaded
amadeo
parents:
diff changeset
68
229d36377838 Uploaded
amadeo
parents:
diff changeset
69
229d36377838 Uploaded
amadeo
parents:
diff changeset
70 open (FIMO, "$ARGV[0]") ||
229d36377838 Uploaded
amadeo
parents:
diff changeset
71 die "File '$ARGV[0]' not found\n" ;
229d36377838 Uploaded
amadeo
parents:
diff changeset
72
229d36377838 Uploaded
amadeo
parents:
diff changeset
73 #$matrix{"PGSC0003DMG400006788"}=(0,0,1,0,2,0,3,0,0,...,0)
229d36377838 Uploaded
amadeo
parents:
diff changeset
74
229d36377838 Uploaded
amadeo
parents:
diff changeset
75 #Filling 2d gene/motif array with zeros to start
229d36377838 Uploaded
amadeo
parents:
diff changeset
76 foreach my $element (@gene_array){
229d36377838 Uploaded
amadeo
parents:
diff changeset
77 my @auxilary_list = ();
229d36377838 Uploaded
amadeo
parents:
diff changeset
78 for (my $i=1; $i <= $n_motifs; $i++){
229d36377838 Uploaded
amadeo
parents:
diff changeset
79 $auxilary_list[$i-1] =0;
229d36377838 Uploaded
amadeo
parents:
diff changeset
80 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
81 $matrix{$element}=\@auxilary_list;
229d36377838 Uploaded
amadeo
parents:
diff changeset
82 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
83
229d36377838 Uploaded
amadeo
parents:
diff changeset
84 #This is how I want to read the .gff file and check if a gene has a certain TF. I dont consider the positions yet. I just
229d36377838 Uploaded
amadeo
parents:
diff changeset
85 # want to see if this first step works.
229d36377838 Uploaded
amadeo
parents:
diff changeset
86
229d36377838 Uploaded
amadeo
parents:
diff changeset
87 while (<FIMO>){
229d36377838 Uploaded
amadeo
parents:
diff changeset
88 $line3 = $_;
229d36377838 Uploaded
amadeo
parents:
diff changeset
89 if ($line3!~/^##/) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
90 for (my $j=0; $j < scalar@gene_array; $j++){
229d36377838 Uploaded
amadeo
parents:
diff changeset
91 for (my $h=0; $h < scalar@TF_array; $h++){
229d36377838 Uploaded
amadeo
parents:
diff changeset
92 #printf "Genes[%d] -%s- Motifs[%d] -%s- \n",$j, $gene_array[$j], $h, $TF_array[$h];
229d36377838 Uploaded
amadeo
parents:
diff changeset
93 if (($line3 =~/$gene_array[$j]/) and ($line3 =~/$TF_array[$h]/)) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
94 if ($matrixType ==0){${$matrix{$gene_array[$j]}}[$h]=1;}
229d36377838 Uploaded
amadeo
parents:
diff changeset
95 if ($matrixType ==1){${$matrix{$gene_array[$j]}}[$h]++;}
229d36377838 Uploaded
amadeo
parents:
diff changeset
96 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
97 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
98 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
99 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
100 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
101
229d36377838 Uploaded
amadeo
parents:
diff changeset
102 printf MATRIX "Gene,";
229d36377838 Uploaded
amadeo
parents:
diff changeset
103 for (my $h=0; $h < scalar@TF_array; $h++){
229d36377838 Uploaded
amadeo
parents:
diff changeset
104 if ($h!=scalar@TF_array-1) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
105 printf MATRIX "$TF_array[$h],";
229d36377838 Uploaded
amadeo
parents:
diff changeset
106 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
107 else{printf MATRIX "$TF_array[$h]"}
229d36377838 Uploaded
amadeo
parents:
diff changeset
108 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
109 printf MATRIX "\n";
229d36377838 Uploaded
amadeo
parents:
diff changeset
110 foreach my $element(sort keys %matrix){
229d36377838 Uploaded
amadeo
parents:
diff changeset
111 printf MATRIX "$element,";
229d36377838 Uploaded
amadeo
parents:
diff changeset
112 for (my $r=0; $r<scalar@{$matrix{$element}};$r++){
229d36377838 Uploaded
amadeo
parents:
diff changeset
113 if ($r!=scalar@{$matrix{$element}}-1) {
229d36377838 Uploaded
amadeo
parents:
diff changeset
114 printf MATRIX "$matrix{$element}[$r],"
229d36377838 Uploaded
amadeo
parents:
diff changeset
115 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
116 else{printf MATRIX "$matrix{$element}[$r]"}
229d36377838 Uploaded
amadeo
parents:
diff changeset
117
229d36377838 Uploaded
amadeo
parents:
diff changeset
118 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
119 printf MATRIX "\n"
229d36377838 Uploaded
amadeo
parents:
diff changeset
120 }
229d36377838 Uploaded
amadeo
parents:
diff changeset
121