0
|
1 #!/usr/bin/perl
|
|
2 # compares overlaps in the number of unique genes between two sets
|
|
3
|
|
4 use Getopt::Long;
|
|
5
|
|
6 #
|
|
7 # command line arguments
|
|
8 #
|
|
9 $t1 = "";
|
|
10 $t2 = "";
|
|
11 $o = "";
|
|
12 $of = 0;
|
|
13 $n = 1;
|
|
14
|
|
15 $options = "Usage: ./Compare_Targets.pl <OPTIONS>
|
|
16 -t1 .targets file 1 (EMBER output, required)
|
|
17 -t2 .targets file 2 (EMBER output, required)
|
|
18 -o output file (optional, if you want a gene list printed)
|
|
19 output list prints all unique genes
|
|
20 -of output type (default $of)
|
|
21 0 - all shared targets
|
|
22 1 - all targets in list 1 only
|
|
23 2 - all targets in list 2 only
|
|
24 3 - union of list 1 and 2
|
|
25 -n compare gene names or probe ids (0 = ids, 1 = names, default $n)
|
|
26 \n";
|
|
27
|
|
28 GetOptions('t1=s' => \$t1,
|
|
29 't2=s' => \$t2,
|
|
30 'o=s' => \$o,
|
|
31 'of=i' => \$of,
|
|
32 'n=i' => \$n
|
|
33 ) || die "\n$options\n";
|
|
34
|
|
35 if( $t1 eq "" ){
|
|
36 print "\nError: set a value for -t1\n\n$options\n";
|
|
37 exit;
|
|
38 }
|
|
39 if( $t2 eq "" ){
|
|
40 print "\nError: set a value for -t2\n\n$options\n";
|
|
41 exit;
|
|
42 }
|
|
43 if( $of != 0 && $of != 1 && $of != 2 && $of != 3 ){
|
|
44 print "\nError: set -of to be 0, 1, 2, or 3\n\n$options\n";
|
|
45 exit;
|
|
46 }
|
|
47 if( $n != 0 && $n != 1 ){
|
|
48 print "\nError: set -n to be 0 or 1\n\n$options\n";
|
|
49 exit;
|
|
50 }
|
|
51
|
|
52 #
|
|
53 # read in gene list from each file
|
|
54 #
|
|
55 @list1 = &read_list( $t1 );
|
|
56 @list2 = &read_list( $t2 );
|
|
57
|
|
58 printf("\nFound %i unique genes in %s, %i in %s\n", $#list1+1, $t1, $#list2+1, $t2);
|
|
59
|
|
60 #
|
|
61 # compare lists and print out if desired
|
|
62 #
|
|
63 if( $o ne "" ){open(OUT,">$o");}
|
|
64 $i = 0;
|
|
65 $j = 0;
|
|
66 $end1 = $#list1;
|
|
67 $end2 = $#list2;
|
|
68 $l1o = ();
|
|
69 $l2o = ();
|
|
70 $l12 = ();
|
|
71 while( $i<= $end1 && $j<= $end2 ){
|
|
72 if( $list1[$i] eq $list2[$j] ){
|
|
73 if( $o ne "" && ($of == 0 || $of == 3) ){
|
|
74 print OUT "$list1[$i]\n";
|
|
75 }
|
|
76 $l12++;
|
|
77 $i++;
|
|
78 $j++;
|
|
79 }
|
|
80 elsif( $list1[$i] lt $list2[$j] ){
|
|
81 if( $o ne "" && ($of == 1 || $of == 3) ){
|
|
82 print OUT "$list1[$i]\n";
|
|
83 }
|
|
84 $l1o++;
|
|
85 $i++;
|
|
86 }
|
|
87 else{
|
|
88 if( $o ne "" && ($of == 2 || $of == 3) ){
|
|
89 print OUT "$list2[$j]\n";
|
|
90 }
|
|
91 $l2o++;
|
|
92 $j++;
|
|
93 }
|
|
94 }
|
|
95 if( $o ne "" ){close(OUT);}
|
|
96 printf("\n%s only: %i\n%s only: %i\nshared: %i\n\n", $t1, $l1o, $t2, $l2o, $l12);
|
|
97
|
|
98
|
|
99
|
|
100
|
|
101
|
|
102 exit;
|
|
103 ##############
|
|
104 # read in gene list from .targets file and sort it, then only print those genes that are unique
|
|
105 sub read_list{
|
|
106 my @rval;
|
|
107 my @sval;
|
|
108 my @final;
|
|
109
|
|
110 @rval = ();
|
|
111 open(IN,"$_[0]") || die "Error: can't open file $_[0]\n";
|
|
112 while($line = <IN>){
|
|
113 chomp($line);
|
|
114 @parts = split(' ',$line);
|
|
115 if( $parts[0] eq "GENE:" ){
|
|
116 push(@rval, $parts[1+$n]);
|
|
117 }
|
|
118 if( $parts[0] eq "TGENE:" ){
|
|
119 push(@rval, $parts[2+$n]);
|
|
120 }
|
|
121 }
|
|
122 close(IN);
|
|
123
|
|
124 @sval = sort{ $a cmp $b } @rval;
|
|
125
|
|
126 @final = ();
|
|
127 push(@final, @sval[0]);
|
|
128 for($i=1; $i<= $#sval; $i++){
|
|
129 if( $sval[$i] ne $sval[$i-1] ){
|
|
130 push(@final, $sval[$i]);
|
|
131 }
|
|
132 }
|
|
133
|
|
134 return @final;
|
|
135 }
|
|
136
|
|
137
|
|
138
|
|
139
|
|
140
|
|
141
|
|
142
|
|
143
|
|
144
|
|
145
|
|
146
|
|
147
|
|
148
|