comparison multi_join_left/addColumnsFromFile2ToFile1.pl @ 5:1de2a8f041b3 draft

Corrected xml syntax errors in config file, added missing pl script and output dataset to test-data
author mir-bioinf
date Tue, 21 Apr 2015 16:20:03 -0400
parents
children
comparison
equal deleted inserted replaced
4:46c880ae6db2 5:1de2a8f041b3
1 #! /usr/bin/perl -w
2 #===============================================================================
3 #
4 # FILENAME: addColumnsFromFile2ToFile1.pl
5 #
6 # USAGE: see -help
7 #
8 # DESCRIPTION: This program adds columns in File 2 to File 1,
9 # if there are correnponding entries in File 1
10 #
11 # AUTHOR: Ron Stewart
12 # VERSION: 1.1
13 # CREATED: 12/18/06 CDT
14 #===============================================================================
15
16 use lib '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased'; ##CMS ADDED 11-05-14, DIR CONTAINS CUSTOM MODULE
17
18 use strict;
19 use IO::File;
20 use GetOptWC;
21 # parsing the command line
22 my %optVarsIn = ();
23 # help information
24 $optVarsIn{'File1'} = './File1.txt`=s`Input file of genes to include';
25 $optVarsIn{'File2'} = './File2.txt`=s`Input file to be added';
26 $optVarsIn{'cola1'} = '2`=f`first column to look for match in File1';
27 $optVarsIn{'colb1'} = '4`=f`second column to look for match in File1';
28 $optVarsIn{'cola2'} = '2`=f`first column to look for match in File2';
29 $optVarsIn{'colb2'} = '4`=f`second column to look for match in File2';
30 $optVarsIn{'file1FirstColToCopy'} = '-1`=f`first column to copy in File1';
31 $optVarsIn{'file1LastColToCopy'} = '-1`=f`last column to copy in File1';
32 $optVarsIn{'file2FirstColToCopy'} = '-1`=f`first column to copy in File2';
33 $optVarsIn{'file2LastColToCopy'} = '-1`=f`last column to copy in File2';
34 $optVarsIn{'HelpPrefix'} = 'This script is for adding entries in File2 to the corresponding entries in File1.';
35 $optVarsIn{'HelpSuffix'} = 'example call: ./addColumnsFromFile2ToFile1.pl -File1=./esAndDiffMarkersWithSage20061211.txt -cola1=2 -calb1=4 -File2=./pan_whole_table_fold_ann.txt -cola2=1 -colb2=4`Note: Two input files should have title lines`NOTE:Files must be text files, NOT .xls files. If you have an .xls file, save it as "Text(Windows) in Excel.';
36 my %retVars = ();
37 my $retVarsRef = GetOptWC::getOptions(\%optVarsIn);
38 %retVars = %$retVarsRef;
39 if ($retVars{'HelpCalled'}) {
40 print "exiting now, help called\n";
41 exit;
42 }
43 my $File1 = $retVars{'File1'};
44 my $FHFile1;
45 $FHFile1 = IO::File->new("<$File1");
46
47 my $File2 = $retVars{'File2'};
48 my $FHFile2;
49 $FHFile2 = IO::File->new("<$File2");
50 my $File1name = $File1;
51 print "file1name: $File1name\n";
52 $File1name =~ s/[\.\/]/_/g;
53 print "file1name: $File1name\n";
54
55 my $File2name = $File2;
56 $File2name =~ s/[\.\/]/_/g;
57 #my $Out = $File1name.'.'.$File2name; # this can be too long in some cases
58 my $Out = "file1_file2.txt";
59 print" out is $Out\n";
60 my $OutFile = IO::File->new(">$Out");
61 my $cola1 = $retVars{'cola1'};
62 my $colb1 = $retVars{'colb1'};
63 my $cola2 = $retVars{'cola2'};
64 my $colb2 = $retVars{'colb2'};
65 my $firstColFile1 = $retVars{'file1FirstColToCopy'};
66 my $lastColFile1 = $retVars{'file1LastColToCopy'};
67 my $firstColFile2 = $retVars{'file2FirstColToCopy'};
68 my $lastColFile2 = $retVars{'file2LastColToCopy'};
69 my %genes2 = ();
70 my %genes4 = ();
71 my %genes4key = ();
72 my $lineCtr = 0;
73 my @cols = ();
74 my $firstLineFile2 = $FHFile2->getline();
75 $firstLineFile2 =~ s/\s+$//;
76 @cols = split "\t",$firstLineFile2;
77 my $numColFile2 = @cols;
78 if($firstColFile2==-1){
79 $firstColFile2 = 0;
80 $lastColFile2 = $numColFile2-1;
81 }
82 my @titleFile2 = @cols[$firstColFile2..$lastColFile2];
83 while (my $line = $FHFile2->getline()) {
84 $lineCtr++;
85 #$line =~ s/\s+$//;
86 $line =~ s/\R//g; ##CMS 11-6-14
87 chomp($line);
88 @cols = split "\t",$line;
89 my $numCols = (@cols + 0);
90
91 #$cols[$cola2] = uc($cols[$cola2]); ##CMS 11-6-14
92 #$cols[$colb2] = uc($cols[$colb2]); ##CMS 11-6-14
93 $cols[$cola2] =~ s/ //g;
94 my $colsBSymbol = "";
95 if ($cols[$colb2] =~ /\"{0,1}CDS\; ([^\;]+);/) {
96 $colsBSymbol = $1;
97 }
98 else {
99 #if ($numCols == ($colb2 +1)) { ##CMS COMMENTED OUT 11-6-14
100 #print "$line\n";
101 #print "Please check this line\n";
102 #exit;
103 #} ##CMS END COMMENTS 11-6-14
104 $cols[$colb2] =~ s/ //g;
105 $colsBSymbol = $cols[$colb2];
106 }
107 if($numCols<$lastColFile2){
108 for(my $i = $numCols;$i<$lastColFile2;$i++){
109 $cols[$i] = "";
110 }
111 }
112
113 $genes2{$cols[$cola2]} = join("\t",@cols[$firstColFile2..$lastColFile2]);
114 $genes4{$colsBSymbol}->{$cols[$cola2]} = join("\t",@cols[$firstColFile2..$lastColFile2]);
115 $genes4key{$colsBSymbol}="x";
116 }
117 print "linectr: $lineCtr\n";
118 $lineCtr = 0;
119 @cols = ();
120 my $firstLineFile1 = $FHFile1->getline();
121 $firstLineFile1 =~ s/\s+$//;
122 @cols = split "\t",$firstLineFile1;
123 my $numColFile1 = @cols;
124 if($firstColFile1==-1){
125 $firstColFile1 = 0;
126 $lastColFile1 = $numColFile1-1;
127 }
128 #print "numcolsfile1: $numColFile1\n";
129 #print "lastcolsfile1: $lastColFile1\n";
130
131 my @titleFile1 = @cols[$firstColFile1..$lastColFile1];
132 #print "tf1: @titleFile1\n";
133 #print "tf2: @titleFile2\n";
134 #print "outfile: $OutFile\n";
135
136 print $OutFile join("\t",@titleFile1)."\t".join("\t",@titleFile2)."\n";
137 #my $numCol = $lastColFile1 - $firstColFile1 +1;
138 my $numCurrentLine =0;
139 while (my $line = $FHFile1->getline()) {
140 $lineCtr++;
141 $line =~ s/\s+$//;
142 my $selectedEntries;
143 @cols = split "\t",$line;
144 $numCurrentLine = $#cols;#[$firstColFile1..$LastColFile1];
145 #print "numcurrentline: $numCurrentLine\n";
146 $line = $line."\t";
147 if($numCurrentLine<$lastColFile1){
148 #print "in if\n";
149 for(my $i =$numCurrentLine+1;$i<=$lastColFile1;$i++){
150 #print "in for. i=$i\n";
151 $cols[$i]="";
152 #$line = $line."\t";
153 }
154 }
155 $selectedEntries = join("\t",@cols[$firstColFile1..$lastColFile1]);
156 my $numCols = (@cols + 0);
157 #$cols[$cola1] = uc($cols[$cola1]); ##CMS 11-6-14
158 #$cols[$colb1] = uc($cols[$colb1]); ##CMS 11-6-14
159 $cols[$cola1] =~ s/ //g;
160 my $colsBSymbol = "";
161 if ($cols[$colb1] =~ /\"{0,1}CDS\; ([^\;]+);/) {
162 $colsBSymbol = $1;
163 }
164 else {
165 #if ($numCols == ($colb1 +1)) { ##CMS COMMENTED 11-6-14
166 # print"$line";
167 # print " please check this line\n";
168 # exit;
169 #} ##CMS END COMMENTS 11-6-14
170 $cols[$colb1] =~ s/ //g;
171 $colsBSymbol = $cols[$colb1];
172 }
173 if((exists ($genes2{$cols[$cola1]}) and $cols[$cola1] ne "N/A") or (exists ($genes2{$colsBSymbol}) and $colsBSymbol ne "N/A") ) {
174 if (exists ($genes2{$cols[$cola1]})) {
175 print $OutFile "$selectedEntries"."\t".$genes2{$cols[$cola1]}."\n";
176 }
177 elsif (exists ($genes2{$colsBSymbol})) {
178 print $OutFile "$selectedEntries"."\t".$genes2{$colsBSymbol}."\n";
179 }
180 else {
181 print "WHOA, we've got a problem Here!!!!!\n";
182 }
183 }
184 elsif(exists ($genes4key{$colsBSymbol}) and $colsBSymbol ne "N/A" ) {
185 foreach my $symbol (keys %{$genes4{$colsBSymbol}}){
186 print $OutFile "$selectedEntries"."\t".$genes4{$colsBSymbol}->{$symbol}."\n";
187 }
188 }
189 else {
190 print $OutFile "$selectedEntries"."\n";
191 }
192 }
193 exit;