Mercurial > repos > mir-bioinf > multi_join_left
view multi_join_left/addColumnsFromFile2ToFile1.pl @ 5:1de2a8f041b3 draft
Corrected xml syntax errors in config file, added missing pl script and output dataset to test-data
author | mir-bioinf |
---|---|
date | Tue, 21 Apr 2015 16:20:03 -0400 |
parents | |
children |
line wrap: on
line source
#! /usr/bin/perl -w #=============================================================================== # # FILENAME: addColumnsFromFile2ToFile1.pl # # USAGE: see -help # # DESCRIPTION: This program adds columns in File 2 to File 1, # if there are correnponding entries in File 1 # # AUTHOR: Ron Stewart # VERSION: 1.1 # CREATED: 12/18/06 CDT #=============================================================================== use lib '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased'; ##CMS ADDED 11-05-14, DIR CONTAINS CUSTOM MODULE use strict; use IO::File; use GetOptWC; # parsing the command line my %optVarsIn = (); # help information $optVarsIn{'File1'} = './File1.txt`=s`Input file of genes to include'; $optVarsIn{'File2'} = './File2.txt`=s`Input file to be added'; $optVarsIn{'cola1'} = '2`=f`first column to look for match in File1'; $optVarsIn{'colb1'} = '4`=f`second column to look for match in File1'; $optVarsIn{'cola2'} = '2`=f`first column to look for match in File2'; $optVarsIn{'colb2'} = '4`=f`second column to look for match in File2'; $optVarsIn{'file1FirstColToCopy'} = '-1`=f`first column to copy in File1'; $optVarsIn{'file1LastColToCopy'} = '-1`=f`last column to copy in File1'; $optVarsIn{'file2FirstColToCopy'} = '-1`=f`first column to copy in File2'; $optVarsIn{'file2LastColToCopy'} = '-1`=f`last column to copy in File2'; $optVarsIn{'HelpPrefix'} = 'This script is for adding entries in File2 to the corresponding entries in File1.'; $optVarsIn{'HelpSuffix'} = 'example call: ./addColumnsFromFile2ToFile1.pl -File1=./esAndDiffMarkersWithSage20061211.txt -cola1=2 -calb1=4 -File2=./pan_whole_table_fold_ann.txt -cola2=1 -colb2=4`Note: Two input files should have title lines`NOTE:Files must be text files, NOT .xls files. If you have an .xls file, save it as "Text(Windows) in Excel.'; my %retVars = (); my $retVarsRef = GetOptWC::getOptions(\%optVarsIn); %retVars = %$retVarsRef; if ($retVars{'HelpCalled'}) { print "exiting now, help called\n"; exit; } my $File1 = $retVars{'File1'}; my $FHFile1; $FHFile1 = IO::File->new("<$File1"); my $File2 = $retVars{'File2'}; my $FHFile2; $FHFile2 = IO::File->new("<$File2"); my $File1name = $File1; print "file1name: $File1name\n"; $File1name =~ s/[\.\/]/_/g; print "file1name: $File1name\n"; my $File2name = $File2; $File2name =~ s/[\.\/]/_/g; #my $Out = $File1name.'.'.$File2name; # this can be too long in some cases my $Out = "file1_file2.txt"; print" out is $Out\n"; my $OutFile = IO::File->new(">$Out"); my $cola1 = $retVars{'cola1'}; my $colb1 = $retVars{'colb1'}; my $cola2 = $retVars{'cola2'}; my $colb2 = $retVars{'colb2'}; my $firstColFile1 = $retVars{'file1FirstColToCopy'}; my $lastColFile1 = $retVars{'file1LastColToCopy'}; my $firstColFile2 = $retVars{'file2FirstColToCopy'}; my $lastColFile2 = $retVars{'file2LastColToCopy'}; my %genes2 = (); my %genes4 = (); my %genes4key = (); my $lineCtr = 0; my @cols = (); my $firstLineFile2 = $FHFile2->getline(); $firstLineFile2 =~ s/\s+$//; @cols = split "\t",$firstLineFile2; my $numColFile2 = @cols; if($firstColFile2==-1){ $firstColFile2 = 0; $lastColFile2 = $numColFile2-1; } my @titleFile2 = @cols[$firstColFile2..$lastColFile2]; while (my $line = $FHFile2->getline()) { $lineCtr++; #$line =~ s/\s+$//; $line =~ s/\R//g; ##CMS 11-6-14 chomp($line); @cols = split "\t",$line; my $numCols = (@cols + 0); #$cols[$cola2] = uc($cols[$cola2]); ##CMS 11-6-14 #$cols[$colb2] = uc($cols[$colb2]); ##CMS 11-6-14 $cols[$cola2] =~ s/ //g; my $colsBSymbol = ""; if ($cols[$colb2] =~ /\"{0,1}CDS\; ([^\;]+);/) { $colsBSymbol = $1; } else { #if ($numCols == ($colb2 +1)) { ##CMS COMMENTED OUT 11-6-14 #print "$line\n"; #print "Please check this line\n"; #exit; #} ##CMS END COMMENTS 11-6-14 $cols[$colb2] =~ s/ //g; $colsBSymbol = $cols[$colb2]; } if($numCols<$lastColFile2){ for(my $i = $numCols;$i<$lastColFile2;$i++){ $cols[$i] = ""; } } $genes2{$cols[$cola2]} = join("\t",@cols[$firstColFile2..$lastColFile2]); $genes4{$colsBSymbol}->{$cols[$cola2]} = join("\t",@cols[$firstColFile2..$lastColFile2]); $genes4key{$colsBSymbol}="x"; } print "linectr: $lineCtr\n"; $lineCtr = 0; @cols = (); my $firstLineFile1 = $FHFile1->getline(); $firstLineFile1 =~ s/\s+$//; @cols = split "\t",$firstLineFile1; my $numColFile1 = @cols; if($firstColFile1==-1){ $firstColFile1 = 0; $lastColFile1 = $numColFile1-1; } #print "numcolsfile1: $numColFile1\n"; #print "lastcolsfile1: $lastColFile1\n"; my @titleFile1 = @cols[$firstColFile1..$lastColFile1]; #print "tf1: @titleFile1\n"; #print "tf2: @titleFile2\n"; #print "outfile: $OutFile\n"; print $OutFile join("\t",@titleFile1)."\t".join("\t",@titleFile2)."\n"; #my $numCol = $lastColFile1 - $firstColFile1 +1; my $numCurrentLine =0; while (my $line = $FHFile1->getline()) { $lineCtr++; $line =~ s/\s+$//; my $selectedEntries; @cols = split "\t",$line; $numCurrentLine = $#cols;#[$firstColFile1..$LastColFile1]; #print "numcurrentline: $numCurrentLine\n"; $line = $line."\t"; if($numCurrentLine<$lastColFile1){ #print "in if\n"; for(my $i =$numCurrentLine+1;$i<=$lastColFile1;$i++){ #print "in for. i=$i\n"; $cols[$i]=""; #$line = $line."\t"; } } $selectedEntries = join("\t",@cols[$firstColFile1..$lastColFile1]); my $numCols = (@cols + 0); #$cols[$cola1] = uc($cols[$cola1]); ##CMS 11-6-14 #$cols[$colb1] = uc($cols[$colb1]); ##CMS 11-6-14 $cols[$cola1] =~ s/ //g; my $colsBSymbol = ""; if ($cols[$colb1] =~ /\"{0,1}CDS\; ([^\;]+);/) { $colsBSymbol = $1; } else { #if ($numCols == ($colb1 +1)) { ##CMS COMMENTED 11-6-14 # print"$line"; # print " please check this line\n"; # exit; #} ##CMS END COMMENTS 11-6-14 $cols[$colb1] =~ s/ //g; $colsBSymbol = $cols[$colb1]; } if((exists ($genes2{$cols[$cola1]}) and $cols[$cola1] ne "N/A") or (exists ($genes2{$colsBSymbol}) and $colsBSymbol ne "N/A") ) { if (exists ($genes2{$cols[$cola1]})) { print $OutFile "$selectedEntries"."\t".$genes2{$cols[$cola1]}."\n"; } elsif (exists ($genes2{$colsBSymbol})) { print $OutFile "$selectedEntries"."\t".$genes2{$colsBSymbol}."\n"; } else { print "WHOA, we've got a problem Here!!!!!\n"; } } elsif(exists ($genes4key{$colsBSymbol}) and $colsBSymbol ne "N/A" ) { foreach my $symbol (keys %{$genes4{$colsBSymbol}}){ print $OutFile "$selectedEntries"."\t".$genes4{$colsBSymbol}->{$symbol}."\n"; } } else { print $OutFile "$selectedEntries"."\n"; } } exit;