Mercurial > repos > mir-bioinf > multi_join_left
changeset 5:1de2a8f041b3 draft
Corrected xml syntax errors in config file, added missing pl script and output dataset to test-data
author | mir-bioinf |
---|---|
date | Tue, 21 Apr 2015 16:20:03 -0400 |
parents | 46c880ae6db2 |
children | 2d88439f6448 |
files | multi_join_left/addColumnsFromFile2ToFile1.pl multi_join_left/multi_join_serial.xml multi_join_left/run-multi_join_serial.pl multi_join_left/test-data/multi_join_serial_out.tab |
diffstat | 4 files changed, 214 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_join_left/addColumnsFromFile2ToFile1.pl Tue Apr 21 16:20:03 2015 -0400 @@ -0,0 +1,193 @@ +#! /usr/bin/perl -w +#=============================================================================== +# +# FILENAME: addColumnsFromFile2ToFile1.pl +# +# USAGE: see -help +# +# DESCRIPTION: This program adds columns in File 2 to File 1, +# if there are correnponding entries in File 1 +# +# AUTHOR: Ron Stewart +# VERSION: 1.1 +# CREATED: 12/18/06 CDT +#=============================================================================== + +use lib '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased'; ##CMS ADDED 11-05-14, DIR CONTAINS CUSTOM MODULE + +use strict; +use IO::File; +use GetOptWC; +# parsing the command line +my %optVarsIn = (); +# help information +$optVarsIn{'File1'} = './File1.txt`=s`Input file of genes to include'; +$optVarsIn{'File2'} = './File2.txt`=s`Input file to be added'; +$optVarsIn{'cola1'} = '2`=f`first column to look for match in File1'; +$optVarsIn{'colb1'} = '4`=f`second column to look for match in File1'; +$optVarsIn{'cola2'} = '2`=f`first column to look for match in File2'; +$optVarsIn{'colb2'} = '4`=f`second column to look for match in File2'; +$optVarsIn{'file1FirstColToCopy'} = '-1`=f`first column to copy in File1'; +$optVarsIn{'file1LastColToCopy'} = '-1`=f`last column to copy in File1'; +$optVarsIn{'file2FirstColToCopy'} = '-1`=f`first column to copy in File2'; +$optVarsIn{'file2LastColToCopy'} = '-1`=f`last column to copy in File2'; +$optVarsIn{'HelpPrefix'} = 'This script is for adding entries in File2 to the corresponding entries in File1.'; +$optVarsIn{'HelpSuffix'} = 'example call: ./addColumnsFromFile2ToFile1.pl -File1=./esAndDiffMarkersWithSage20061211.txt -cola1=2 -calb1=4 -File2=./pan_whole_table_fold_ann.txt -cola2=1 -colb2=4`Note: Two input files should have title lines`NOTE:Files must be text files, NOT .xls files. If you have an .xls file, save it as "Text(Windows) in Excel.'; +my %retVars = (); +my $retVarsRef = GetOptWC::getOptions(\%optVarsIn); +%retVars = %$retVarsRef; +if ($retVars{'HelpCalled'}) { + print "exiting now, help called\n"; + exit; +} +my $File1 = $retVars{'File1'}; +my $FHFile1; +$FHFile1 = IO::File->new("<$File1"); + +my $File2 = $retVars{'File2'}; +my $FHFile2; +$FHFile2 = IO::File->new("<$File2"); +my $File1name = $File1; +print "file1name: $File1name\n"; +$File1name =~ s/[\.\/]/_/g; +print "file1name: $File1name\n"; + +my $File2name = $File2; +$File2name =~ s/[\.\/]/_/g; +#my $Out = $File1name.'.'.$File2name; # this can be too long in some cases +my $Out = "file1_file2.txt"; +print" out is $Out\n"; +my $OutFile = IO::File->new(">$Out"); +my $cola1 = $retVars{'cola1'}; +my $colb1 = $retVars{'colb1'}; +my $cola2 = $retVars{'cola2'}; +my $colb2 = $retVars{'colb2'}; +my $firstColFile1 = $retVars{'file1FirstColToCopy'}; +my $lastColFile1 = $retVars{'file1LastColToCopy'}; +my $firstColFile2 = $retVars{'file2FirstColToCopy'}; +my $lastColFile2 = $retVars{'file2LastColToCopy'}; +my %genes2 = (); +my %genes4 = (); +my %genes4key = (); +my $lineCtr = 0; +my @cols = (); +my $firstLineFile2 = $FHFile2->getline(); +$firstLineFile2 =~ s/\s+$//; +@cols = split "\t",$firstLineFile2; +my $numColFile2 = @cols; +if($firstColFile2==-1){ + $firstColFile2 = 0; + $lastColFile2 = $numColFile2-1; +} +my @titleFile2 = @cols[$firstColFile2..$lastColFile2]; +while (my $line = $FHFile2->getline()) { + $lineCtr++; + #$line =~ s/\s+$//; + $line =~ s/\R//g; ##CMS 11-6-14 + chomp($line); + @cols = split "\t",$line; + my $numCols = (@cols + 0); + + #$cols[$cola2] = uc($cols[$cola2]); ##CMS 11-6-14 + #$cols[$colb2] = uc($cols[$colb2]); ##CMS 11-6-14 + $cols[$cola2] =~ s/ //g; + my $colsBSymbol = ""; + if ($cols[$colb2] =~ /\"{0,1}CDS\; ([^\;]+);/) { + $colsBSymbol = $1; + } + else { + #if ($numCols == ($colb2 +1)) { ##CMS COMMENTED OUT 11-6-14 + #print "$line\n"; + #print "Please check this line\n"; + #exit; + #} ##CMS END COMMENTS 11-6-14 + $cols[$colb2] =~ s/ //g; + $colsBSymbol = $cols[$colb2]; + } + if($numCols<$lastColFile2){ + for(my $i = $numCols;$i<$lastColFile2;$i++){ + $cols[$i] = ""; + } + } + + $genes2{$cols[$cola2]} = join("\t",@cols[$firstColFile2..$lastColFile2]); + $genes4{$colsBSymbol}->{$cols[$cola2]} = join("\t",@cols[$firstColFile2..$lastColFile2]); + $genes4key{$colsBSymbol}="x"; +} +print "linectr: $lineCtr\n"; +$lineCtr = 0; +@cols = (); +my $firstLineFile1 = $FHFile1->getline(); +$firstLineFile1 =~ s/\s+$//; +@cols = split "\t",$firstLineFile1; +my $numColFile1 = @cols; +if($firstColFile1==-1){ + $firstColFile1 = 0; + $lastColFile1 = $numColFile1-1; +} +#print "numcolsfile1: $numColFile1\n"; +#print "lastcolsfile1: $lastColFile1\n"; + +my @titleFile1 = @cols[$firstColFile1..$lastColFile1]; +#print "tf1: @titleFile1\n"; +#print "tf2: @titleFile2\n"; +#print "outfile: $OutFile\n"; + +print $OutFile join("\t",@titleFile1)."\t".join("\t",@titleFile2)."\n"; +#my $numCol = $lastColFile1 - $firstColFile1 +1; +my $numCurrentLine =0; +while (my $line = $FHFile1->getline()) { + $lineCtr++; + $line =~ s/\s+$//; + my $selectedEntries; + @cols = split "\t",$line; + $numCurrentLine = $#cols;#[$firstColFile1..$LastColFile1]; + #print "numcurrentline: $numCurrentLine\n"; + $line = $line."\t"; + if($numCurrentLine<$lastColFile1){ + #print "in if\n"; + for(my $i =$numCurrentLine+1;$i<=$lastColFile1;$i++){ + #print "in for. i=$i\n"; + $cols[$i]=""; + #$line = $line."\t"; + } + } + $selectedEntries = join("\t",@cols[$firstColFile1..$lastColFile1]); + my $numCols = (@cols + 0); + #$cols[$cola1] = uc($cols[$cola1]); ##CMS 11-6-14 + #$cols[$colb1] = uc($cols[$colb1]); ##CMS 11-6-14 + $cols[$cola1] =~ s/ //g; + my $colsBSymbol = ""; + if ($cols[$colb1] =~ /\"{0,1}CDS\; ([^\;]+);/) { + $colsBSymbol = $1; + } + else { + #if ($numCols == ($colb1 +1)) { ##CMS COMMENTED 11-6-14 + # print"$line"; + # print " please check this line\n"; + # exit; + #} ##CMS END COMMENTS 11-6-14 + $cols[$colb1] =~ s/ //g; + $colsBSymbol = $cols[$colb1]; + } + if((exists ($genes2{$cols[$cola1]}) and $cols[$cola1] ne "N/A") or (exists ($genes2{$colsBSymbol}) and $colsBSymbol ne "N/A") ) { + if (exists ($genes2{$cols[$cola1]})) { + print $OutFile "$selectedEntries"."\t".$genes2{$cols[$cola1]}."\n"; + } + elsif (exists ($genes2{$colsBSymbol})) { + print $OutFile "$selectedEntries"."\t".$genes2{$colsBSymbol}."\n"; + } + else { + print "WHOA, we've got a problem Here!!!!!\n"; + } + } + elsif(exists ($genes4key{$colsBSymbol}) and $colsBSymbol ne "N/A" ) { + foreach my $symbol (keys %{$genes4{$colsBSymbol}}){ + print $OutFile "$selectedEntries"."\t".$genes4{$colsBSymbol}->{$symbol}."\n"; + } + } + else { + print $OutFile "$selectedEntries"."\n"; + } +} +exit;
--- a/multi_join_left/multi_join_serial.xml Wed Apr 15 17:51:39 2015 -0400 +++ b/multi_join_left/multi_join_serial.xml Tue Apr 21 16:20:03 2015 -0400 @@ -34,17 +34,17 @@ <param name="Files_2|joinMe" value="multi_join_serial_in3.tab" ftype="tabular"/> <param name="Files_2joinCol" value="2"/> <param name="headerYes" value="yes"/> - <output name="Joined_all" value="multi_join_serial_out.tab" ftype="tabular"/> - <test/> - <tests/> + <output name="Joined_all" value="multi_join_serial_out.tab"/> + </test> + </tests> <help> -This tool performs a left-outer join on multiple (at least two) files using a perl script that Ron wrote (thanks, Ron!). The resulting joined file will have the same number of rows as the first file chosen and subsequent files' matches will be shown if present. Rows in the first file without matches in the other files will have empty cells. If none of the input files have a header present, a simple column number header will be added to the output file to denote the start of each set of matches (from each file, start denoted by "C1"). +This tool performs a left-outer join on multiple (at least two) files using an external perl script (included in tool repository) called addColumnsFromFile2ToFile1.pl. The resulting joined file will have the same number of rows as the first file chosen and subsequent files' matches will be shown if present. Rows in the first file without matches in the other files will have empty cells. If none of the input files have a header present, a simple column number header will be added to the output file to denote the start of each set of matches (from each file, start denoted by "C1"). .. class:: warningmark -This tool may fail due to the system running out of memory depending on the number and size of input files and number of matching lines. The higher all of these are, the more likely the tool is to fail. A red output dataset saying "Job killed" typically means the system ran into an out of memory error and as a result the job was killed. This issue has yet to be addressed at the moment... +This tool may fail due to the system running out of memory depending on the number and size of input files and number of matching lines. The higher all of these are, the more likely the tool is to fail. A red output dataset saying "Job killed" typically means the system ran into an out of memory error and as a result the job was killed. **Steps:**
--- a/multi_join_left/run-multi_join_serial.pl Wed Apr 15 17:51:39 2015 -0400 +++ b/multi_join_left/run-multi_join_serial.pl Tue Apr 21 16:20:03 2015 -0400 @@ -6,10 +6,6 @@ use IO::File; use Data::Dumper; -#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/multi_join_shell.pl'; ##comment this line out when finished testing -#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl'; -#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/GetOptWC.pm'; - GetOptions( "join_file=s" => \$data_in, "join_col=s" => \$coljoin, @@ -35,8 +31,8 @@ # # use IO::Handle; -STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run -STDERR->fdopen( \*ERROR, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run +STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n"; # changing mode from 'w' to 'a' for multiple files in one run +STDERR->fdopen( \*ERROR, 'a' ) or die "cant open file $!\n"; # changing mode from 'w' to 'a' for multiple files in one run # # # my @options; @@ -57,7 +53,7 @@ open(my $tmpfile, "<", "temp_filenames.txt") or die "Cannot open temp file: $!"; my @fileArray = <$tmpfile>; #unshift @fileArray,$conditions; ##don't need to do this since conditions aren't used here - close($tmpfile) or die "what is that??!!! $!"; + close($tmpfile) or die "ERROR: $!"; ##Need to send output file name to shell script: @@ -74,7 +70,7 @@ @first = split('\t',$fileArray[$f]); ##was filename\tJoinCol - ##CMS DEALING WITH HEADER OR NOT: + ##DEALING WITH HEADER OR NOT: if ($header_yes eq "no") { my $fh1; $fh1 = IO::File->new("<$first[0]"); @@ -87,7 +83,7 @@ $head1.="C$i\t"; } $head1.="C$numcols1\n"; - open(my $fh_sub, '>', './header1.txt') or die "OOPIES: $!\n"; + open(my $fh_sub, '>', './header1.txt') or die "ERROR: $!\n"; print $fh_sub $head1; close $fh_sub; system("cat $first[0] >> ./header1.txt"); ##put header in front of file @@ -118,7 +114,7 @@ $second[0]="./header2.txt"; } - system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]"); + system("./addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]"); $f+=2; system("mv file1_file2.txt joined.txt"); if ($header_yes eq "no") { @@ -142,14 +138,14 @@ $head.="C$i\t"; } $head.="C$numcols\n"; - open(my $fh_sub, '>', './header.txt') or die "OOPIES: $!\n"; + open(my $fh_sub, '>', './header.txt') or die "ERROR: $!\n"; print $fh_sub $head; close $fh_sub; system("cat $current[0] >> ./header.txt"); $current[0]="./header.txt"; } - system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]"); + system("./addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]"); system("mv file1_file2.txt joined.txt"); if ($header_yes eq "no") { system("rm ./header.txt"); @@ -158,15 +154,11 @@ system("mv joined.txt $fileArray[-2]"); - ##NOT SURE WHAT TO DO WITH THIS FOR THE MULTI-JOIN TOOL: - ##Now, make the EC files from the genes-results files (extract appropriate columns): - #my $condStr = multi_join_shell(@fileArray); ##RSEMgetTPMs needs to take care of carriage returns - ##NEED TO MODIFY RSEMTOEBSEQ_SHELL SO IT TAKES THE OUTPUT FILENAME AS WELL system("rm temp_filenames.txt"); } -elsif ($N<2) { +elsif ($N<2) { ##DO NOTHING }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_join_left/test-data/multi_join_serial_out.tab Tue Apr 21 16:20:03 2015 -0400 @@ -0,0 +1,7 @@ +symbol sample1 sample2 sample3 Gene Sample_4 Sample_5 Sample_6 Sample_7 Experiment GENE SampleX SampleY SampleZ +CSNK2A1P 12345 1.2345 8.90 +APOM 0.0 0.0 0.0 APOM 123 123 123 123 1 APOM 132 85 97 +HIST1H2AJ 89.5 75.6 32.3 1 HIST1H2AJ 0 0 0 +ASPHD1 0.001 6.98 2.33 ASPHD1 10 0.075 3.57 1.14 +STBD1 0.2545 600.7 0.05 +