Mercurial > repos > mir-bioinf > multi_join_left

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multi_join_left/addColumnsFromFile2ToFile1.pl	Tue Apr 21 16:20:03 2015 -0400
@@ -0,0 +1,193 @@
+#! /usr/bin/perl -w
+#===============================================================================
+#
+#     FILENAME:  addColumnsFromFile2ToFile1.pl
+#
+#        USAGE:  see -help
+#
+#  DESCRIPTION:  This program adds columns in File 2 to File 1,
+#                if there are correnponding entries in File 1
+#
+#       AUTHOR:  Ron Stewart
+#      VERSION:  1.1
+#      CREATED:  12/18/06 CDT
+#===============================================================================
+
+use lib '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased';  ##CMS ADDED 11-05-14, DIR CONTAINS CUSTOM MODULE
+
+use strict;
+use IO::File;
+use GetOptWC;
+# parsing the command line
+my %optVarsIn = ();
+# help information
+$optVarsIn{'File1'} = './File1.txt`=s`Input file of genes to include';
+$optVarsIn{'File2'} = './File2.txt`=s`Input file to be added';
+$optVarsIn{'cola1'} = '2`=f`first column to look for match in File1';
+$optVarsIn{'colb1'} = '4`=f`second column to look for match in File1';
+$optVarsIn{'cola2'} = '2`=f`first column to look for match in File2';
+$optVarsIn{'colb2'} = '4`=f`second column to look for match in File2';
+$optVarsIn{'file1FirstColToCopy'} = '-1`=f`first column to copy in File1';
+$optVarsIn{'file1LastColToCopy'} = '-1`=f`last column to copy  in File1';
+$optVarsIn{'file2FirstColToCopy'} = '-1`=f`first column to copy in File2';
+$optVarsIn{'file2LastColToCopy'} = '-1`=f`last column to copy  in File2';
+$optVarsIn{'HelpPrefix'} = 'This script is for adding entries in File2 to the corresponding entries in File1.';
+$optVarsIn{'HelpSuffix'} = 'example call:  ./addColumnsFromFile2ToFile1.pl -File1=./esAndDiffMarkersWithSage20061211.txt -cola1=2 -calb1=4 -File2=./pan_whole_table_fold_ann.txt -cola2=1 -colb2=4`Note: Two input files should have title lines`NOTE:Files must be text files, NOT .xls files.  If you have an .xls file, save it as "Text(Windows) in Excel.';
+my %retVars = ();
+my $retVarsRef =  GetOptWC::getOptions(\%optVarsIn);
+%retVars = %$retVarsRef;
+if ($retVars{'HelpCalled'}) {
+	   print "exiting now, help called\n";
+	   exit;
+}
+my $File1 = $retVars{'File1'};
+my $FHFile1;
+$FHFile1 = IO::File->new("<$File1");
+
+my $File2 = $retVars{'File2'};
+my $FHFile2;
+$FHFile2 = IO::File->new("<$File2");
+my $File1name = $File1;
+print "file1name: $File1name\n";
+$File1name =~ s/[\.\/]/_/g;
+print "file1name: $File1name\n";
+
+my $File2name = $File2;
+$File2name =~ s/[\.\/]/_/g;
+#my $Out = $File1name.'.'.$File2name; # this can be too long in some cases
+my $Out = "file1_file2.txt";
+print" out is $Out\n";
+my $OutFile = IO::File->new(">$Out");
+my $cola1 = $retVars{'cola1'};
+my $colb1 = $retVars{'colb1'};
+my $cola2 = $retVars{'cola2'};
+my $colb2 = $retVars{'colb2'};
+my $firstColFile1 = $retVars{'file1FirstColToCopy'};
+my $lastColFile1 = $retVars{'file1LastColToCopy'};
+my $firstColFile2 = $retVars{'file2FirstColToCopy'};
+my $lastColFile2 = $retVars{'file2LastColToCopy'};
+my %genes2 = ();
+my %genes4 = ();
+my %genes4key = ();
+my $lineCtr = 0;
+my @cols = ();
+my $firstLineFile2 = $FHFile2->getline();
+$firstLineFile2 =~ s/\s+$//;
+@cols = split "\t",$firstLineFile2;
+my $numColFile2 = @cols;
+if($firstColFile2==-1){
+	$firstColFile2 = 0;
+	$lastColFile2 = $numColFile2-1;
+}
+my @titleFile2 = @cols[$firstColFile2..$lastColFile2];
+while (my $line = $FHFile2->getline()) {
+	   $lineCtr++;
+	   #$line =~ s/\s+$//;
+	   $line =~ s/\R//g;  ##CMS 11-6-14
+	   chomp($line);
+	   @cols = split "\t",$line;
+	   my $numCols = (@cols + 0);
+
+	   #$cols[$cola2] = uc($cols[$cola2]); ##CMS 11-6-14
+	   #$cols[$colb2] = uc($cols[$colb2]); ##CMS 11-6-14
+	   $cols[$cola2] =~ s/ //g;
+	   my $colsBSymbol = "";
+	   if ($cols[$colb2] =~ /\"{0,1}CDS\; ([^\;]+);/) {
+	   		$colsBSymbol = $1;
+	   }
+	   else {
+	   		#if ($numCols == ($colb2 +1)) {  ##CMS COMMENTED OUT 11-6-14
+			#print "$line\n";
+			#print "Please check this line\n";
+			#exit;
+	   		#}  ##CMS END COMMENTS 11-6-14
+	   		$cols[$colb2] =~ s/ //g;
+	   		$colsBSymbol = $cols[$colb2];
+	   }
+	   if($numCols<$lastColFile2){
+	   		for(my $i = $numCols;$i<$lastColFile2;$i++){
+				$cols[$i] = "";
+			}
+	   }
+
+	   $genes2{$cols[$cola2]} = join("\t",@cols[$firstColFile2..$lastColFile2]);
+	   $genes4{$colsBSymbol}->{$cols[$cola2]} = join("\t",@cols[$firstColFile2..$lastColFile2]);
+	   $genes4key{$colsBSymbol}="x";
+}
+print "linectr: $lineCtr\n";
+$lineCtr = 0;
+@cols = ();
+my $firstLineFile1 = $FHFile1->getline();
+$firstLineFile1 =~ s/\s+$//;
+@cols = split "\t",$firstLineFile1;
+my $numColFile1 = @cols;
+if($firstColFile1==-1){
+	$firstColFile1 = 0;
+	$lastColFile1 = $numColFile1-1;
+}
+#print "numcolsfile1: $numColFile1\n";
+#print "lastcolsfile1: $lastColFile1\n";
+
+my @titleFile1 = @cols[$firstColFile1..$lastColFile1];
+#print "tf1:  @titleFile1\n";
+#print "tf2:  @titleFile2\n";
+#print "outfile: $OutFile\n";
+
+print $OutFile join("\t",@titleFile1)."\t".join("\t",@titleFile2)."\n";
+#my $numCol = $lastColFile1 - $firstColFile1 +1;
+my $numCurrentLine =0;
+while (my $line = $FHFile1->getline()) {
+	   $lineCtr++;
+	   $line =~ s/\s+$//;
+	   my $selectedEntries;
+	   @cols = split "\t",$line;
+	   $numCurrentLine = $#cols;#[$firstColFile1..$LastColFile1];
+	   #print "numcurrentline: $numCurrentLine\n";
+	   $line = $line."\t";
+	   if($numCurrentLine<$lastColFile1){
+	        #print "in if\n";
+			for(my $i =$numCurrentLine+1;$i<=$lastColFile1;$i++){
+			    #print "in for. i=$i\n";
+				$cols[$i]="";
+				#$line = $line."\t";
+			}
+	   }
+	   $selectedEntries = join("\t",@cols[$firstColFile1..$lastColFile1]);
+	   my $numCols = (@cols + 0);
+	   #$cols[$cola1] = uc($cols[$cola1]); ##CMS 11-6-14
+	   #$cols[$colb1] = uc($cols[$colb1]); ##CMS 11-6-14
+	   $cols[$cola1] =~ s/ //g;
+	   my $colsBSymbol = "";
+	   if ($cols[$colb1] =~ /\"{0,1}CDS\; ([^\;]+);/) {
+	   		$colsBSymbol = $1;
+	   }
+	   else {
+	   		#if ($numCols == ($colb1 +1)) {  ##CMS COMMENTED 11-6-14
+			#	print"$line";
+			#	print " please check this line\n";
+			#	exit;
+			#}  ##CMS END COMMENTS 11-6-14
+			$cols[$colb1] =~ s/ //g;
+			$colsBSymbol = $cols[$colb1];
+	   }
+	   if((exists ($genes2{$cols[$cola1]}) and $cols[$cola1] ne "N/A") or (exists ($genes2{$colsBSymbol}) and $colsBSymbol ne "N/A")  ) {
+	        if (exists ($genes2{$cols[$cola1]})) {
+	   		   print $OutFile "$selectedEntries"."\t".$genes2{$cols[$cola1]}."\n";
+	   		}
+	   		elsif (exists ($genes2{$colsBSymbol})) {
+	   		   print $OutFile "$selectedEntries"."\t".$genes2{$colsBSymbol}."\n";
+	   		}
+	   		else {
+	   		   print "WHOA, we've got a problem Here!!!!!\n";
+	   		}
+	   }
+	   elsif(exists ($genes4key{$colsBSymbol}) and $colsBSymbol ne "N/A" ) {
+				foreach my $symbol (keys %{$genes4{$colsBSymbol}}){
+					print $OutFile "$selectedEntries"."\t".$genes4{$colsBSymbol}->{$symbol}."\n";
+				}
+	   }
+	   else {
+			    print $OutFile "$selectedEntries"."\n";
+	   }
+}
+exit;
--- a/multi_join_left/multi_join_serial.xml	Wed Apr 15 17:51:39 2015 -0400
+++ b/multi_join_left/multi_join_serial.xml	Tue Apr 21 16:20:03 2015 -0400
@@ -34,17 +34,17 @@
 	<param name="Files_2|joinMe" value="multi_join_serial_in3.tab" ftype="tabular"/>
 	<param name="Files_2joinCol" value="2"/>
 	<param name="headerYes" value="yes"/>
-	<output name="Joined_all" value="multi_join_serial_out.tab" ftype="tabular"/>
-     <test/>
-  <tests/>
+	<output name="Joined_all" value="multi_join_serial_out.tab"/>
+     </test>
+  </tests>
   <help>

-This tool performs a left-outer join on multiple (at least two) files using a perl script that Ron wrote (thanks, Ron!). The resulting joined file will have the same number of rows as the first file chosen and subsequent files' matches will be shown if present. Rows in the first file without matches in the other files will have empty cells. If none of the input files have a header present, a simple column number header will be added to the output file to denote the start of each set of matches (from each file, start denoted by "C1").
+This tool performs a left-outer join on multiple (at least two) files using an external perl script (included in tool repository) called addColumnsFromFile2ToFile1.pl. The resulting joined file will have the same number of rows as the first file chosen and subsequent files' matches will be shown if present. Rows in the first file without matches in the other files will have empty cells. If none of the input files have a header present, a simple column number header will be added to the output file to denote the start of each set of matches (from each file, start denoted by "C1").


 .. class:: warningmark

-This tool may fail due to the system running out of memory depending on the number and size of input files and number of matching lines. The higher all of these are, the more likely the tool is to fail. A red output dataset saying "Job killed" typically means the system ran into an out of memory error and as a result the job was killed. This issue has yet to be addressed at the moment...
+This tool may fail due to the system running out of memory depending on the number and size of input files and number of matching lines. The higher all of these are, the more likely the tool is to fail. A red output dataset saying "Job killed" typically means the system ran into an out of memory error and as a result the job was killed.


 **Steps:**
--- a/multi_join_left/run-multi_join_serial.pl	Wed Apr 15 17:51:39 2015 -0400
+++ b/multi_join_left/run-multi_join_serial.pl	Tue Apr 21 16:20:03 2015 -0400
@@ -6,10 +6,6 @@
 use IO::File;
 use Data::Dumper;

-#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/multi_join_shell.pl';  ##comment this line out when finished testing
-#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl';
-#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/GetOptWC.pm';
-
 GetOptions(
     "join_file=s"             => \$data_in,
     "join_col=s"	     => \$coljoin,
@@ -35,8 +31,8 @@

 # #
 use IO::Handle;
-STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n";   #cms changing mode from 'w' to 'a' for multiple files in one run
-STDERR->fdopen( \*ERROR,  'a' ) or die "cant open file $!\n";   #cms changing mode from 'w' to 'a' for multiple files in one run
+STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n";   # changing mode from 'w' to 'a' for multiple files in one run
+STDERR->fdopen( \*ERROR,  'a' ) or die "cant open file $!\n";   # changing mode from 'w' to 'a' for multiple files in one run
 # # #

 my @options;
@@ -57,7 +53,7 @@
 	open(my $tmpfile, "<", "temp_filenames.txt") or die "Cannot open temp file: $!";
 	my @fileArray = <$tmpfile>;
 	#unshift @fileArray,$conditions; ##don't need to do this since conditions aren't used here
-	close($tmpfile) or die "what is that??!!! $!";
+	close($tmpfile) or die "ERROR: $!";


 	##Need to send output file name to shell script:
@@ -74,7 +70,7 @@

 		@first = split('\t',$fileArray[$f]);  ##was filename\tJoinCol

-		##CMS DEALING WITH HEADER OR NOT:
+		##DEALING WITH HEADER OR NOT:
                 if ($header_yes eq "no") {
 			my $fh1;
                 	$fh1 = IO::File->new("<$first[0]");
@@ -87,7 +83,7 @@
 				$head1.="C$i\t";
 			}
 			$head1.="C$numcols1\n";
-			open(my $fh_sub, '>', './header1.txt') or die "OOPIES: $!\n";
+			open(my $fh_sub, '>', './header1.txt') or die "ERROR: $!\n";
 			print $fh_sub $head1;
 			close $fh_sub;
 			system("cat $first[0] >> ./header1.txt");  ##put header in front of file
@@ -118,7 +114,7 @@
 			$second[0]="./header2.txt";
                 }

-		system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]");
+		system("./addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]");
 		$f+=2;
 		system("mv file1_file2.txt joined.txt");
 		if ($header_yes eq "no") {
@@ -142,14 +138,14 @@
                                 $head.="C$i\t";
                         }
                         $head.="C$numcols\n";
-                        open(my $fh_sub, '>', './header.txt') or die "OOPIES: $!\n";
+                        open(my $fh_sub, '>', './header.txt') or die "ERROR: $!\n";
                         print $fh_sub $head;
                         close $fh_sub;
                         system("cat $current[0] >> ./header.txt");
                         $current[0]="./header.txt";
                 }

-                system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]");
+                system("./addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]");
 		system("mv file1_file2.txt joined.txt");
 		if ($header_yes eq "no") {
 			system("rm ./header.txt");
@@ -158,15 +154,11 @@

 	system("mv joined.txt $fileArray[-2]");

-	##NOT SURE WHAT TO DO WITH THIS FOR THE MULTI-JOIN TOOL:
-	##Now, make the EC files from the genes-results files (extract appropriate columns):
-	#my $condStr =  multi_join_shell(@fileArray);  ##RSEMgetTPMs needs to take care of carriage returns
-	##NEED TO MODIFY RSEMTOEBSEQ_SHELL SO IT TAKES THE OUTPUT FILENAME AS WELL

 	system("rm temp_filenames.txt");

 }
-elsif ($N<2) {
+elsif ($N<2) {  ##DO NOTHING
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multi_join_left/test-data/multi_join_serial_out.tab	Tue Apr 21 16:20:03 2015 -0400
@@ -0,0 +1,7 @@
+symbol	sample1	sample2	sample3	Gene	Sample_4	Sample_5	Sample_6	Sample_7	Experiment	GENE	SampleX	SampleY	SampleZ
+CSNK2A1P	12345	1.2345	8.90
+APOM	0.0	0.0	0.0	APOM	123	123	123	123	1	APOM	132	85	97
+HIST1H2AJ	89.5	75.6	32.3						1	HIST1H2AJ	0	0	0
+ASPHD1	0.001	6.98	2.33	ASPHD1	10	0.075	3.57	1.14
+STBD1	0.2545	600.7	0.05
+