Mercurial > repos > mir-bioinf > multi_join_left
diff run-multi_join_serial.pl @ 2:3a9cc859f4c1 draft
Uploaded
author | mir-bioinf |
---|---|
date | Wed, 15 Apr 2015 14:43:04 -0400 |
parents | |
children | 0aa0ebcd307c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/run-multi_join_serial.pl Wed Apr 15 14:43:04 2015 -0400 @@ -0,0 +1,196 @@ +#!/usr/bin/perl + + +use Getopt::Long; +use Pod::Usage; +use IO::File; +use Data::Dumper; + +#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/multi_join_shell.pl'; ##comment this line out when finished testing +#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl'; +#require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/GetOptWC.pm'; + +GetOptions( + "log=s" => \$log, + "join_file=s" => \$data_in, + "join_col=s" => \$coljoin, + "time" => \$mTime, + "q|quiet" => \$quiet, + "iteration=i" => \$I, + "totalfiles=i" => \$N, + "with_header=s" => \$header_yes, + "input_name=s" => \$in_name, + "resultsfile=s" => \$out_file, +# "h|help" => \$help +) or pod2usage( -exitval => 2, -verbose => 2 ); + + +#check parameters and options +my $debug = scalar(@ARGV); + +$coljoin--; +#pod2usage(-msg => "To troubleshoot. ARGV should be @ARGV with $debug arguments in it."); +pod2usage(-msg => "Forward probability should be in [0, 1]!", -exitval => 2, -verbose => 2) if ($probF < 0 || $probF > 1); + +$N++; + +# # +use IO::Handle; +open OUTPUT, '>>',$log or die "cant open this file for OUTPUT: $log. Computer says: $!\n";; +open ERROR, '>>', $log or die "cant open this file for ERROR: $log. Computer says: $!\n"; +STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run +STDERR->fdopen( \*ERROR, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run +# # # + +my @options; + +my $fileno = $I + 1; + +##Keeping track of the input files (one per iteration of this script) in an external file: +open $Filenames, '>>', "temp_filenames.txt" or die "cannot open the temporary file $!\n"; +print $Filenames "$data_in\t"; +print $Filenames "$coljoin\n"; + +if (($I==$N-1)&&($N>=2)) { + ## At the end of the last iteration + close($Filenames); + + print "\nLAST ITERATION COMPLETED and at least two input files provided.\n"; + + ##Read in file temp_filenames.txt + open(my $tmpfile, "<", "temp_filenames.txt") or die "Cannot open temp file: $!"; + my @fileArray = <$tmpfile>; + #unshift @fileArray,$conditions; ##don't need to do this since conditions aren't used here + close($tmpfile) or die "what is that??!!! $!"; + + + ##Need to send output file name to shell script: + push @fileArray, $out_file; ##adds out_file to the end of fileArray + ##Also need to send yes/no for keeping header: + push @fileArray, $header_yes; + + ##Debug: + print "\nFirst file fileArray[0] is $fileArray[0]."; + print "\nOutput file is next-to-last val in fileArray, $fileArray[-2]."; + print "\nUse header? is last val in fileArray, $fileArray[-1]."; + print "\nSecond file now is fileArray[2], $fileArray[2]."; + + ##@fileArray has one file per line,output,header_yes, so $N+1 rows + my $f=0; + my @first; + my @second; + do { + + @first = split('\t',$fileArray[$f]); ##was filename\tJoinCol + print "\njoin column from first line is $first[1]."; + + ##CMS DEALING WITH HEADER OR NOT: + if ($header_yes eq "no") { + my $fh1; + $fh1 = IO::File->new("<$first[0]"); + my $line1file1 = $fh1->getline(); + $line1file1 =~ s/\s+$//; + #print "\nline1file1 is $line1file1\n"; ##DEBUG + @cols = split "\t",$line1file1; + my $numcols1 = @cols; + my $head1; + for (my $i=1; $i<$numcols1; $i++) { + $head1.="C$i\t"; + } + $head1.="C$numcols1\n"; + open(my $fh_sub, '>', './header1.txt') or die "OOPIES: $!\n"; + print "\nheader first file $first[0] on next line:\n$head1"; ##DEBUG + print $fh_sub $head1; + close $fh_sub; + system("cat $first[0] >> ./header1.txt"); ##put header in front of file + ##now want to use ./header1.txt instead of what was in $first[0] earlier + $first[0] = "./header1.txt"; + } + + + + @second = split('\t',$fileArray[$f+1]); + + if ($header_yes eq "no") { + my $fh2; + $fh2 = IO::File->new("<$second[0]"); + my $line1file2 = $fh2->getline(); + $line1file2 =~ s/\s+$//; + @cols = split "\t",$line1file2; + my $numcols2 = @cols; + my $head2; + for (my $i=1; $i<$numcols2; $i++) { + $head2.="C$i\t"; + } + $head2.="C$numcols2\n"; + open(my $fh_sub, '>', './header2.txt') or die "OOPIES: $!\n"; + print "\nheader from $second[0] on next line:\n$head2"; ##DEBUG + print $fh_sub $head2; + close $fh_sub; + system("cat $second[0] >> ./header2.txt"); + $second[0]="./header2.txt"; + } + + print "\ncommand following:\n"; + print "/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl -File1=$first[0] -File2=$second[0] -cola1=$first[1] -cola2=$second[1] -colb1=$first[1] -colb2=$second[1]"; + system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]"); + print "\nOut from system call on next line:\n$!"; + $f+=2; + system("mv file1_file2.txt joined.txt"); + if ($header_yes eq "no") { + system("rm ./header2.txt"); + system("rm ./header1.txt"); + } + } while ($f < 2); ##FIRST TWO ONLY!!! + + for ($f; $f<$N; $f++) { + my @current = split('\t',$fileArray[$f]); ##was filename\tJoinCol + print "\njoin column from first line is $first[1]."; + + if ($header_yes eq "no") { + my $fh; + $fh = IO::File->new("<$current[0]"); + my $line1file = $fh->getline(); + $line1file =~ s/\s+$//; + @cols = split "\t",$line1file; + my $numcols = @cols; + my $head; + for (my $i=1; $i<$numcols; $i++) { + $head.="C$i\t"; + } + $head.="C$numcols\n"; + open(my $fh_sub, '>', './header.txt') or die "OOPIES: $!\n"; + print "\nheader from file $current[0] on next line:\n$head"; ##DEBUG + print $fh_sub $head; + close $fh_sub; + system("cat $current[0] >> ./header.txt"); + $current[0]="./header.txt"; + } + + print "\ncommand following:\n"; + print "/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl -File1=joined.txt -File2=$current[0] -cola1=$first[1] -cola2=$current[1] -colb1=$first[1] -colb2=$current[1]"; + system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]"); + print "\nOut from system call on next line:\n$!"; + system("mv file1_file2.txt joined.txt"); + if ($header_yes eq "no") { + system("rm ./header.txt"); + } + } + + system("mv joined.txt $fileArray[-2]"); + + ##NOT SURE WHAT TO DO WITH THIS FOR THE MULTI-JOIN TOOL: + ##Now, make the EC files from the genes-results files (extract appropriate columns): + #my $condStr = multi_join_shell(@fileArray); ##RSEMgetTPMs needs to take care of carriage returns + ##NEED TO MODIFY RSEMTOEBSEQ_SHELL SO IT TAKES THE OUTPUT FILENAME AS WELL + + system("rm temp_filenames.txt"); + +} +elsif ($N<2) { + print "\n<br /><i>Only one file; not running join.</i>\n"; +} + + + +#print "LOG $mv\n";