Mercurial > repos > mir-bioinf > multi_join_left
view run-multi_join_serial.pl @ 3:0aa0ebcd307c draft
Uploaded
author | mir-bioinf |
---|---|
date | Wed, 15 Apr 2015 16:31:04 -0400 |
parents | 3a9cc859f4c1 |
children |
line wrap: on
line source
#!/usr/bin/perl use Getopt::Long; use Pod::Usage; use IO::File; use Data::Dumper; #require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/multi_join_shell.pl'; ##comment this line out when finished testing #require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl'; #require '/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/GetOptWC.pm'; GetOptions( "join_file=s" => \$data_in, "join_col=s" => \$coljoin, "time" => \$mTime, "q|quiet" => \$quiet, "iteration=i" => \$I, "totalfiles=i" => \$N, "with_header=s" => \$header_yes, "input_name=s" => \$in_name, "resultsfile=s" => \$out_file, # "h|help" => \$help ) or pod2usage( -exitval => 2, -verbose => 2 ); #check parameters and options my $debug = scalar(@ARGV); $coljoin--; #pod2usage(-msg => "To troubleshoot. ARGV should be @ARGV with $debug arguments in it."); pod2usage(-msg => "Forward probability should be in [0, 1]!", -exitval => 2, -verbose => 2) if ($probF < 0 || $probF > 1); $N++; # # use IO::Handle; STDOUT->fdopen( \*OUTPUT, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run STDERR->fdopen( \*ERROR, 'a' ) or die "cant open file $!\n"; #cms changing mode from 'w' to 'a' for multiple files in one run # # # my @options; my $fileno = $I + 1; ##Keeping track of the input files (one per iteration of this script) in an external file: open $Filenames, '>>', "temp_filenames.txt" or die "cannot open the temporary file $!\n"; print $Filenames "$data_in\t"; print $Filenames "$coljoin\n"; if (($I==$N-1)&&($N>=2)) { ## At the end of the last iteration close($Filenames); ##Read in file temp_filenames.txt open(my $tmpfile, "<", "temp_filenames.txt") or die "Cannot open temp file: $!"; my @fileArray = <$tmpfile>; #unshift @fileArray,$conditions; ##don't need to do this since conditions aren't used here close($tmpfile) or die "what is that??!!! $!"; ##Need to send output file name to shell script: push @fileArray, $out_file; ##adds out_file to the end of fileArray ##Also need to send yes/no for keeping header: push @fileArray, $header_yes; ##@fileArray has one file per line,output,header_yes, so $N+1 rows my $f=0; my @first; my @second; do { @first = split('\t',$fileArray[$f]); ##was filename\tJoinCol ##CMS DEALING WITH HEADER OR NOT: if ($header_yes eq "no") { my $fh1; $fh1 = IO::File->new("<$first[0]"); my $line1file1 = $fh1->getline(); $line1file1 =~ s/\s+$//; @cols = split "\t",$line1file1; my $numcols1 = @cols; my $head1; for (my $i=1; $i<$numcols1; $i++) { $head1.="C$i\t"; } $head1.="C$numcols1\n"; open(my $fh_sub, '>', './header1.txt') or die "OOPIES: $!\n"; print $fh_sub $head1; close $fh_sub; system("cat $first[0] >> ./header1.txt"); ##put header in front of file ##now want to use ./header1.txt instead of what was in $first[0] earlier $first[0] = "./header1.txt"; } @second = split('\t',$fileArray[$f+1]); if ($header_yes eq "no") { my $fh2; $fh2 = IO::File->new("<$second[0]"); my $line1file2 = $fh2->getline(); $line1file2 =~ s/\s+$//; @cols = split "\t",$line1file2; my $numcols2 = @cols; my $head2; for (my $i=1; $i<$numcols2; $i++) { $head2.="C$i\t"; } $head2.="C$numcols2\n"; open(my $fh_sub, '>', './header2.txt') or die "ERROR: $!\n"; print $fh_sub $head2; close $fh_sub; system("cat $second[0] >> ./header2.txt"); $second[0]="./header2.txt"; } system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl", "-File1=$first[0]", "-File2=$second[0]", "-cola1=$first[1]", "-cola2=$second[1]", "-colb1=$first[1]", "-colb2=$second[1]"); $f+=2; system("mv file1_file2.txt joined.txt"); if ($header_yes eq "no") { system("rm ./header2.txt"); system("rm ./header1.txt"); } } while ($f < 2); ##FIRST TWO ONLY!!! for ($f; $f<$N; $f++) { my @current = split('\t',$fileArray[$f]); ##was filename\tJoinCol if ($header_yes eq "no") { my $fh; $fh = IO::File->new("<$current[0]"); my $line1file = $fh->getline(); $line1file =~ s/\s+$//; @cols = split "\t",$line1file; my $numcols = @cols; my $head; for (my $i=1; $i<$numcols; $i++) { $head.="C$i\t"; } $head.="C$numcols\n"; open(my $fh_sub, '>', './header.txt') or die "OOPIES: $!\n"; print $fh_sub $head; close $fh_sub; system("cat $current[0] >> ./header.txt"); $current[0]="./header.txt"; } system("/opt/galaxy/galaxy-dist/tools/ngs_rna/Unreleased/addColumnsFromFile2ToFile1.pl","-File1=joined.txt", "-File2=$current[0]", "-cola1=$first[1]", "-cola2=$current[1]", "-colb1=$first[1]", "-colb2=$current[1]"); system("mv file1_file2.txt joined.txt"); if ($header_yes eq "no") { system("rm ./header.txt"); } } system("mv joined.txt $fileArray[-2]"); ##NOT SURE WHAT TO DO WITH THIS FOR THE MULTI-JOIN TOOL: ##Now, make the EC files from the genes-results files (extract appropriate columns): #my $condStr = multi_join_shell(@fileArray); ##RSEMgetTPMs needs to take care of carriage returns ##NEED TO MODIFY RSEMTOEBSEQ_SHELL SO IT TAKES THE OUTPUT FILENAME AS WELL system("rm temp_filenames.txt"); } elsif ($N<2) { }