view PsiCLASS-1.0.2/psiclass @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict ;
use warnings ;

use Cwd 'cwd' ;
use Cwd 'abs_path' ;
use File::Basename;
use threads ;
use threads::shared ;

die "Usage: ./psiclass [OPTIONS]\n".
    "Required:\n".
    "\t-b STRING: paths to the alignment BAM files. Use comma to separate multiple BAM files\n".
    "\t\tor\n".
    "\t--lb STRING: path to the file listing the alignments BAM files\n".
    "Optional:\n".
    "\t-s STRING: path to the trusted splice file (default: not used)\n".
    "\t-o STRING: prefix of output files (default: ./psiclass)\n". 
    "\t-p INT: number of processes/threads (default: 1)\n".
    "\t-c FLOAT: only use the subexons with classifier score <= than the given number (default: 0.05)\n".
    "\t--sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5)\n".
    "\t--vd FLOAT : the minimum average coverage depth of a transcript to be reported (defaults: 1.0)\n".
    "\t--maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf)\n".
    "\t--bamGroup STRING: path to the file listing the group id of BAMs in the --lb file (default: not used)\n".
    "\t--primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments)\n".
    #"\t--mateIdx INT: the read id has suffix such as .1, .2 for a mate pair. (default: auto)\n".
    "\t--version: print version and exit\n".
    "\t--stage INT:  (default: 0)\n".
    "\t\t0-start from beginning - building splice sites for each sample\n".
    "\t\t1-start from building subexon files for each sample\n".
    "\t\t2-start from combining subexon files across samples\n".
    "\t\t3-start from assembling the transcripts for each sample\n".
    "\t\t4-start from voting the consensus transcripts across samples\n"
    if ( @ARGV == 0 ) ;

my $WD = dirname( abs_path( $0 ) ) ;

my $i ;
my $cmd ;
my $prefix = "psiclass_" ;
my $numThreads = 1 ;

sub system_call
{
	print $_[0], "\n" ;
	system( $_[0] ) == 0 or die "Terminated\n" ;
}

# Process the arguments
my @bamFiles : shared ;
my $spliceFile = "" ;
my $bamFileList = "" ;
my $stage = 0 ;
my $classesOpt = "" ;
my $juncOpt = "" ;
my $trustSpliceOpt = "" ;
my $voteOpt = "" ;
my $outdir = "." ;
my $mateIdx = -1 ;
my $spliceAvgSupport = 0.5 ;
my $bamGroup = "" ;
for ( $i = 0 ; $i < @ARGV ; ++$i )
{
	if ( $ARGV[$i] eq "--lb" )
	{
		$bamFileList = $ARGV[$i + 1] ;
		open FP1, $ARGV[$i + 1] ;	
		while ( <FP1> )
		{
			chomp ;
			push @bamFiles, $_ ;
		}
		++$i ;
	}
	elsif ( $ARGV[$i] eq "-b" )
	{
		push @bamFiles, ( split /,/, $ARGV[$i + 1] ) ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "-s" )
	{
		$spliceFile = $ARGV[$i + 1] ;		
		++$i ;
	}
	elsif ( $ARGV[$i] eq "-o" )
	{
		my $o = $ARGV[$i + 1] ;
		# Parse the -o option to get the folder and prefix
		if (  $o =~ /\// )
		{
			if ( substr( $o, -1 ) ne "/" )  # file prefix is in -o
			{
				my @cols = split /\/+/, $o ;
				$prefix = $cols[-1] ; 
				$outdir = join( "/", @cols[0..($#cols-1)] ) ;
			}
			else # -o only specifies path
			{
				$outdir = substr( $o, 0, length( $o ) - 1 ) ;
			}
		}
		else
		{
			$prefix = $o ;
		}
		
		if ( substr( $prefix, -1 ) ne "_" )
		{
			$prefix .= "_" ;
		}
		
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--stage" )
	{
		$stage = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[ $i ] eq "-p" )
	{
		$numThreads = $ARGV[$i + 1] ;
		$classesOpt .= " -p $numThreads" ;
		++$i ;
	}
	elsif ( $ARGV[ $i ] eq "-c" )
	{
		$classesOpt .= " -c ".$ARGV[ $i + 1 ] ;
		++$i ;
	}
	elsif ( $ARGV[ $i ] eq "--mateIdx" )
	{
		$mateIdx = $ARGV[ $i + 1 ] ;
		if ( $mateIdx == 1 )
		{
			$classesOpt .= " --hasMateIdSuffix" ;
			$juncOpt .= " --hasMateIdSuffix" ;
		}
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--bamGroup" )
	{
		$bamGroup = $ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[ $i ] eq "--sa" )
	{
		$trustSpliceOpt .= " -a ".$ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[ $i ] eq "--vd" )
	{
		$voteOpt .= " -d ".$ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--primaryParalog" )
	{
		$classesOpt .= " --primaryParalog" ;
	}
	elsif ( $ARGV[$i] eq "--maxDpConstraintSize" )
	{
		$classesOpt .= " --maxDpConstraintSize ".$ARGV[$i + 1] ;
		++$i ;
	}
	elsif ( $ARGV[$i] eq "--version" )
	{
		die "PsiCLASS v1.0.1\n" ;
	}
	else
	{
		die "Unknown argument: ", $ARGV[$i], "\n" ;
	}
}
if ( scalar( @bamFiles ) == 0 )
{
	die "Must use option --lb to specify the list of bam files.\n" ;
}

if ( $mateIdx == -1 )
{	
	open FPsam, "$WD/samtools-0.1.19/samtools view ".$bamFiles[0]."| head -1000 |" ; 
	my $flag = 0 ;
	while ( <FPsam> )
	{
		my @cols = split /\t/ ;
		my $id = $cols[0] ;

		if ( !( $id =~ /[\.|\/][1|2]$/ ) )
		{
			$flag = 1 ;
			last ;
		}
	}
	close FPsam ;

	if ( $flag == 0 )
	{
		print "Found mate read id index suffix(.1 or /1). Calling \"--mateIdx 1\" option. If this is a false calling, please use \"--mateIdx 0\".\n" ;
		$classesOpt .= " --hasMateIdSuffix" ;
		$juncOpt .= " --hasMateIdSuffix" ;

	}
}

mkdir $outdir if ( !-d $outdir ) ;
mkdir "$outdir/splice" if ( !-d "$outdir/splice" ) ;
mkdir "$outdir/subexon" if ( !-d "$outdir/subexon" ) ;


my $threadLock : shared ;
my @sharedFiles : shared ;
my @threads ;
for ( $i = 0 ; $i < $numThreads ; ++$i )
{
	push @threads, $i ;
}

sub threadRunSplice
{
	my $tid = threads->tid() - 1 ;
	my $i ;
	for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i )	
	{
		next if ( ( $i % $numThreads ) != $tid ) ;
		system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ;
	}
}

# Generate the splice file for each bam file.
if ( $stage <= 0 )
{
	if ( $numThreads == 1 )
	{
		for ( $i = 0 ; $i < @bamFiles ; ++$i )
		{
			system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ;
			#if ( $spliceFile ne "" )
			#{
			#	system_call( "perl $WD/ManipulateIntronFile.pl $spliceFile ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ;
			#}
			#else
			#{
			#system_call( "awk \'{if (\$6>1) print;}\' ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ;
			#	system_call( "mv ${prefix}bam_$i.raw_splice ${prefix}bam_$i.splice" ) ;
			#}
		}
	}
	else
	{
		foreach ( @threads )
		{
			$_ = threads->create( \&threadRunSplice ) ;
		}
		foreach ( @threads )
		{
			$_->join() ;
		}
	}
	
	if ( $bamGroup eq "" || $spliceFile ne "" )
	{
		open FPls, ">$outdir/splice/${prefix}splice.list" ;
		for ( $i = 0 ; $i < @bamFiles ; ++$i )
		{
			print FPls  "$outdir/splice/${prefix}bam_$i.raw_splice\n" ;
		}
		close FPls ;

		if ( $spliceFile ne "" )
		{
			for ( $i = 0 ; $i < @bamFiles ; ++$i )
			{
				system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $spliceFile > $outdir/splice/${prefix}bam_$i.splice" ) ;
			}
		}
		else
		{
			system_call( "$WD/trust-splice $outdir/splice/${prefix}splice.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam.trusted_splice" ) ;
			for ( $i = 0 ; $i < @bamFiles ; ++$i )
			{
				system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ;
			}
		}
	}
	else
	{
		my @bamToGroupId ;
		my %groupNameToId ;
		open FPbg, $bamGroup ;
		my $groupUsed = 0 ;
		while (<FPbg>) 
		{
			chomp ;
			if ( !defined $groupNameToId{$_} )
			{
				$groupNameToId{$_} = $groupUsed ;
				++$groupUsed ;
			}
			push @bamToGroupId, $groupNameToId{$_} ;	
		}
		close FPbg ;
		
		# Process each group one by one
		my $group ;
		for ( $group = 0 ; $group < $groupUsed ; ++$group )
		{
			open FPls, ">$outdir/splice/${prefix}splice_$group.list" ;
			for ( $i = 0 ; $i < @bamFiles ; ++$i )
			{
				next if ( $bamToGroupId[$i] != $group ) ;
				print FPls  "$outdir/splice/${prefix}bam_$i.raw_splice\n" ;
			}
			close FPls ;

			system_call( "$WD/trust-splice $outdir/splice/${prefix}splice_$group.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam_$group.trusted_splice" ) ;
			for ( $i = 0 ; $i < @bamFiles ; ++$i )
			{
				next if ( $bamToGroupId[$i] != $group ) ;
				system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam_$group.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ;
			}
		}

	}
}


# Get subexons from each bam file
sub threadRunSubexonInfo
{
	my $tidAdjust = 0 ;
	$tidAdjust = $numThreads if ( $stage < 1 ) ;
	my $tid = threads->tid() - $tidAdjust - 1 ;
	my $i ;
	for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i )	
	{
		next if ( ( $i % $numThreads ) != $tid ) ;
		system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ;	
	}
}


if ( $stage <= 1 )
{
	if ( $numThreads == 1 )
	{
		for ( $i = 0 ; $i < @bamFiles ; ++$i )
		{
			system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ;	
		}
	}
	else
	{
		#print "hi ", scalar( @threads ), " ", scalar( @bamFiles), "\n" ;
		foreach ( @threads )
		{
			$_ = threads->create( \&threadRunSubexonInfo ) ;
		}
		foreach ( @threads )
		{
			$_->join() ;
		}
	}
	
	open FPls, ">$outdir/subexon/${prefix}subexon.list" ;
	for ( $i = 0 ; $i < @bamFiles ; ++$i )
	{
		print FPls "$outdir/subexon/${prefix}subexon_$i.out\n" ;
	}
	close FPls ;
}

# combine the subexons.
if ( $stage <= 2 )
{
	$cmd = "$WD/combine-subexons --ls $outdir/subexon/${prefix}subexon.list > $outdir/subexon/${prefix}subexon_combined.out" ;
	system_call( "$cmd" ) ;
}

# Run classes
if ( $stage <= 3 )
{
	my $trimPrefix = substr( $prefix, 0, -1 ) ;
	my $bamPath = "" ;
	if ( $bamFileList ne "" )
	{
		$bamPath = " --lb $bamFileList " ;
	}
	else
	{
		foreach my $b (@bamFiles)
		{
			$bamPath .= " -b $b " ;
		}
	}
	$cmd = "$WD/classes $classesOpt $bamPath -s $outdir/subexon/${prefix}subexon_combined.out -o $outdir/${trimPrefix} > $outdir/${prefix}classes.log" ;
	system_call( "$cmd" ) ;
}

# Run voting
if ( $stage <= 4 )
{
	open FPgtflist, ">$outdir/${prefix}gtf.list" ;
	for ( $i = 0 ; $i < @bamFiles ; ++$i )		
	{
		print FPgtflist "$outdir/${prefix}sample_${i}.gtf\n" ;
	}
	close FPgtflist ;
	$cmd = "$WD/vote-transcripts --lg $outdir/${prefix}gtf.list $voteOpt > $outdir/${prefix}vote.gtf" ;
	system_call( "$cmd" ) ;
}