Mercurial > repos > lsong10 > psiclass
view PsiCLASS-1.0.2/psiclass @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict ; use warnings ; use Cwd 'cwd' ; use Cwd 'abs_path' ; use File::Basename; use threads ; use threads::shared ; die "Usage: ./psiclass [OPTIONS]\n". "Required:\n". "\t-b STRING: paths to the alignment BAM files. Use comma to separate multiple BAM files\n". "\t\tor\n". "\t--lb STRING: path to the file listing the alignments BAM files\n". "Optional:\n". "\t-s STRING: path to the trusted splice file (default: not used)\n". "\t-o STRING: prefix of output files (default: ./psiclass)\n". "\t-p INT: number of processes/threads (default: 1)\n". "\t-c FLOAT: only use the subexons with classifier score <= than the given number (default: 0.05)\n". "\t--sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5)\n". "\t--vd FLOAT : the minimum average coverage depth of a transcript to be reported (defaults: 1.0)\n". "\t--maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf)\n". "\t--bamGroup STRING: path to the file listing the group id of BAMs in the --lb file (default: not used)\n". "\t--primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments)\n". #"\t--mateIdx INT: the read id has suffix such as .1, .2 for a mate pair. (default: auto)\n". "\t--version: print version and exit\n". "\t--stage INT: (default: 0)\n". "\t\t0-start from beginning - building splice sites for each sample\n". "\t\t1-start from building subexon files for each sample\n". "\t\t2-start from combining subexon files across samples\n". "\t\t3-start from assembling the transcripts for each sample\n". "\t\t4-start from voting the consensus transcripts across samples\n" if ( @ARGV == 0 ) ; my $WD = dirname( abs_path( $0 ) ) ; my $i ; my $cmd ; my $prefix = "psiclass_" ; my $numThreads = 1 ; sub system_call { print $_[0], "\n" ; system( $_[0] ) == 0 or die "Terminated\n" ; } # Process the arguments my @bamFiles : shared ; my $spliceFile = "" ; my $bamFileList = "" ; my $stage = 0 ; my $classesOpt = "" ; my $juncOpt = "" ; my $trustSpliceOpt = "" ; my $voteOpt = "" ; my $outdir = "." ; my $mateIdx = -1 ; my $spliceAvgSupport = 0.5 ; my $bamGroup = "" ; for ( $i = 0 ; $i < @ARGV ; ++$i ) { if ( $ARGV[$i] eq "--lb" ) { $bamFileList = $ARGV[$i + 1] ; open FP1, $ARGV[$i + 1] ; while ( <FP1> ) { chomp ; push @bamFiles, $_ ; } ++$i ; } elsif ( $ARGV[$i] eq "-b" ) { push @bamFiles, ( split /,/, $ARGV[$i + 1] ) ; ++$i ; } elsif ( $ARGV[$i] eq "-s" ) { $spliceFile = $ARGV[$i + 1] ; ++$i ; } elsif ( $ARGV[$i] eq "-o" ) { my $o = $ARGV[$i + 1] ; # Parse the -o option to get the folder and prefix if ( $o =~ /\// ) { if ( substr( $o, -1 ) ne "/" ) # file prefix is in -o { my @cols = split /\/+/, $o ; $prefix = $cols[-1] ; $outdir = join( "/", @cols[0..($#cols-1)] ) ; } else # -o only specifies path { $outdir = substr( $o, 0, length( $o ) - 1 ) ; } } else { $prefix = $o ; } if ( substr( $prefix, -1 ) ne "_" ) { $prefix .= "_" ; } ++$i ; } elsif ( $ARGV[$i] eq "--stage" ) { $stage = $ARGV[$i + 1] ; ++$i ; } elsif ( $ARGV[ $i ] eq "-p" ) { $numThreads = $ARGV[$i + 1] ; $classesOpt .= " -p $numThreads" ; ++$i ; } elsif ( $ARGV[ $i ] eq "-c" ) { $classesOpt .= " -c ".$ARGV[ $i + 1 ] ; ++$i ; } elsif ( $ARGV[ $i ] eq "--mateIdx" ) { $mateIdx = $ARGV[ $i + 1 ] ; if ( $mateIdx == 1 ) { $classesOpt .= " --hasMateIdSuffix" ; $juncOpt .= " --hasMateIdSuffix" ; } ++$i ; } elsif ( $ARGV[$i] eq "--bamGroup" ) { $bamGroup = $ARGV[$i + 1] ; ++$i ; } elsif ( $ARGV[ $i ] eq "--sa" ) { $trustSpliceOpt .= " -a ".$ARGV[$i + 1] ; ++$i ; } elsif ( $ARGV[ $i ] eq "--vd" ) { $voteOpt .= " -d ".$ARGV[$i + 1] ; ++$i ; } elsif ( $ARGV[$i] eq "--primaryParalog" ) { $classesOpt .= " --primaryParalog" ; } elsif ( $ARGV[$i] eq "--maxDpConstraintSize" ) { $classesOpt .= " --maxDpConstraintSize ".$ARGV[$i + 1] ; ++$i ; } elsif ( $ARGV[$i] eq "--version" ) { die "PsiCLASS v1.0.1\n" ; } else { die "Unknown argument: ", $ARGV[$i], "\n" ; } } if ( scalar( @bamFiles ) == 0 ) { die "Must use option --lb to specify the list of bam files.\n" ; } if ( $mateIdx == -1 ) { open FPsam, "$WD/samtools-0.1.19/samtools view ".$bamFiles[0]."| head -1000 |" ; my $flag = 0 ; while ( <FPsam> ) { my @cols = split /\t/ ; my $id = $cols[0] ; if ( !( $id =~ /[\.|\/][1|2]$/ ) ) { $flag = 1 ; last ; } } close FPsam ; if ( $flag == 0 ) { print "Found mate read id index suffix(.1 or /1). Calling \"--mateIdx 1\" option. If this is a false calling, please use \"--mateIdx 0\".\n" ; $classesOpt .= " --hasMateIdSuffix" ; $juncOpt .= " --hasMateIdSuffix" ; } } mkdir $outdir if ( !-d $outdir ) ; mkdir "$outdir/splice" if ( !-d "$outdir/splice" ) ; mkdir "$outdir/subexon" if ( !-d "$outdir/subexon" ) ; my $threadLock : shared ; my @sharedFiles : shared ; my @threads ; for ( $i = 0 ; $i < $numThreads ; ++$i ) { push @threads, $i ; } sub threadRunSplice { my $tid = threads->tid() - 1 ; my $i ; for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i ) { next if ( ( $i % $numThreads ) != $tid ) ; system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ; } } # Generate the splice file for each bam file. if ( $stage <= 0 ) { if ( $numThreads == 1 ) { for ( $i = 0 ; $i < @bamFiles ; ++$i ) { system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ; #if ( $spliceFile ne "" ) #{ # system_call( "perl $WD/ManipulateIntronFile.pl $spliceFile ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ; #} #else #{ #system_call( "awk \'{if (\$6>1) print;}\' ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ; # system_call( "mv ${prefix}bam_$i.raw_splice ${prefix}bam_$i.splice" ) ; #} } } else { foreach ( @threads ) { $_ = threads->create( \&threadRunSplice ) ; } foreach ( @threads ) { $_->join() ; } } if ( $bamGroup eq "" || $spliceFile ne "" ) { open FPls, ">$outdir/splice/${prefix}splice.list" ; for ( $i = 0 ; $i < @bamFiles ; ++$i ) { print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ; } close FPls ; if ( $spliceFile ne "" ) { for ( $i = 0 ; $i < @bamFiles ; ++$i ) { system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $spliceFile > $outdir/splice/${prefix}bam_$i.splice" ) ; } } else { system_call( "$WD/trust-splice $outdir/splice/${prefix}splice.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam.trusted_splice" ) ; for ( $i = 0 ; $i < @bamFiles ; ++$i ) { system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ; } } } else { my @bamToGroupId ; my %groupNameToId ; open FPbg, $bamGroup ; my $groupUsed = 0 ; while (<FPbg>) { chomp ; if ( !defined $groupNameToId{$_} ) { $groupNameToId{$_} = $groupUsed ; ++$groupUsed ; } push @bamToGroupId, $groupNameToId{$_} ; } close FPbg ; # Process each group one by one my $group ; for ( $group = 0 ; $group < $groupUsed ; ++$group ) { open FPls, ">$outdir/splice/${prefix}splice_$group.list" ; for ( $i = 0 ; $i < @bamFiles ; ++$i ) { next if ( $bamToGroupId[$i] != $group ) ; print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ; } close FPls ; system_call( "$WD/trust-splice $outdir/splice/${prefix}splice_$group.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam_$group.trusted_splice" ) ; for ( $i = 0 ; $i < @bamFiles ; ++$i ) { next if ( $bamToGroupId[$i] != $group ) ; system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam_$group.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ; } } } } # Get subexons from each bam file sub threadRunSubexonInfo { my $tidAdjust = 0 ; $tidAdjust = $numThreads if ( $stage < 1 ) ; my $tid = threads->tid() - $tidAdjust - 1 ; my $i ; for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i ) { next if ( ( $i % $numThreads ) != $tid ) ; system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ; } } if ( $stage <= 1 ) { if ( $numThreads == 1 ) { for ( $i = 0 ; $i < @bamFiles ; ++$i ) { system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ; } } else { #print "hi ", scalar( @threads ), " ", scalar( @bamFiles), "\n" ; foreach ( @threads ) { $_ = threads->create( \&threadRunSubexonInfo ) ; } foreach ( @threads ) { $_->join() ; } } open FPls, ">$outdir/subexon/${prefix}subexon.list" ; for ( $i = 0 ; $i < @bamFiles ; ++$i ) { print FPls "$outdir/subexon/${prefix}subexon_$i.out\n" ; } close FPls ; } # combine the subexons. if ( $stage <= 2 ) { $cmd = "$WD/combine-subexons --ls $outdir/subexon/${prefix}subexon.list > $outdir/subexon/${prefix}subexon_combined.out" ; system_call( "$cmd" ) ; } # Run classes if ( $stage <= 3 ) { my $trimPrefix = substr( $prefix, 0, -1 ) ; my $bamPath = "" ; if ( $bamFileList ne "" ) { $bamPath = " --lb $bamFileList " ; } else { foreach my $b (@bamFiles) { $bamPath .= " -b $b " ; } } $cmd = "$WD/classes $classesOpt $bamPath -s $outdir/subexon/${prefix}subexon_combined.out -o $outdir/${trimPrefix} > $outdir/${prefix}classes.log" ; system_call( "$cmd" ) ; } # Run voting if ( $stage <= 4 ) { open FPgtflist, ">$outdir/${prefix}gtf.list" ; for ( $i = 0 ; $i < @bamFiles ; ++$i ) { print FPgtflist "$outdir/${prefix}sample_${i}.gtf\n" ; } close FPgtflist ; $cmd = "$WD/vote-transcripts --lg $outdir/${prefix}gtf.list $voteOpt > $outdir/${prefix}vote.gtf" ; system_call( "$cmd" ) ; }