Mercurial > repos > sarahinraauzeville > star
view sm_STAR2_V2.pl @ 5:c4fc8ff6e280 draft
Uploaded
author | sarahinraauzeville |
---|---|
date | Tue, 12 Dec 2017 10:16:23 -0500 |
parents | 80e19490ec6a |
children |
line wrap: on
line source
#!/usr/bin/perl -w # usage : perl sm_STAR.pl <read1.fastq.gz> <read2.fastq.gz> # 10/02/2014 - Wrapper du traitement des données RNAseq # Sarah Maman # Copyright (C) 2014 INRA # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # use strict; use File::Basename; use Getopt::Long; use lib "$ENV{'MY_GALAXY_DIR'}"; use GalaxyPath; my $cfg = GalaxyPath->new( -file => $ENV{"GALAXY_CONFIG_FILE"}); my $PATH = $cfg->my_path( 'workPath', 'MYWORKSPACE' ); my $STAR = $cfg->my_path( 'toolsPath', 'STAR_PATH' ); my $Nthreads; my $genome_path; my $reads_selector; my $input_read; my $Read1fastqgz; my $Read2fastqgz; my $alignIntronMin; my $alignIntronMax; my $outFilterMismatchNmax; my $orientation; my $refownfastaref; my $refselector; my $refowngtf; my $compress; my $cufflinks; my $outputfile; my $outputfileT; my $outputlogSJ; my $outputlogfinal; Getopt::Long::Configure( 'no_ignorecase', 'bundling' ); GetOptions ( 'runThreadN=i' => \$Nthreads, 'genomeDir=s' => \$genome_path, 'refselector=s' => \$refselector, 'refownfastaref=s' => \$refownfastaref, 'refowngtf=s' => \$refowngtf, 'compress=s' => \$compress, 'cufflinks=s' => \$cufflinks, 'readsselector=s'=> \$reads_selector, 'readFilesIn1=s' => \$Read1fastqgz, 'readFilesIn2=s' => \$Read2fastqgz, 'readsinputread=s' => \$input_read, 'alignIntronMin=i' => \$alignIntronMin, 'alignIntronMax=i' => \$alignIntronMax, 'outFilterMismatchNmax=i' => \$outFilterMismatchNmax, 'orientation=s' => \$orientation, 'outputfile=s' => \$outputfile, 'outputfileT=s' => \$outputfileT, 'outputlogfinal=s' => \$outputlogfinal, 'outputlogSJ=s' => \$outputlogSJ ) or die "Usage: Error in command line arguments\n"; my $cmd1 = ''; my $cmd2 =''; my $cmd3 = ''; my $cmd4 =''; #STAR --runThreadN 4 --runMode genomeGenerate --genomeDir /work/smaman/TP_RNAseq/INDEX/ --genomeFastaFiles ITAG2.3_genomic_Ch6.fasta --sjdbGTFfile ITAG_pre2.3_gene_models_Ch6.gtf --sjdbOverhang 100 #smaman@node001 /work/smaman/TP_RNAseq $ ls -ltrah INDEX #-rw-r--r-- 1 smaman BIOINFO 331 17 juil. 11:55 genomeParameters.txt #-rw-r--r-- 1 smaman BIOINFO 387K 17 juil. 11:55 exonGeTrInfo.tab #-rw-r--r-- 1 smaman BIOINFO 53K 17 juil. 11:55 geneInfo.tab #-rw-r--r-- 1 smaman BIOINFO 151K 17 juil. 11:55 transcriptInfo.tab #-rw-r--r-- 1 smaman BIOINFO 171K 17 juil. 11:55 exonInfo.tab #-rw-r--r-- 1 smaman BIOINFO 325K 17 juil. 11:55 sjdbList.fromGTF.out.tab #-rw-r--r-- 1 smaman BIOINFO 272K 17 juil. 11:55 sjdbInfo.txt #-rw-r--r-- 1 smaman BIOINFO 325K 17 juil. 11:55 sjdbList.out.tab #-rw-r--r-- 1 smaman BIOINFO 11 17 juil. 11:55 chrName.txt #-rw-r--r-- 1 smaman BIOINFO 9 17 juil. 11:55 chrLength.txt #-rw-r--r-- 1 smaman BIOINFO 11 17 juil. 11:55 chrStart.txt #-rw-r--r-- 1 smaman BIOINFO 20 17 juil. 11:55 chrNameLength.txt #-rw-r--r-- 1 smaman BIOINFO 47M 17 juil. 11:55 Genome #-rw-r--r-- 1 smaman BIOINFO 360M 17 juil. 11:55 SA #-rw-r--r-- 1 smaman BIOINFO 1,5G 17 juil. 11:55 SAindex #STAR --readFilesIn WTr1.fastq WTr2.fastq --genomeDir /work/smaman/TP_RNAseq/INDEX/ --sjdbGTFfile ITAG_pre2.3_gene_models_Ch6.gtf --outSAMtype BAM SortedByCoordinate --alignIntronMin 20 --alignIntronMax 1000000 --outFilterMismatchNmax 10 --outSAMtype BAM SortedByCoordinate --runThreadN 4 --outFileNamePrefix galaxyName --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --outFilterType BySJout --quantMode TranscriptomeSAM #-rw-r--r-- 1 smaman BIOINFO 45M 26 mars 2015 ITAG2.3_genomic_Ch6.fasta #-rw-r--r-- 1 smaman BIOINFO 1,6M 26 mars 2015 ITAG_pre2.3_gene_models_Ch6.gtf #-rw-r--r-- 1 smaman BIOINFO 29 26 mars 2015 ITAG2.3_genomic_Ch6.fasta.fai #-rw-r--r-- 1 smaman BIOINFO 614 17 juil. 10:20 WTr1.fastq #-rw-r--r-- 1 smaman BIOINFO 589 17 juil. 10:20 WTr2.fastq #-rw-r--r-- 1 smaman BIOINFO 14K 17 juil. 11:55 Log.out #-rw-r--r-- 1 smaman BIOINFO 35K 17 juil. 12:03 galaxyNameAligned.toTranscriptome.out.bam #-rw-r--r-- 1 smaman BIOINFO 637 17 juil. 12:03 galaxyNameAligned.sortedByCoord.out.bam +++++++++ #-rw-r--r-- 1 smaman BIOINFO 0 17 juil. 12:03 galaxyNameSJ.out.tab ++++++++++++++++ #-rw-r--r-- 1 smaman BIOINFO 246 17 juil. 12:03 galaxyNameLog.progress.out #-rw-r--r-- 1 smaman BIOINFO 1,7K 17 juil. 12:03 galaxyNameLog.final.out +++++++++++++++ #-rw-r--r-- 1 smaman BIOINFO 16K 17 juil. 12:03 galaxyNameLog.out #workspace my $debug = 0; #Mode debug if ($debug == 0) { print STDOUT "Debug mode OK \n"; } else { $PATH = dirname($outputfile); print STDOUT "No debug \n"; } #Récuperer le numero (unique) de l'output afin, si besoin, de créer un répertoire de travail unique dans /work/galaxy-dev/workspace my ($nb) = ($outputfile=~/dataset_(\d+)\.\S+$/); #Repertoire de sortie cree par le script, verif des droits d'ecriture sur ce repertoire de sortie `cd $PATH/; mkdir $nb/; chmod -R 777 $nb/; cd $nb/;`; my $dirresults= "$PATH/".$nb; print STDOUT "Job working directory : $dirresults \n"; if ($refselector eq "ownfasta"){ my $cmdSTARindex="(cd $dirresults/; mkdir INDEX/; chmod 777 INDEX/; $STAR --runThreadN $Nthreads --runMode genomeGenerate --genomeDir $dirresults/INDEX --genomeFastaFiles $refownfastaref --sjdbGTFfile $refowngtf --sjdbOverhang 100) >& ./out_Starindex.log 2>&1"; system $cmdSTARindex; #Info pour les biologistes print STDOUT "STAR Genome Generate : \n\n $cmdSTARindex \n\n "; $genome_path = "$dirresults/INDEX/"; } my $addcuff; if ($cufflinks eq "cuff"){ $addcuff="--outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --outFilterType BySJout --quantMode TranscriptomeSAM "; }else{ $addcuff=""; } my $cat; if ($reads_selector eq "single"){ my $in; if ($compress eq "compress"){ #Si besoin, recupération du fichier de configuration avec modification de l extension `ln -s $input_read $dirresults/input_read.fastq.gz;`; $in = "$dirresults/input_read.fastq.gz"; $cat="--readFilesCommand zcat"; }else {`ln -s $input_read $dirresults/input_read.fastq;`; $in = "$dirresults/input_read.fastq"; $cat="";} if ($orientation eq "No"){ $cmd1 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; system $cmd1; #Info pour les biologistes print STDOUT "STAR command run on cluster without oriented reads : \n\n $cmd1 \n\n "; } else { $cmd2 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; system $cmd2; #Info pour les biologistes print STDOUT "STAR command run on cluster with oriented reads : \n\n $cmd2 \n\n Instead, you need to run Cufflinks with the library option --library-type options. For example, cufflinks <…> -library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has to be used only for Cufflinks runs and not for STAR runs.\n\n"; } }else{ my $in1; my $in2; if ($compress eq "compress"){ #Si besoin, recupération du fichier de configuration avec modification de l extension `ln -s $Read1fastqgz $dirresults/Read1.fastq.gz; ln -s $Read2fastqgz $dirresults/Read2.fastq.gz;`; $in1="$dirresults/Read1.fastq.gz"; $in2="$dirresults/Read2.fastq.gz"; $cat="--readFilesCommand zcat"; }else {`ln -s $Read1fastqgz $dirresults/Read1.fastq; ln -s $Read2fastqgz $dirresults/Read2.fastq;`; $in1="$dirresults/Read1.fastq"; $in2="$dirresults/Read2.fastq"; $cat="";} if ($orientation eq "No"){ $cmd3 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in1 $in2 --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; system $cmd3; #Info pour les biologistes print STDOUT "STAR command run on cluster without oriented reads : \n\n $cmd3 \n\n "; } else { $cmd4 = "(cd $dirresults; $STAR --runThreadN $Nthreads --genomeDir $genome_path --readFilesIn $in1 $in2 --outSAMtype BAM SortedByCoordinate --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --outFilterMismatchNmax $outFilterMismatchNmax $cat --outFileNamePrefix $nb $addcuff) >& ./out_Star.log 2>&1"; #Info pour les biologistes system $cmd4; print STDOUT "STAR command run on cluster with oriented reads : \n\n $cmd4 \n\n Instead, you need to run Cufflinks with the library option --library-type options. For example, cufflinks <…> -library-type fr-firststrand should be used for the “standard” dUTP protocol. This option has to be used only for Cufflinks runs and not for STAR runs.\n\n"; } } #Si besoin : #TEST 1 : command ligne on vm-galaxy #TEST 2 perl Galaxy file : perl script.pl path/to/tests/files/used/for/galaxy/perl/script out1 #Recuperation des fichiers par Galaxy #-rw-r--r-- 1 smaman BIOINFO 35K 17 juil. 12:03 galaxyNameAligned.toTranscriptome.out.bam +++++ #-rw-r--r-- 1 smaman BIOINFO 637 17 juil. 12:03 galaxyNameAligned.sortedByCoord.out.bam +++++++++ #-rw-r--r-- 1 smaman BIOINFO 0 17 juil. 12:03 galaxyNameSJ.out.tab ++++++++++++++++ #-rw-r--r-- 1 smaman BIOINFO 1,7K 17 juil. 12:03 galaxyNameLog.final.out +++++++++++++++ my $bam = glob("$dirresults/*$nb*Aligned.sortedByCoord.out.bam"); if (! -e $bam){print STDERR "Aligned.sortedByCoord.out.bam file not found. \n";}else{`cp -a $bam $outputfile`;} my $bamT = glob("$dirresults/*$nb*Aligned.toTranscriptome.out.bam"); if (! -e $bamT){print STDERR "Aligned.toTranscriptome.out.bam file not found. \n";}else{`cp -a $bamT $outputfileT`;} my $logSJ = glob("$dirresults/$nb*SJ.out.tab"); if (! -e $logSJ){print STDERR "SJ.out.tab log file not found. \n";}else{`cp -a $logSJ $outputlogSJ`;} my $logfinal = glob("$dirresults/$nb*Log.final.out"); if (! -e $logfinal){print STDERR "Log.final.out log file not found. \n";}else{`cp -a $logfinal $outputlogfinal`;}