Mercurial > repos > big-tiandm > mirplant2
view preProcess.pl @ 50:7b5a48b972e9 draft
Uploaded
author | big-tiandm |
---|---|
date | Fri, 05 Dec 2014 00:11:02 -0500 |
parents | c75593f79aa9 |
children |
line wrap: on
line source
#!/usr/bin/perl -w #Filename: #Author: Tian Dongmei #Email: tiandm@big.ac.cn #Date: 2014-12-2 #Modified: #Description: RNA-seq data pre-process my $version=1.00; use strict; use Getopt::Long; use threads; #use threads::shared; use File::Path; use File::Basename; #use RNA; #use Term::ANSIColor; my %opts; GetOptions(\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h"); if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments &usage; } my $time=&Time(); print "miPlant program start:\n The time is $time!\n"; print "Command line:\n $0 @ARGV\n"; my $format=$opts{'format'}; if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { #&printErr(); die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; } my $phred_qv=64; if (defined $opts{'phred'}) {$phred_qv=$opts{'phred'};} my @inputfiles=@{$opts{'i'}}; my @inputtags=@{$opts{'tag'}}; my $mypath=`pwd`; chomp $mypath; my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/preProcess/"; unless ($dir=~/\/$/) {$dir.="/";} if (not -d $dir) { mkdir $dir; } my $config=$dir."/input_config"; open CONFIG,">$config"; for (my $i=0;$i<@inputfiles;$i++) { print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; } close CONFIG; my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; my $a="ATCTCGTATG"; #adapter if (defined $opts{'a'}) {$a=$opts{'a'};} my $m=6; #adapter minimum mapped nt if (defined $opts{'M'}) {$m=$opts{'M'};} my $t=1; #threads number if (defined $opts{'t'}) {$t=$opts{'t'};} my $min_nt=19; # minimum reads length if (defined $opts{'min'}) {$min_nt=$opts{'min'};} my $max_nt=28; #maximum reads length if (defined $opts{'max'}) {$max_nt=$opts{'max'};} my $mis=0; #mismatch number for microRNA if (defined $opts{'mis'}) {$mis=$opts{'mis'};} my $mis_rfam=0;# mismatch number for rfam if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} my (@filein,@mark,@clean); #&read_config(); @filein=@inputfiles; @mark=@inputtags; &checkfa($opts{gfa}); ##### clip adpter --> clean data start my $preprocess=$dir."preProcess_clean/"; mkdir $preprocess; my $can_use_threads = eval 'use threads; 1'; if ($can_use_threads) { # Do processing using threads print "Do processing using threads\n"; my @filein1=@filein; my @mark1=@mark; while (@filein1>0) { my @thrs; my @res; for (my $i=0;$i<$t ;$i++) { last if(@filein1==0); my $in=shift @filein1; my $out=shift @mark1; push @clean,$preprocess.$out."_clips_adapter.fq"; $thrs[$i]=threads->create(\&clips,$in,$out); } for (my $i=0;$i<@thrs;$i++) { $res[$i]=$thrs[$i]->join(); } } } else { # Do not processing using threads print "Do not processing using threads\n"; for (my $i=0;$i<@filein ;$i++) { my $in=$filein[$i]; my $out=$mark[$i]; push @clean,$preprocess.$out."_clips_adapter.fq"; &clips($in,$out); } } ##### clip adpter --> clean data end my $collapsed=$preprocess."collapse_reads.fa"; my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data &collapse(\@clean,$collapsed); #collapse reads to tags &filterbylength(); # filter <$min_nt && >$max_nt print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; $time=Time(); print "$time: Genome alignment!\n\n"; my $genome_map=$dir."genome_match"; &genome($data); #my $genome_map=&search($dir,"genome_match_"); my $mapfile=$genome_map."/genome_mapped.bwt"; my $mapfa=$genome_map."/genome_mapped.fa"; my $unmap=$genome_map."/genome_not_mapped.fa"; chdir $dir; my $pathfile="$dir/path.txt"; open PA,">$pathfile"; print PA "$config\n"; print PA "$preprocess\n"; print PA "$genome_map\n"; if (defined $opts{'rfam'}) { #rfam mapping and analysis $time=Time(); print "$time: RNA annotate!\n\n"; $time=~s/:/-/g; $time=~s/ /-/g; my $rfam_exp_dir=$dir."rfam_match"; &rfam(); #my $rfam_exp_dir=&search($dir,"rfam_match_"); print PA "$rfam_exp_dir\n"; my $tag=join "\\;" ,@mark; system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); } close PA; system("perl $scipt_path/html_preprocess.pl -i $pathfile -format $format -min $min_nt -max $max_nt -o $dir/preprocessResult.html"); $time=Time(); print "$time: Program end!!\n"; ############################## sub programs ################################### sub genome{ my ($file)=@_; if(defined $opts{'idx'}){ system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir -index $opts{idx}") ; # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; }else{ system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir") ; # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; } } sub rfam{ if (defined $opts{'idx2'}) { system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} "); # print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; }else{ system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir "); # print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; } } sub filterbylength{ my $tmpmark=join ",", @mark; system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); system("perl $scipt_path/Length_Distibution.pl -i $preprocess/reads_length_distribution.txt -o $preprocess/length.html"); # print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; } sub collapse{ my ($ins,$data)=@_; my $str=""; for (my $i=0;$i<@{$ins};$i++) { $str .="-i $$ins[$i] "; } system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); # print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; } sub clips{ my ($in,$out)=@_; my $adapter=$preprocess.$out."_clips_adapter.fq"; if($format eq "fq" || $format eq "fastq"){ system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; # print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; } if($format eq "fa" || $format eq "fasta"){ system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; } #my $clean=$preprocess.$out."_clean.fq"; #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); return; } sub read_config{ open CON,"<$config"; while (my $aline=<CON>) { chomp $aline; my @tmp=split/\t/,$aline; push @filein,$tmp[0]; push @mark,$tmp[1]; &check_rawdata($tmp[0]); } close CON; if (@filein != @mark) { #&printErr(); die "Maybe config file have some wrong!!!\n"; } } sub check_rawdata{ my ($fileforcheck)=@_; if (!(-s $fileforcheck)) { #&printErr(); die "Can not find $fileforcheck, or file is empty!!!\n"; } if ($format eq "fasta" || $format eq "fa") { &checkfa($fileforcheck); } if ($format eq "fastq" || $format eq "fq") { &checkfq($fileforcheck); } } sub checkfa{ my ($file_reads)=@_; open N,"<$file_reads"; my $line=<N>; chomp $line; if($line !~ /^>\S+/){ #printErr(); die "The first line of file $file_reads does not start with '>identifier' Reads file $file_reads is not a valid fasta file\n\n"; } if(<N> !~ /^[ACGTNacgtn]*$/){ #printErr(); die "File $file_reads contains not allowed characters in sequences Allowed characters are ACGTN Reads file $file_reads is not a fasta file\n\n"; } close N; } sub checkfq{ my ($file_reads)=@_; open N,"<$file_reads"; for (my $i=0;$i<10;$i++) { my $a=<N>; my $b=<N>; my $c=<N>; my $d=<N>; chomp $a; chomp $b; chomp $c; chomp $d; if($a!~/^\@/){ #&printErr(); die "$file_reads is not a fastq file\n\n"; } if($b!~ /^[ACGTNacgtn]*$/){ #&printErr(); die "File $file_reads contains not allowed characters in sequences Allowed characters are ACGTN Reads file $file_reads is not a fasta file\n\n"; } if ($c!~/^\@/ && $c!~/^\+/) { #&printErr(); die "$file_reads is not a fastq file\n\n"; } if ((length $b) != (length $d)) { #&printErr(); die "$file_reads is not a fastq file\n\n"; } my @qv=split //,$d; for (my $j=0;$j<@qv ;$j++) { my $q=ord($qv[$j])-64; if($q<0){$phred_qv=33;} } } close N; } sub search{ my ($dir,$str)=@_; opendir I,$dir; my @ret; while (my $file=readdir I) { if ($file=~/$str/) { push @ret, $file; } } closedir I; if (@ret != 1) { #&printErr(); die "Can not find directory or file which name has string: $str !!!\n"; } return $ret[0]; } sub Time{ my $time=time(); my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; $month++; $year+=1900; if (length($sec) == 1) {$sec = "0"."$sec";} if (length($min) == 1) {$min = "0"."$min";} if (length($hour) == 1) {$hour = "0"."$hour";} if (length($day) == 1) {$day = "0"."$day";} if (length($month) == 1) {$month = "0"."$month";} #print "$year-$month-$day $hour:$min:$sec\n"; return("$year-$month-$day $hour:$min:$sec"); } sub usage{ print <<"USAGE"; Version $version Usage: $0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t -o -path options: -i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ... -tag string # raw data file names, -tag xxx -tag xxx -format string,#specific input rawdata file format : fastq|fq|fasta|fa -phred int # phred quality number, default is 64 -path scirpt path -gfa string, input file # genome fasta. sequence file -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter string must be the prefix of the bowtie index. For instance, if the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then the prefix is 'h_sapiens_37_asm'.##can be null -rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. -idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter string must be the prefix of the bowtie index. For instance, if the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then the prefix is 'h_sapiens_37_asm'.##can be null -a string, ADAPTER string. default is ATCTCGTATG. -M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. -min int, reads min length,default is 19. -max int, reads max length,default is 28. -mis [int] number of allowed mismatches when mapping reads to genome, default 0 -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment -t int, number of threads [1] -o output directory# absolute path -h help USAGE exit(1); }