# HG changeset patch # User big-tiandm # Date 1417589669 18000 # Node ID c75593f79aa9a7ee014423d804f9a59fb955b4cd # Parent ca05d68aca13fe5665009f527519859a24c69c1f Uploaded diff -r ca05d68aca13 -r c75593f79aa9 convert_bowtie_to_blast.pl --- a/convert_bowtie_to_blast.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -#!/usr/bin/perl - - -use warnings; -use strict; -use Getopt::Std; - -######################################### USAGE ################################ - -my $usage= -"$0 file_bowtie_result file_solexa_seq file_chromosome - -This is a converter which changes Bowtie output into Blast format. -The input includes three files: a Bowtie result file (default Bowtie -output file), a fasta file consisting of small Reads and a chromosome -fasta file. It outputs the alignments in blast_parsed format. - -file_bowtie_result likes: - -AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0 -AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0 - -file_solexa_seq likes: - ->AtFlower100010_x2 -AAGGAGATTCTTTCAGTCCAG - -file_chromosome contains chromosome seq in fasta format - -"; - - -####################################### INPUT FILES ############################ - -my $file_bowtie_result=shift or die $usage; -my $file_short_seq=shift or die $usage; -my $file_chromosome_seq=shift or die $usage; - - -##################################### GLOBAL VARIBALES ######################### - -my %short_seq_length=(); -my %chromosome_length=(); - - -######################################### MAIN ################################# - -#get the short sequence id and its length -sequence_length($file_short_seq,\%short_seq_length); - -#get the chromosome sequence id and its length -sequence_length($file_chromosome_seq,\%chromosome_length); - -#convert bowtie result format to blast format; -change_format($file_bowtie_result); - -exit; - - -##################################### SUBROUTINES ############################## - -sub sequence_length{ - my ($file,$hash) = @_; - my ($id, $desc, $sequence, $seq_length) = (); - - open (FASTA, "<$file") or die "can not open $$file\n"; - while () - { - chomp; - if (/^>(\S+)(.*)/) - { - $id = $1; - $desc = $2; - $sequence = ""; - while (){ - chomp; - if (/^>(\S+)(.*)/){ - $$hash{$id} = length $sequence; - $id = $1; - $desc = $2; - $sequence = ""; - next; - } - $sequence .= $_; - } - } - } - $seq_length=length($sequence); - $$hash{$id} = $seq_length; - close FASTA; -} - - - - - -sub change_format{ - #Change Bowtie format into blast format - my $file=shift @_; - open(FILE,"<$file")||die"can not open the bowtie result file:$!\n"; - #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n"; - - while(){ - chomp; - my @tmp=split("\t",$_); - #Clean the reads ID - my @tmp1=split(" ",$tmp[0]); - print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t"; - if($tmp[1] eq "+"){ - my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]}; - my $seq_bg=$tmp[3] + 1; - print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n"; - } - if($tmp[1] eq "-"){ - my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3]; - my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1; - print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n"; - } - } - -# close BLASTOUT; - -} - - - diff -r ca05d68aca13 -r c75593f79aa9 html.pl --- a/html.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,273 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2014-5-29 -#Modified: -#Description: -my $version=1.00; - -use strict; -use Getopt::Long; -use File::Basename; - -my %opts; -GetOptions(\%opts,"i=s","format=s","o=s","h"); -if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments -&usage; -} -my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath); -my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir); -open IN,"<$opts{i}"; -$config=; chomp $config; -$prepath=; chomp $prepath; -$rfampath=;chomp $rfampath; -$knownpath=; chomp $knownpath; -$genomepath=; chomp $genomepath; -$novelpath=; chomp $novelpath; -close IN; -my @tmp=split/\//,$prepath; -$predir=$tmp[-1]; -@tmp=split/\//,$rfampath; -$rfamdir=$tmp[-1]; -@tmp=split/\//,$knownpath; -$knowndir=$tmp[-1]; -@tmp=split/\//,$genomepath; -$genomedir=$tmp[-1]; -@tmp=split/\//,$novelpath; -$noveldir=$tmp[-1]; - -my $dir=dirname($opts{'o'}); - -open OUT ,">$opts{'o'}"; -print OUT "\n \n Analysis Report \n - \n

\n \n Small RNA Analysis Report\n \n

-

1. Sequence No. and quality

-

1.1 Sequece No.

-"; - -### raw data no -open IN,"<$config"; -my @files;my @marks; my @rawNo; -while (my $aline=) { - chomp $aline; - my @tmp=split/\t/,$aline; - push @files,$tmp[0]; - - my $no=`less $tmp[0] |wc -l `; - chomp $no; - if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { - $no=$no/4; - } - else{ - $no=$no/2; - } - push @rawNo,$no; - - push @marks,$tmp[1]; -} -close IN; - -### preprocess -unless ($prepath=~/\/$/) { - $prepath .="/"; -} - -my @trimNo;my @collapse; -my $collapsefile=$prepath."collapse_reads.fa"; -open IN,"<$collapsefile"; -while (my $aline=) { - chomp $aline; - ; - $aline=~/:([\d|_]+)_x(\d+)$/; - my @lng=split/_/,$1; - for (my $i=0;$i<@lng;$i++) { - if ($lng[$i]>0) { - $trimNo[$i] +=$lng[$i]; - $collapse[$i] ++; - } - } -} -close IN; - -my @cleanR;my @cleanT; -my $clean=$prepath."collapse_reads_19_28.fa"; -open IN,"<$clean"; -while (my $aline=) { - chomp $aline; - ; - $aline=~/:([\d|_]+)_x(\d+)$/; - my @lng=split/_/,$1; - for (my $i=0;$i<@lng;$i++) { - if ($lng[$i]>0) { - $cleanR[$i] +=$lng[$i]; - $cleanT[$i] ++; - } - } -} -close IN; - -print OUT " - - -"; -foreach (@marks) { - print OUT "\n"; -} -print OUT " - - -"; -foreach (@rawNo) { - print OUT "\n"; -} -print OUT " - - -"; -foreach (@trimNo) { - print OUT "\n"; -} -print OUT " - - -"; -foreach (@collapse) { - print OUT "\n"; -} -print OUT " - - -"; -foreach (@cleanR) { - print OUT "\n"; -} -print OUT " - - -"; -foreach (@cleanT) { - print OUT "\n"; -} -print OUT "\n
  $_
Raw Reads No. $_
Reads No. After Trimed 3\' adapter $_
Unique Tags No. $_
Clean Reads No. $_
Clean Tags No. $_
"; -print OUT "

-Note:
-The raw data file path is: $files[0]
-"; -for (my $i=1;$i<@files;$i++) { - print OUT "           $files[$i]
"; -} -print OUT "The collapsed file path is: $collapsefile
-The clean data file path is: $clean
-

-

1. Sequence length count

-"; -print OUT "\n"; - -my $length=$prepath."length.html"; -open IN,"<$length"; -while (my $aline=) { - chomp $aline; - print OUT "$aline\n"; -} - -print OUT "

Note:
The sequence length data: length file -

-"; - -#### rfam -unless ($rfampath=~/\/$/) { - $rfampath .="/"; -} -print OUT "

2. Rfam non-miRNA annotation

-

2.1 Reads count

- - -"; - -my @rfamR; my @rfamT; -my $tag=1; -open IN,"<$dir/rfam_non-miRNA_annotation.txt"; -while (my $aline=) { - chomp $aline; - $tag=0 if($aline=~/tags\s+number/); - next if($aline=~/^\#/); - next if($aline=~/^\s*$/); - my @tmp=split/\s+/,$aline; - if($tag == 1){push @rfamR,[@tmp];} - else{push @rfamT,[@tmp];} -} -close IN; - - -print OUT "\n"; -foreach (@marks) { - print OUT "\n"; -} -for (my $i=0;$i<@rfamR;$i++) { - print OUT " - - - "; - for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { - print OUT "\n"; - } -} - -print OUT "\n
RNA Name $_
$rfamR[$i][0] $rfamR[$i][$j]
-

2.2 Tags count

- - - \n"; -foreach (@marks) { - print OUT "\n"; -} -for (my $i=0;$i<@rfamT;$i++) { - print OUT " - - - "; - for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { - print OUT "\n"; - } -} -print OUT "\n
RNA Name $_
$rfamT[$i][0] $rfamT[$i][$j]
-

Note:
The rfam mapping results is: $rfampath"; -print OUT "rfam_mapped.bwt

-

3. MicroRNA result

-

3.1 known microRNA

-

The known microRNA express list: known_microRNA_express.txt
- The known microRNA alngment file: known_microRNA_express.aln
- The known moRs file: known_microRNA_express.moRs
- The known microRNA mature sequence file: known_microRNA_mature.fa
- The knowm microRNA precursor sequence file: known_microRNA_precursor.fa -

- -

3.2 novel microRNA

-

The novel microRNA prediction file: microRNA_prediction.mrd
- The novel microRNA express list: novel_microRNA_express.txt
- The novel microRNA mature sequence file: novel_microRNA_mature.fa
- The novel microRNA precursor sequence file: novel_microRNA_precursor.fa -

-"; - - - -print OUT " - - -"; -close OUT; - -sub usage{ -print <<"USAGE"; -Version $version -Usage: -$0 -o -options: --o output file --h help -USAGE -exit(1); -} - diff -r ca05d68aca13 -r c75593f79aa9 html_preprocess.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html_preprocess.pl Wed Dec 03 01:54:29 2014 -0500 @@ -0,0 +1,388 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-5-29 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","format=s","min=i","max=i","o=s","h"); +if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments +&usage; +} +my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath); +my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir); +open IN,"<$opts{i}"; +$config=; chomp $config; +$prepath=; chomp $prepath; +$genomepath=; chomp $genomepath; +$rfampath=; +close IN; +my @tmp=split/\//,$prepath; +$predir=$tmp[-1]; +@tmp=split/\//,$genomepath; +$genomedir=$tmp[-1]; + +my $dir=dirname($opts{'o'}); + +open OUT ,">$opts{'o'}"; +print OUT "\n \n Analysis Report \n + \n

\n \n Preprocess Report\n \n

+

1. Sequence No. and quality

+

1.1 Sequece No.

+"; + +### raw data no +open IN,"<$config"; +my @files;my @marks; my @rawNo; +while (my $aline=) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @files,$tmp[0]; + + my $no=`less $tmp[0] |wc -l `; + chomp $no; + if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { + $no=$no/4; + } + else{ + $no=$no/2; + } + push @rawNo,$no; + + push @marks,$tmp[1]; +} +close IN; + +### preprocess +unless ($prepath=~/\/$/) { + $prepath .="/"; +} + +my @trimNo;my @collapse; +my $collapsefile=$prepath."collapse_reads.fa"; +open IN,"<$collapsefile"; +while (my $aline=) { + chomp $aline; + ; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $trimNo[$i] +=$lng[$i]; + $collapse[$i] ++; + } + } +} +close IN; + +my @cleanR;my @cleanT; +my $clean=$prepath."collapse_reads_$opts{min}_$opts{max}.fa"; +open IN,"<$clean"; +while (my $aline=) { + chomp $aline; + ; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $cleanR[$i] +=$lng[$i]; + $cleanT[$i] ++; + } + } +} +close IN; + +print OUT " + + +"; +foreach (@marks) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@rawNo) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@trimNo) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@collapse) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@cleanR) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@cleanT) { + print OUT "\n"; +} +print OUT "\n
  $_
Raw Reads No. $_
Reads No. After Trimed 3\' adapter $_
Unique Tags No. $_
Clean Reads No. $_
Clean Tags No. $_
"; +print OUT "

+Note:
+The raw data file path is: $files[0]
+"; +for (my $i=1;$i<@files;$i++) { + print OUT "           $files[$i]
"; +} +print OUT "The collapsed file path is: $collapsefile
+The clean data file path is: $clean
+

+

1. Sequence length count

+

1.1 Reads length count

+"; +print OUT "\n"; + +my (%length); my $key="Tags Length"; +open IN,"<$prepath/reads_length_distribution.txt"; +while (my $aline=) { + chomp $aline; + next if($aline=~/^\s*$/); + if ($aline=~/^Reads/) { $key="Reads Length";} + my @tmp=split/\t/,$aline; + my @array=split/\s/,$tmp[1]; + push @{$length{$key}},[$tmp[0],@array]; +} +close IN; + +print OUT " +"; +my $hashkey="Reads Length"; +foreach (@{$length{$hashkey}[0]}) { + print OUT "\n"; +} +print OUT ""; + +for (my $i=1;$i<@{$length{$hashkey}};$i++) { + print OUT " + + "; + for(my $j=1;$j<@{$length{$hashkey}[$i]};$j++) { + print OUT "\n"; + } + print OUT "\n"; +} +print OUT "
$_
$length{$hashkey}[$i][0] $length{$hashkey}[$i][$j]
\n"; + +print OUT "

1.2 Tags length count

"; + +print OUT " +"; +$hashkey="Tags Length"; +foreach (@{$length{$hashkey}[0]}) { + print OUT "\n"; +} +print OUT ""; + +for (my $i=1;$i<@{$length{$hashkey}};$i++) { + print OUT " + + "; + for(my $j=1;$j<@{$length{$hashkey}[$i]};$j++) { + print OUT "\n"; + } + print OUT "\n"; +} + +print OUT "
$_
$length{$hashkey}[$i][0] $length{$hashkey}[$i][$j]
\n"; + +print OUT "

2. Sequence length distribution

"; +my $length=$prepath."length.html"; +open IN,"<$length"; +while (my $aline=) { + chomp $aline; + print OUT "$aline\n"; +} + +#print OUT "

Note:
The sequence length data: length file +#

+#"; + + + + +####genome map +#unless ($genomedir=~/\/$/) { +# $genomedir .="/"; +#} + +print OUT "

2. Genome Alignment Result

+

2.1 Mapping count

+"; + +open IN,"<$genomepath/genome_mapped.fa"; +my (@gread,@gtag); +while (my $aline=) { + chomp $aline; + ; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @sss=split/_/,$1; + for (my $i=0;$i<@sss;$i++) { + if ($sss[$i]>0) { + $gread[$i] +=$sss[$i]; + $gtag[$i] ++; + } + } +} +close IN; + +print OUT " + + +"; +foreach (@marks) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@gread) { + print OUT "\n"; +} +print OUT " + + +"; + +for (my $i=0;$i<@gread;$i++) { + my $per=sprintf ("%.2f",$gread[$i]/$cleanR[$i]*100); + print OUT "\n"; +} + +print OUT " + + +"; +foreach (@gtag) { + print OUT "\n"; +} +print OUT " + + +"; + +for (my $i=0;$i<@gtag;$i++) { + my $per=sprintf ("%.2f",$gtag[$i]/$cleanT[$i]*100); + print OUT "\n"; +} +print OUT "\n
  $_
Genome Mapped Reads No. $_
Genome Mapped Reads Percent $per\%
Genome Mapped Tags No. $_
Genome Mapped Tags Percent $per\%
"; +print OUT "

+Note:
+The genome mapped bwt file path is: $genomedir/genome_mapped.bwt
+The genome mapped FASTA file path is: $genomedir/genome_mapped.fa +
+"; + + + +#### rfam +if(defined $rfampath && $rfampath=~/rfam_match/){ +chomp $rfampath; +@tmp=split/\//,$rfampath; +$rfamdir=$tmp[-1]; + +unless ($rfampath=~/\/$/) { + $rfampath .="/"; +} +print OUT "

3. Rfam non-miRNA annotation

+

3.1 Reads count

+ + +"; + +my @rfamR; my @rfamT; +my $tag=1; +open IN,"<$dir/rfam_non-miRNA_annotation.txt"; +while (my $aline=) { + chomp $aline; + $tag=0 if($aline=~/tags\s+number/); + next if($aline=~/^\#/); + next if($aline=~/^\s*$/); + my @tmp=split/\s+/,$aline; + if($tag == 1){push @rfamR,[@tmp];} + else{push @rfamT,[@tmp];} +} +close IN; + + +print OUT "\n"; +foreach (@marks) { + print OUT "\n"; +} +for (my $i=0;$i<@rfamR;$i++) { + print OUT " + + + "; + for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { + print OUT "\n"; + } +} + +print OUT "\n
RNA Name $_
$rfamR[$i][0] $rfamR[$i][$j]
+

3.2 Tags count

+ + + \n"; +foreach (@marks) { + print OUT "\n"; +} +for (my $i=0;$i<@rfamT;$i++) { + print OUT " + + + "; + for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { + print OUT "\n"; + } +} +print OUT "\n
RNA Name $_
$rfamT[$i][0] $rfamT[$i][$j]
+

Note:
The rfam mapping results is: $rfampath"; +print OUT "rfam_mapped.bwt

+"; +} + + +print OUT " + + +"; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -o +options: +-o output file +-h help +USAGE +exit(1); +} + diff -r ca05d68aca13 -r c75593f79aa9 matching.pl --- a/matching.pl Thu Nov 13 22:43:35 2014 -0500 +++ b/matching.pl Wed Dec 03 01:54:29 2014 -0500 @@ -11,7 +11,7 @@ use Getopt::Long; my %opts; -GetOptions(\%opts,"i=s","g=s","index:s","v:i","p:i","r:s","o=s","time:s","h"); +GetOptions(\%opts,"i=s","g=s","index:s","v:i","p:i","r:s","o=s","h"); if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments &usage; } @@ -27,11 +27,7 @@ my $time=&Time(); -if (defined $opts{'time'}) { - $time=$opts{'time'}; -} - -my $mapdir=$fileout."/genome_match_".$time; +my $mapdir=$fileout."/genome_match"; if(not -d $mapdir){ mkdir $mapdir; } diff -r ca05d68aca13 -r c75593f79aa9 miRDeep_plant.pl --- a/miRDeep_plant.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1622 +0,0 @@ -#!/usr/bin/perl - -use warnings; -use strict; -use Getopt::Std; -#use RNA; - - -################################# MIRDEEP ################################################# - -################################## USAGE ################################################## - - -my $usage= -"$0 file_signature file_structure temp_out_directory - -This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with -information on the positions of reads aligned to potential precursor sequences (signature). -It also takes as input an RNAfold output file, giving information on the sequence, structure -and mimimum free energy of the potential precursor sequences. - -Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA -sequences that should be considered for conservation purposes. -t prints out the potential -precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that -exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors -that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences -obtained through conventional cloning. -z consider the number of base pairings in the lower -stems (this option is not well tested). - --h print this usage --s fasta file with known miRNAs -#-o temp directory ,maked befor running the program. --t print filtered --u limited output (only ids) --v cut-off (default 1) --x sensitive option for Sanger sequences --y use Randfold --z consider Drosha processing -"; - - - - - -############################################################################################ - -################################### INPUT ################################################## - - -#signature file in blast_parsed format -my $file_blast_parsed=shift or die $usage; - -#structure file outputted from RNAfold -my $file_struct=shift or die $usage; - -my $tmpdir=shift or die $usage; -#options -my %options=(); -getopts("hs:tuv:xyz",\%options); - - - - - - -############################################################################################# - -############################# GLOBAL VARIABLES ############################################## - - -#parameters -my $nucleus_lng=11; - -my $score_star=3.9; -my $score_star_not=-1.3; -my $score_nucleus=7.63; -my $score_nucleus_not=-1.17; -my $score_randfold=1.37; -my $score_randfold_not=-3.624; -my $score_intercept=0.3; -my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0); -my $score_min=1; -if($options{v}){$score_min=$options{v};} -if($options{x}){$score_min=-5;} - -my $e=2.718281828; - -#hashes -my %hash_desc; -my %hash_seq; -my %hash_struct; -my %hash_mfe; -my %hash_nuclei; -my %hash_mirs; -my %hash_query; -my %hash_comp; -my %hash_bp; - -#other variables -my $subject_old; -my $message_filter; -my $message_score; -my $lines; -my $out_of_bound; - - - -############################################################################################## - -################################ MAIN ###################################################### - - -#print help if that option is used -if($options{h}){die $usage;} -unless ($tmpdir=~/\/$/) {$tmpdir .="/";} -if(!(-s $tmpdir)){mkdir $tmpdir;} -$tmpdir .="TMP_DIR/"; -mkdir $tmpdir; - -#parse structure file outputted from RNAfold -parse_file_struct($file_struct); - -#if conservation is scored, the fasta file of known miRNA sequences is parsed -if($options{s}){create_hash_nuclei($options{s})}; - -#parse signature file in blast_parsed format and resolve each potential precursor -parse_file_blast_parsed($file_blast_parsed); -#`rm -rf $tmpdir`; -exit; - - - - -############################################################################################## - -############################## SUBROUTINES ################################################### - - - -sub parse_file_blast_parsed{ - -# read through the signature blastparsed file, fills up a hash with information on queries -# (deep sequences) mapping to the current subject (potential precursor) and resolve each -# potential precursor in turn - - my $file_blast_parsed=shift; - - open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n"; - while (my $line=){ - if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){ - my $query=$1; - my $query_lng=$2; - my $query_beg=$3; - my $query_end=$4; - my $subject=$5; - my $subject_lng=$6; - my $subject_beg=$7; - my $subject_end=$8; - my $e_value=$9; - my $pid=$10; - my $bitscore=$11; - my $other=$12; - - #if the new line concerns a new subject (potential precursor) then the old subject must be resolved - if($subject_old and $subject_old ne $subject){ - resolve_potential_precursor(); - } - - #resolve the strand - my $strand=find_strand($other); - - #resolve the number of reads that the deep sequence represents - my $freq=find_freq($query); - - #read information of the query (deep sequence) into hash - $hash_query{$query}{"subject_beg"}=$subject_beg; - $hash_query{$query}{"subject_end"}=$subject_end; - $hash_query{$query}{"strand"}=$strand; - $hash_query{$query}{"freq"}=$freq; - - #save the signature information - $lines.=$line; - - $subject_old=$subject; - } - } - resolve_potential_precursor(); -} - -sub resolve_potential_precursor{ - -# dissects the potential precursor in parts by filling hashes, and tests if it passes the -# initial filter and the scoring filter - -# binary variable whether the potential precursor is still viable - my $ret=1; -#print STDERR ">$subject_old\n"; - - fill_structure(); -#print STDERR "\%hash_bp",scalar keys %hash_bp,"\n"; - fill_pri(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - - fill_mature(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - - fill_star(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - - fill_loop(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - - fill_lower_flanks(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - -# do_test_assemble(); - -# this is the actual classification - unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;} - - print_results($ret); - - reset_variables(); - - return; - -} - - - -sub print_results{ - - my $ret=shift; - -# print out if the precursor is accepted and accepted precursors should be printed out -# or if the potential precursor is discarded and discarded potential precursors should -# be printed out - - if((!$options{t} and $ret) or ($options{t} and !$ret)){ - #full output - unless($options{u}){ - if($message_filter){print $message_filter;} - if($message_score){print $message_score;} - print_hash_comp(); - print $lines,"\n\n"; - return; - } - #limited output (only ids) - my $id=$hash_comp{"pri_id"}; - print "$id\n"; - } -} - - - - - - - -sub pass_threshold_score{ - -# this is the scoring - - #minimum free energy of the potential precursor -# my $score_mfe=score_mfe($hash_comp{"pri_mfe"}); - my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"}); - - #count of reads that map in accordance with Dicer processing - my $score_freq=score_freq($hash_comp{"freq"}); -#print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n"; - - #basic score - my $score=$score_mfe+$score_freq; - - #scoring of conserved nucleus/seed (optional) - if($options{s}){ - - #if the nucleus is conserved - if(test_nucleus_conservation()){ - - #nucleus from position 2-8 - my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); - - #resolve DNA/RNA ambiguities - $nucleus=~tr/[T]/[U]/; - - #print score contribution - score_s("score_nucleus\t$score_nucleus"); - - #print the ids of known miRNAs with same nucleus - score_s("$hash_mirs{$nucleus}"); -#print STDERR "score_nucleus\t$score_nucleus\n"; - - #add to score - $score+=$score_nucleus; - - #if the nucleus is not conserved - }else{ - #print (negative) score contribution - score_s("score_nucleus\t$score_nucleus_not"); - - #add (negative) score contribution - $score+=$score_nucleus_not; - } - } - - #if the majority of potential star reads fall as expected from Dicer processing - if($hash_comp{"star_read"}){ - score_s("score_star\t$score_star"); -#print STDERR "score_star\t$score_star\n"; - $score+=$score_star; - }else{ - score_s("score_star\t$score_star_not"); -#print STDERR "score_star_not\t$score_star_not\n"; - $score+=$score_star_not; - } - - #score lower stems for potential for Drosha recognition (highly optional) - if($options{z}){ - my $stem_bp=$hash_comp{"stem_bp"}; - my $score_stem=$scores_stem[$stem_bp]; - $score+=$score_stem; - score_s("score_stem\t$score_stem"); - } - -#print STDERR "score_intercept\t$score_intercept\n"; - - $score+=$score_intercept; - - #score for randfold (optional) - if($options{y}){ - -# only calculate randfold value if it can make the difference between the potential precursor -# being accepted or discarded - if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){ - - #randfold value<0.05 - if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");} - - #randfold value>0.05 - else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");} - } - } - - #round off values to one decimal - my $round_mfe=round($score_mfe*10)/10; - my $round_freq=round($score_freq*10)/10; - my $round=round($score*10)/10; - - #print scores - score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round"); - - #return 1 if the potential precursor is accepted, return 0 if discarded - unless($score>=$score_min){return 0;} - return 1; -} - -sub test_randfold{ - - #print sequence to temporary file, test randfold value, return 1 or 0 - -# print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); - #my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; - #open(FILE, ">$tmpfile"); - #print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; - #close FILE; - -# my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; - #my $p1=`randfold -s $tmpfile 999 | cut -f 3`; - #my $p2=`randfold -s $tmpfile 999 | cut -f 3`; - my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999); - my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999); - my $p_value=($p1+$p2)/2; - wait; -# system "rm $tmpfile"; - - if($p_value<=0.05){return 1;} - - return 0; -} - -sub randfold_pvalue{ - my $cpt_sup = 0; - my $cpt_inf = 0; - my $cpt_ega = 1; - - my ($seq,$number_of_randomizations)=@_; - #my $str =$seq; - #my $mfe = RNA::fold($seq,$str); - my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; - my @rawfolds=split/\s+/,$rnafold; - my $str=$rawfolds[1]; - my $mfe=$rawfolds[-1]; - $mfe=~s/\(//; - $mfe=~s/\)//; - - for (my $i=0;$i<$number_of_randomizations;$i++) { - $seq = shuffle_sequence_dinucleotide($seq); - #$str = $seq; - - #my $rand_mfe = RNA::fold($str,$str); - $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; - my @rawfolds=split/\s+/,$rnafold; - my $str=$rawfolds[1]; - my $rand_mfe=$rawfolds[-1]; - $rand_mfe=~s/\(//; - $rand_mfe=~s/\)//; - - if ($rand_mfe < $mfe) { - $cpt_inf++; - } - if ($rand_mfe == $mfe) { - $cpt_ega++; - } - if ($rand_mfe > $mfe) { - $cpt_sup++; - } - } - - my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1); - - #print "$name\t$mfe\t$proba\n"; - return $proba; -} - -sub shuffle_sequence_dinucleotide { - - my ($str) = @_; - - # upper case and convert to ATGC - $str = uc($str); - $str =~ s/U/T/g; - - my @nuc = ('A','T','G','C'); - my $count_swap = 0; - # set maximum number of permutations - my $stop = length($str) * 10; - - while($count_swap < $stop) { - - my @pos; - - # look start and end letters - my $firstnuc = $nuc[int(rand 4)]; - my $thirdnuc = $nuc[int(rand 4)]; - - # get positions for matching nucleotides - for (my $i=0;$i<(length($str)-2);$i++) { - if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) { - push (@pos,($i+1)); - $i++; - } - } - - # swap at random trinucleotides - my $max = scalar(@pos); - for (my $i=0;$i<$max;$i++) { - my $swap = int(rand($max)); - if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) { - $count_swap++; - my $w1 = substr($str,$pos[$i],1); - my $w2 = substr($str,$pos[$swap],1); - substr($str,$pos[$i],1,$w2); - substr($str,$pos[$swap],1,$w1); - } - } - } - return($str); -} - -sub test_nucleus_conservation{ - - #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 - - my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); - $nucleus=~tr/[T]/[U]/; - if($hash_nuclei{$nucleus}){return 1;} - - return 0; -} - - - -sub pass_filtering_initial{ - - #test if the structure forms a plausible hairpin - unless(pass_filtering_structure()){filter_p("structure problem"); return 0;} - - #test if >90% of reads map to the hairpin in consistence with Dicer processing - unless(pass_filtering_signature()){filter_p("signature problem");return 0;} - - return 1; - -} - - -sub pass_filtering_signature{ - - #number of reads that map in consistence with Dicer processing - my $consistent=0; - - #number of reads that map inconsistent with Dicer processing - my $inconsistent=0; - -# number of potential star reads map in good consistence with Drosha/Dicer processing -# (3' overhangs relative to mature product) - my $star_perfect=0; - -# number of potential star reads that do not map in good consistence with 3' overhang - my $star_fuzzy=0; - - - #sort queries (deep sequences) by their position on the hairpin - my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query; - - foreach my $query(@queries){ - - #number of reads that the deep sequence represents - unless(defined($hash_query{$query}{"freq"})){next;} - my $query_freq=$hash_query{$query}{"freq"}; - - #test which Dicer product (if any) the deep sequence corresponds to - my $product=test_query($query); - - #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable - if($product){$consistent+=$query_freq;} - - #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable - else{$inconsistent+=$query_freq;} - - #test a potential star sequence has good 3' overhang - if($product eq "star"){ - if(test_star($query)){$star_perfect+=$query_freq;} - else{$star_fuzzy+=$query_freq;} - } - } - -# if the majority of potential star sequences map in good accordance with 3' overhang -# score for the presence of star evidence - if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;} - - #total number of reads mapping to the hairpin - my $freq=$consistent+$inconsistent; - $hash_comp{"freq"}=$freq; - unless($freq>0){filter_s("read frequency too low"); return 0;} - - #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded - my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent); - unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;} - - #the hairpin is retained - return 1; -} - -sub test_star{ - - #test if a deep sequence maps in good consistence with 3' overhang - - my $query=shift; - - #5' begin and 3' end positions - my $beg=$hash_query{$query}{"subject_beg"}; - my $end=$hash_query{$query}{"subject_end"}; - - #the difference between observed and expected begin positions must be 0 or 1 - my $offset=$beg-$hash_comp{"star_beg"}; - if($offset==0 or $offset==1 or $offset==-1){return 1;} - - return 0; -} - - - -sub test_query{ - - #test if deep sequence maps in consistence with Dicer processing - - my $query=shift; - - #begin, end, strand and read count - my $beg=$hash_query{$query}{"subject_beg"}; - my $end=$hash_query{$query}{"subject_end"}; - my $strand=$hash_query{$query}{"strand"}; - my $freq=$hash_query{$query}{"freq"}; - - #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs) - if($strand eq '-'){return 0;} - - #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end - my $fuzz_beg=2; - #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end - my $fuzz_end=2; - - #if in accordance with Dicer processing, return the type of Dicer product - if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";} - if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";} - if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";} - - #if not in accordance, return 0 - return 0; -} - - -sub pass_filtering_structure{ - - #The potential precursor must form a hairpin with miRNA precursor-like characteristics - - #return value - my $ret=1; - - #potential mature, star, loop and lower flank parts must be identifiable - unless(test_components()){return 0;} - - #no bifurcations - unless(no_bifurcations_precursor()){$ret=0;} - - #minimum 14 base pairings in duplex - unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");} - - #not more than 6 nt difference between mature and star length - unless(-60 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} - - #no overlap between the mature and the star sequence - if($hash_comp{"mature_arm"} eq "first"){ - ($hash_comp{"mature_end"}<$beg) or return 0; - }elsif($hash_comp{"mature_arm"} eq "second"){ - ($end<$hash_comp{"mature_beg"}) or return 0; - } - - #there should be no bifurcations and minimum one base pairing - my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+"); - if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){ - return "first"; - }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){ - return "second"; - } - return 0; -} - - -sub test_loop{ - - #tests the loop positions - - my ($beg,$end)=@_; - - #unless the begin and end positions are plausible, test negative - unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} - - return 1; -} - - -sub test_flanks{ - - #tests the positions of the lower flanks - - my ($beg,$end)=@_; - - #unless the begin and end positions are plausible, test negative - unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} - - return 1; -} - - -sub comp{ - - #subroutine to retrive from the 'comp' hash - - my $type=shift; - my $component=$hash_comp{$type}; - return $component; -} - - -sub find_strand_query{ - - #subroutine to find the strand for a given query - - my $query=shift; - my $strand=$hash_query{$query}{"strand"}; - return $strand; -} - - -sub find_positions_query{ - - #subroutine to find the begin and end positions for a given query - - my $query=shift; - my $beg=$hash_query{$query}{"subject_beg"}; - my $end=$hash_query{$query}{"subject_end"}; - return ($beg,$end); -} - - - -sub find_mature_query{ - - #finds the query with the highest frequency of reads and returns it - #is used to determine the positions of the potential mature sequence - - my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query; - my $mature_query=$queries[0]; - return $mature_query; -} - - - - -sub reset_variables{ - - #resets the hashes for the next potential precursor - -# %hash_query=(); -# %hash_comp=(); -# %hash_bp=(); - foreach my $key (keys %hash_query) {delete($hash_query{$key});} - foreach my $key (keys %hash_comp) {delete($hash_comp{$key});} - foreach my $key (keys %hash_bp) {delete($hash_bp{$key});} - -# $message_filter=(); -# $message_score=(); -# $lines=(); - undef($message_filter); - undef($message_score); - undef($lines); - return; -} - - - -sub excise_seq{ - - #excise sub sequence from the potential precursor - - my($seq,$beg,$end,$strand)=@_; - - #begin can be equal to end if only one nucleotide is excised - unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} - - #rarely, permuted combinations of signature and structure cause out of bound excision errors. - #this happens once appr. every two thousand combinations - unless($beg<=length($seq)){$out_of_bound++;return 0;} - - #if on the minus strand, the reverse complement should be excised - if($strand eq "-"){$seq=revcom($seq);} - - #the blast parsed format is 1-indexed, substr is 0-indexed - my $sub_seq=substr($seq,$beg-1,$end-$beg+1); - - return $sub_seq; - -} - -sub excise_struct{ - - #excise sub structure - - my($struct,$beg,$end,$strand)=@_; - my $lng=length $struct; - - #begin can be equal to end if only one nucleotide is excised - unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} - - #rarely, permuted combinations of signature and structure cause out of bound excision errors. - #this happens once appr. every two thousand combinations - unless($beg<=length($struct)){return 0;} - - #if excising relative to minus strand, positions are reversed - if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);} - - #the blast parsed format is 1-indexed, substr is 0-indexed - my $sub_struct=substr($struct,$beg-1,$end-$beg+1); - - return $sub_struct; -} - - -sub create_hash_nuclei{ - #parses a fasta file with sequences of known miRNAs considered for conservation purposes - #reads the nuclei into a hash - - my ($file) = @_; - my ($id, $desc, $sequence, $nucleus) = (); - - open (FASTA, "<$file") or die "can not open $file\n"; - while () - { - chomp; - if (/^>(\S+)(.*)/) - { - $id = $1; - $desc = $2; - $sequence = ""; - $nucleus = ""; - while (){ - chomp; - if (/^>(\S+)(.*)/){ - $nucleus = substr($sequence,1,$nucleus_lng); - $nucleus =~ tr/[T]/[U]/; - $hash_mirs{$nucleus} .="$id\t"; - $hash_nuclei{$nucleus} += 1; - - $id = $1; - $desc = $2; - $sequence = ""; - $nucleus = ""; - next; - } - $sequence .= $_; - } - } - } - $nucleus = substr($sequence,1,$nucleus_lng); - $nucleus =~ tr/[T]/[U]/; - $hash_mirs{$nucleus} .="$id\t"; - $hash_nuclei{$nucleus} += 1; - close FASTA; -} - - -sub parse_file_struct{ - #parses the output from RNAfoldand reads it into hashes - my($file) = @_; - my($id,$desc,$seq,$struct,$mfe) = (); - open (FILE_STRUCT, "<$file") or die "can not open $file\n"; - while (){ - chomp; - if (/^>(\S+)\s*(.*)/){ - $id= $1; - $desc= $2; - $seq= ""; - $struct= ""; - $mfe= ""; - while (){ - chomp; - if (/^>(\S+)\s*(.*)/){ - $hash_desc{$id} = $desc; - $hash_seq{$id} = $seq; - $hash_struct{$id} = $struct; - $hash_mfe{$id} = $mfe; - $id = $1; - $desc = $2; - $seq = ""; - $struct = ""; - $mfe = ""; - next; - } - if(/^\w/){ - tr/uU/tT/; - $seq .= $_; - next; - } - if(/((\.|\(|\))+)/){$struct .=$1;} - if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;} - } - } - } - $hash_desc{$id} = $desc; - $hash_seq{$id} = $seq; - $hash_struct{$id} = $struct; - $hash_mfe{$id} = $mfe; - close FILE_STRUCT; - return; -} - - -sub score_s{ - - #this score message is appended to the end of the string of score messages outputted for the potential precursor - - my $message=shift; - $message_score.=$message."\n";; - return; -} - - - -sub score_p{ - - #this score message is appended to the beginning of the string of score messages outputted for the potential precursor - - my $message=shift; - $message_score=$message."\n".$message_score; - return; -} - - - -sub filter_s{ - - #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor - - my $message=shift; - $message_filter.=$message."\n"; - return; -} - - -sub filter_p{ - - #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor - - my $message=shift; - if(defined $message_filter){$message_filter=$message."\n".$message_filter;} - else{$message_filter=$message."\n";} - return; -} - - -sub find_freq{ - - #finds the frequency of a given read query from its id. - - my($query)=@_; - - if($query=~/x(\d+)/i){ - my $freq=$1; - return $freq; - }else{ - #print STDERR "Problem with read format\n"; - return 0; - } -} - - -sub print_hash_comp{ - - #prints the 'comp' hash - - my @keys=sort keys %hash_comp; - foreach my $key(@keys){ - my $value=$hash_comp{$key}; - print "$key \t$value\n"; - } -} - - - -sub print_hash_bp{ - - #prints the 'bp' hash - - my @keys=sort {$a<=>$b} keys %hash_bp; - foreach my $key(@keys){ - my $value=$hash_bp{$key}; - print "$key\t$value\n"; - } - print "\n"; -} - - - -sub find_strand{ - - #A subroutine to find the strand, parsing different blast formats - - my($other)=@_; - - my $strand="+"; - - if($other=~/-/){ - $strand="-"; - } - - if($other=~/minus/i){ - $strand="-"; - } - return($strand); -} - - -sub contained{ - - #Is the stretch defined by the first positions contained in the stretch defined by the second? - - my($beg1,$end1,$beg2,$end2)=@_; - - testbeginend($beg1,$end1,$beg2,$end2); - - if($beg2<=$beg1 and $end1<=$end2){ - return 1; - }else{ - return 0; - } -} - - -sub testbeginend{ - - #Are the beginposition numerically smaller than the endposition for each pair? - - my($begin1,$end1,$begin2,$end2)=@_; - - unless($begin1<=$end1 and $begin2<=$end2){ - print STDERR "beg can not be larger than end for $subject_old\n"; - exit; - } -} - - -sub rev_pos{ - -# The blast_parsed format always uses positions that are relative to the 5' of the given strand -# This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with -# the n't nucleotide on the plus strand - -# This subroutine reverses the begin and end positions of positions of the minus strand so that they -# are relative to the 5' end of the plus strand - - my($beg,$end,$lng)=@_; - - my $new_end=$lng-$beg+1; - my $new_beg=$lng-$end+1; - - return($new_beg,$new_end); -} - -sub round { - - #rounds to nearest integer - - my($number) = shift; - return int($number + .5); - -} - - -sub rev{ - - #reverses the order of nucleotides in a sequence - - my($sequence)=@_; - - my $rev=reverse $sequence; - - return $rev; -} - -sub com{ - - #the complementary of a sequence - - my($sequence)=@_; - - $sequence=~tr/acgtuACGTU/TGCAATGCAA/; - - return $sequence; -} - -sub revcom{ - - #reverse complement - - my($sequence)=@_; - - my $revcom=rev(com($sequence)); - - return $revcom; -} - - -sub max2 { - - #max of two numbers - - my($a, $b) = @_; - return ($a>$b ? $a : $b); -} - -sub min2 { - - #min of two numbers - - my($a, $b) = @_; - return ($a<$b ? $a : $b); -} - - - -sub score_freq{ - -# scores the count of reads that map to the potential precursor -# Assumes geometric distribution as described in methods section of manuscript - - my $freq=shift; - - #parameters of known precursors and background hairpins - my $parameter_test=0.999; - my $parameter_control=0.6; - - #log_odds calculated directly to avoid underflow - my $intercept=log((1-$parameter_test)/(1-$parameter_control)); - my $slope=log($parameter_test/$parameter_control); - my $log_odds=$slope*$freq+$intercept; - - #if no strong evidence for 3' overhangs, limit the score contribution to 0 - unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);} - - return $log_odds; -} - - - -##sub score_mfe{ - -# scores the minimum free energy in kCal/mol of the potential precursor -# Assumes Gumbel distribution as described in methods section of manuscript - -## my $mfe=shift; - - #numerical value, minimum 1 -## my $mfe_adj=max2(1,-$mfe); - - #parameters of known precursors and background hairpins, scale and location -## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); -## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); - -## my $odds=$prob_test/$prob_background; -## my $log_odds=log($odds); - -## return $log_odds; -##} - -sub score_mfe{ -# use bignum; - -# scores the minimum free energy in kCal/mol of the potential precursor -# Assumes Gumbel distribution as described in methods section of manuscript - - my ($mfe,$mlng)=@_; - - #numerical value, minimum 1 - my $mfe_adj=max2(1,-$mfe); -my $mfe_adj1=$mfe/$mlng; - #parameters of known precursors and background hairpins, scale and location - my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; - my $ev=$e**($mfe_adj1*$c); - #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; - my $log_odds=($a/($b+$ev)); - - - my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); - my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); - - my $odds=$prob_test/$prob_background; - my $log_odds_2=log($odds); - #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; - return $log_odds; -} - - - -sub prob_gumbel_discretized{ - -# discretized Gumbel distribution, probabilities within windows of 1 kCal/mol -# uses the subroutine that calculates the cdf to find the probabilities - - my ($var,$scale,$location)=@_; - - my $bound_lower=$var-0.5; - my $bound_upper=$var+0.5; - - my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location); - my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location); - - my $prob=$cdf_upper-$cdf_lower; - - return $prob; -} - - -sub cdf_gumbel{ - -# calculates the cumulative distribution function of the Gumbel distribution - - my ($var,$scale,$location)=@_; - - my $cdf=$e**(-($e**(-($var-$location)/$scale))); - - return $cdf; -} - diff -r ca05d68aca13 -r c75593f79aa9 miRNA_Express_and_sequence.pl --- a/miRNA_Express_and_sequence.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,173 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2014-6-4 -#Modified: -#Description: solexa miRNA express and sequence -my $version=1.00; - -use strict; -use Getopt::Long; - -my %opts; -GetOptions(\%opts,"i=s","list=s","fa=s","pre=s","tag=s","h"); -if (!(defined $opts{i} and defined $opts{list} and defined $opts{fa} and defined $opts{pre} and defined $opts{tag}) || defined $opts{h}) { #necessary arguments -&usage; -} - -my $filein=$opts{'i'}; -my $fileout=$opts{'list'}; -my $out=$opts{'fa'}; -my $preout=$opts{'pre'}; - -=cut -my %hash_pri; -open PRI,"<$opts{p}"; -while (my $aline=) { - chomp $aline; - if($aline=~/^>(\S+)/){$hash_pri{$1}=$aline;} -} -close PRI; -=cut - -open IN,"<$filein"; #input file -open OUT,">$fileout"; #output file -open FA ,">$out"; -open PRE,">$preout"; - -print OUT "#ID\tcoordinate\tpos1\tpos2"; -my @marks=split/\,/,$opts{'tag'}; -foreach (@marks) { - print OUT "\t",$_,"_matureExp"; -} -foreach (@marks) { - print OUT "\t",$_,"_starExp"; -} -foreach (@marks) { - print OUT "\t",$_,"_totalExp"; -} - -print OUT "\n"; - -my (%uniq_id,$novel); -while (my $aline=) { - chomp $aline; - until ($aline =~ /^score\s+[-\d\.]+/){ - $aline = ; - if (eof) {last;} - } - if (eof) {last;} -########## miRNA ID ################ - $novel++; -########### annotate#################### - do {$aline=;} until($aline=~/flank_first_end/) ; - chomp $aline; - my @flank1=split/\t/,$aline; - do {$aline=;} until($aline=~/flank_second_beg/) ; - chomp $aline; - my @flank2=split/\t/,$aline; -# -########## mature start loop pre #### - do {$aline=;} until($aline=~/mature_beg/) ; - chomp $aline; - my @start=split/\t/,$aline; -# $start[1] -=$flank1[1]; - do {$aline=;} until($aline=~/mature_end/) ; - chomp $aline; - my @end=split/\t/,$aline; -# $end[1] -=$flank1[1]; - do {$aline=;} until($aline=~/mature_seq/) ; - chomp $aline; - my @arr1=split/\t/,$aline; - do {$aline=;} until($aline=~/pre_seq/) ; - chomp $aline; - my @arr2=split/\t/,$aline; - do {$aline=;} until($aline=~/pri_id/) ; - chomp $aline; - my @pri_id=split/\t/,$aline; - do {$aline=;} until($aline=~/pri_seq/) ; - chomp $aline; - my @pri_seq=split/\t/,$aline; - do {$aline=;} until($aline=~/star_beg/) ; - chomp $aline; - my @star_start=split/\t/,$aline; -# $star_start[1] -=$flank1[1]; - do {$aline=;} until($aline=~/star_end/) ; - chomp $aline; - my @star_end=split/\t/,$aline; -# $star_end[1] -=$flank1[1]; - do {$aline=;} until($aline=~/star_seq/) ; - chomp $aline; - my @arr3=split/\t/,$aline; - print OUT "miR-c-$novel\t$pri_id[1]\tmature:$start[1]:$end[1]\tstar:$star_start[1]:$star_end[1]\t"; - #print OUT "$arr1[1]\t$arr3[1]\t$arr2[1]\t\/\t"; - print FA ">miR-c-$novel\n$arr1[1]\n"; - print PRE ">miR-c-$novel\n$pri_seq[1]\n"; -########## reads count ############# - ; - my @count1;my @count2;my @count3;my @count4; - $aline=; - do { - chomp $aline; - my @reads=split/\t/,$aline; - my @pos=(); - $reads[5]=~/(\d+)\.\.(\d+)/; -# $pos[0] =$1-$flank1[1]; -# $pos[1] =$2-$flank1[1]; - $pos[0]=$1; - $pos[1]=$2; - $reads[0]=~/:([\d|_]+)_x(\d+)$/; - my @ss=split/_/,$1; - for (my $i=0;$i<@ss ;$i++) { - if (!(defined $count3[$i])) { - $count3[$i]=0; - } - if (!(defined $count4[$i])) { - $count4[$i]=0; - } - $count2[$i]+=$ss[$i]; - - } -# $count3 +=$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 ); -# $count4 +=$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 ); -# $count1 =$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 && $count1<$1); -# $count2 =$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 && $count2<$1); - if($end[1]-$pos[1]>=-5 && $end[1]-$pos[1]<=5 && $pos[0]-$start[1]>=-3 && $pos[0]-$start[1]<=3 ) - { - for (my $i=0;$i<@ss;$i++) { - $count3[$i]+=$ss[$i]; - } - } - if($star_end[1]-$pos[1]<=5 && $star_end[1]-$pos[1]>=-5 && $pos[0]-$star_start[1]>=-3 && $pos[0]-$star_start[1]<=3){ - for (my $i=0;$i<@ss;$i++) { - $count4[$i]+=$ss[$i]; - } - } - $aline=; - chomp $aline; - } until(length $aline < 1) ; - $"="\t"; - print OUT "@count3\t@count4\t@count2\n"; - $"=" "; -} - -close IN; -close OUT; - -sub usage{ -print <<"USAGE"; -Version $version -Usage: -$0 -i -list -fa -pre -tag -options: --i input file,predictions file --list output file miRNA list file --fa output file ,miRNA sequence fasta file. --pre output file, miRNA precursor fasta file. --tag string, sample names# eg: samA,samB,samC --h help -USAGE -exit(1); -} - diff -r ca05d68aca13 -r c75593f79aa9 miRPlant.pl --- a/miRPlant.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,506 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2014-4-22 -#Modified: -#Description: plant microRNA prediction -my $version=1.00; - -use strict; -use Getopt::Long; -use threads; -#use threads::shared; -use File::Path; -use File::Basename; -#use RNA; -#use Term::ANSIColor; - -my %opts; -GetOptions(\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); -if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments -&usage; -} - -my $time=&Time(); -print "miPlant program start:\n The time is $time!\n"; -print "Command line:\n $0 @ARGV\n"; - -my $format=$opts{'format'}; -if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { - #&printErr(); - die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; -} - -my $phred_qv=64; - - -my @inputfiles=@{$opts{'i'}}; -my @inputtags=@{$opts{'tag'}}; - -my $mypath=`pwd`; -chomp $mypath; - -my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRPlant_out/"; - - -unless ($dir=~/\/$/) {$dir.="/";} -if (not -d $dir) { - mkdir $dir; -} -my $config=$dir."/input_config"; -open CONFIG,">$config"; - for (my $i=0;$i<@inputfiles;$i++) { - print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; - } -close CONFIG; - -my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; - -my $a="ATCTCGTATG"; #adapter -if (defined $opts{'a'}) {$a=$opts{'a'};} - -my $m=6; #adapter minimum mapped nt -if (defined $opts{'M'}) {$m=$opts{'M'};} - -my $t=1; #threads number -if (defined $opts{'t'}) {$t=$opts{'t'};} - -my $min_nt=19; # minimum reads length -if (defined $opts{'min'}) {$min_nt=$opts{'min'};} - -my $max_nt=28; #maximum reads length -if (defined $opts{'max'}) {$max_nt=$opts{'max'};} - -my $mis=0; #mismatch number for microRNA -if (defined $opts{'mis'}) {$mis=$opts{'mis'};} - -my $mis_rfam=0;# mismatch number for rfam -if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} - -my $hit=25; # maximum reads mapping hits in genome -if (defined $opts{'r'}) {$hit=$opts{'r'};} - -my $upstream = 2; # microRNA 5' extension -$upstream = $opts{'e'} if(defined $opts{'e'}); - -my $downstream = 5;# microRNA 3' extension -$downstream = $opts{'f'} if(defined $opts{'f'}); - -my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; -my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; -my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; - -$time=&Time(); -print "$time, Checking input file!\n"; - -my (@filein,@mark,@clean); -#&read_config(); -@filein=@inputfiles; -@mark=@inputtags; - -&checkfa($opts{pre}); -&checkfa($opts{mat}); -&checkfa($opts{gfa}); - - -##### clip adpter --> clean data start -$time=&Time(); -print "$time, Preprocess:\n trim adapter, reads collapse and filter reads by length.\n"; - -$time=~s/:/-/g; -$time=~s/ /-/g; -my $preprocess=$dir."preProcess_${time}/"; -mkdir $preprocess; -my $can_use_threads = eval 'use threads; 1'; -if ($can_use_threads) { -# Do processing using threads - print "Do processing using threads\n"; - my @filein1=@filein; my @mark1=@mark; - while (@filein1>0) { - my @thrs; my @res; - for (my $i=0;$i<$t ;$i++) { - last if(@filein1==0); - my $in=shift @filein1; - my $out=shift @mark1; - push @clean,$preprocess.$out."_clips_adapter.fq"; - $thrs[$i]=threads->create(\&clips,$in,$out); - } - for (my $i=0;$i<@thrs;$i++) { - $res[$i]=$thrs[$i]->join(); - } - } -} else { -# Do not processing using threads - print "Do not processing using threads\n"; - for (my $i=0;$i<@filein ;$i++) { - my $in=$filein[$i]; - my $out=$mark[$i]; - push @clean,$preprocess.$out."_clips_adapter.fq"; - &clips($in,$out); - } -} - -##### clip adpter --> clean data end - -my $collapsed=$preprocess."collapse_reads.fa"; -my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data -my $data2; ### mirbase not mapped reads -my $data3; ### rfam not mapped reads -&collapse(\@clean,$collapsed); #collapse reads to tags - -&filterbylength(); # filter <$min_nt && >$max_nt - -print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; - -$time=Time(); -print "$time: known microRNA quantify!\n\n"; - -chdir $dir; - -$time=~s/:/-/g; -$time=~s/ /-/g; -my $known_result=$dir."miRNA_Express_${time}/"; -&quantify(); ### known microRAN quantify - - -#my $miR_exp_dir=&search($known_result,"miRNA_Express_"); -$data2=$known_result."/mirbase_not_mapped.fa"; - -my $pathfile="$dir/path.txt"; -open PA,">$pathfile"; -print PA "$config\n"; -print PA "$preprocess\n"; -print PA "$known_result\n"; - -if (defined $opts{'rfam'}) { #rfam mapping and analysis - $time=Time(); - print "$time: RNA annotate!\n\n"; - $time=~s/:/-/g; - $time=~s/ /-/g; - my $rfam_exp_dir=$dir."rfam_match_${time}"; - &rfam(); - #my $rfam_exp_dir=&search($dir,"rfam_match_"); - $data3=$rfam_exp_dir."/rfam_not_mapped.fa"; -print PA "$rfam_exp_dir\n"; - - my $tag=join "\\;" ,@mark; - system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); -} - -my $data4=$data; -if (defined $opts{'D'}) { #genome mapping - $data4=$data3; -}else{ - $data4=$data2; -} - -$time=Time(); -print "$time: Genome alignment!\n\n"; -$time=~s/:/-/g; -$time=~s/ /-/g; -my $genome_map=$dir."genome_match_${time}"; -&genome($data4); -print PA "$genome_map\n"; -#my $genome_map=&search($dir,"genome_match_"); -my $mapfile=$genome_map."/genome_mapped.bwt"; -my $mapfa=$genome_map."/genome_mapped.fa"; -my $unmap=$genome_map."/genome_not_mapped.fa"; - -#$time=Time(); -#print "$time: Novel microRNA prediction!\n\n"; - -&predict($mapfa); - -close PA; -system("perl $scipt_path/html.pl -i $pathfile -format $format -o $dir/result.html"); - -$time=Time(); -print "$time: Program end!!\n"; - -############################## sub programs ################################### -sub predict{ - my ($file)=@_; - $time=&Time(); - print "$time: Novel microRNA prediction!\n\n"; - $time=~s/:/-/g; - $time=~s/ /-/g; - my $predict=$dir."miRNA_predict_${time}"; -print PA "$predict\n"; - mkdir $predict; - chdir $predict; - system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); -# print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; - - system("bowtie-build -f excised_precursor.fa excised_precursor"); -# print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; - - system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log"); -# print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; - - system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); -# print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; - - system("sort -k 4 precursor_mapped.bst > signatures.bst"); -# print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; - - chdir $dir; - system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); -# print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; - #system("rm novel_tmp_dir -rf"); - my $tag=join "," ,@mark; - system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); -} - -sub genome{ - my ($file)=@_; - if(defined $opts{'idx'}){ - system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time") ; -# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; - }else{ - system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time") ; -# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; - } -} -sub rfam{ - if (defined $opts{'idx2'}) { - system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time"); -# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; - }else{ - system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time"); -# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; - } -} -sub quantify{ - my $tag=join "\\;" ,@mark; - system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); - print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; -} -sub filterbylength{ - my $tmpmark=join ",", @mark; - system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); - system("perl $scipt_path/Length_Distibution.pl -i $preprocess/reads_length_distribution.txt -o $preprocess/length.html"); -# print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; - -} -sub collapse{ - my ($ins,$data)=@_; - my $str=""; - for (my $i=0;$i<@{$ins};$i++) { - $str .="-i $$ins[$i] "; - } - system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); -# print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; -} - -sub clips{ - my ($in,$out)=@_; - my $adapter=$preprocess.$out."_clips_adapter.fq"; - if($format eq "fq" || $format eq "fastq"){ - system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; - print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; - } - if($format eq "fa" || $format eq "fasta"){ - system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; - # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; - } - #my $clean=$preprocess.$out."_clean.fq"; - #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); - - return; -} - -sub read_config{ - open CON,"<$config"; - while (my $aline=) { - chomp $aline; - my @tmp=split/\t/,$aline; - push @filein,$tmp[0]; - push @mark,$tmp[1]; - &check_rawdata($tmp[0]); - } - close CON; - if (@filein != @mark) { - #&printErr(); - die "Maybe config file have some wrong!!!\n"; - } -} -sub check_rawdata{ - my ($fileforcheck)=@_; - if (!(-s $fileforcheck)) { - #&printErr(); - die "Can not find $fileforcheck, or file is empty!!!\n"; - } - if ($format eq "fasta" || $format eq "fa") { - &checkfa($fileforcheck); - } - if ($format eq "fastq" || $format eq "fq") { - &checkfq($fileforcheck); - } -} -sub checkfa{ - my ($file_reads)=@_; - open N,"<$file_reads"; - my $line=; - chomp $line; - if($line !~ /^>\S+/){ - #printErr(); - die "The first line of file $file_reads does not start with '>identifier' -Reads file $file_reads is not a valid fasta file\n\n"; - } - if( !~ /^[ACGTNacgtn]*$/){ - #printErr(); - die "File $file_reads contains not allowed characters in sequences -Allowed characters are ACGTN -Reads file $file_reads is not a fasta file\n\n"; - } - close N; -} -sub checkfq{ - my ($file_reads)=@_; - - open N,"<$file_reads"; - for (my $i=0;$i<10;$i++) { - my $a=; - my $b=; - my $c=; - my $d=; - chomp $a; - chomp $b; - chomp $c; - chomp $d; - if($a!~/^\@/){ - #&printErr(); - die "$file_reads is not a fastq file\n\n"; - } - if($b!~ /^[ACGTNacgtn]*$/){ - #&printErr(); - die "File $file_reads contains not allowed characters in sequences -Allowed characters are ACGTN -Reads file $file_reads is not a fasta file\n\n"; - } - if ($c!~/^\@/ && $c!~/^\+/) { - #&printErr(); - die "$file_reads is not a fastq file\n\n"; - } - if ((length $b) != (length $d)) { - #&printErr(); - die "$file_reads is not a fastq file\n\n"; - } - my @qv=split //,$d; - for (my $j=0;$j<@qv ;$j++) { - my $q=ord($qv[$j])-64; - if($q<0){$phred_qv=33;} - } - } - close N; -} - -sub search{ - my ($dir,$str)=@_; - opendir I,$dir; - my @ret; - while (my $file=readdir I) { - if ($file=~/$str/) { - push @ret, $file; - } - } - closedir I; - if (@ret != 1) { - #&printErr(); - - die "Can not find directory or file which name has string: $str !!!\n"; - } - return $ret[0]; -} - -=cut - -sub printErr{ - print STDERR color 'bold red'; - print STDERR "Error: "; - print STDERR color 'reset'; -} -sub Time{ - my $time=time(); - my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; - $month++; - $year+=1900; - if (length($sec) == 1) {$sec = "0"."$sec";} - if (length($min) == 1) {$min = "0"."$min";} - if (length($hour) == 1) {$hour = "0"."$hour";} - if (length($day) == 1) {$day = "0"."$day";} - if (length($month) == 1) {$month = "0"."$month";} - #print "$year-$month-$day $hour:$min:$sec\n"; - return("$year-$month-$day-$hour-$min-$sec"); -} -=cut -sub Time{ - my $time=time(); - my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; - $month++; - $year+=1900; - if (length($sec) == 1) {$sec = "0"."$sec";} - if (length($min) == 1) {$min = "0"."$min";} - if (length($hour) == 1) {$hour = "0"."$hour";} - if (length($day) == 1) {$day = "0"."$day";} - if (length($month) == 1) {$month = "0"."$month";} - #print "$year-$month-$day $hour:$min:$sec\n"; - return("$year-$month-$day $hour:$min:$sec"); -} - - -sub usage{ -print <<"USAGE"; -Version $version -Usage: - -$0 -i -format -gfa -index -pre -mat -rfam -D -a -M -min -max -mis -e -f -v -t -o -path -options: --i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ... --tag string # raw data file names, -tag xxx -tag xxx - --format string,#specific input rawdata file format : fastq|fq|fasta|fa - --path scirpt path - --gfa string, input file # genome fasta. sequence file --idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter - string must be the prefix of the bowtie index. For instance, if - the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then - the prefix is 'h_sapiens_37_asm'.##can be null - --pre string, input file #species specific microRNA precursor sequences --mat string, input file #species specific microRNA mature sequences - --rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. --idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter - string must be the prefix of the bowtie index. For instance, if - the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then - the prefix is 'h_sapiens_37_asm'.##can be null - --D If [-D] is specified,will discard rfam mapped reads(nead -rfam). - --a string, ADAPTER string. default is ATCTCGTATG. --M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. --min int, reads min length,default is 19. --max int, reads max length,default is 28. - --mis [int] number of allowed mismatches when mapping reads to precursors, default 0 --e [int] number of nucleotides upstream of the mature sequence to consider, default 2 --f [int] number of nucleotides downstream of the mature sequence to consider, default 5 --v report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment --r int a read is allowed to map up to this number of positions in the genome,default is 25 - --dis Maximal space between miRNA and miRNA* (200) --flank Flank sequence length of miRNA precursor (10) --mfe Maximal free energy allowed for a miRNA precursor (-20) - --t int, number of threads [1] - --o output directory# absolute path --h help -USAGE -exit(1); -} - diff -r ca05d68aca13 -r c75593f79aa9 miRPlant.xml --- a/miRPlant.xml Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ - - tool for plant microRNA analisis - - - fastx_toolkit - bowtie - SCRIPT_PATH - - SVG - ViennaRNA - - - - - miRPlant.pl - ## Change this to accommodate the number of threads you have available. - -t \${GALAXY_SLOTS:-4} - ## Do or not delet rfam mapped tags - #if $params.delet_rfam == "yes": - -D - #end if - -path \$SCRIPT_PATH - - #for $j, $s in enumerate( $series ) - ##rank_of_series=$j - -i ${s.input} - -tag ${s.tag} - #end for - - -format $format -gfa $gfa -pre $pre -mat $mat -rfam $rfam -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe > run.log - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r ca05d68aca13 -r c75593f79aa9 preProcess.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preProcess.pl Wed Dec 03 01:54:29 2014 -0500 @@ -0,0 +1,383 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-12-2 +#Modified: +#Description: RNA-seq data pre-process +my $version=1.00; + +use strict; +use Getopt::Long; +use threads; +#use threads::shared; +use File::Path; +use File::Basename; +#use RNA; +#use Term::ANSIColor; + +my %opts; +GetOptions(\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h"); +if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $time=&Time(); +print "miPlant program start:\n The time is $time!\n"; +print "Command line:\n $0 @ARGV\n"; + +my $format=$opts{'format'}; +if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { + #&printErr(); + die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +} + +my $phred_qv=64; +if (defined $opts{'phred'}) {$phred_qv=$opts{'phred'};} + +my @inputfiles=@{$opts{'i'}}; +my @inputtags=@{$opts{'tag'}}; + +my $mypath=`pwd`; +chomp $mypath; + +my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/preProcess/"; + + +unless ($dir=~/\/$/) {$dir.="/";} +if (not -d $dir) { + mkdir $dir; +} +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@inputfiles;$i++) { + print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; + } +close CONFIG; + +my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; + +my $a="ATCTCGTATG"; #adapter +if (defined $opts{'a'}) {$a=$opts{'a'};} + +my $m=6; #adapter minimum mapped nt +if (defined $opts{'M'}) {$m=$opts{'M'};} + +my $t=1; #threads number +if (defined $opts{'t'}) {$t=$opts{'t'};} + +my $min_nt=19; # minimum reads length +if (defined $opts{'min'}) {$min_nt=$opts{'min'};} + +my $max_nt=28; #maximum reads length +if (defined $opts{'max'}) {$max_nt=$opts{'max'};} + +my $mis=0; #mismatch number for microRNA +if (defined $opts{'mis'}) {$mis=$opts{'mis'};} + +my $mis_rfam=0;# mismatch number for rfam +if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} + +my (@filein,@mark,@clean); +#&read_config(); +@filein=@inputfiles; +@mark=@inputtags; + +&checkfa($opts{gfa}); + + +##### clip adpter --> clean data start +my $preprocess=$dir."preProcess_clean/"; +mkdir $preprocess; +my $can_use_threads = eval 'use threads; 1'; +if ($can_use_threads) { +# Do processing using threads + print "Do processing using threads\n"; + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clean,$preprocess.$out."_clips_adapter.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } +} else { +# Do not processing using threads + print "Do not processing using threads\n"; + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clean,$preprocess.$out."_clips_adapter.fq"; + &clips($in,$out); + } +} + +##### clip adpter --> clean data end + +my $collapsed=$preprocess."collapse_reads.fa"; +my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data +&collapse(\@clean,$collapsed); #collapse reads to tags + +&filterbylength(); # filter <$min_nt && >$max_nt + +print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; + + +$time=Time(); +print "$time: Genome alignment!\n\n"; +my $genome_map=$dir."genome_match"; +&genome($data); +#my $genome_map=&search($dir,"genome_match_"); +my $mapfile=$genome_map."/genome_mapped.bwt"; +my $mapfa=$genome_map."/genome_mapped.fa"; +my $unmap=$genome_map."/genome_not_mapped.fa"; + +chdir $dir; +my $pathfile="$dir/path.txt"; +open PA,">$pathfile"; +print PA "$config\n"; +print PA "$preprocess\n"; +print PA "$genome_map\n"; + +if (defined $opts{'rfam'}) { #rfam mapping and analysis + $time=Time(); + print "$time: RNA annotate!\n\n"; + $time=~s/:/-/g; + $time=~s/ /-/g; + my $rfam_exp_dir=$dir."rfam_match"; + &rfam(); + #my $rfam_exp_dir=&search($dir,"rfam_match_"); +print PA "$rfam_exp_dir\n"; + + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); +} + + +close PA; +system("perl $scipt_path/html_preprocess.pl -i $pathfile -format $format -min $min_nt -max $max_nt -o $dir/preprocessResult.html"); + +$time=Time(); +print "$time: Program end!!\n"; + +############################## sub programs ################################### +sub genome{ + my ($file)=@_; + if(defined $opts{'idx'}){ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir -index $opts{idx}") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; + }else{ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -r 1000 -v $mis -p $t -o $dir") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; + } +} +sub rfam{ + if (defined $opts{'idx2'}) { + system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} "); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; + }else{ + system("perl $scipt_path/rfam.pl -i $mapfa -ref $opts{rfam} -v $mis_rfam -p $t -o $dir "); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; + } +} +sub filterbylength{ + my $tmpmark=join ",", @mark; + system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); + system("perl $scipt_path/Length_Distibution.pl -i $preprocess/reads_length_distribution.txt -o $preprocess/length.html"); +# print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; + +} +sub collapse{ + my ($ins,$data)=@_; + my $str=""; + for (my $i=0;$i<@{$ins};$i++) { + $str .="-i $$ins[$i] "; + } + system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); +# print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; +} + +sub clips{ + my ($in,$out)=@_; + my $adapter=$preprocess.$out."_clips_adapter.fq"; + if($format eq "fq" || $format eq "fastq"){ + system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; +# print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; + } + if($format eq "fa" || $format eq "fasta"){ + system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; + # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; + } + #my $clean=$preprocess.$out."_clean.fq"; + #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); + + return; +} + +sub read_config{ + open CON,"<$config"; + while (my $aline=) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + &check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} +sub check_rawdata{ + my ($fileforcheck)=@_; + if (!(-s $fileforcheck)) { + #&printErr(); + die "Can not find $fileforcheck, or file is empty!!!\n"; + } + if ($format eq "fasta" || $format eq "fa") { + &checkfa($fileforcheck); + } + if ($format eq "fastq" || $format eq "fq") { + &checkfq($fileforcheck); + } +} +sub checkfa{ + my ($file_reads)=@_; + open N,"<$file_reads"; + my $line=; + chomp $line; + if($line !~ /^>\S+/){ + #printErr(); + die "The first line of file $file_reads does not start with '>identifier' +Reads file $file_reads is not a valid fasta file\n\n"; + } + if( !~ /^[ACGTNacgtn]*$/){ + #printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + close N; +} +sub checkfq{ + my ($file_reads)=@_; + + open N,"<$file_reads"; + for (my $i=0;$i<10;$i++) { + my $a=; + my $b=; + my $c=; + my $d=; + chomp $a; + chomp $b; + chomp $c; + chomp $d; + if($a!~/^\@/){ + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if($b!~ /^[ACGTNacgtn]*$/){ + #&printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + if ($c!~/^\@/ && $c!~/^\+/) { + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if ((length $b) != (length $d)) { + #&printErr(); + die "$file_reads is not a fastq file\n\n"; + } + my @qv=split //,$d; + for (my $j=0;$j<@qv ;$j++) { + my $q=ord($qv[$j])-64; + if($q<0){$phred_qv=33;} + } + } + close N; +} + +sub search{ + my ($dir,$str)=@_; + opendir I,$dir; + my @ret; + while (my $file=readdir I) { + if ($file=~/$str/) { + push @ret, $file; + } + } + closedir I; + if (@ret != 1) { + #&printErr(); + + die "Can not find directory or file which name has string: $str !!!\n"; + } + return $ret[0]; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day $hour:$min:$sec"); +} + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t -o -path +options: +-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ... +-tag string # raw data file names, -tag xxx -tag xxx + +-format string,#specific input rawdata file format : fastq|fq|fasta|fa +-phred int # phred quality number, default is 64 + +-path scirpt path + +-gfa string, input file # genome fasta. sequence file +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-a string, ADAPTER string. default is ATCTCGTATG. +-M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. +-min int, reads min length,default is 19. +-max int, reads max length,default is 28. + +-mis [int] number of allowed mismatches when mapping reads to genome, default 0 +-v report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment + +-t int, number of threads [1] + +-o output directory# absolute path +-h help +USAGE +exit(1); +} + diff -r ca05d68aca13 -r c75593f79aa9 preProcess.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preProcess.xml Wed Dec 03 01:54:29 2014 -0500 @@ -0,0 +1,120 @@ + + tool for Raw data preprocess analisis, including 3' adapter triming, reads collaping, genome mapping and rfam non-miRNA analysis + + + fastx_toolkit + bowtie + SCRIPT_PATH + + SVG + ViennaRNA + + + + + miRPlant.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + -path \$SCRIPT_PATH + + #for $j, $s in enumerate( $series ) + ##rank_of_series=$j + -i ${s.input} + -tag ${s.tag} + #end for + + ## Do or not annotate rfam non-miRNA RNAs + #if $params.annotate_rfam == "yes": + -rfam $rfam + #end if + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + -format $format -gfa ${index_path}.fa -idx $index_path -a $a -M $mapnt -min $min -max $max -mis $mismatch -v $v > run.log + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (params['annotate_rfam'] == 'Yes') + + + (params['annotate_rfam'] == 'Yes') + + + (params['annotate_rfam'] == 'Yes') + + + + + + + diff -r ca05d68aca13 -r c75593f79aa9 precursors.pl --- a/precursors.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,829 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2013/7/19 -#Modified: -#Description: -my $version=1.00; - -use strict; -use Getopt::Long; -#use RNA; - -my %opts; -GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h"); -if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments -&usage; -} - -my $checkno=1; -my $filein=$opts{'map'}; -my $faout=$opts{'o'}; -my $strout=$opts{'s'}; -my $genome= $opts{'g'}; - -my $maxd=defined $opts{'d'} ? $opts{'d'} : 200; -my $flank=defined $opts{'f'}? $opts{'f'} : 10; - -my $MAX_ENERGY=-18; -if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};} -my $MAX_UNPAIR=5; -my $MIN_PAIR=15; -my $MAX_SIZEDIFF=4; -my $MAX_BULGE=2; -my $ASYMMETRY=5; -my $MIN_UNPAIR=0; -my $MIN_SPACE=5; -my $MAX_SPACE=$maxd; -my $FLANK=$flank; - -######### load in genome sequences start ######## -my %genome; -my %lng; -my $name; -open IN,"<$genome"; -while (my $aline=) { - chomp $aline; - next if($aline=~/^\#/); - if ($aline=~/^>(\S+)/) { - $name=$1; - next; - } - $genome{$name} .=$aline; -} -close IN; -foreach my $key (keys %genome) { - $lng{$key}=length($genome{$key}); -} -####### load in genome sequences end ########## - -my %breaks; ### reads number bigger than 3 -open IN,"<$filein"; #input file -while (my $aline=) { - chomp $aline; - my @tmp=split/\t/,$aline; - $tmp[0]=~/_x(\d+)$/; - my $no=$1; - next if($no<3); - #my $trand=&find_strand($tmp[9]); - #my @pos=split/\.\./,$tmp[5]; - my $end=$tmp[3]+length($tmp[4])-1; - if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);} - push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base -} -close IN; - -my %cites; ### peaks -foreach my $chr (keys %breaks) { - foreach my $strand (keys %{$breaks{$chr}}) { - my @array=@{$breaks{$chr}{$strand}}; - @array=sort{$a->[0]<=>$b->[0]} @array; - for (my $i=0;$i<@array;$i++) { - my $start=$array[$i][0];my $end=$array[$i][1]; - my @subarray=(); - push @subarray,$array[$i]; - - for (my $j=$i+1;$j<@array;$j++) { - if ($start<$array[$j][1] && $end>$array[$j][0]) { - push @subarray,$array[$j]; - ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); - } - else{ - $i=$j; - &find_cites(\@subarray,$chr,$strand); - last; - } - } - } - } -} - -open FA,">$faout"; #output file -open STR,">$strout"; -foreach my $chr (keys %cites) { - foreach my $strand (keys %{$cites{$chr}}) { - - my @array2=@{$cites{$chr}{$strand}}; - @array2=sort{$a->[0]<=>$b->[0]} @array2; - &excise(\@array2,$chr,$strand); - } -} -close FA; -close STR; -sub oneCiteDn{ - my ($array,$a,$chr,$strand)=@_; - - my $ss=$$array[$a][0]-$flank; - $ss=0 if($ss<0); - my $ee=$$array[$a][1]+$maxd+$flank; - $ee=$lng{$chr} if($ee>$lng{$chr}); - - my $seq=substr($genome{$chr},$ss,$ee-$ss+1); - if($strand eq "-"){$seq=revcom($seq);} - - my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); - return $val; -} -sub oneCiteUp{ - my ($array,$a,$chr,$strand)=@_; - - my $ss=$$array[$a][0]-$maxd-$flank; - $ss=0 if($ss<0); - my $ee=$$array[$a][1]+$flank; - $ee=$lng{$chr} if($ee>$lng{$chr}); - - my $seq=substr($genome{$chr},$ss,$ee-$ss+1); - if($strand eq "-"){$seq=revcom($seq);} - - my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); - return $val; - -} - -sub twoCites{ - my ($array,$a,$b,$chr,$strand)=@_; - - my $ss=$$array[$a][0]-$flank; - $ss=0 if($ss<0); - my $ee=$$array[$b][1]+$flank; - $ee=$lng{$chr} if($ee>$lng{$chr}); - - my $seq=substr($genome{$chr},$ss,$ee-$ss+1); - if($strand eq "-"){$seq=revcom($seq);} - -# my( $str,$mfe)=RNA::fold($seq); -# return 0 if($mfe>$MAX_ENERGY); ### minimum mfe - my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee); - - return $val; - -} -sub excise{ - my ($cluster,$chr,$strand)=@_; - - my $last_pos=0; - for (my $i=0;$i<@{$cluster};$i++) { - next if($$cluster[$i][0]<$last_pos); - my $ok=0; - for (my $j=$i+1;$j<@{$cluster} ;$j++) { - if($$cluster[$j][0]-$$cluster[$i][1]>$maxd){ - $i=$j; - last; - }else{ - $ok=&twoCites($cluster,$i,$j,$chr,$strand); - if($ok){ $last_pos=$$cluster[$j][1]+$flank; $i=$j; last;} - } - } - next if($ok); - - $ok=&oneCiteDn($cluster,$i,$chr,$strand); - if($ok){$last_pos=$$cluster[$i][1]+$maxd+$flank; next;} - $ok=&oneCiteUp($cluster,$i,$chr,$strand); - if($ok){$last_pos=$$cluster[$i][1]+$flank;next;} - } - - -} - -sub ffw2{ - my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_; - - my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns - if ($N_count > 5) { - return 0; - } - - my $seq_length=length $seq; - # position tag1 and tag2 - my $tag1_beg=index($seq,$tag1,0)+1; - if ($tag1_beg < 1) { - warn "[ffw2] coordinate error.\n"; -# $fold->{reason}="coordinate error"; - return 0; - } - my $tag2_beg=index($seq,$tag2,0)+1; - if ($tag2_beg < 1) { - warn "[ffw2] coordinate error.\n"; -# $fold->{reason}="coordinate error"; - return 0; - } - if ($tag2_beg < $tag1_beg) { - # swap tag1 and tag2 - ($tag1,$tag2)=($tag2,$tag1); - ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg); - } - my $tag1_end=$tag1_beg+length($tag1)-1; - my $tag2_end=$tag2_beg+length($tag2)-1; - # re-clipping - my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1; - my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length; - $seq=substr($seq,$beg-1,$end-$beg+1); - $seq_length=length $seq; - # re-reposition - $tag1_beg=index($seq,$tag1,0)+1; - if ($tag1_beg < 1) { - warn "[ffw2] coordinate error.\n"; -# $fold->{reason}="coordinate error"; - return 0; - } - - $tag2_beg=index($seq,$tag2,0)+1; - if ($tag2_beg < 1) { - warn "[ffw2] coordinate error.\n"; -# $fold->{reason}="coordinate error"; - return 0; - } - $tag1_end=$tag1_beg+length($tag1)-1; - $tag2_end=$tag2_beg+length($tag2)-1; - - # fold - #my ($struct,$mfe)=RNA::fold($seq); - my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; - my @rawfolds=split/\s+/,$rnafold; - my $struct=$rawfolds[1]; - my $mfe=$rawfolds[-1]; - $mfe=~s/\(//; - $mfe=~s/\)//; - #$mfe=sprintf "%.2f", $mfe; - if ($mfe > $MAX_ENERGY) {return 0;} - - # tag1 - my $tag1_length=$tag1_end-$tag1_beg+1; - my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length); - my $tag1_arm=which_arm($tag1_struct); - my $tag1_unpair=$tag1_struct=~tr/.//; - my $tag1_pair=$tag1_length-$tag1_unpair; - my $tag1_max_bulge=biggest_bulge($tag1_struct); - if ($tag1_arm ne "5p") { return 0;} # tag not in stem -# if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass} - if ($tag1_pair < $MIN_PAIR) {return 0;} - if ($tag1_max_bulge > $MAX_BULGE) {return 0;} - - # tag2 - my $tag2_length=$tag2_end-$tag2_beg+1; - my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length); - my $tag2_arm=which_arm($tag2_struct); - my $tag2_unpair=$tag2_struct=~tr/.//; - my $tag2_pair=$tag2_length-$tag2_unpair; - my $tag2_max_bulge=biggest_bulge($tag2_struct); - if ($tag2_arm ne "3p") {return 0;} # star not in stem -# if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass} - if ($tag2_pair < $MIN_PAIR) {return 0;} - if ($tag2_max_bulge > $MAX_BULGE) {return 0;} - - # space size between miR and miR* - my $space=$tag2_beg-$tag1_end-1; - if ($space < $MIN_SPACE) {return 0;} - if ($space > $MAX_SPACE) {return 0;} - - # size diff of miR and miR* - my $size_diff=abs($tag1_length-$tag2_length); - if ($size_diff > $MAX_SIZEDIFF) {return 0;} - - # build base pairing table - my %pairtable; - &parse_struct($struct,\%pairtable); # coords count from 1 - - my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end); - my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end); - my $asy=($asy1 < $asy2) ? $asy1 : $asy2; - if ($asy > $ASYMMETRY) {return 0} - - # duplex fold, determine whether two matures like a miR/miR* ike duplex - my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2); - # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context - my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end); - if ($like_mir_duplex1==0 && $like_mir_duplex2==0) { - return 0; - } - - print FA ">$chr:$strand:$ss..$ee\n$seq\n"; - print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; - - return 1; -} - -sub ffw1{ - my ($seq,$tag,$chr,$strand,$ss,$ee)=@_; - my $pass=0; - - my $N_count=$seq=~tr/N//; - if ($N_count > 5) { - return 0; - } - - my $seq_length=length $seq; - my $tag_length=length $tag; - - # position - my $tag_beg=index($seq,$tag,0)+1; - if ($tag_beg < 1) { - warn "[ffw1] coordinate error.\n"; - return $pass; - } - my $tag_end=$tag_beg+length($tag)-1; - - - # define candidate precursor by hybrid short arm to long arm, not solid enough - my($beg,$end)=define_precursor($seq,$tag); - if (not defined $beg) { - return $pass; - } - if (not defined $end) { - return $pass; - } - $seq=substr($seq,$beg-1,$end-$beg+1); - $seq_length=length $seq; - - - # fold - #my ($struct,$mfe)=RNA::fold($seq); - my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; - my @rawfolds=split/\s+/,$rnafold; - my $struct=$rawfolds[1]; - my $mfe=$rawfolds[-1]; - $mfe=~s/\(//; - $mfe=~s/\)//; - - if ($mfe > $MAX_ENERGY) { - $pass=0; - return $pass; - } - - # reposition - $tag_beg=index($seq,$tag,0)+1; - if ($tag_beg < 1) { - warn "[ffw1] coordinate error.\n"; - return 0; - } - $tag_end=$tag_beg+length($tag)-1; - - my $tag_struct=substr($struct,$tag_beg-1,$tag_length); - my $tag_arm=which_arm($tag_struct); - my $tag_unpair=$tag_struct=~tr/.//; - my $tag_pair=$tag_length-$tag_unpair; - my $tag_max_bulge=biggest_bulge($tag_struct); - if ($tag_arm eq "-") { return $pass;} -# if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass} - if ($tag_pair < $MIN_PAIR) { return $pass;} - if ($tag_max_bulge > $MAX_BULGE) {return $pass;} - - # build base pairing table - my %pairtable; - &parse_struct($struct,\%pairtable); # coords count from 1 - - # get star - my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end); - my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1); - my $star_length=$star_end-$star_beg+1; - my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1); - my $star_arm=which_arm($star_struct); - my $star_unpair=$star_struct=~tr/.//; - my $star_pair=$star_length-$star_unpair; - my $star_max_bulge=biggest_bulge($star_struct); - if ($star_arm eq "-") { return $pass;} -# if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass} - if ($star_pair < $MIN_PAIR) {return $pass;} - if ($star_max_bulge > $MAX_BULGE) {return $pass;} - - if ($tag_arm eq $star_arm) {return $pass;} - - # space size between miR and miR* - my $space; - if ($tag_beg < $star_beg) { - $space=$star_beg-$tag_end-1; - } - else { - $space=$tag_beg-$star_end-1; - } - if ($space < $MIN_SPACE) { return $pass;} - if ($space > $MAX_SPACE) { return $pass;} - - # size diff - my $size_diff=abs($tag_length-$star_length); - if ($size_diff > $MAX_SIZEDIFF) { return $pass;} - - # asymmetry - my $asy=get_asy(\%pairtable,$tag_beg,$tag_end); - if ($asy > $ASYMMETRY) {return $pass;} - - $pass=1; - print FA ">$chr:$strand:$ss..$ee\n$seq\n"; - print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; - return $pass; - -} -sub get_star { - my($table,$beg,$end)=@_; - - my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2 - foreach my $i ($beg..$end) { - if (defined $table->{$i}) { - my $j=$table->{$i}; - $s1=$i; - $s2=$j; - last; - } - } - foreach my $i (reverse ($beg..$end)) { - if (defined $table->{$i}) { - my $j=$table->{$i}; - $e1=$i; - $e2=$j; - last; - } - } -# print "$s1,$e1 $s2,$e2\n"; - - # correct terminus - my $off1=$s1-$beg; - my $off2=$end-$e1; - $s2+=$off1; - $s2+=2; # 081009 - $e2-=$off2; $e2=1 if $e2 < 1; - $e2+=2; $e2=1 if $e2 < 1; # 081009 - ($s2,$e2)=($e2,$s2) if ($s2 > $e2); - return ($s2,$e2); -} - -sub define_precursor { - my $seq=shift; - my $tag=shift; - - my $seq_length=length $seq; - my $tag_length=length $tag; - my $tag_beg=index($seq,$tag,0)+1; - my $tag_end=$tag_beg+$tag_length-1; - - # split the candidate region into short arm and long arm - my $tag_arm; - my ($larm,$larm_beg,$larm_end); - my ($sarm,$sarm_beg,$sarm_end); - if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm - $sarm=substr($seq,0,$tag_end); - $larm=substr($seq,$tag_end); - $sarm_beg=1; - $sarm_end=$tag_end; - $larm_beg=$tag_end+1; - $larm_end=$seq_length; - $tag_arm="5p"; - } - else { - $larm=substr($seq,0,$tag_beg-1); # on 3' arm - $sarm=substr($seq,$tag_beg-1); - $larm_beg=1; - $larm_end=$tag_beg-1; - $sarm_beg=$tag_beg; - $sarm_end=$seq_length; - $tag_arm="3p"; - } - -# print "$sarm_beg,$sarm_end $sarm\n"; -# print "$larm_beg,$larm_end $larm\n"; - - # clipping short arm - if ($tag_arm eq "5p") { - $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1; - $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); - } - else { - $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length; - $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); - } -# print "$sarm_beg,$sarm_end $sarm\n"; -# print "$larm_beg,$larm_end $larm\n"; - - # define the precursor by hybriding short arm to long arm -=cut #modify in 2014-10-28 - my $duplex=RNA::duplexfold($sarm,$larm); - my $struct=$duplex->{structure}; - my $energy=sprintf "%.2f", $duplex->{energy}; - my ($str1,$str2)=split(/&/,$struct); - my $pair=$str1=~tr/(//; -# print "pair=$pair\n"; - my $beg1=$duplex->{i}+1-length($str1); - my $end1=$duplex->{i}; - my $beg2=$duplex->{j}; - my $end2=$duplex->{j}+length($str2)-1; -=cut -###### new codes begin - my $duplex=`perl -e 'print "$sarm\n$larm"' | RNAduplex`; - #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) - my @tmpduplex=split/\s+/,$duplex; - my $struct=$tmpduplex[0]; - $tmpduplex[-1]=~s/[(|)]//g; - my $energy=$tmpduplex[-1]; - my ($str1,$str2)=split(/&/,$struct); - my $pair=$str1=~tr/(//; - my ($beg1,$end1)=split/,/,$tmpduplex[1]; - my ($beg2,$end2)=split/,/,$tmpduplex[3]; -######## new codes end - -# print "$beg1:$end1 $beg2:$end2\n"; - # transform coordinates - $beg1=$beg1+$sarm_beg-1; - $end1=$end1+$sarm_beg-1; - $beg2=$beg2+$larm_beg-1; - $end2=$end2+$larm_beg-1; -# print "$beg1:$end1 $beg2:$end2\n"; - - my $off5p=$beg1-$sarm_beg; - my $off3p=$sarm_end-$end1; - $beg2-=$off3p; $beg2=1 if $beg2 < 1; - $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length; - -# print "$beg1:$end1 $beg2:$end2\n"; - - my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2; - my $end=$sarm_end > $end2 ? $sarm_end : $end2; - - return if $pair < $MIN_PAIR; -# print "$beg,$end\n"; - return ($beg,$end); -} - - -# duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex -sub likeMirDuplex1 { - my $seq1=shift; - my $seq2=shift; - my $like_mir_duplex=1; - - my $length1=length $seq1; - my $length2=length $seq2; -=cut - my $duplex=RNA::duplexfold($seq1, $seq2); - my $duplex_struct=$duplex->{structure}; - my $duplex_energy=sprintf "%.2f", $duplex->{energy}; - my ($str1,$str2)=split(/&/,$duplex_struct); - my $beg1=$duplex->{i}+1-length($str1); - my $end1=$duplex->{i}; - my $beg2=$duplex->{j}; - my $end2=$duplex->{j}+length($str2)-1; -=cut - my $duplex=`perl -e 'print "$seq1\n$seq2"' | RNAduplex`; - #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) - my @tmpduplex=split/\s+/,$duplex; - my $duplex_struct=$tmpduplex[0]; - $tmpduplex[-1]=~s/[(|)]//g; - my $duplex_energy=$tmpduplex[-1]; - my ($str1,$str2)=split(/&/,$duplex_struct); - #my $pair=$str1=~tr/(//; - my ($beg1,$end1)=split/,/,$tmpduplex[1]; - my ($beg2,$end2)=split/,/,$tmpduplex[3]; - - # revise beg1, end1, beg2, end2 - $str1=~/^(\.*)/; - $beg1+=length($1); - $str1=~/(\.*)$/; - $end1-=length($1); - $str2=~/^(\.*)/; - $beg2+=length($1); - $str2=~/(\.*)$/; - $end2-=length($1); - - my $pair_num=$str1=~tr/(//; - my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom - my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck -# print $pair_num,"\n"; -# print $overhang1,"\n"; -# print $overhang2,"\n"; - if ($pair_num < 13) { - $like_mir_duplex=0; - } - if ($overhang1 < 0 || $overhang2 < 0 ) { - $like_mir_duplex=0; - } - if ($overhang1 > 4 || $overhang2 > 4) { - $like_mir_duplex=0; - } - return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); -} - -# judge whether two matures form miR/miR* duplex, in hairpin context -sub likeMirDuplex2 { - my ($table,$beg1,$end1,$beg2,$end2)=@_; - my $like_mir_duplex=1; - -# s1 e1 -# 5 ----------------------------3 -# | | |||| ||| | -#3 -------------------------------5 -# e2 s2 - - my $pair_num=0; - my $overhang1=0; - my $overhang2=0; - my ($s1,$e1,$s2,$e2); - foreach my $i ($beg1..$end1) { - if (defined $table->{$i}) { - my $j=$table->{$i}; - if ($j <= $end2 && $j >= $beg2) { - $s1=$i; - $e2=$j; - last; - } - } - } - foreach my $i (reverse ($beg1..$end1)) { - if (defined $table->{$i}) { - my $j=$table->{$i}; - if ($j <= $end2 && $j >= $beg2) { - $e1=$i; - $s2=$j; - last; - } - } - } - -# print "$beg1,$end1 $s1,$e1\n"; -# print "$beg2,$end2 $s2,$e2\n"; - - foreach my $i ($beg1..$end1) { - if (defined $table->{$i}) { - my $j=$table->{$i}; - if ($j <= $end2 && $j >= $beg2) { - ++$pair_num; - } - } - } - if (defined $s1 && defined $e2) { - $overhang1=($end2-$e2)-($s1-$beg1); - } - if (defined $e1 && defined $s2) { - $overhang2=($end1-$e1)-($s2-$beg2); - } - - if ($pair_num < 13) { - $like_mir_duplex=0; - } - if ($overhang1 < 0 && $overhang2 < 0) { - $like_mir_duplex=0; - } - return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); -} -sub parse_struct { - my $struct=shift; - my $table=shift; - - my @t=split('',$struct); - my @lbs; # left brackets - foreach my $k (0..$#t) { - if ($t[$k] eq "(") { - push @lbs, $k+1; - } - elsif ($t[$k] eq ")") { - my $lb=pop @lbs; - my $rb=$k+1; - $table->{$lb}=$rb; - $table->{$rb}=$lb; - } - } - if (@lbs) { - warn "unbalanced RNA struct.\n"; - } -} -sub which_arm { - my $substruct=shift; - my $arm; - if ($substruct=~/\(/ && $substruct=~/\)/) { - $arm="-"; - } - elsif ($substruct=~/\(/) { - $arm="5p"; - } - else { - $arm="3p"; - } - return $arm; -} -sub biggest_bulge { - my $struct=shift; - my $bulge_size=0; - my $max_bulge=0; - while ($struct=~/(\.+)/g) { - $bulge_size=length $1; - if ($bulge_size > $max_bulge) { - $max_bulge=$bulge_size; - } - } - return $max_bulge; -} -sub get_asy { - my($table,$a1,$a2)=@_; - my ($pre_i,$pre_j); - my $asymmetry=0; - foreach my $i ($a1..$a2) { - if (defined $table->{$i}) { - my $j=$table->{$i}; - if (defined $pre_i && defined $pre_j) { - my $diff=($i-$pre_i)+($j-$pre_j); - $asymmetry += abs($diff); - } - $pre_i=$i; - $pre_j=$j; - } - } - return $asymmetry; -} - -sub peaks{ - my @cluster=@{$_[0]}; - - return if(@cluster<1); - - my $max=0; my $index=-1; - for (my $i=0;$i<@cluster;$i++) { - if($cluster[$i][2]>$max){ - $max=$cluster[$i][2]; - $index=$i; - } - } -# &excise(\@cluster,$index,$_[1],$_[2]); - return($index); -} - -sub find_cites{ - my @tmp=@{$_[0]}; - my $i=&peaks(\@tmp); - - my $start=$tmp[$i][0]; - my $total=0; my $node5=0; - for (my $j=0;$j<@tmp ;$j++) { - $total+=$tmp[$j][2]; - $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2); - } - push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5); -} - -sub newpos{ - my ($a,$b,$c,$d)=@_; - my $s= $a>$c ? $c : $a; - my $e=$b>$d ? $b : $d; - return($s,$e); -} - -sub rev{ - - my($sequence)=@_; - - my $rev=reverse $sequence; - - return $rev; -} - -sub com{ - - my($sequence)=@_; - - $sequence=~tr/acgtuACGTU/TGCAATGCAA/; - - return $sequence; -} - -sub revcom{ - - my($sequence)=@_; - - my $revcom=rev(com($sequence)); - - return $revcom; -} - -sub find_strand{ - - #A subroutine to find the strand, parsing different blast formats - my($other)=@_; - - my $strand="+"; - - if($other=~/-/){ - $strand="-"; - } - - if($other=~/minus/i){ - $strand="-"; - } - - return($strand); -} -sub usage{ -print <<"USAGE"; -Version $version -Usage: -$0 -map -g -d -f -o -s -e -options: - -map input file# align result # bst. format - -g input file # genome sequence fasta format - -d Maximal space between miRNA and miRNA* (200) - -f Flank sequence length of miRNA precursor (10) - -o output file# percursor fasta file - -s output file# precursor structure file - -e Maximal free energy allowed for a miRNA precursor (-18 kcal/mol) - - -h help -USAGE -exit(1); -} - diff -r ca05d68aca13 -r c75593f79aa9 quantify.pl --- a/quantify.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,503 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2013/7/19 -#Modified: -#Description: -my $version=1.00; - -use File::Path; -use strict; -use File::Basename; -#use Getopt::Std; -use Getopt::Long; -#use RNA; - -my %opts; -GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","time:s","h"); -if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments -&usage; -} - -my $read=$opts{'r'}; -my $pre=$opts{'p'}; -my $mature=$opts{'m'}; - -my $dir=$opts{'o'}; -unless ($dir=~/\/$/) {$dir .="/";} -if (not -d $dir) { - mkdir $dir; -} - -my $threads=defined $opts{'t'} ? $opts{'t'} : 1; -my $mismatch=defined $opts{'mis'} ? $opts{'mis'} : 0; - -my $upstream = 2; -my $downstream = 5; - -$upstream = $opts{'e'} if(defined $opts{'e'}); -$downstream = $opts{'f'} if(defined $opts{'f'}); - -my $marks=defined $opts{'tag'} ? $opts{'tag'} : ""; - -my $time=Time(); -if (defined $opts{'time'}) { $time=$opts{'time'};} - -my $tmpdir="${dir}/miRNA_Express_${time}"; -if(not -d $tmpdir){ - mkdir($tmpdir); -} -chdir $tmpdir; - -`cp $pre ./`; -my $pre_file_name=basename($pre); - -&mapping(); # matures align to precursors && reads align to precursors; - -my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end; -&maturePosOnPre(); # acknowledge mature positions on precursor - -my %pre_read; -&readPosOnPre(); # acknowledge reads positions on precursors - -if(!(defined $opts{'tag'})){ - foreach my $key (keys %pre_read) { - $pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/; - my @ss=split/_/,$1; - for (my $i=1;$i<=@ss;$i++) { - $marks .="Smp$i;"; - } - last; - } -} - -my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...." -&attachPre(); - -my $preno=scalar (keys %pre); -print "Total Precursor Number is $preno !!!!\n"; - -my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe; -&structure(); - - -##### analysis and print out && moRs -my $aln=$dir."known_microRNA_express.aln"; -my $list=$dir."known_microRNA_express.txt"; -my $moRs=$dir."known_microRNA_express.moRs"; - -system("ln $mature $dir/known_microRNA_mature.fa "); -system("ln $pre $dir/known_microRNA_precursor.fa "); - -open ALN,">$aln"; -open LIST,">$list"; -open MORS,">$moRs"; - -$"="\t"; ##### @array print in \t - -my @marks=split/\;/,$marks; -#print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n"; -print LIST "#matueID\tpreID\tpos1\tpos2"; -for (my $i=0;$i<@marks;$i++) { - print LIST "\t",$marks[$i],"_matureExp"; -} -for (my $i=0;$i<@marks;$i++) { - print LIST "\t",$marks[$i],"_starExp"; -} -for (my $i=0;$i<@marks;$i++) { - print LIST "\t",$marks[$i],"_totalExp"; -} -print LIST "\n"; -print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n"; -print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n"; -my %moRs; - -foreach my $key (keys %pre) { - print ALN ">$key\n$pre{$key}\n$struc{$key}{struc} ($struc{$key}{mfe})\n"; - next if(! (exists $pre_read{$key})); - my @array=@{$pre_read{$key}}; - @array=sort{$a->[3]<=> $b->[3]} @array; - - my $length=length($pre{$key}); - - my $maxline=-1;my $max=0; ### storage the maxinum express read line - my $totalReadsNo=0; - my @not_over=(); ### new read format better for moRs analysis - -####print out Aln file start - for (my $i=0;$i<@array;$i++) { - my $maps=$array[$i][3]+1; - my $mape=$array[$i][3]+length($array[$i][4]); - my $str=""; - $str .= "." x ($maps-1); - $str .=$array[$i][4]; - $str .="." x ($length-$mape); - $str .=" "; - - $array[$i][0]=~/:([\d|_]+)_x(\d+)$/; - my @sample=split /\_/,$1; - my $total=$2; - print ALN $str,"@sample","\t",$total,"\n"; - - if($total>$max){$max=$total; $maxline=$i;} - $totalReadsNo+=$total; - - push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"]; - } -####print out Aln file end - -#### express list start - my ($ms,$me,$ss,$se); - if (!(exists($pre_mature{$key}))) { - $ms=$array[$maxline][3]+1; - $me=$array[$maxline][3]+length($array[$maxline][4]); - ($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'}); - - my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); - print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; - } - else{ - foreach my $maID (keys %{$pre_mature{$key}}) { - $ms=$pre_mature{$key}{$maID}{"mature"}[0]; - $me=$pre_mature{$key}{$maID}{"mature"}[1]; - $ss=$pre_mature{$key}{$maID}{"star"}[0]; - $se=$pre_mature{$key}{$maID}{"star"}[1]; - my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); - print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; - } - } -#### express list end - -#### analysis moRs start - my @result; my @m_texp;my $m_texp=0; ### moRs informations - - while (@not_over>0) { - my @over=@not_over; - @not_over=(); - -#丰度最高tag - my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1; - for (my $i=0;$i<@over;$i++) { - my @m_array=@{$over[$i]}; - if ($m_max<$m_array[4]) { - $m_max=$m_array[4]; - $m_maxline=$i; - } - } - $m_start=$over[$m_maxline][1]; - $m_end=$over[$m_maxline][2]; - $m_exp=$m_max; - $over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/; - my @m_nums=split/_/,$1; - for (my $j=0;$j<@m_nums;$j++) { - $m_exp[$j]=$m_nums[$j]; - } - -#统计以丰度最高tag为坐标的reads, 两端位置差异不超过3nt - for (my $i=0;$i<@over;$i++) { - next if($i==$m_maxline); - my @m_array=@{$over[$i]}; - if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) { - $m_exp+=$m_array[4]; - $m_no++; - $m_array[3]=~/:([\d|_]+)_x(\d+)$/; - my @m_nums=split/_/,$1; - for (my $j=0;$j<@m_nums;$j++) { - $m_exp[$j] +=$m_nums[$j]; - } - } - elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #去除跨越block的reads - } - if($m_exp>5){### 5个reads - $m_texp+=$m_exp; - for (my $j=0;$j<@m_exp;$j++) { - $m_texp[$j]+=$m_exp[$j]; - } - my $string=&subseq($pre{$key},$m_start,$m_end,"+"); - push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ; - } - } - - my $str=scalar @result; - my $percent=sprintf("%.2f",$m_texp/$totalReadsNo); - $str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}"; - @{$moRs{$str}}=@result; - -#### analysis moRs end -} - -##### moRs print out start -foreach my $key (keys %moRs) { - my @tmp=split/\t/,$key; - next if ($tmp[4]<=2); - next if($tmp[3]<0.95); - my @over; - for (my $i=0;$i<@{$moRs{$key}};$i++) { - my @arrayi=split/\t/,$moRs{$key}[$i]; - for (my $j=0;$j<@{$moRs{$key}};$j++) { - next if($i==$j); - my @arrayj=split/\t/,$moRs{$key}[$j]; - if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) { - push @over,$moRs{$key}[$i]; - } - } - } - if (@over>0) { - print MORS "$key\n"; - foreach (@{$moRs{$key}}) { - print MORS "$_\n"; - } - } -} -###### moRs print out end -close ALN; -close LIST; -close MORS; - -$"=" ";##### reset - - -################### Sub programs ################# -sub express{ - my ($ms,$me,$ss,$se,$read)=@_; - my (@mexp,@sexp,@texp); - $$read[0][0]=~/:([_|\d]+)_x(\d+)$/; - my @numsample=split/_/,$1; - for (my $i=0;$i<@numsample;$i++) { - $mexp[$i]=0; - $sexp[$i]=0; - $texp[$i]=0; - } - - for (my $i=0;$i<@{$read};$i++) { - my $start=$$read[$i][3]+1; - my $end=$$read[$i][3]+length($$read[$i][4]); - $$read[$i][0]=~/:([_|\d]+)_x(\d+)$/; - my $expresses=$1; - my @nums=split/_/,$expresses; - - for (my $j=0;$j<@nums;$j++) { - $texp[$j]+=$nums[$j]; - } - if ($start>=$ms && $end<=$me) { - for (my $j=0;$j<@nums;$j++) { - $mexp[$j]+=$nums[$j]; - } - } - if ($start>=$ss && $end<=$se) { - for (my $j=0;$j<@nums;$j++) { - $sexp[$j]+=$nums[$j]; - } - } - } - return(\@mexp,\@sexp,\@texp); -} - -sub structure{ - foreach my $key (keys %pre_mature) { - if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";} - #my ($str,$mfe)=RNA::fold($pre{$key}); - my $rnafold=`perl -e 'print "$pre{$key}"' | RNAfold --noPS`; - my @rnafolds=split/\s+/,$rnafold; - my $str=$rnafolds[1]; - my $mfe=$rnafolds[-1]; - $mfe=~s/\(//; - $mfe=~s/\)//; - - $struc{$key}{"struc"}=$str; - #$struc{$key}{"mfe"}=sprintf ("%.2f",$mfe); - $struc{$key}{"mfe"}=$mfe; - - foreach my $id (keys %{$pre_mature{$key}}) { - ($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str); - } -=cut -##### Nucleotide complementary - my @tmp=split//,$str; - my %a2b; - my @bps; - for (my $i=0;$i<@tmp;$i++) { - if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} - if ($tmp[$i] eq ")") { - my $up=pop @bps; - $a2b{$i+1}=$up; - $a2b{$up}=$i+1; - } - } - -##### search star position - foreach my $id (keys %{$pre_mature{$key}}) { - my $n=0; - for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) { - if (defined $a2b{$i}) { - my $a=$i; my $b=$a2b{$i}; - if($a>$b){ - $pre_mature{$key}{$id}{"star"}[0]=$b-$n+2; - $pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); - } - if($a<$b{ - $pre_mature{$key}{$id}{"star"}[1]=$b+$n+2; - $pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); - } - last; - } - $n++; - } - } -=cut - } -} -sub other_pair{ - my ($start,$end,$structure)=@_; - ##### Nucleotide complementary - my @tmp=split//,$structure; - my %a2b; my @bps; - for (my $i=0;$i<@tmp;$i++) { - if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} - if ($tmp[$i] eq ")") { - my $up=pop @bps; - $a2b{$i+1}=$up; - $a2b{$up}=$i+1; - } - } -##### search star position - my $n=0;my $startpos; my $endpos; - for (my $i=$start;$i<=$end ; $i++) { - if (defined $a2b{$i}) { - my $a=$i; my $b=$a2b{$i}; -# if($a>$b){ -# $startpos=$b-$n+2; -# $endpos=$b-$n+2+($end-$start); -# } -# if($a<$b){ - $endpos=$b+$n+2; - if($endpos>length($structure)){$endpos=length($structure);} - $startpos=$b+$n+2-($end-$start); - if($startpos<1){$startpos=1;} -# } - last; - } - $n++; - } - return ($startpos,$endpos); -} -sub attachPre{ - open IN, "<$pre_file_name"; - my $name; - while (my $aline=) { - chomp $aline; - if ($aline=~/^>(\S+)/) { - $name=$1; - next; - } - $pre{$name} .=$aline; - } - close IN; -} -sub readPosOnPre{ - open IN,") { - chomp $aline; - my @tmp=split/\t/,$aline; - my $id=lc($tmp[2]); - push @{$pre_read{$tmp[2]}},[@tmp]; - } - close IN; -} -sub maturePosOnPre{ - open IN,") { - chomp $aline; - my @tmp=split/\t/,$aline; - my $mm=$tmp[0]; -# $mm=~s/\-3P|\-5P//i; - $mm=lc($mm); - my $pm=$tmp[2]; - $pm=lc($pm); - -# next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a - next if($mm!~/$pm/); -# print "$tmp[2]\t$tmp[0]\n"; -# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream; -# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0); -# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream; - $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1; - $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]); - } - close IN; -} -sub mapping{ - my $err; -## build bowtie index - #print STDERR "building bowtie index\n"; - $err = `bowtie-build $pre_file_name miRNA_precursor`; - -## map mature sequences against precursors - #print STDERR "mapping mature sequences against index\n"; - $err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`; - -## map reads against precursors - #print STDERR "mapping read sequences against index\n"; - $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`; - -} - -sub subseq{ - my $seq=shift; - my $beg=shift; - my $end=shift; - my $strand=shift; - - my $subseq=substr($seq,$beg-1,$end-$beg+1); - if ($strand eq "-") { - $subseq=revcom($subseq); - } - return uc $subseq; -} - -sub revcom{ - my $seq=shift; - $seq=~tr/ATCGatcg/TAGCtagc/; - $seq=reverse $seq; - return uc $seq; -} - -sub Time{ - my $time=time(); - my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; - $month++; - $year+=1900; - if (length($sec) == 1) {$sec = "0"."$sec";} - if (length($min) == 1) {$min = "0"."$min";} - if (length($hour) == 1) {$hour = "0"."$hour";} - if (length($day) == 1) {$day = "0"."$day";} - if (length($month) == 1) {$month = "0"."$month";} - #print "$year-$month-$day $hour:$min:$sec\n"; - return("$year-$month-$day-$hour-$min-$sec"); -} - -sub usage{ -print <<"USAGE"; -Version $version -Usage: -$0 -r -p -m -mis -t -e -f -tag -o -time -mandatory parameters: --p precursor.fa miRNA precursor sequences from miRBase # must be absolute path --m mature.fa miRNA sequences from miRBase # must be absolute path --r reads.fa your read sequences #must be absolute path - --o output directory - -options: --mis [int] number of allowed mismatches when mapping reads to precursors, default 0 --t [int] threads number,default 1 --e [int] number of nucleotides upstream of the mature sequence to consider, default 2 --f [int] number of nucleotides downstream of the mature sequence to consider, default 5 --tag [string] sample marks# eg. sampleA;sampleB;sampleC --time sting #make directory time,default is the local time --h help -USAGE -exit(1); -} - diff -r ca05d68aca13 -r c75593f79aa9 rfam.pl --- a/rfam.pl Thu Nov 13 22:43:35 2014 -0500 +++ b/rfam.pl Wed Dec 03 01:54:29 2014 -0500 @@ -12,7 +12,7 @@ use File::Basename; my %opts; -GetOptions(\%opts,"i=s","ref=s","index:s","v:i","p:i","o=s","time:s","h"); +GetOptions(\%opts,"i=s","ref=s","index:s","v:i","p:i","o=s","h"); if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments &usage; } @@ -31,10 +31,8 @@ my $time=Time(); -if (defined $opts{'time'}) { - $time=$opts{'time'}; -} -my $mapdir=$dir."/rfam_match_".$time; + +my $mapdir=$dir."/rfam_match"; if(not -d $mapdir){ mkdir $mapdir; } @@ -95,7 +93,6 @@ -p/--threads number of alignment threads to launch (default: 1) -o output directory --time sting #make directory time,default is the local time -h help USAGE exit(1); diff -r ca05d68aca13 -r c75593f79aa9 tool_dependencies.xml --- a/tool_dependencies.xml Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ - - - - - - - - - - $REPOSITORY_INSTALL_DIR - - - - - - - http://www.tbi.univie.ac.at/RNA/packages/source/ViennaRNA-2.1.8.tar.gz - ./configure --prefix=$INSTALL_DIR - make - make install - - $INSTALL_DIR/bin - - - - - - - - - http://www.cpan.org/authors/id/S/SZ/SZABGAB/SVG-2.59.tar.gz - $INSTALL_DIR/lib/perl5 - - perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && - make && - make install - - - $INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/ - - - - - - -