# HG changeset patch # User big-tiandm # Date 1417756262 18000 # Node ID 7b5a48b972e9bee3c2b66b3f15ffd50d415d12a5 # Parent f008ab2cadc6c6d83a174fc8679012e066f14f61 Uploaded diff -r f008ab2cadc6 -r 7b5a48b972e9 Annotate.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Annotate.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,178 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chentt +#Email: chentt@big.ac.cn +#Date: 2014/4/10 +#Modified: +#Description: cluster annotate by priority +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","d=i","g=s","o=s","t=s","h"); +if (!(defined $opts{i} and defined $opts{g} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} +#my $genelistout=$opts{'t'}; +my $dis=defined $opts{'d'}? $opts{'d'}:1000; +my %gene; + +#open OUT,">$genelistout"; #output file +#print OUT "#ID\tchr\tstart\tend\tstrand\ns"; +open IN,"<$opts{g}"; +while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline;#ID chr start end strand + #push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; + $gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[4]]; +} +#while (my $aline=) { +# chomp $aline; +# next if($aline=~/^\#/); +# my @tmp=split/\t/,$aline; +# my $ID; +# if ($tmp[2] eq "gene") { +# $tmp[0]=~s/Chr\./Chr/; +# $tmp[0]=~s/Chr/chr/; +# my @infor=split/;/,$tmp[8]; +# for (my $i=0;$i<@infor ;$i++) { +# if ($infor[$i]=~/Alias\=(\S+)$/) { +# $ID=$1; +# last; +# } +# } +# $gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] +# print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; +# } +#} +close IN; +#close OUT; + + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +while (my $aline=) { + chomp $aline; + my @tmp=split/\t/,$aline; + if($aline=~/^\#/){print OUT "$aline\tP_annotate\n";next} + my @result; + #shift @tmp; + my @id=split/:/,$tmp[0]; + $id[0]=~s/Chr0/Chr/; + my @posi=split/-/,$id[1]; + foreach my $key (keys %{$gene{$id[0]}}) { + if ($posi[0]<$gene{$id[0]}{$key}[1] && $posi[1]>$gene{$id[0]}{$key}[0]) { + push @result,"gene-body;$key;$gene{$id[0]}{$key}[2]";#$te{$key}"; + next; + } + #if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-1000) { + if ($posi[0]<$gene{$id[0]}{$key}[0] && $posi[1]>$gene{$id[0]}{$key}[0]-$dis) { + push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+"); + push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-"); + next; + } + #if ($posi[0]<$gene{$id[0]}{$key}[1]+1000 && $posi[1]>$gene{$id[0]}{$key}[1]) { + if ($posi[0]<$gene{$id[0]}{$key}[1]+$dis && $posi[1]>$gene{$id[0]}{$key}[1]) { + push @result,"down1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "+"); + push @result,"up1-kb;$key;$gene{$id[0]}{$key}[2]" if($gene{$id[0]}{$key}[2] eq "-"); + next; + } + } + my $result; + if (!(@result)) { + $result="intergenic"; + } + elsif($#result==0){ + $result=$result[0]; + + } + else{ + $result=join "\t",@result; + } +# else{ +# my $te_num=0; +# my @te_overlap; +# my @te_up_down; +# my @non_overlap; +# my @non_up_down; +# for (my $k=0;$k<@result ;$k++) { +# my @rr=split/\;/,$result[$k]; +# if ($rr[3] eq "Y") { +# $te_num++; +# if ($rr[0] eq "overlap") { +# push @te_overlap,$result[$k]; +# } +# else{ +# push @te_up_down,$result[$k]; +# } +# } +# else{ +# if ($rr[0] eq "overlap") { +# push @non_overlap,$result[$k]; +# } +# else{ +# push @non_up_down,$result[$k]; +# } +# } +# } +# if ($te_num==0) {#non TE +# if (!(@te_overlap)) {#down up +# if ($#non_up_down==0) { +# $result=$non_up_down[0]; +# } +# else{#overlap +# my $all_2=join "\t",@non_up_down; +# $result="up&down1-kb\t".$all_2; +# } +# } +# else{ +# $result=join "\t",@non_overlap; +# if ($#non_overlap>=1) { +# print "$aline\t$result\n"; +# } +# } +# } +# else{#TE +# if (!(@te_overlap)) {#down up +# if ($#te_up_down==0) { +# $result=$te_up_down[0]; +# } +# else{#overlap +# my $all_2=join "\t",@te_up_down; +# $result="up&down1-kb\t".$all_2; +# } +# } +# else{ +# $result=join "\t",@te_overlap; +# if ($#te_overlap>=1) { +# print "$aline\t$result\n"; +# } +# } +# } +# } + print OUT "$aline\t$result\n"; +} + +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -g -d +options: +-i input file +-g genelist file +-d int the length of the upstream and downstream,default 1000 +-o output file +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 ClassAnnotate.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ClassAnnotate.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,251 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chen Tingting +#Email: chentt@big.ac.cn +#Date: 2014/5/13 +#Modified: +#Description: cluster annotate +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","g=s","n=s","r=s","p=s","o=s","t=s","l=s","h"); +if (!(defined $opts{i} and defined $opts{g} and defined $opts{n} and defined $opts{r} and defined $opts{p} and defined $opts{o} and defined $opts{t} and defined $opts{l}) || defined $opts{h}) { #necessary arguments +&usage; +} + +#my %gene; +my %gene1; +open IN,"<$opts{g}"; +open OUT ,">$opts{l}"; +print OUT "#ID\tchr\tstart\tend\tstrand\n"; +my $n=1; +while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + my $ID; + if ($tmp[2] eq "gene") { + $tmp[0]=~s/Chr\./Chr/; + #$tmp[0]=~s/Chr/chr/; + my @infor=split/;/,$tmp[8]; + for (my $i=0;$i<@infor ;$i++) { + if ($infor[$i]=~/Alias\=(\S+)$/) { + $ID=$1; + last; + } + else { + $ID="unknown$n"; + $n++; + } + } + #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] + push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; + print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; + } +} +#while (my $aline=) { +# chomp $aline; +# next if($aline=~/^\#/); +# my @tmp=split/\t/,$aline;#ID chr start end strand +# push @{$gene1{$tmp[0]}},[$tmp[2],$tmp[3],$tmp[1]]; +# #$gene{$tmp[1]}{$tmp[0]}=[$tmp[2],$tmp[3],$tmp[1]]; +#} +close IN; +close OUT; + +my %nat; +open TMP,">$opts{t}"; +print TMP "#NAT_ID\tGene\tStrand\tChr\tGene_start\tGene_end\tAntiGene\tStrand\tChr\tAntiGene_start\tAntiGene_end\tType1\tType2\tNATS1_start\tNATS1_end\tNATS2_start\tNATS2_end\n"; + +open IN,"<$opts{n}"; +my $nat_n=1; +while (my $aline=) { + next if($aline=~/^\#/);#osj LOC_Os05g02659 - LOC_Os01g24200 + trans 1559 1802 660 905 246 100nt - + chomp $aline; + my @arr=split /\t/,$aline; + my ($ns,$ne,$ns2,$ne2)=(0,0,0,0); + if ($arr[11]=~/Nearby/) { + ($ns,$ne)=&nearby($gene1{$arr[1]}[0][0],$gene1{$arr[1]}[0][1],$gene1{$arr[3]}[0][0],$gene1{$arr[3]}[0][1]); + push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs".$nat_n]; + print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns\t$ne\n"; + $nat_n++; + }else{ + $ns=$gene1{$arr[1]}[0][0]+$arr[6]-1; + $ne=$gene1{$arr[1]}[0][0]+$arr[7]-1; + $ns2=$gene1{$arr[3]}[0][1]-$arr[9]+1; + $ne2=$gene1{$arr[3]}[0][1]-$arr[8]+1; + push @{$nat{$gene1{$arr[1]}[0][2]}},[$ns,$ne,$arr[5],$arr[11],"NATs$nat_n"."_1"];#start,end,class1,class2 + push @{$nat{$gene1{$arr[3]}[0][2]}},[$ns2,$ne2,$arr[5],$arr[11],"NATs$nat_n"."_2"]; + print TMP "NATs$nat_n\t$arr[1]\t$arr[2]\t$gene1{$arr[1]}[0][2]\t$gene1{$arr[1]}[0][0]\t$gene1{$arr[1]}[0][1]\t$arr[3]\t$arr[4]\t$gene1{$arr[3]}[0][2]\t$gene1{$arr[3]}[0][0]\t$gene1{$arr[3]}[0][1]\t$arr[5]\t$arr[11]\t$ns\t$ne\t$ns2\t$ne2\n"; + $nat_n++; + } +} +close IN; +close TMP; + +my %repeat; +open IN,"<$opts{'r'}"; +my $first=; +$first=; +$first=; +while (my $aline=) { + chomp $aline; + $aline=~s/^\s+//; + my @tmp=split/\s+/,$aline; + $tmp[4]=~s/chr0/Chr/; + $tmp[4]=~s/chr/Chr/; + push @{$repeat{$tmp[4]}},[$tmp[5],$tmp[6],$tmp[10]];#start,end,class + #print "$tmp[4]\t$tmp[5]\t$tmp[6]\t$tmp[10]\n"; +} +close IN; + +my %phase; +open IN,"<$opts{'p'}"; +while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + if ($tmp[5]>=25) { + $phase{$tmp[0]}=$tmp[5]; + } +} +close IN; + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +while (my $aline=) { + chomp $aline; + if($aline=~/^\#/){ + print OUT "$aline\tPhase\tLong\tRepeatClass\tNatClass1\tNatClass2\tNatID\n"; + next; + } + my @tmp=split/\t/,$aline; + my @inf=split/\:/,$tmp[0]; + my @pos=split/\-/,$inf[1]; + my $chr=$inf[0]; + $chr=~s/Chr0/Chr/; + my $start=$pos[0]; + my $end=$pos[1]; + #=========Repeat============ + my @repeat; + if (defined(@{$repeat{$chr}})) { + my @r_array=sort {$a->[0] <=> $b->[0]} @{$repeat{$chr}}; + for (my $i=0;$i<@r_array ;$i++) { + if ($start<$r_array[$i][0] && $end>$r_array[$i][0]) { + push @repeat,$r_array[$i][2]; + } + elsif($start>$r_array[$i][0] && $start<$r_array[$i][1]){ + push @repeat,$r_array[$i][2]; + + } + elsif($end<$r_array[$i][0]){ + last; + } + else{next;} + } + } + my $repeat; + if (@repeat==0) { + $repeat="\/"; + } + else{ + $repeat=join ";",@repeat; + } + #=========nat=============== + my @nat1;#class 1 + my @nat2;#class 2 + my @nat;#nat ID + #foreach my $chr (keys %nat) { + my @n_array=sort {$a->[0] <=> $b->[0] } @{$nat{$chr}}; + for (my $i=0;$i<@n_array ;$i++) { + if ($start<$n_array[$i][0] && $end>$n_array[$i][0]) { + push @nat1,$n_array[$i][2]; + push @nat2,$n_array[$i][3]; + push @nat,$n_array[$i][4]; + } + elsif($start>$n_array[$i][0] && $start<$n_array[$i][1]){ + push @nat1,$n_array[$i][2]; + push @nat2,$n_array[$i][3]; + push @nat,$n_array[$i][4]; + } + elsif($end<$n_array[$i][0]){ + last; + } + else{next;} + } + #} + + my $nat1; + my $nat2; + my $nat; + if (@nat1==0) { + $nat1="\/"; + } + else{ + $nat1=join ";",@nat1; + } + if (@nat2==0) { + $nat2="\/"; + } + else{ + $nat2=join ";",@nat2; + } + if (@nat==0) { + $nat="\/"; + } + else{ + $nat=join ";",@nat; + } + #========phase============== + my $phase="\/"; + if (defined($phase{$tmp[0]})) { + $phase="phase"; + } + #=========long=============== + my $long="\/"; + if ($tmp[1] eq "\>30nt" and $tmp[2]>=0.5) { + $long="long"; + } + print OUT "$aline\t$phase\t$long\t$repeat\t$nat1\t$nat2\t$nat\n"; +} + +close IN; +close OUT; + +sub nearby{ + my @p=@_; + my ($s,$e)=(0,0); + if ($p[1]<$p[2]) { + $s=$p[1]; + $e=$p[2]; + }else{ + $s=$p[3]; + $e=$p[0]; + } + return ($s,$e); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -g -n -r -p -t -l +options: +-i input file + -g gff file + -n NATs file + -r repeat file + -p phase file +-o output file +-t nat output file +-l genelist output file +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 DEGseq_2.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DEGseq_2.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2009-05-06 +#Modified: +#Description: 删除matched reads +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","outdir=s","column1:i","mark1=s","depth1:i","depth2:i","column2:i","mark2=s","h"); +if (!(defined $opts{i} and defined $opts{outdir} and defined $opts{mark1} and defined $opts{mark2}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $outputdir=$opts{'outdir'}; +unless ($outputdir=~/\/$/) {$outputdir .="/";} +my $column1=defined $opts{column1} ? $opts{column1} : 3; +my $column2=defined $opts{column2} ? $opts{column2} : 4; +my $mark1=$opts{mark1}; +my $mark2=$opts{mark2}; +my $fileout=$outputdir."degseq.R"; + +open OUT,">$fileout"; #output file +#my ($name,$dir); +#$name=basename($filein); +print OUT "library(DEGseq)\n"; +print OUT "geneExpFile <- system.file(package=\"DEGseq\")\n"; +print OUT "geneExpFile<-file.path(\"$filein\")\n"; +print OUT "layout(matrix(c(1,2,3,4,5,6), 3, 2, byrow=TRUE))\npar(mar=c(2, 2, 2,2))\n"; +print OUT "outputdir<-file.path(\"$outputdir\")\n"; +print OUT "geneExpMatrix1 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column1))\n"; +print OUT "geneExpMatrix2 <- readGeneExp(file=geneExpFile, geneCol=1, valCol=c($column2))\n"; +if(defined $opts{'depth1'} && defined $opts{'depth2'}){ +print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",depth1=$opts{depth1},depth2=$opts{depth2},outputDir=outputdir,method=\"MARS\")\n"; +} +else{ +print OUT "DEGexp(geneExpMatrix1=geneExpMatrix1, geneCol1=1, expCol1=c(2), groupLabel1=\"$mark1\",geneExpMatrix2=geneExpMatrix2, geneCol2=1, expCol2=c(2), groupLabel2=\"$mark2\",outputDir=outputdir,method=\"MARS\")\n"; +} +close OUT; + + +system("R CMD BATCH $fileout"); + +wait; + + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -outdir -column1 -mark1 -column2 -mark2 -depth1 -depth2 +options: +-i input file +-outdir output file dir +-column1 the first column for DEGseq +-mark1 the name of the column1 +-depth1 depth for the first file,use for normalize +-column2 the second column for DEGseq +-mark2 the name of the column2 +-depth2 depth for the second file,use for normalize + +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 SampleDEGseqMerge.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SampleDEGseqMerge.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: chentt@big.ac.cn +#Date: 2014-05-21 +#Modified: +#Description: merged deg file and total information +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i:s@","mark:s@","f:s","o=s","n=s","h"); +if (!(defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my @filein=@{$opts{'i'}}; +my @mark=@{$opts{'mark'}}; +my $fileout=$opts{'o'}; +my $number=$opts{'n'}; + +my %hash; +open IN,"<$filein[0]"; #input file + +while (my $aline=) { + chomp $aline; + next if($aline=~/^\"/); + my @temp=split/\t/,$aline; + $hash{$temp[0]}=$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1]; +} +close IN; + +for (my $i=1;$i<=$#filein;$i++) { + open IN,"<$filein[$i]"; #input file + + while (my $aline=) { + chomp $aline; + next if($aline=~/^\"/); + my @temp=split/\t/,$aline; + if (!(defined $hash{$temp[0]})) { + print "Not find $temp[0]in sample one!\n"; + next; + } + $hash{$temp[0]} .="\t".$temp[4]."\t".$temp[6]."\t".$temp[7]."\t".$temp[-1]; + } + close IN; +} + +open OUT,">$fileout"; #output file +my $deg_title; +foreach (@mark) { + $deg_title.="log2(Fold_change)\tp_value\tq_value\t".$_."\t"; +} +$deg_title=~s/\s+$//; +my %function; +my $title; +open F,"<$opts{f}"; +while (my $aline=) { + chomp $aline; + if($aline=~/^\#/){ + my $title=$aline; + my @title=split/\t/,$aline; + $title[2+$number].="\t".$deg_title; + $title=join"\t",@title; + print OUT "$title\n"; + next; + } + my @temp=split/\t/,$aline; + $temp[2+$number].="\t".$hash{$temp[0]}; + my $temp=join"\t",@temp; + print OUT "$temp\n"; + +} +close F; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -mark -f +options: +-i input file # -i output_score.txt -i output_score.txt -i output_score.txt +-mark sample name # -mark sam1_VS_sam2 -mark sam1_VS_sam3 -mark sam2_VS_sam3 +-f cluster file +-n sample number +-o output file +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 conventional.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/conventional.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,156 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Chentt +#Email: chentt@big.ac.cn +#Date: 2014/04/09 +#Modified: +#Description: islands merged of merged samples +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","d=i","o=s","N=i","t=s","mark=s","h"); +if (!(defined $opts{i} and defined $opts{d} and defined $opts{N} and defined $opts{mark} and defined $opts{t} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +my $distance=$opts{'d'}; +my $tempout=$opts{'t'}; +my $mark=$opts{'mark'}; +my @sample=split/\#/,$mark; +$mark=join"\"\t\"",@sample; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n"; +open TMP,">$tempout"; +print TMP "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n"; +my %hash; +while (my $aline=) { + chomp $aline; + if($aline=~/^\#/){ + #print OUT "$aline\n"; + next; + } + my @tmp=split/\t/,$aline; + my $chr=shift @tmp; + #shift @tmp; + push @{$hash{$chr}},[@tmp]; +} + +close IN; + +foreach my $key (keys %hash) { + my @tag=sort{$a->[1] <=> $b->[1]} @{$hash{$key}}; + my @sample; + my $start=$tag[0][1]; + my $end=$tag[0][2]; + push @sample,[@{$tag[0]}]; + for (my $i=1;$i<@tag-1;$i++) { + if ($tag[$i][1]-$end<=$distance) { + if ($tag[$i][2]>$end) { + $end=$tag[$i][2]; + } + push @sample,[@{$tag[$i]}]; + } + else{ + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + $max_length="\>30"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + $start=$tag[$i][1]; + $end=$tag[$i][2]; + + @sample=(); + push @sample,[@{$tag[$i]}]; + } + } + if ($tag[$#tag][1]-$end<=$distance) { + if ($tag[$#tag][2]>$end) { + $end=$tag[$#tag][2]; + } + push @sample,[@{$tag[$#tag]}]; + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + $max_length="\>30"; + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + } + else{ + my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample); + my $cluster_exp=join"\t",@cluster_exp; + if ($max_length>30) { + $max_length="\>30"; + print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n"; + } + else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";} + print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n"; + + } +} +close OUT; +close TMP; +sub Max_length{ + my @exp=@{$_[0]}; + my %sample_length; + my $total_exp; + my @each; + my @tag; + for (my $i=0;$i<=$#exp ;$i++) { + my $length=$exp[$i][2]-$exp[$i][1]+1; + #if ($length>30) { + # $length=40; + #} + my $exp=0; + foreach (1..$opts{'N'}) { + $exp+=$exp[$i][$_+2]; + $each[$_-1]+=$exp[$i][$_+2]; + } + $sample_length{$length}+=$exp; + $total_exp+=$exp; + push @tag,($exp[$i][1].",".$exp[$i][2].",".$exp[$i][0].",".$exp); + } + my $max=0; + my $max_key; + foreach my $key (sort keys %sample_length) { + my $p=$sample_length{$key}/$total_exp; + if ($p>$max) { + $max=$p; + $max_key=$key; + } + $sample_length{$key}=sprintf("%.2f",$p); + } + my $tag_n=@tag; + my $tag=join";",@tag; + $tag=$tag_n."\t".$tag; + return($max_key,$sample_length{$max_key},$tag,@each); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -d -N -t -mark +options: +-i input file +-d distance of two islands +-mark sample name; +-o output file +-N sample number +-t temp output file +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 convert_bowtie_to_blast.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_bowtie_to_blast.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,126 @@ +#!/usr/bin/perl + + +use warnings; +use strict; +use Getopt::Std; + +######################################### USAGE ################################ + +my $usage= +"$0 file_bowtie_result file_solexa_seq file_chromosome + +This is a converter which changes Bowtie output into Blast format. +The input includes three files: a Bowtie result file (default Bowtie +output file), a fasta file consisting of small Reads and a chromosome +fasta file. It outputs the alignments in blast_parsed format. + +file_bowtie_result likes: + +AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0 +AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0 + +file_solexa_seq likes: + +>AtFlower100010_x2 +AAGGAGATTCTTTCAGTCCAG + +file_chromosome contains chromosome seq in fasta format + +"; + + +####################################### INPUT FILES ############################ + +my $file_bowtie_result=shift or die $usage; +my $file_short_seq=shift or die $usage; +my $file_chromosome_seq=shift or die $usage; + + +##################################### GLOBAL VARIBALES ######################### + +my %short_seq_length=(); +my %chromosome_length=(); + + +######################################### MAIN ################################# + +#get the short sequence id and its length +sequence_length($file_short_seq,\%short_seq_length); + +#get the chromosome sequence id and its length +sequence_length($file_chromosome_seq,\%chromosome_length); + +#convert bowtie result format to blast format; +change_format($file_bowtie_result); + +exit; + + +##################################### SUBROUTINES ############################## + +sub sequence_length{ + my ($file,$hash) = @_; + my ($id, $desc, $sequence, $seq_length) = (); + + open (FASTA, "<$file") or die "can not open $$file\n"; + while () + { + chomp; + if (/^>(\S+)(.*)/) + { + $id = $1; + $desc = $2; + $sequence = ""; + while (){ + chomp; + if (/^>(\S+)(.*)/){ + $$hash{$id} = length $sequence; + $id = $1; + $desc = $2; + $sequence = ""; + next; + } + $sequence .= $_; + } + } + } + $seq_length=length($sequence); + $$hash{$id} = $seq_length; + close FASTA; +} + + + + + +sub change_format{ + #Change Bowtie format into blast format + my $file=shift @_; + open(FILE,"<$file")||die"can not open the bowtie result file:$!\n"; + #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n"; + + while(){ + chomp; + my @tmp=split("\t",$_); + #Clean the reads ID + my @tmp1=split(" ",$tmp[0]); + print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t"; + if($tmp[1] eq "+"){ + my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]}; + my $seq_bg=$tmp[3] + 1; + print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n"; + } + if($tmp[1] eq "-"){ + my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3]; + my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1; + print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n"; + } + } + +# close BLASTOUT; + +} + + + diff -r f008ab2cadc6 -r 7b5a48b972e9 count_ref_length.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/count_ref_length.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,58 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2009-05-06 +#Modified: +#Description: 删除matched reads +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file + +my ($name,$seq); +while (my $aline=) { + chomp $aline; + if ($aline=~/^>(\S+)/) { + $name=$1; + while (my $new=) { + chomp $new; + if ($new=~/^>(\S+)/) { + print OUT $name,"\t",length($seq),"\n"; + $seq =""; + $name=$1; + next; + } + else{$seq .=$new;} + } + } + print OUT $name,"\t",length($seq),"\n"; +} + +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 filterReadsByCount.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filterReadsByCount.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2010-01 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","o=s","mark:s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $mark=defined $opts{'mark'} ? $opts{'mark'} : "Sample"; +my @mark=split /\#/,$mark; + +open OUT,">$opts{o}"; +open IN,"<$opts{i}"; +my %hash;my %reads; +while (my $aline=) { + chomp $aline; + my $seq=; + chomp $seq; + if($aline=~/:([\d|_]+)_x(\d+)$/){ + if ($2>3) { + my @ss=split/_/,$1; + for (my $i=0;$i<@ss;$i++) { + $hash{length($seq)}[$i]++ if($ss[$i]>0); + $hash{length($seq)}[$i] +=0 if($ss[$i]==0); + $reads{length($seq)}[$i]+=$ss[$i]; + } + print OUT "$aline\n$seq\n"; + } + } +} +close IN; +close OUT; + +my $dir=dirname($opts{'o'}); +chdir $dir; +my $lengthfile=$dir."/reads_length_distribution_after_count_filter.txt"; +open OUT, ">$lengthfile"; +open R,">$dir/length_distribution_after_count_filter.R"; + +print OUT "Tags length\t@mark\n"; + +my $samNo=@mark; +my $avalue=""; +my @length=sort{$a<=>$b} keys %hash; +foreach (@length) { + print OUT $_,"\t@{$hash{$_}}\n"; + my $vv=join ", ",@{$hash{$_}}; + $avalue .="$vv,"; +} +$avalue =~s/,$//; +my $lengths=join ",",@length; +my $marks=join "\",\"",@mark; + +print R "a<-c($avalue) +b<-matrix(a,ncol=$samNo,byrow=T) +cl<-colors() +names=c($lengths) +legends=c(\"$marks\") +png(\"Tags_length_after_count_filter.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Tags Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +$avalue=""; +print OUT "\nReads length\t@mark\n"; +foreach (@length) { + print OUT $_,"\t@{$reads{$_}}\n"; + my $vv=join ", ", @{$reads{$_}}; + $avalue .= "$vv,"; +} +$avalue =~s/,$//; + +print R "a<-c($avalue)\n +b<-matrix(a,ncol=$samNo,byrow=T) + +png(\"Reads_length_after_count_filter.png\",width=800,height=600) +barplot(t(b),beside=TRUE,col=cl[1:$samNo],main=\"Reads Length Distribution After Count Filter\",names.arg=names,ylim=c(0,max(a)),legend.text=legends,args.legend=\"topleft\") +abline(h=0) +dev.off() + +"; +close OUT; +close R; + +system ("R CMD BATCH $dir/length_distribution_after_count_filter.R"); + +#system ("rm $dir/length_distribution.R"); +#system ("rm $dir/length_distribution.Rout"); +#system ("rm $dir/.RData"); +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -min -max -mark +options: + +-i input file +-o output file +-mark string #sample name eg: samA#samB#samC +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 filterReadsByLength.pl --- a/filterReadsByLength.pl Wed Dec 03 02:03:27 2014 -0500 +++ b/filterReadsByLength.pl Fri Dec 05 00:11:02 2014 -0500 @@ -33,7 +33,7 @@ my @ss=split/_/,$1; for (my $i=0;$i<@ss;$i++) { $hash{length($seq)}[$i]++ if($ss[$i]>0); - $hash{length($seq)}[$i] +=0 if($ss[$i]>0); + $hash{length($seq)}[$i] +=0 if($ss[$i]==0); $reads{length($seq)}[$i]+=$ss[$i]; } } diff -r f008ab2cadc6 -r 7b5a48b972e9 get_genelist.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_genelist.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,62 @@ +#!/usr/bin/perl -w +#Filename: +#Author: chentt +#Email: +#Date: 2012-4-6 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} +open IN,"<$opts{i}"; +open OUT ,">$opts{o}"; +print OUT "#ID\tchr\tstart\tend\tstrand\n"; +my $n=1; +my %gene1; +while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + my @tmp=split/\t/,$aline; + my $ID; + if ($tmp[2] eq "gene") { + $tmp[0]=~s/Chr\./Chr/; + #$tmp[0]=~s/Chr/chr/; + my @infor=split/;/,$tmp[8]; + for (my $i=0;$i<@infor ;$i++) { + if ($infor[$i]=~/Alias\=(\S+)$/) { + $ID=$1; + last; + } + else { + $ID="unknown$n"; + $n++; + } + } + #$gene{$tmp[0]}{$ID}=[$tmp[3],$tmp[4],$tmp[6]];#$gene{chr}{geneID}=[start,end,strand] + push @{$gene1{$ID}},[$tmp[3],$tmp[4],$tmp[0]]; + print OUT "$ID\t$tmp[0]\t$tmp[3]\t$tmp[4]\t$tmp[6]\n"; + } +} +close IN; +close OUT; + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -h +options: +-i input cluster file +-o output file +-h help +USAGE +exit(1); +} \ No newline at end of file diff -r f008ab2cadc6 -r 7b5a48b972e9 html_siRNA.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html_siRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,788 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-5-29 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +use File::Basename; + +my %opts; +GetOptions(\%opts,"i=s","format=s","o=s","h"); +if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments +&usage; +} +my ($config,$prepath,$rfampath,$genomepath,$clusterpath,$annotatepath,$plotpath,$degpath); +my ($predir,$rfamdir,$genomedir,$clusterdir,$annotatedir,$plotdir,$degdir); +open IN,"<$opts{i}"; +$config=; chomp $config; +$prepath=; chomp $prepath; +$rfampath=;chomp $rfampath; +$genomepath=; chomp $genomepath; +$clusterpath=; chomp $clusterpath; +$annotatepath=; chomp $annotatepath; +$plotpath=; chomp $plotpath; +my $deg_tag=1; +if(eof){$deg_tag=0;} +else{ + $degpath=; chomp $degpath; +} +close IN; +my @tmp=split/\//,$prepath; +$predir=$tmp[-1]; +@tmp=split/\//,$rfampath; +$rfamdir=$tmp[-1]; +@tmp=split/\//,$genomepath; +$genomedir=$tmp[-1]; +@tmp=split/\//,$clusterpath; +$clusterdir=$tmp[-1]; +@tmp=split/\//,$annotatepath; +$annotatedir=$tmp[-1]; +@tmp=split/\//,$plotpath; +$plotdir=$tmp[-1]; + +my $dir=dirname($opts{'o'}); + +open OUT ,">$opts{'o'}"; +print OUT "\n \n Analysis Report \n + \n

\n \n Small RNA Analysis Report\n \n

+

1. Sequence No. and quality

+

1.1 Sequece No.

+"; + +### raw data no +open IN,"<$config"; +my @files;my @marks; my @rawNo; +while (my $aline=) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @files,$tmp[0]; + + my $no=`less $tmp[0] |wc -l `; + chomp $no; + if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { + $no=$no/4; + } + else{ + $no=$no/2; + } + push @rawNo,$no; + + push @marks,$tmp[1]; +} +close IN; + +### preprocess +unless ($prepath=~/\/$/) { + $prepath .="/"; +} + +my @trimNo;my @collapse; +my $collapsefile=$prepath."collapse_reads.fa"; +open IN,"<$collapsefile"; +while (my $aline=) { + chomp $aline; + ; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $trimNo[$i] +=$lng[$i]; + $collapse[$i] ++; + } + } +} +close IN; + +my @cleanR;my @cleanT; +my $clean=$prepath."collapse_reads_18-40.fa"; +open IN,"<$clean"; +while (my $aline=) { + chomp $aline; + ; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $cleanR[$i] +=$lng[$i]; + $cleanT[$i] ++; + } + } +} +close IN; + +my @filterR;my @filterT; +my $filter=$prepath."collapse_reads_out.fa"; +open IN,"<$filter"; +while (my $aline=) { + chomp $aline; + ; + $aline=~/:([\d|_]+)_x(\d+)$/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $filterR[$i] +=$lng[$i]; + $filterT[$i] ++; + } + } +} +close IN; + + +print OUT " + + +"; +foreach (@marks) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@rawNo) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@trimNo) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@collapse) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@cleanR) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@cleanT) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@filterR) { + print OUT "\n"; +} +print OUT " + + +"; +foreach (@filterT) { + print OUT "\n"; +} +print OUT "\n
  $_
Raw Reads No. $_
Reads No. After Trimed 3\' adapter $_
Unique Tags No. $_
Clean Reads No. $_
Clean Tags No. $_
Filter Reads No. \(reads count \>3\) $_
Filter Tags No. \(reads count \>3\) $_
"; +print OUT "

+Note:
+The raw data file path is: $files[0]
+"; +for (my $i=1;$i<@files;$i++) { + print OUT "          $files[$i]
"; +} +print OUT "The collapsed file path is: $collapsefile
+The clean data file path is: $clean
+The filter (remain total reads>3) data file path is: $filter
+

+

1. Sequence length count

+"; +print OUT "\n"; + +my $length=$prepath."length.html"; +open IN,"<$length"; +while (my $aline=) { + chomp $aline; + print OUT "$aline\n"; +} +close IN; + +print OUT "

Note:
The sequence length data: length file +

+"; + +#### rfam +unless ($rfampath=~/\/$/) { + $rfampath .="/"; +} +unless ($genomepath=~/\/$/) { + $genomepath .="/"; +} +print OUT "

2. Rfam non-miRNA annotation

+

2.1 Reads count

+ + +"; + +my @rfamR; my @rfamT; +my $tag=1; +open IN,"<$dir/rfam_match/rfam_non-miRNA_annotation.txt"; +while (my $aline=) { + chomp $aline; + $tag=0 if($aline=~/tags\s+number/); + next if($aline=~/^\#/); + next if($aline=~/^\s*$/); + my @tmp=split/\s+/,$aline; + if($tag == 1){push @rfamR,[@tmp];} + else{push @rfamT,[@tmp];} +} +close IN; + + +print OUT "\n"; +foreach (@marks) { + print OUT "\n"; +} +for (my $i=0;$i<@rfamR;$i++) { + print OUT " + + + "; + for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { + print OUT "\n"; + } +} + +print OUT "\n
RNA Name $_
$rfamR[$i][0] $rfamR[$i][$j]
+

2.2 Tags count

+ + + \n"; +foreach (@marks) { + print OUT "\n"; +} +for (my $i=0;$i<@rfamT;$i++) { + print OUT " + + + "; + for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { + print OUT "\n"; + } +} +print OUT "\n
RNA Name $_
$rfamT[$i][0] $rfamT[$i][$j]
+

Note:
The rfam mapping results is: $rfampath"; +print OUT "rfam_mapped.bwt

"; + +open IN,"<$dir/genome_match/genome_mapped.bwt"; +my @genome_r_u; +my @genome_r_m; +my @genome_t_u; +my @genome_t_m; +my $tags_map_number=0; +while (my $aline=) { + chomp $aline; + my @temp=split/\t/,$aline; + if ($temp[6]==0) { + $aline=~/:([\d|_]+)_x(\d+)/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $genome_r_u[$i] +=$lng[$i]; + $genome_t_u[$i] ++; + } + } + $tags_map_number++; + } + if ($temp[6]>0) { + $aline=~/:([\d|_]+)_x(\d+)/; + my @lng=split/_/,$1; + for (my $i=0;$i<@lng;$i++) { + if ($lng[$i]>0) { + $genome_r_m[$i] +=$lng[$i]; + $genome_t_m[$i] ++; + } + } + for (my $i=0;$i<$temp[6] ;$i++) { + my $next=; + } + $tags_map_number++; + } +} +close IN; +#

3.1 Reads count

+# +# +print OUT "

3. genome mapping result

+
+ +\n +"; +foreach (@marks) { + print OUT "\n"; +} +print OUT " + + +"; +for (my $i=0;$i<@genome_r_u ;$i++) { + print OUT "\n"; +} + +print OUT " + + +"; +for (my $i=0;$i<@genome_t_u ;$i++) { + print OUT "\n"; +} + +print OUT " + + +"; +for (my $i=0;$i<@genome_r_m ;$i++) { + print OUT "\n"; +} + +print OUT " + + +"; +for (my $i=0;$i<@genome_t_m ;$i++) { + print OUT "\n"; +} + +print OUT "\n
Map $_
Uniq Map Reads No. $genome_r_u[$i]
Uniq Map Tags No. $genome_t_u[$i]
Multiple Map Reads No. $genome_r_m[$i]
Multiple Map Tags No. $genome_t_m[$i]
+

Note:
The genome mapping results is: $genomepath"; +print OUT "genome_mapped.bwt

"; + +my $cluster="$clusterpath/sample_reads.cluster"; +my $cluster_number=`less $cluster |wc -l `; +$cluster_number=$cluster_number-1; +my (%cluster_length,@exp,@rpkm); +my @exp_range=qw(0 \(0--10] \(10--100] \(100--1000] \(1000--10000] \(10000--100000] \(100000--**\)); +my @rpkm_range=qw(0 \(0--0.25] \(0.25--0.5] \(0.5--1] \(1.0-5.0] \(5--10] \(10--50] \(50--100] \(100--500] \(500--1000] \(1000--**]); + +open IN,"<$cluster"; +while (my $aline=) { + next if($aline=~/^\"/); + chomp $aline; + my @temp=split/\t/,$aline; + my @id=split/:|-/,$temp[0]; + $cluster_length{$id[2]-$id[1]+1}++; + for (my $i=0;$i<@marks ;$i++) { + if ($temp[$i+3] == 0) {$exp[0][$i]++;} + elsif ($temp[$i+3]>0 && $temp[$i+3]<= 10 ){$exp[1][$i]++;} + elsif ($temp[$i+3]>10 && $temp[$i+3]<=100){$exp[2][$i]++;} + elsif ($temp[$i+3]>100 && $temp[$i+3]<=1000){$exp[3][$i]++;} + elsif ($temp[$i+3]>1000 && $temp[$i+3]<=10000){$exp[4][$i]++;} + elsif ($temp[$i+3]>10000 && $temp[$i+3]<=100000){$exp[5][$i]++;} + elsif ($temp[$i+3]>100000){$exp[6][$i]++;} + } +} +close IN; + +my $cluster_rpkm="$clusterpath/sample_rpkm.cluster"; +open IN,"<$cluster_rpkm"; +while (my $aline=) { + next if($aline=~/^\#/); + chomp $aline; + my @temp=split/\t/,$aline; + for (my $i=0;$i<@marks ;$i++) { + if ($temp[$i+3]==0) {$rpkm[0][$i]++;} + elsif($temp[$i+3]>0 && $temp[$i+3]<=0.25){$rpkm[1][$i]++;} + elsif($temp[$i+3]>0.25 && $temp[$i+3]<=0.5){$rpkm[2][$i]++;} + elsif($temp[$i+3]>0.5 && $temp[$i+3]<=1){$rpkm[3][$i]++;} + elsif($temp[$i+3]>1 && $temp[$i+3]<=5){$rpkm[4][$i]++;} + elsif($temp[$i+3]>5 && $temp[$i+3]<=10){$rpkm[5][$i]++;} + elsif($temp[$i+3]>10 && $temp[$i+3]<=50){$rpkm[6][$i]++;} + elsif($temp[$i+3]>50 && $temp[$i+3]<=100){$rpkm[7][$i]++;} + elsif($temp[$i+3]>100 && $temp[$i+3]<=500){$rpkm[8][$i]++;} + elsif($temp[$i+3]>500 && $temp[$i+3]<=1000){$rpkm[9][$i]++;} + else{$rpkm[10][$i]++;} + } +} + +close IN; + +my $cluster_length_file="$clusterpath/cluster_length.txt"; +open LEN,">$cluster_length_file"; +print LEN "\#length\tcluster_number\n"; +foreach my $key (sort keys %cluster_length) { + print LEN "$key\t$cluster_length{$key}\n"; +} +close LEN; +print OUT "

4. cluster result

+

4.1 Cluster count

+ + + + + + + + + +\n
Merged samples
Tags number$tags_map_number
Cluster number$cluster_number
+"; + +print OUT "

4.2 Cluster length

+

Note:
The clusters length data: length file +

+"; +print OUT "

4.3 Quantify

+ + +\n +"; +foreach (@marks) { + print OUT "\n"; +} +for (my $i=0;$i<@exp_range;$i++) { + print OUT " + + + "; + for (my $j=0;$j<@marks ;$j++) { + if (!(defined($exp[$i][$j]))) { + print OUT "\n"; + } + else{print OUT "\n";} + } +} +print OUT "\n
Reads Range $_
$exp_range[$i] 0 $exp[$i][$j]
"; + +print OUT "\n + +\n +"; +foreach (@marks) { + print OUT "\n"; +} +for (my $i=0;$i<@rpkm_range;$i++) { + print OUT " + + + "; + for (my $j=0;$j<@marks ;$j++) { + if (!(defined($rpkm[$i][$j]))) { + print OUT "\n"; + } + else{print OUT "\n";} + } +} +print OUT "\n
RPKM Range $_
$rpkm_range[$i] 0 $rpkm[$i][$j]
"; + +my $annotate="$annotatepath/sample_c_p.anno"; +my (%posit,%repeat,%nat1,%nat2); +my (@phase,@long,@repeat,@nat); +for (my $j=0;$j<@marks ;$j++) { + $phase[$j]=0; + $long[$j]=0; + $repeat[$j]=0; + $nat[$j]=0; +} + +my $class_anno=1; +open ANNO,"<$annotate"; +while (my $aline=) { + chomp $aline; + my @temp=split/\t/,$aline; + if($aline=~/^\#/){ + if (@temp != 10+@marks) { + $class_anno=0; + } + next; + } + for (my $i=3+@marks+$class_anno;$i<@temp;$i++) { + my @posit=split/\;/,$temp[$i]; + for (my $j=0;$j<@marks ;$j++) { + if ($temp[3+$j]>0) { + $posit{$posit[0]}[$j]++; + } + else{ + if (!(defined($posit{$posit[0]}[$j]))) { + $posit{$posit[0]}[$j]=0; + } + } + } + } + if ($class_anno) { + for (my $j=0;$j<@marks ;$j++) { + if ($temp[3+$j]>0) { + if ($temp[6] eq "phase") { + $phase[$j]++; + } + if ($temp[7] eq "long") { + $long[$j]++; + } + if ($temp[8] ne "\/") { + $repeat[$j]++; + my @rr=split/\;/,$temp[8]; + foreach (@rr) { + $repeat{$_}[$j]++; + } + } + if ($temp[9] ne "\/") { + $nat[$j]++; + my @nn1=split/\;/,$temp[9]; + my @nn2=split/\;/,$temp[10]; + for (my $k=0;$k<@nn1 ;$k++) { + $nat1{$nn1[$k]}[$j]++; + $nat2{$nn2[$k]}[$j]++; + } + } + } + } + } +} +close ANNO; + +print OUT "

5. Cluster Annotate

+

5.1 Cluster genome position annotate

+ + +\n +"; + +foreach (@marks) { + print OUT "\n"; +} +foreach my $key (sort keys %posit) { + print OUT " + + + "; + foreach (@{$posit{$key}}) { + print OUT "\n"; + } +} +print OUT "\n
clusters number $_
$key $_
"; +print OUT "

+Note:
+One cluster mybe annotate to multiple genes
+"; + +if ($class_anno) { + print OUT "

5.2 Cluster source mechanism annotate

+ + + \n + "; + + foreach (@marks) { + print OUT "\n"; + } + print OUT " + + \n + "; + foreach (@phase) { + print OUT "\n"; + } + + print OUT " + + \n + "; + foreach (@long) { + print OUT "\n"; + } + + print OUT " + + \n + "; + foreach (@repeat) { + print OUT "\n"; + } + + print OUT " + + \n + "; + foreach (@nat) { + print OUT "\n"; + } + print OUT "\n
clusters number $_
Phase $_
Long $_
Repeat $_
Nat $_
"; + + print OUT "

+ Repeat subclass annotate: + "; + + print OUT " + + \n + "; + foreach (@marks) { + print OUT "\n"; + } + + foreach my $key (sort keys %repeat) { + print OUT " + + + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($repeat{$key}[$i])) { + print OUT "\n"; + } + else{print OUT "\n";} + } + } + print OUT "\n
Repeat subclass $_
$key $repeat{$key}[$i] 0
"; + + + print OUT "

+ Nat subclass1 annotate: + "; + + print OUT " + + \n + "; + foreach (@marks) { + print OUT "\n"; + } + foreach my $key (sort keys %nat1) { + print OUT " + + + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($nat1{$key}[$i])) { + print OUT "\n"; + } + else{print OUT "\n";} + } + } + print OUT "\n
Nat subclass1 $_
$key $nat1{$key}[$i] 0
"; + + print OUT "

+ Nat subclass2 annotate: + "; + + print OUT " + + \n + "; + foreach (@marks) { + print OUT "\n"; + } + foreach my $key (sort keys %nat2) { + print OUT " + + + "; + for (my $i=0;$i<@marks ;$i++) { + if (defined($nat2{$key}[$i])) { + print OUT "\n"; + } + else{print OUT "\n";} + } + } + print OUT "\n
Nat subclass2 $_
$key $nat2{$key}[$i] 0
"; + print OUT "

+ Note:
+ One cluster mybe annotate to multiple repeats or nats
+ "; +} +else { + print OUT "

5.2 Cluster source mechanism annotate

+
Do not do source mechanism annotate
"; + +} + +print OUT "

6. Graph of Clusters of all samples

\n"; + +my $plot=$plotpath."cluster.html"; +open IN,"<$plot"; +while (my $aline=) { + chomp $aline; + print OUT "$aline\n"; +} +close IN; + + +if ($deg_tag) { + my $deg_file=`ls $degpath`; + chomp $deg_file; + my @deg_file=split/\n/,$deg_file; + my %deg; + foreach (@deg_file) { + my $output="$degpath/$_/output_score.txt"; + open IN,"<$output"; + $deg{$_}[0]=0; + $deg{$_}[1]=0; + $deg{$_}[2]=0; + while (my $aline=) { + next if ($aline=~/^\"/); + chomp $aline; + my @temp=split/\t/,$aline; + if ($temp[9] eq "TRUE") { + $deg{$_}[0]++; + if ($temp[4] >0) { + $deg{$_}[1]++; + } + if ($temp[4] <0) { + $deg{$_}[2]++; + } + } + } + close IN; + } + + print OUT "

7. DEG

+ + + \n + \n + \n + \n + "; + + foreach my $key (sort keys %deg) { + print OUT " + + + "; + for (my $i=0;$i<@{$deg{$key}} ;$i++) { + print OUT "\n"; + } + } + print OUT "\n
Genes number DEG UP DOWN
$key $deg{$key}[$i]
"; +} +else{ + print OUT "

7. DEG

+
Do not do DE clusters
"; +} + +print OUT " + + +"; +close OUT; + + + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -o +options: +-i +-format +-o output file +-h help +USAGE +exit(1); +} diff -r f008ab2cadc6 -r 7b5a48b972e9 install_DEG.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/install_DEG.R Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,2 @@ +source("http://bioconductor.org/biocLite.R") +biocLite("DEGseq") diff -r f008ab2cadc6 -r 7b5a48b972e9 miRDeep_plant.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/miRDeep_plant.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,1622 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use Getopt::Std; +#use RNA; + + +################################# MIRDEEP ################################################# + +################################## USAGE ################################################## + + +my $usage= +"$0 file_signature file_structure temp_out_directory + +This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with +information on the positions of reads aligned to potential precursor sequences (signature). +It also takes as input an RNAfold output file, giving information on the sequence, structure +and mimimum free energy of the potential precursor sequences. + +Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA +sequences that should be considered for conservation purposes. -t prints out the potential +precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that +exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors +that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences +obtained through conventional cloning. -z consider the number of base pairings in the lower +stems (this option is not well tested). + +-h print this usage +-s fasta file with known miRNAs +#-o temp directory ,maked befor running the program. +-t print filtered +-u limited output (only ids) +-v cut-off (default 1) +-x sensitive option for Sanger sequences +-y use Randfold +-z consider Drosha processing +"; + + + + + +############################################################################################ + +################################### INPUT ################################################## + + +#signature file in blast_parsed format +my $file_blast_parsed=shift or die $usage; + +#structure file outputted from RNAfold +my $file_struct=shift or die $usage; + +my $tmpdir=shift or die $usage; +#options +my %options=(); +getopts("hs:tuv:xyz",\%options); + + + + + + +############################################################################################# + +############################# GLOBAL VARIABLES ############################################## + + +#parameters +my $nucleus_lng=11; + +my $score_star=3.9; +my $score_star_not=-1.3; +my $score_nucleus=7.63; +my $score_nucleus_not=-1.17; +my $score_randfold=1.37; +my $score_randfold_not=-3.624; +my $score_intercept=0.3; +my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0); +my $score_min=1; +if($options{v}){$score_min=$options{v};} +if($options{x}){$score_min=-5;} + +my $e=2.718281828; + +#hashes +my %hash_desc; +my %hash_seq; +my %hash_struct; +my %hash_mfe; +my %hash_nuclei; +my %hash_mirs; +my %hash_query; +my %hash_comp; +my %hash_bp; + +#other variables +my $subject_old; +my $message_filter; +my $message_score; +my $lines; +my $out_of_bound; + + + +############################################################################################## + +################################ MAIN ###################################################### + + +#print help if that option is used +if($options{h}){die $usage;} +unless ($tmpdir=~/\/$/) {$tmpdir .="/";} +if(!(-s $tmpdir)){mkdir $tmpdir;} +$tmpdir .="TMP_DIR/"; +mkdir $tmpdir; + +#parse structure file outputted from RNAfold +parse_file_struct($file_struct); + +#if conservation is scored, the fasta file of known miRNA sequences is parsed +if($options{s}){create_hash_nuclei($options{s})}; + +#parse signature file in blast_parsed format and resolve each potential precursor +parse_file_blast_parsed($file_blast_parsed); +#`rm -rf $tmpdir`; +exit; + + + + +############################################################################################## + +############################## SUBROUTINES ################################################### + + + +sub parse_file_blast_parsed{ + +# read through the signature blastparsed file, fills up a hash with information on queries +# (deep sequences) mapping to the current subject (potential precursor) and resolve each +# potential precursor in turn + + my $file_blast_parsed=shift; + + open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n"; + while (my $line=){ + if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){ + my $query=$1; + my $query_lng=$2; + my $query_beg=$3; + my $query_end=$4; + my $subject=$5; + my $subject_lng=$6; + my $subject_beg=$7; + my $subject_end=$8; + my $e_value=$9; + my $pid=$10; + my $bitscore=$11; + my $other=$12; + + #if the new line concerns a new subject (potential precursor) then the old subject must be resolved + if($subject_old and $subject_old ne $subject){ + resolve_potential_precursor(); + } + + #resolve the strand + my $strand=find_strand($other); + + #resolve the number of reads that the deep sequence represents + my $freq=find_freq($query); + + #read information of the query (deep sequence) into hash + $hash_query{$query}{"subject_beg"}=$subject_beg; + $hash_query{$query}{"subject_end"}=$subject_end; + $hash_query{$query}{"strand"}=$strand; + $hash_query{$query}{"freq"}=$freq; + + #save the signature information + $lines.=$line; + + $subject_old=$subject; + } + } + resolve_potential_precursor(); +} + +sub resolve_potential_precursor{ + +# dissects the potential precursor in parts by filling hashes, and tests if it passes the +# initial filter and the scoring filter + +# binary variable whether the potential precursor is still viable + my $ret=1; +#print STDERR ">$subject_old\n"; + + fill_structure(); +#print STDERR "\%hash_bp",scalar keys %hash_bp,"\n"; + fill_pri(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_mature(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_star(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_loop(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_lower_flanks(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + +# do_test_assemble(); + +# this is the actual classification + unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;} + + print_results($ret); + + reset_variables(); + + return; + +} + + + +sub print_results{ + + my $ret=shift; + +# print out if the precursor is accepted and accepted precursors should be printed out +# or if the potential precursor is discarded and discarded potential precursors should +# be printed out + + if((!$options{t} and $ret) or ($options{t} and !$ret)){ + #full output + unless($options{u}){ + if($message_filter){print $message_filter;} + if($message_score){print $message_score;} + print_hash_comp(); + print $lines,"\n\n"; + return; + } + #limited output (only ids) + my $id=$hash_comp{"pri_id"}; + print "$id\n"; + } +} + + + + + + + +sub pass_threshold_score{ + +# this is the scoring + + #minimum free energy of the potential precursor +# my $score_mfe=score_mfe($hash_comp{"pri_mfe"}); + my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"}); + + #count of reads that map in accordance with Dicer processing + my $score_freq=score_freq($hash_comp{"freq"}); +#print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n"; + + #basic score + my $score=$score_mfe+$score_freq; + + #scoring of conserved nucleus/seed (optional) + if($options{s}){ + + #if the nucleus is conserved + if(test_nucleus_conservation()){ + + #nucleus from position 2-8 + my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); + + #resolve DNA/RNA ambiguities + $nucleus=~tr/[T]/[U]/; + + #print score contribution + score_s("score_nucleus\t$score_nucleus"); + + #print the ids of known miRNAs with same nucleus + score_s("$hash_mirs{$nucleus}"); +#print STDERR "score_nucleus\t$score_nucleus\n"; + + #add to score + $score+=$score_nucleus; + + #if the nucleus is not conserved + }else{ + #print (negative) score contribution + score_s("score_nucleus\t$score_nucleus_not"); + + #add (negative) score contribution + $score+=$score_nucleus_not; + } + } + + #if the majority of potential star reads fall as expected from Dicer processing + if($hash_comp{"star_read"}){ + score_s("score_star\t$score_star"); +#print STDERR "score_star\t$score_star\n"; + $score+=$score_star; + }else{ + score_s("score_star\t$score_star_not"); +#print STDERR "score_star_not\t$score_star_not\n"; + $score+=$score_star_not; + } + + #score lower stems for potential for Drosha recognition (highly optional) + if($options{z}){ + my $stem_bp=$hash_comp{"stem_bp"}; + my $score_stem=$scores_stem[$stem_bp]; + $score+=$score_stem; + score_s("score_stem\t$score_stem"); + } + +#print STDERR "score_intercept\t$score_intercept\n"; + + $score+=$score_intercept; + + #score for randfold (optional) + if($options{y}){ + +# only calculate randfold value if it can make the difference between the potential precursor +# being accepted or discarded + if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){ + + #randfold value<0.05 + if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");} + + #randfold value>0.05 + else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");} + } + } + + #round off values to one decimal + my $round_mfe=round($score_mfe*10)/10; + my $round_freq=round($score_freq*10)/10; + my $round=round($score*10)/10; + + #print scores + score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round"); + + #return 1 if the potential precursor is accepted, return 0 if discarded + unless($score>=$score_min){return 0;} + return 1; +} + +sub test_randfold{ + + #print sequence to temporary file, test randfold value, return 1 or 0 + +# print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); + #my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; + #open(FILE, ">$tmpfile"); + #print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; + #close FILE; + +# my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; + #my $p1=`randfold -s $tmpfile 999 | cut -f 3`; + #my $p2=`randfold -s $tmpfile 999 | cut -f 3`; + my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999); + my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999); + my $p_value=($p1+$p2)/2; + wait; +# system "rm $tmpfile"; + + if($p_value<=0.05){return 1;} + + return 0; +} + +sub randfold_pvalue{ + my $cpt_sup = 0; + my $cpt_inf = 0; + my $cpt_ega = 1; + + my ($seq,$number_of_randomizations)=@_; + #my $str =$seq; + #my $mfe = RNA::fold($seq,$str); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $str=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + for (my $i=0;$i<$number_of_randomizations;$i++) { + $seq = shuffle_sequence_dinucleotide($seq); + #$str = $seq; + + #my $rand_mfe = RNA::fold($str,$str); + $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $str=$rawfolds[1]; + my $rand_mfe=$rawfolds[-1]; + $rand_mfe=~s/\(//; + $rand_mfe=~s/\)//; + + if ($rand_mfe < $mfe) { + $cpt_inf++; + } + if ($rand_mfe == $mfe) { + $cpt_ega++; + } + if ($rand_mfe > $mfe) { + $cpt_sup++; + } + } + + my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1); + + #print "$name\t$mfe\t$proba\n"; + return $proba; +} + +sub shuffle_sequence_dinucleotide { + + my ($str) = @_; + + # upper case and convert to ATGC + $str = uc($str); + $str =~ s/U/T/g; + + my @nuc = ('A','T','G','C'); + my $count_swap = 0; + # set maximum number of permutations + my $stop = length($str) * 10; + + while($count_swap < $stop) { + + my @pos; + + # look start and end letters + my $firstnuc = $nuc[int(rand 4)]; + my $thirdnuc = $nuc[int(rand 4)]; + + # get positions for matching nucleotides + for (my $i=0;$i<(length($str)-2);$i++) { + if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) { + push (@pos,($i+1)); + $i++; + } + } + + # swap at random trinucleotides + my $max = scalar(@pos); + for (my $i=0;$i<$max;$i++) { + my $swap = int(rand($max)); + if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) { + $count_swap++; + my $w1 = substr($str,$pos[$i],1); + my $w2 = substr($str,$pos[$swap],1); + substr($str,$pos[$i],1,$w2); + substr($str,$pos[$swap],1,$w1); + } + } + } + return($str); +} + +sub test_nucleus_conservation{ + + #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 + + my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); + $nucleus=~tr/[T]/[U]/; + if($hash_nuclei{$nucleus}){return 1;} + + return 0; +} + + + +sub pass_filtering_initial{ + + #test if the structure forms a plausible hairpin + unless(pass_filtering_structure()){filter_p("structure problem"); return 0;} + + #test if >90% of reads map to the hairpin in consistence with Dicer processing + unless(pass_filtering_signature()){filter_p("signature problem");return 0;} + + return 1; + +} + + +sub pass_filtering_signature{ + + #number of reads that map in consistence with Dicer processing + my $consistent=0; + + #number of reads that map inconsistent with Dicer processing + my $inconsistent=0; + +# number of potential star reads map in good consistence with Drosha/Dicer processing +# (3' overhangs relative to mature product) + my $star_perfect=0; + +# number of potential star reads that do not map in good consistence with 3' overhang + my $star_fuzzy=0; + + + #sort queries (deep sequences) by their position on the hairpin + my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query; + + foreach my $query(@queries){ + + #number of reads that the deep sequence represents + unless(defined($hash_query{$query}{"freq"})){next;} + my $query_freq=$hash_query{$query}{"freq"}; + + #test which Dicer product (if any) the deep sequence corresponds to + my $product=test_query($query); + + #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable + if($product){$consistent+=$query_freq;} + + #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable + else{$inconsistent+=$query_freq;} + + #test a potential star sequence has good 3' overhang + if($product eq "star"){ + if(test_star($query)){$star_perfect+=$query_freq;} + else{$star_fuzzy+=$query_freq;} + } + } + +# if the majority of potential star sequences map in good accordance with 3' overhang +# score for the presence of star evidence + if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;} + + #total number of reads mapping to the hairpin + my $freq=$consistent+$inconsistent; + $hash_comp{"freq"}=$freq; + unless($freq>0){filter_s("read frequency too low"); return 0;} + + #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded + my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent); + unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;} + + #the hairpin is retained + return 1; +} + +sub test_star{ + + #test if a deep sequence maps in good consistence with 3' overhang + + my $query=shift; + + #5' begin and 3' end positions + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + + #the difference between observed and expected begin positions must be 0 or 1 + my $offset=$beg-$hash_comp{"star_beg"}; + if($offset==0 or $offset==1 or $offset==-1){return 1;} + + return 0; +} + + + +sub test_query{ + + #test if deep sequence maps in consistence with Dicer processing + + my $query=shift; + + #begin, end, strand and read count + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + my $strand=$hash_query{$query}{"strand"}; + my $freq=$hash_query{$query}{"freq"}; + + #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs) + if($strand eq '-'){return 0;} + + #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end + my $fuzz_beg=2; + #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end + my $fuzz_end=2; + + #if in accordance with Dicer processing, return the type of Dicer product + if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";} + if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";} + if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";} + + #if not in accordance, return 0 + return 0; +} + + +sub pass_filtering_structure{ + + #The potential precursor must form a hairpin with miRNA precursor-like characteristics + + #return value + my $ret=1; + + #potential mature, star, loop and lower flank parts must be identifiable + unless(test_components()){return 0;} + + #no bifurcations + unless(no_bifurcations_precursor()){$ret=0;} + + #minimum 14 base pairings in duplex + unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");} + + #not more than 6 nt difference between mature and star length + unless(-60 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + #no overlap between the mature and the star sequence + if($hash_comp{"mature_arm"} eq "first"){ + ($hash_comp{"mature_end"}<$beg) or return 0; + }elsif($hash_comp{"mature_arm"} eq "second"){ + ($end<$hash_comp{"mature_beg"}) or return 0; + } + + #there should be no bifurcations and minimum one base pairing + my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+"); + if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){ + return "first"; + }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){ + return "second"; + } + return 0; +} + + +sub test_loop{ + + #tests the loop positions + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + return 1; +} + + +sub test_flanks{ + + #tests the positions of the lower flanks + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + return 1; +} + + +sub comp{ + + #subroutine to retrive from the 'comp' hash + + my $type=shift; + my $component=$hash_comp{$type}; + return $component; +} + + +sub find_strand_query{ + + #subroutine to find the strand for a given query + + my $query=shift; + my $strand=$hash_query{$query}{"strand"}; + return $strand; +} + + +sub find_positions_query{ + + #subroutine to find the begin and end positions for a given query + + my $query=shift; + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + return ($beg,$end); +} + + + +sub find_mature_query{ + + #finds the query with the highest frequency of reads and returns it + #is used to determine the positions of the potential mature sequence + + my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query; + my $mature_query=$queries[0]; + return $mature_query; +} + + + + +sub reset_variables{ + + #resets the hashes for the next potential precursor + +# %hash_query=(); +# %hash_comp=(); +# %hash_bp=(); + foreach my $key (keys %hash_query) {delete($hash_query{$key});} + foreach my $key (keys %hash_comp) {delete($hash_comp{$key});} + foreach my $key (keys %hash_bp) {delete($hash_bp{$key});} + +# $message_filter=(); +# $message_score=(); +# $lines=(); + undef($message_filter); + undef($message_score); + undef($lines); + return; +} + + + +sub excise_seq{ + + #excise sub sequence from the potential precursor + + my($seq,$beg,$end,$strand)=@_; + + #begin can be equal to end if only one nucleotide is excised + unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} + + #rarely, permuted combinations of signature and structure cause out of bound excision errors. + #this happens once appr. every two thousand combinations + unless($beg<=length($seq)){$out_of_bound++;return 0;} + + #if on the minus strand, the reverse complement should be excised + if($strand eq "-"){$seq=revcom($seq);} + + #the blast parsed format is 1-indexed, substr is 0-indexed + my $sub_seq=substr($seq,$beg-1,$end-$beg+1); + + return $sub_seq; + +} + +sub excise_struct{ + + #excise sub structure + + my($struct,$beg,$end,$strand)=@_; + my $lng=length $struct; + + #begin can be equal to end if only one nucleotide is excised + unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} + + #rarely, permuted combinations of signature and structure cause out of bound excision errors. + #this happens once appr. every two thousand combinations + unless($beg<=length($struct)){return 0;} + + #if excising relative to minus strand, positions are reversed + if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);} + + #the blast parsed format is 1-indexed, substr is 0-indexed + my $sub_struct=substr($struct,$beg-1,$end-$beg+1); + + return $sub_struct; +} + + +sub create_hash_nuclei{ + #parses a fasta file with sequences of known miRNAs considered for conservation purposes + #reads the nuclei into a hash + + my ($file) = @_; + my ($id, $desc, $sequence, $nucleus) = (); + + open (FASTA, "<$file") or die "can not open $file\n"; + while () + { + chomp; + if (/^>(\S+)(.*)/) + { + $id = $1; + $desc = $2; + $sequence = ""; + $nucleus = ""; + while (){ + chomp; + if (/^>(\S+)(.*)/){ + $nucleus = substr($sequence,1,$nucleus_lng); + $nucleus =~ tr/[T]/[U]/; + $hash_mirs{$nucleus} .="$id\t"; + $hash_nuclei{$nucleus} += 1; + + $id = $1; + $desc = $2; + $sequence = ""; + $nucleus = ""; + next; + } + $sequence .= $_; + } + } + } + $nucleus = substr($sequence,1,$nucleus_lng); + $nucleus =~ tr/[T]/[U]/; + $hash_mirs{$nucleus} .="$id\t"; + $hash_nuclei{$nucleus} += 1; + close FASTA; +} + + +sub parse_file_struct{ + #parses the output from RNAfoldand reads it into hashes + my($file) = @_; + my($id,$desc,$seq,$struct,$mfe) = (); + open (FILE_STRUCT, "<$file") or die "can not open $file\n"; + while (){ + chomp; + if (/^>(\S+)\s*(.*)/){ + $id= $1; + $desc= $2; + $seq= ""; + $struct= ""; + $mfe= ""; + while (){ + chomp; + if (/^>(\S+)\s*(.*)/){ + $hash_desc{$id} = $desc; + $hash_seq{$id} = $seq; + $hash_struct{$id} = $struct; + $hash_mfe{$id} = $mfe; + $id = $1; + $desc = $2; + $seq = ""; + $struct = ""; + $mfe = ""; + next; + } + if(/^\w/){ + tr/uU/tT/; + $seq .= $_; + next; + } + if(/((\.|\(|\))+)/){$struct .=$1;} + if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;} + } + } + } + $hash_desc{$id} = $desc; + $hash_seq{$id} = $seq; + $hash_struct{$id} = $struct; + $hash_mfe{$id} = $mfe; + close FILE_STRUCT; + return; +} + + +sub score_s{ + + #this score message is appended to the end of the string of score messages outputted for the potential precursor + + my $message=shift; + $message_score.=$message."\n";; + return; +} + + + +sub score_p{ + + #this score message is appended to the beginning of the string of score messages outputted for the potential precursor + + my $message=shift; + $message_score=$message."\n".$message_score; + return; +} + + + +sub filter_s{ + + #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor + + my $message=shift; + $message_filter.=$message."\n"; + return; +} + + +sub filter_p{ + + #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor + + my $message=shift; + if(defined $message_filter){$message_filter=$message."\n".$message_filter;} + else{$message_filter=$message."\n";} + return; +} + + +sub find_freq{ + + #finds the frequency of a given read query from its id. + + my($query)=@_; + + if($query=~/x(\d+)/i){ + my $freq=$1; + return $freq; + }else{ + #print STDERR "Problem with read format\n"; + return 0; + } +} + + +sub print_hash_comp{ + + #prints the 'comp' hash + + my @keys=sort keys %hash_comp; + foreach my $key(@keys){ + my $value=$hash_comp{$key}; + print "$key \t$value\n"; + } +} + + + +sub print_hash_bp{ + + #prints the 'bp' hash + + my @keys=sort {$a<=>$b} keys %hash_bp; + foreach my $key(@keys){ + my $value=$hash_bp{$key}; + print "$key\t$value\n"; + } + print "\n"; +} + + + +sub find_strand{ + + #A subroutine to find the strand, parsing different blast formats + + my($other)=@_; + + my $strand="+"; + + if($other=~/-/){ + $strand="-"; + } + + if($other=~/minus/i){ + $strand="-"; + } + return($strand); +} + + +sub contained{ + + #Is the stretch defined by the first positions contained in the stretch defined by the second? + + my($beg1,$end1,$beg2,$end2)=@_; + + testbeginend($beg1,$end1,$beg2,$end2); + + if($beg2<=$beg1 and $end1<=$end2){ + return 1; + }else{ + return 0; + } +} + + +sub testbeginend{ + + #Are the beginposition numerically smaller than the endposition for each pair? + + my($begin1,$end1,$begin2,$end2)=@_; + + unless($begin1<=$end1 and $begin2<=$end2){ + print STDERR "beg can not be larger than end for $subject_old\n"; + exit; + } +} + + +sub rev_pos{ + +# The blast_parsed format always uses positions that are relative to the 5' of the given strand +# This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with +# the n't nucleotide on the plus strand + +# This subroutine reverses the begin and end positions of positions of the minus strand so that they +# are relative to the 5' end of the plus strand + + my($beg,$end,$lng)=@_; + + my $new_end=$lng-$beg+1; + my $new_beg=$lng-$end+1; + + return($new_beg,$new_end); +} + +sub round { + + #rounds to nearest integer + + my($number) = shift; + return int($number + .5); + +} + + +sub rev{ + + #reverses the order of nucleotides in a sequence + + my($sequence)=@_; + + my $rev=reverse $sequence; + + return $rev; +} + +sub com{ + + #the complementary of a sequence + + my($sequence)=@_; + + $sequence=~tr/acgtuACGTU/TGCAATGCAA/; + + return $sequence; +} + +sub revcom{ + + #reverse complement + + my($sequence)=@_; + + my $revcom=rev(com($sequence)); + + return $revcom; +} + + +sub max2 { + + #max of two numbers + + my($a, $b) = @_; + return ($a>$b ? $a : $b); +} + +sub min2 { + + #min of two numbers + + my($a, $b) = @_; + return ($a<$b ? $a : $b); +} + + + +sub score_freq{ + +# scores the count of reads that map to the potential precursor +# Assumes geometric distribution as described in methods section of manuscript + + my $freq=shift; + + #parameters of known precursors and background hairpins + my $parameter_test=0.999; + my $parameter_control=0.6; + + #log_odds calculated directly to avoid underflow + my $intercept=log((1-$parameter_test)/(1-$parameter_control)); + my $slope=log($parameter_test/$parameter_control); + my $log_odds=$slope*$freq+$intercept; + + #if no strong evidence for 3' overhangs, limit the score contribution to 0 + unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);} + + return $log_odds; +} + + + +##sub score_mfe{ + +# scores the minimum free energy in kCal/mol of the potential precursor +# Assumes Gumbel distribution as described in methods section of manuscript + +## my $mfe=shift; + + #numerical value, minimum 1 +## my $mfe_adj=max2(1,-$mfe); + + #parameters of known precursors and background hairpins, scale and location +## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); +## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); + +## my $odds=$prob_test/$prob_background; +## my $log_odds=log($odds); + +## return $log_odds; +##} + +sub score_mfe{ +# use bignum; + +# scores the minimum free energy in kCal/mol of the potential precursor +# Assumes Gumbel distribution as described in methods section of manuscript + + my ($mfe,$mlng)=@_; + + #numerical value, minimum 1 + my $mfe_adj=max2(1,-$mfe); +my $mfe_adj1=$mfe/$mlng; + #parameters of known precursors and background hairpins, scale and location + my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; + my $ev=$e**($mfe_adj1*$c); + #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; + my $log_odds=($a/($b+$ev)); + + + my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); + my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); + + my $odds=$prob_test/$prob_background; + my $log_odds_2=log($odds); + #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; + return $log_odds; +} + + + +sub prob_gumbel_discretized{ + +# discretized Gumbel distribution, probabilities within windows of 1 kCal/mol +# uses the subroutine that calculates the cdf to find the probabilities + + my ($var,$scale,$location)=@_; + + my $bound_lower=$var-0.5; + my $bound_upper=$var+0.5; + + my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location); + my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location); + + my $prob=$cdf_upper-$cdf_lower; + + return $prob; +} + + +sub cdf_gumbel{ + +# calculates the cumulative distribution function of the Gumbel distribution + + my ($var,$scale,$location)=@_; + + my $cdf=$e**(-($e**(-($var-$location)/$scale))); + + return $cdf; +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 miRNA_Express_and_sequence.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/miRNA_Express_and_sequence.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,173 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-6-4 +#Modified: +#Description: solexa miRNA express and sequence +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","list=s","fa=s","pre=s","tag=s","h"); +if (!(defined $opts{i} and defined $opts{list} and defined $opts{fa} and defined $opts{pre} and defined $opts{tag}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'list'}; +my $out=$opts{'fa'}; +my $preout=$opts{'pre'}; + +=cut +my %hash_pri; +open PRI,"<$opts{p}"; +while (my $aline=) { + chomp $aline; + if($aline=~/^>(\S+)/){$hash_pri{$1}=$aline;} +} +close PRI; +=cut + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file +open FA ,">$out"; +open PRE,">$preout"; + +print OUT "#ID\tcoordinate\tpos1\tpos2"; +my @marks=split/\,/,$opts{'tag'}; +foreach (@marks) { + print OUT "\t",$_,"_matureExp"; +} +foreach (@marks) { + print OUT "\t",$_,"_starExp"; +} +foreach (@marks) { + print OUT "\t",$_,"_totalExp"; +} + +print OUT "\n"; + +my (%uniq_id,$novel); +while (my $aline=) { + chomp $aline; + until ($aline =~ /^score\s+[-\d\.]+/){ + $aline = ; + if (eof) {last;} + } + if (eof) {last;} +########## miRNA ID ################ + $novel++; +########### annotate#################### + do {$aline=;} until($aline=~/flank_first_end/) ; + chomp $aline; + my @flank1=split/\t/,$aline; + do {$aline=;} until($aline=~/flank_second_beg/) ; + chomp $aline; + my @flank2=split/\t/,$aline; +# +########## mature start loop pre #### + do {$aline=;} until($aline=~/mature_beg/) ; + chomp $aline; + my @start=split/\t/,$aline; +# $start[1] -=$flank1[1]; + do {$aline=;} until($aline=~/mature_end/) ; + chomp $aline; + my @end=split/\t/,$aline; +# $end[1] -=$flank1[1]; + do {$aline=;} until($aline=~/mature_seq/) ; + chomp $aline; + my @arr1=split/\t/,$aline; + do {$aline=;} until($aline=~/pre_seq/) ; + chomp $aline; + my @arr2=split/\t/,$aline; + do {$aline=;} until($aline=~/pri_id/) ; + chomp $aline; + my @pri_id=split/\t/,$aline; + do {$aline=;} until($aline=~/pri_seq/) ; + chomp $aline; + my @pri_seq=split/\t/,$aline; + do {$aline=;} until($aline=~/star_beg/) ; + chomp $aline; + my @star_start=split/\t/,$aline; +# $star_start[1] -=$flank1[1]; + do {$aline=;} until($aline=~/star_end/) ; + chomp $aline; + my @star_end=split/\t/,$aline; +# $star_end[1] -=$flank1[1]; + do {$aline=;} until($aline=~/star_seq/) ; + chomp $aline; + my @arr3=split/\t/,$aline; + print OUT "miR-c-$novel\t$pri_id[1]\tmature:$start[1]:$end[1]\tstar:$star_start[1]:$star_end[1]\t"; + #print OUT "$arr1[1]\t$arr3[1]\t$arr2[1]\t\/\t"; + print FA ">miR-c-$novel\n$arr1[1]\n"; + print PRE ">miR-c-$novel\n$pri_seq[1]\n"; +########## reads count ############# + ; + my @count1;my @count2;my @count3;my @count4; + $aline=; + do { + chomp $aline; + my @reads=split/\t/,$aline; + my @pos=(); + $reads[5]=~/(\d+)\.\.(\d+)/; +# $pos[0] =$1-$flank1[1]; +# $pos[1] =$2-$flank1[1]; + $pos[0]=$1; + $pos[1]=$2; + $reads[0]=~/:([\d|_]+)_x(\d+)$/; + my @ss=split/_/,$1; + for (my $i=0;$i<@ss ;$i++) { + if (!(defined $count3[$i])) { + $count3[$i]=0; + } + if (!(defined $count4[$i])) { + $count4[$i]=0; + } + $count2[$i]+=$ss[$i]; + + } +# $count3 +=$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 ); +# $count4 +=$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 ); +# $count1 =$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 && $count1<$1); +# $count2 =$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 && $count2<$1); + if($end[1]-$pos[1]>=-5 && $end[1]-$pos[1]<=5 && $pos[0]-$start[1]>=-3 && $pos[0]-$start[1]<=3 ) + { + for (my $i=0;$i<@ss;$i++) { + $count3[$i]+=$ss[$i]; + } + } + if($star_end[1]-$pos[1]<=5 && $star_end[1]-$pos[1]>=-5 && $pos[0]-$star_start[1]>=-3 && $pos[0]-$star_start[1]<=3){ + for (my $i=0;$i<@ss;$i++) { + $count4[$i]+=$ss[$i]; + } + } + $aline=; + chomp $aline; + } until(length $aline < 1) ; + $"="\t"; + print OUT "@count3\t@count4\t@count2\n"; + $"=" "; +} + +close IN; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -list -fa -pre -tag +options: +-i input file,predictions file +-list output file miRNA list file +-fa output file ,miRNA sequence fasta file. +-pre output file, miRNA precursor fasta file. +-tag string, sample names# eg: samA,samB,samC +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 miRPlant.pl --- a/miRPlant.pl Wed Dec 03 02:03:27 2014 -0500 +++ b/miRPlant.pl Fri Dec 05 00:11:02 2014 -0500 @@ -17,7 +17,7 @@ #use Term::ANSIColor; my %opts; -GetOptions(\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); +GetOptions(\%opts,"i:s@","tag:s@","phred:i","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments &usage; } @@ -33,7 +33,9 @@ } my $phred_qv=64; - +if (defined $opts{'phred'}) { + $phred_qv=$opts{'phred'}; +} my @inputfiles=@{$opts{'i'}}; my @inputtags=@{$opts{'tag'}}; diff -r f008ab2cadc6 -r 7b5a48b972e9 miRPlant.xml --- a/miRPlant.xml Wed Dec 03 02:03:27 2014 -0500 +++ b/miRPlant.xml Fri Dec 05 00:11:02 2014 -0500 @@ -15,10 +15,6 @@ miRPlant.pl ## Change this to accommodate the number of threads you have available. -t \${GALAXY_SLOTS:-4} - ## Do or not delet rfam mapped tags - #if $params.delet_rfam == "yes": - -D - #end if -path \$SCRIPT_PATH #for $j, $s in enumerate( $series ) @@ -27,7 +23,43 @@ -tag ${s.tag} #end for - -format $format -gfa $gfa -pre $pre -mat $mat -rfam $rfam -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe > run.log + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + + ## Do or not annotate rfam non-miRNA RNAs + #if $params.annotate_rfam == "yes": + + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($params.annotate_rfam.reference_rfam.source) == "history": + bowtie-build "$params.annotate_rfam.reference_rfam.own_file" rfam; ln -s "$params.annotate_rfam.reference_rfam.own_file" rfam.fa; + #set rfam_index_path = 'rfam' + #else: + #set rfam_index_path = $params.annotate_rfam.reference_rfam.index.fields.path + #end if + + -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v + ## Do or not delet rfam mapped tags + #if $params.annotate_rfam.rfamresult.delet_rfam == "yes": + -D + #end if + #end if + + + ## Do or not annotate known microRNAs + #if $params.known_microRNA == "yes": + -pre $pre -mat $mat + #end if + + + -format $format -gfa ${index_path}.fa -idx $index_path -phred $phred -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe > run.log @@ -37,25 +69,136 @@ + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -64,7 +207,6 @@ - @@ -72,11 +214,21 @@ - - - - - + + (params['known_microRNA'] == 'Yes') + + + (params['known_microRNA'] == 'Yes') + + + (params['known_microRNA'] == 'Yes') + + + (params['known_microRNA'] == 'Yes') + + + (params['known_microRNA'] == 'Yes') + diff -r f008ab2cadc6 -r 7b5a48b972e9 microRNA.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,253 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-4-22 +#Modified: +#Description: plant microRNA prediction +my $version=1.00; + +use strict; +use Getopt::Long; +use threads; +#use threads::shared; +use File::Path; +use File::Basename; +#use RNA; +#use Term::ANSIColor; + +my %opts; +GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h"); +if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $time=&Time(); +print "miPlant program start:\n The time is $time!\n"; +print "Command line:\n $0 @ARGV\n"; + +my $mypath=`pwd`; +chomp $mypath; + +my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/"; + + +unless ($dir=~/\/$/) {$dir.="/";} +if (not -d $dir) { + mkdir $dir; +} +my $config=$opts{'i'}; +my $data=$opts{'fa'}; + +my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; + +my $t=1; #threads number +if (defined $opts{'t'}) {$t=$opts{'t'};} + +my $mis=0; #mismatch number for microRNA +if (defined $opts{'mis'}) {$mis=$opts{'mis'};} + +my $hit=25; # maximum reads mapping hits in genome +if (defined $opts{'r'}) {$hit=$opts{'r'};} + +my $upstream = 2; # microRNA 5' extension +$upstream = $opts{'e'} if(defined $opts{'e'}); + +my $downstream = 5;# microRNA 3' extension +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; +my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; +my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; + +$time=&Time(); +print "$time, Checking input file!\n"; + +my (@filein,@mark); +&read_config(); + +&checkfa($opts{pre}) if(defined $opts{pre}); +&checkfa($opts{mat}) if(defined $opts{mat}); +&checkfa($opts{gfa}); + +chdir $dir; +my $data2=$data; +my $known_result=$dir."known_miRNA_Express"; +if(defined $opts{pre} and defined $opts{mat}){ + &quantify(); ### known microRAN quantify + $data2=$known_result."/mirbase_not_mapped.fa"; +} + +my $genome_map=$dir."genome_match"; +&genome($data2); + +#my $genome_map=&search($dir,"genome_match_"); +my $mapfile=$genome_map."/genome_mapped.bwt"; +my $mapfa=$genome_map."/genome_mapped.fa"; +my $unmap=$genome_map."/genome_not_mapped.fa"; + +#$time=Time(); +#print "$time: Novel microRNA prediction!\n\n"; + +&predict($mapfa); + +$time=Time(); +print "$time: Program end!!\n"; + +############################## sub programs ################################### +sub predict{ + my ($file)=@_; + $time=&Time(); + print "$time: Novel microRNA prediction!\n\n"; + my $predict=$dir."Novel_miRNA_predict"; + mkdir $predict; + chdir $predict; + system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); +# print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; + + system("bowtie-build -f excised_precursor.fa excised_precursor"); +# print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; + + system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log"); +# print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; + + system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); +# print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; + + system("sort -k 4 precursor_mapped.bst > signatures.bst"); +# print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; + + chdir $dir; + system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); +# print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; + #system("rm novel_tmp_dir -rf"); + my $tag=join "," ,@mark; + system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); + + system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa"); + +} + +sub genome{ + my ($file)=@_; + if(defined $opts{'idx'}){ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; + }else{ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; + } +} + +sub quantify{ + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); + print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; +} + +sub read_config{ + open CON,"<$config"; + while (my $aline=) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + #&check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} +sub checkfa{ + my ($file_reads)=@_; + open N,"<$file_reads"; + my $line=; + chomp $line; + if($line !~ /^>\S+/){ + #printErr(); + die "The first line of file $file_reads does not start with '>identifier' +Reads file $file_reads is not a valid fasta file\n\n"; + } + if( !~ /^[ACGTNacgtn]*$/){ + #printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + close N; +} +sub search{ + my ($dir,$str)=@_; + opendir I,$dir; + my @ret; + while (my $file=readdir I) { + if ($file=~/$str/) { + push @ret, $file; + } + } + closedir I; + if (@ret != 1) { + #&printErr(); + + die "Can not find directory or file which name has string: $str !!!\n"; + } + return $ret[0]; +} + + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day $hour:$min:$sec"); +} + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: + +$0 -i -fa -gfa -idx -pre -mat -mis -e -f -t -o -path +options: +-i input files, # config + +-fa ,#fasta sequence file + +-path scirpt path + +-gfa string, input file # genome fasta. sequence file +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-pre string, input file #species specific microRNA precursor sequences +-mat string, input file #species specific microRNA mature sequences + +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-r int a read is allowed to map up to this number of positions in the genome,default is 25 + +-dis Maximal space between miRNA and miRNA* (200) +-flank Flank sequence length of miRNA precursor (10) +-mfe Maximal free energy allowed for a miRNA precursor (-20) + +-t int, number of threads [1] + +-o output directory# absolute path +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 microRNA.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microRNA.xml Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,111 @@ + + Plant microRNA analysis + + + fastx_toolkit + bowtie + SCRIPT_PATH + + SVG + ViennaRNA + + + + + microRNA.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + -path \$SCRIPT_PATH + + + ## Do or not annotate known microRNAs + #if $params.known_microRNA == "yes": + -pre $pre -mat $mat + #end if + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + + -gfa ${index_path}.fa -idx $index_path -mis $mismatch -i $config -fa $reads -e $e -f $f -r $r -dis $dis -flank $flank -mfe $mfe > run.log + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (params['known_microRNA'] == 'Yes') + + + (params['known_microRNA'] == 'Yes') + + + (params['known_microRNA'] == 'Yes') + + + (params['known_microRNA'] == 'Yes') + + + (params['known_microRNA'] == 'Yes') + + + + + + + + + + + + diff -r f008ab2cadc6 -r 7b5a48b972e9 nibls.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nibls.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,319 @@ +#!/usr/bin/perl +##################################################################################################### +#LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it. +# +# Dan MacLean +# dan.maclean@sainsbury-laboratory.ac.uk +# +# This program is free academic software; academic and non-profit +# users may redistribute it freely. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 +# see included file GPL3.txt +# +# + + +###Dont forget you will need ... +##################################################################################################### +# Boost::Graph +#Copyright 2005 by David Burdick +# Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm +#Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself. +##################################################################################################### + + + +use strict; +use warnings; +use Boost::Graph; +use Getopt::Long; + + +my $usage = "usage: $0 -f GFF_FILE [options]\n\n -m minimum inclusion distance (default 5)\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\n"; + +my $gff_file ; +my $min_inc = 5; +my $clus = 0.6; +my $buff = 0; +my $output_file; +my $temp; +my $mark; + +GetOptions( + + 'c=f' => \$clus, + 'm=i' => \$min_inc, + 'f|file=s' => \$gff_file, + 'b=i' => \$buff, + 'o=s' => \$output_file, + 't=s' => \$temp, + 'k=s' => \$mark +) ; + + +die $usage unless $gff_file; + + +my $starttime = time; +warn "started $starttime\n"; + +## load in data +my %molecules; # stores starts and ends of srnas +open GFF, "<$gff_file"; + +while (my $entry = ){ + + chomp $entry; + next if($entry=~/^\#/); + my @data = split(/\t/,$entry); + my $chr=shift @data; + my $strand=shift @data; + my $start=shift @data; + my $end=shift @data; +# my $length1=$end-$start+1; +# if ($length1>30) { +# $length1=40; +# } + my $total; + for (my $s=0;$s<@data ;$s++) { + $total+=$data[$s]; + } + push @data,$total; +# push @data,$length1; +# if (defined $molecules{$chr}{$start}{$end}{$strand}) { +# my @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand}); +# for (my $i=0;$i<$#old_data ;$i++) { +# $data[$i]+=$old_data[$i]; +# } +# } + my $data=join ";",@data; + $molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information + #print "$chr\t$start\t$end\n"; +} + +close GFF; + +warn "Data loaded...\nBuilding graphs and finding loci\nPlease be patient, this can take a while...\n"; + +my @sample=split/\#/,$mark; +$mark=join"\"\t\"",@sample; +open OUT, ">$output_file"; +print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n"; +open CLUSTER,">$temp"; +print CLUSTER "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n"; +foreach my $chromosome (keys %molecules){ + my $g = new Boost::Graph(directed=>0); + my @starts = keys(%{$molecules{$chromosome}} ); + @starts = sort {$a <=> $b} @starts; + + while (my $srna_start = shift @starts){ ## work from left most sRNA to right most, add to graph if they close enough + + + foreach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){ + + + ###use new graph if the next srna is too far away from this one.. + if(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){ + + + ##dump the info from the old graph + if (scalar(@{$g->get_nodes()}) > 2){ + + my $cluster_coeff = get_cc($g); + if ($cluster_coeff >= $clus){ + dump_locus($g, $cluster_coeff); + } + } + + + $g = new Boost::Graph(directed=>0); + + } + + foreach my $e (keys %{$molecules{$chromosome}{$srna_start}}){ ### extra bit because all loci with same start and different end overlap by definition. but are not collected by main search below + + unless ($e eq $srna_end){ + my $sn = $chromosome. ':' . $srna_start . ':' . $srna_end; ## turn coordinate of sRNA inro a node name + my $en = $chromosome. ':' . $srna_start . ':' . $e; + $g->add_edge(node1=>"$sn", node2=>"$en", weight=>'1'); + } + + } + + foreach my $start (@starts){ ##build graph of overlaps + my $new = 0; + last if $start - $min_inc > $srna_end; + if ($start - $min_inc <= $srna_end){ + + my $start_node = $chromosome . ':' . $srna_start . ':' . $srna_end; + foreach my $end (keys %{$molecules{$chromosome}{$start}}){ + + my $end_node = $chromosome . ':' . $start . ':' . $end; + $g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>'1'); + } + + } + } + } + + if (!(defined $starts[0])) { + ##dump the info from the last graph + if (scalar(@{$g->get_nodes()}) > 2){ + + my $cluster_coeff = get_cc($g); + if ($cluster_coeff >= $clus){ + dump_locus($g, $cluster_coeff); + } + } + } + } +} + +warn "Loci printed\nFinished\n"; + +my $endtime = time; + +my $elapsed = $endtime - $starttime; + +warn "Time elapsed = $elapsed s\n"; +close OUT; +close CLUSTER; +######################################################################################### +sub get_cc{ ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2 + + my $graph = shift; + + my @component = @{$graph->get_nodes()}; #number of nodes + my @clustering_coefficients; + + foreach my $vertex (@component) + { + + my @neighbours = @{$graph->neighbors($vertex)}; + + my %edges_in_graph; + + my $n = @neighbours; #n = the number of neighbours + my $k = ($n * ($n - 1))/2; #k = total number of possible connections + + my $e= 0; #actual number of connections within sub-graph + + foreach my $neighbour (@neighbours) + { + foreach my $neighbour_2 (@neighbours) + { + my $edge1 = "$neighbour\t$neighbour_2"; + my $edge2 = "$neighbour_2\t$neighbour"; + unless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2}) + { + if ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour)) + { + ++$e; + $edges_in_graph{$edge1}=1; + $edges_in_graph{$edge2}=1; + } + } + } + } + + if ($k >= 1) + { + my $c = $e / $k; + push @clustering_coefficients, $c; + } + else {push @clustering_coefficients, '0';} + } + + my $graph_n = scalar(@clustering_coefficients); + my $graph_cc = 0; + foreach my $cc (@clustering_coefficients){ + + $graph_cc = $graph_cc + $cc; + + } + $graph_cc = $graph_cc / $graph_n; + + return $graph_cc; +} + +############################################################################################################ + +sub dump_locus{ + + my $g = shift; + my $cc = shift; + my $chr; + my $start = 1000000000000000000000000000000000000000000000; + my $end = -1; + my @sample; + my @tag; + foreach my $node (@{$g->get_nodes()}){ + + $node =~ m/^(\S+):(\d+):(\d+)$/; + my $c=$1; + my $s=$2; + my $e=$3; + # my @data; + foreach my $str (keys %{$molecules{$c}{$s}{$e}}) { + my @data=split(/;/,$molecules{$c}{$s}{$e}{$str}); + push @tag,($s.",".$e.",".$str.",".$data[-1]); +# for (my $i=0;$i<$#old_data ;$i++) { +# $data[$i]+=$old_data[$i]; +# } + my $length=$e-$s+1; + if ($length>30) { + $length=40; + } + push @data,$length; + my $data=join ";",@data;#sample_exp/total_exp/length; + push @sample,$data; + } + + $chr = $c; + $start = $s if $s < $start; + $end = $e if $e > $end; + } + my $tag=join";",@tag; + my $tag_number=@tag; + my ($max_length,$max_p,@cluster_exp)=Max_length(\@sample); + if ($max_length==40) { + $max_length="\>30"; + } + my $cluster_exp=join"\t",@cluster_exp; + my $gff = $chr."\:$start\-$end\t".$max_length."nt\t".$max_p."\t" . $cluster_exp; + print CLUSTER "$chr\:$start\-$end\t$max_length"."nt\t$max_p\t$tag_number\t$tag\n"; + print OUT $gff, "\n"; +} + +sub Max_length{ + my @exp=@{$_[0]}; + my %sample_length; + my $total_exp; + my @each; + for (my $i=0;$i<=$#exp ;$i++) { + my @tag=split/;/,$exp[$i]; + my $length=pop(@tag); + my $exp=pop(@tag); + $sample_length{$length}+=$exp; + $total_exp+=$exp; + for (my $j=0;$j<=$#tag ;$j++) { + $each[$j]+=$tag[$j]; + } + } + my $max=0; + my $max_key; + foreach my $key (sort keys %sample_length) { + my $p=$sample_length{$key}/$total_exp; + if ($p>$max) { + $max=$p; + $max_key=$key; + } + $sample_length{$key}=sprintf("%.2f",$p); + } + return($max_key,$sample_length{$max_key},@each); +} diff -r f008ab2cadc6 -r 7b5a48b972e9 non_miRNA_reads.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/non_miRNA_reads.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,62 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","fa=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'fa'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +my (%fa,%seq); +while (my $aline=){ + chomp $aline; + $aline=~s/^>//; + my $seq=; + chomp $seq; + #$seq{$seq}=$aline; + $fa{$aline}=$seq; +} +close IN; + +open IN,"<$opts{i}"; +while(my $aline=){ + chomp $aline; + if($aline=~/^\S+_x\d+/){ + $aline=~/^(\S+_x\d+)/; + my $name=$1; + delete($fa{$name}); + } +} +close IN; + +open OUT,">$fileout"; #output file +foreach my $key (keys %fa) { + print OUT ">$key\n$fa{$key}\n"; +} +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 phased_siRNA.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phased_siRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,254 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +#use Math::Cephes qw(:hypergeometrics); + +my %opts; +GetOptions(\%opts,"i=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; + +open IN,"<$filein"; #input file +open OUT,">$fileout"; #output file + +while (my $aline=) { + chomp $aline; + if ($aline=~/^\#/) { + print OUT $aline,"\tp-value\n"; + next; + } + my @tmp=split/\t/,$aline; + my @pos=split/:|-/,$tmp[0]; + $tmp[1]=~s/nt//; + my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]); + + print OUT $aline,"\t",$pv,"\n"; +} +close IN; +close OUT; + +sub phase{ + my ($tagL,$start,$end,$tags)=@_; + my @tmp=split/\;/,$tags; + my %tag; + for (my $i=0;$i<@tmp;$i++) { + my @aa=split/\,/,$tmp[$i]; + next if($aa[1]-$aa[0]+1 != $tagL); +# $tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+"); +# $tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-"); + $tag{$aa[0]}+=$aa[3] if($aa[2] eq "+"); + $tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-"); + } + + my $pv=&pvalue2(\%tag,$tagL,$start,$end); + + return $pv; +} + +sub pvalue2{ + my ($tag,$tagL,$start,$end)=@_; + + my $p=1; my $pp=1; + foreach my $ccs(keys %{$tag}){ + my $n=0; + my $k=0; + my $K=0; + my $N=0; + + my $cor= $ccs; + my $ss=$cor; + my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end; + + my $max=0; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i; + if (defined $$tag{$x}) + { + if ($max<$$tag{$x}) {$max=$$tag{$x};} + $n +=$$tag{$x}; + $N++; + } + } + for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i; + if (defined $$tag{$x}) + { + $k +=$$tag{$x}; + $K++; + } + } + + + return $p if($K<3); + return $p if($max/$n>0.8); + + my $pn=0; + next if($n==$k); + $pn=10*$k/($n-$k)+1; + $pn = $pn ** ($K-2); + $pn = log($pn); + if ($p<$pn) { + $p=$pn; + } + + } + + return $p; + +} + +sub pvalue{ + my ($tag,$tagL,$start,$end)=@_; + + my $p=1; + foreach my $ccs(keys %{$tag}){ + my $n=-1; + my $k=-1; + + my ($cor, $str)=split(/,/, $ccs); + if ($str eq "+") # small RNAs on the Watson strand + { + my $ss=$cor; + my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i.","."+"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i.","."+"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + + for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand + { + my $x=$j.","."-"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + + for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand + { + my $x=$j.","."-"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + } + + elsif ($str eq "-") # small RNAs on the Crick strand + { + my $ee=$cor; + my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start; + for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand + { + my $x=$i.","."-"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand + { + my $x=$i.","."-"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + + for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand + { + my $x=$j.","."+"; + if (defined $$tag{$x}) + { + $n=$n+1; + } + } + for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand + { + my $x=$j.","."+"; + if (defined $$tag{$x}) + { + $k=$k+1; + } + } + } + + next if($k<3); + + my $pn=0; my $N=$tagL*11*2-1; my $M=21; + for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k + { + my $c=1; + my $rr=1; + my $rw=1; + + for (my $j=0; $j<=$w-1; $j++) + { + $c=$c*($M-$j)/($j+1); + } + for (my $x=0; $x<=$n-$w-1; $x++) + { + $rr=$rr*($N-$M-$x)/($x+1); + } + for (my $y=0; $y<=$n-1; $y++) + { + $rw=$rw*($y+1)/($N-$y); + } + my $pr=$c*$rr*$rw; + + $pn=$pn+$pr; + } + + $p=$pn<$p ? $pn :$p; + + if ($p<0.001) #select and output small RNA clusters with p<0.001 + + { + + return $p; + + } + + } + return $p; +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-o output file +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 preProcess.xml --- a/preProcess.xml Wed Dec 03 02:03:27 2014 -0500 +++ b/preProcess.xml Fri Dec 05 00:11:02 2014 -0500 @@ -12,7 +12,7 @@ - miRPlant.pl + preProcess.pl ## Change this to accommodate the number of threads you have available. -t \${GALAXY_SLOTS:-4} -path \$SCRIPT_PATH @@ -25,7 +25,16 @@ ## Do or not annotate rfam non-miRNA RNAs #if $params.annotate_rfam == "yes": - -rfam $rfam + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($params.annotate_rfam.reference_rfam.source) == "history": + bowtie-build "$params.annotate_rfam.reference_rfam.own_file" rfam; ln -s "$params.annotate_rfam.reference_rfam.own_file" rfam.fa; + #set rfam_index_path = 'rfam' + #else: + #set rfam_index_path = $params.annotate_rfam.reference_rfam.index.fields.path + #end if + + -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v #end if ## prepare bowtie index @@ -37,7 +46,7 @@ #set index_path = $reference_genome.index.fields.path #end if - -format $format -gfa ${index_path}.fa -idx $index_path -a $a -M $mapnt -min $min -max $max -mis $mismatch -v $v > run.log + -format $format -gfa ${index_path}.fa -idx $index_path -phred $phred -a $a -M $mapnt -min $min -max $max -mis $mismatch > run.log @@ -75,6 +84,10 @@ + + + + @@ -87,8 +100,27 @@ - - + + + + + + + + + + + + + + + + + + + + + @@ -112,6 +144,8 @@ (params['annotate_rfam'] == 'Yes') + + diff -r f008ab2cadc6 -r 7b5a48b972e9 precursors.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/precursors.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,829 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +#use RNA; + +my %opts; +GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h"); +if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $checkno=1; +my $filein=$opts{'map'}; +my $faout=$opts{'o'}; +my $strout=$opts{'s'}; +my $genome= $opts{'g'}; + +my $maxd=defined $opts{'d'} ? $opts{'d'} : 200; +my $flank=defined $opts{'f'}? $opts{'f'} : 10; + +my $MAX_ENERGY=-18; +if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};} +my $MAX_UNPAIR=5; +my $MIN_PAIR=15; +my $MAX_SIZEDIFF=4; +my $MAX_BULGE=2; +my $ASYMMETRY=5; +my $MIN_UNPAIR=0; +my $MIN_SPACE=5; +my $MAX_SPACE=$maxd; +my $FLANK=$flank; + +######### load in genome sequences start ######## +my %genome; +my %lng; +my $name; +open IN,"<$genome"; +while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + if ($aline=~/^>(\S+)/) { + $name=$1; + next; + } + $genome{$name} .=$aline; +} +close IN; +foreach my $key (keys %genome) { + $lng{$key}=length($genome{$key}); +} +####### load in genome sequences end ########## + +my %breaks; ### reads number bigger than 3 +open IN,"<$filein"; #input file +while (my $aline=) { + chomp $aline; + my @tmp=split/\t/,$aline; + $tmp[0]=~/_x(\d+)$/; + my $no=$1; + next if($no<3); + #my $trand=&find_strand($tmp[9]); + #my @pos=split/\.\./,$tmp[5]; + my $end=$tmp[3]+length($tmp[4])-1; + if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);} + push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base +} +close IN; + +my %cites; ### peaks +foreach my $chr (keys %breaks) { + foreach my $strand (keys %{$breaks{$chr}}) { + my @array=@{$breaks{$chr}{$strand}}; + @array=sort{$a->[0]<=>$b->[0]} @array; + for (my $i=0;$i<@array;$i++) { + my $start=$array[$i][0];my $end=$array[$i][1]; + my @subarray=(); + push @subarray,$array[$i]; + + for (my $j=$i+1;$j<@array;$j++) { + if ($start<$array[$j][1] && $end>$array[$j][0]) { + push @subarray,$array[$j]; + ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); + } + else{ + $i=$j; + &find_cites(\@subarray,$chr,$strand); + last; + } + } + } + } +} + +open FA,">$faout"; #output file +open STR,">$strout"; +foreach my $chr (keys %cites) { + foreach my $strand (keys %{$cites{$chr}}) { + + my @array2=@{$cites{$chr}{$strand}}; + @array2=sort{$a->[0]<=>$b->[0]} @array2; + &excise(\@array2,$chr,$strand); + } +} +close FA; +close STR; +sub oneCiteDn{ + my ($array,$a,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$a][1]+$maxd+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + + my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); + return $val; +} +sub oneCiteUp{ + my ($array,$a,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$maxd-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$a][1]+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + + my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); + return $val; + +} + +sub twoCites{ + my ($array,$a,$b,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$b][1]+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + +# my( $str,$mfe)=RNA::fold($seq); +# return 0 if($mfe>$MAX_ENERGY); ### minimum mfe + my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee); + + return $val; + +} +sub excise{ + my ($cluster,$chr,$strand)=@_; + + my $last_pos=0; + for (my $i=0;$i<@{$cluster};$i++) { + next if($$cluster[$i][0]<$last_pos); + my $ok=0; + for (my $j=$i+1;$j<@{$cluster} ;$j++) { + if($$cluster[$j][0]-$$cluster[$i][1]>$maxd){ + $i=$j; + last; + }else{ + $ok=&twoCites($cluster,$i,$j,$chr,$strand); + if($ok){ $last_pos=$$cluster[$j][1]+$flank; $i=$j; last;} + } + } + next if($ok); + + $ok=&oneCiteDn($cluster,$i,$chr,$strand); + if($ok){$last_pos=$$cluster[$i][1]+$maxd+$flank; next;} + $ok=&oneCiteUp($cluster,$i,$chr,$strand); + if($ok){$last_pos=$$cluster[$i][1]+$flank;next;} + } + + +} + +sub ffw2{ + my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_; + + my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns + if ($N_count > 5) { + return 0; + } + + my $seq_length=length $seq; + # position tag1 and tag2 + my $tag1_beg=index($seq,$tag1,0)+1; + if ($tag1_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + my $tag2_beg=index($seq,$tag2,0)+1; + if ($tag2_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + if ($tag2_beg < $tag1_beg) { + # swap tag1 and tag2 + ($tag1,$tag2)=($tag2,$tag1); + ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg); + } + my $tag1_end=$tag1_beg+length($tag1)-1; + my $tag2_end=$tag2_beg+length($tag2)-1; + # re-clipping + my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1; + my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length; + $seq=substr($seq,$beg-1,$end-$beg+1); + $seq_length=length $seq; + # re-reposition + $tag1_beg=index($seq,$tag1,0)+1; + if ($tag1_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + + $tag2_beg=index($seq,$tag2,0)+1; + if ($tag2_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + $tag1_end=$tag1_beg+length($tag1)-1; + $tag2_end=$tag2_beg+length($tag2)-1; + + # fold + #my ($struct,$mfe)=RNA::fold($seq); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $struct=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + #$mfe=sprintf "%.2f", $mfe; + if ($mfe > $MAX_ENERGY) {return 0;} + + # tag1 + my $tag1_length=$tag1_end-$tag1_beg+1; + my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length); + my $tag1_arm=which_arm($tag1_struct); + my $tag1_unpair=$tag1_struct=~tr/.//; + my $tag1_pair=$tag1_length-$tag1_unpair; + my $tag1_max_bulge=biggest_bulge($tag1_struct); + if ($tag1_arm ne "5p") { return 0;} # tag not in stem +# if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag1_pair < $MIN_PAIR) {return 0;} + if ($tag1_max_bulge > $MAX_BULGE) {return 0;} + + # tag2 + my $tag2_length=$tag2_end-$tag2_beg+1; + my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length); + my $tag2_arm=which_arm($tag2_struct); + my $tag2_unpair=$tag2_struct=~tr/.//; + my $tag2_pair=$tag2_length-$tag2_unpair; + my $tag2_max_bulge=biggest_bulge($tag2_struct); + if ($tag2_arm ne "3p") {return 0;} # star not in stem +# if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag2_pair < $MIN_PAIR) {return 0;} + if ($tag2_max_bulge > $MAX_BULGE) {return 0;} + + # space size between miR and miR* + my $space=$tag2_beg-$tag1_end-1; + if ($space < $MIN_SPACE) {return 0;} + if ($space > $MAX_SPACE) {return 0;} + + # size diff of miR and miR* + my $size_diff=abs($tag1_length-$tag2_length); + if ($size_diff > $MAX_SIZEDIFF) {return 0;} + + # build base pairing table + my %pairtable; + &parse_struct($struct,\%pairtable); # coords count from 1 + + my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end); + my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end); + my $asy=($asy1 < $asy2) ? $asy1 : $asy2; + if ($asy > $ASYMMETRY) {return 0} + + # duplex fold, determine whether two matures like a miR/miR* ike duplex + my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2); + # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context + my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end); + if ($like_mir_duplex1==0 && $like_mir_duplex2==0) { + return 0; + } + + print FA ">$chr:$strand:$ss..$ee\n$seq\n"; + print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; + + return 1; +} + +sub ffw1{ + my ($seq,$tag,$chr,$strand,$ss,$ee)=@_; + my $pass=0; + + my $N_count=$seq=~tr/N//; + if ($N_count > 5) { + return 0; + } + + my $seq_length=length $seq; + my $tag_length=length $tag; + + # position + my $tag_beg=index($seq,$tag,0)+1; + if ($tag_beg < 1) { + warn "[ffw1] coordinate error.\n"; + return $pass; + } + my $tag_end=$tag_beg+length($tag)-1; + + + # define candidate precursor by hybrid short arm to long arm, not solid enough + my($beg,$end)=define_precursor($seq,$tag); + if (not defined $beg) { + return $pass; + } + if (not defined $end) { + return $pass; + } + $seq=substr($seq,$beg-1,$end-$beg+1); + $seq_length=length $seq; + + + # fold + #my ($struct,$mfe)=RNA::fold($seq); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $struct=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + if ($mfe > $MAX_ENERGY) { + $pass=0; + return $pass; + } + + # reposition + $tag_beg=index($seq,$tag,0)+1; + if ($tag_beg < 1) { + warn "[ffw1] coordinate error.\n"; + return 0; + } + $tag_end=$tag_beg+length($tag)-1; + + my $tag_struct=substr($struct,$tag_beg-1,$tag_length); + my $tag_arm=which_arm($tag_struct); + my $tag_unpair=$tag_struct=~tr/.//; + my $tag_pair=$tag_length-$tag_unpair; + my $tag_max_bulge=biggest_bulge($tag_struct); + if ($tag_arm eq "-") { return $pass;} +# if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag_pair < $MIN_PAIR) { return $pass;} + if ($tag_max_bulge > $MAX_BULGE) {return $pass;} + + # build base pairing table + my %pairtable; + &parse_struct($struct,\%pairtable); # coords count from 1 + + # get star + my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end); + my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1); + my $star_length=$star_end-$star_beg+1; + my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1); + my $star_arm=which_arm($star_struct); + my $star_unpair=$star_struct=~tr/.//; + my $star_pair=$star_length-$star_unpair; + my $star_max_bulge=biggest_bulge($star_struct); + if ($star_arm eq "-") { return $pass;} +# if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass} + if ($star_pair < $MIN_PAIR) {return $pass;} + if ($star_max_bulge > $MAX_BULGE) {return $pass;} + + if ($tag_arm eq $star_arm) {return $pass;} + + # space size between miR and miR* + my $space; + if ($tag_beg < $star_beg) { + $space=$star_beg-$tag_end-1; + } + else { + $space=$tag_beg-$star_end-1; + } + if ($space < $MIN_SPACE) { return $pass;} + if ($space > $MAX_SPACE) { return $pass;} + + # size diff + my $size_diff=abs($tag_length-$star_length); + if ($size_diff > $MAX_SIZEDIFF) { return $pass;} + + # asymmetry + my $asy=get_asy(\%pairtable,$tag_beg,$tag_end); + if ($asy > $ASYMMETRY) {return $pass;} + + $pass=1; + print FA ">$chr:$strand:$ss..$ee\n$seq\n"; + print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; + return $pass; + +} +sub get_star { + my($table,$beg,$end)=@_; + + my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2 + foreach my $i ($beg..$end) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + $s1=$i; + $s2=$j; + last; + } + } + foreach my $i (reverse ($beg..$end)) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + $e1=$i; + $e2=$j; + last; + } + } +# print "$s1,$e1 $s2,$e2\n"; + + # correct terminus + my $off1=$s1-$beg; + my $off2=$end-$e1; + $s2+=$off1; + $s2+=2; # 081009 + $e2-=$off2; $e2=1 if $e2 < 1; + $e2+=2; $e2=1 if $e2 < 1; # 081009 + ($s2,$e2)=($e2,$s2) if ($s2 > $e2); + return ($s2,$e2); +} + +sub define_precursor { + my $seq=shift; + my $tag=shift; + + my $seq_length=length $seq; + my $tag_length=length $tag; + my $tag_beg=index($seq,$tag,0)+1; + my $tag_end=$tag_beg+$tag_length-1; + + # split the candidate region into short arm and long arm + my $tag_arm; + my ($larm,$larm_beg,$larm_end); + my ($sarm,$sarm_beg,$sarm_end); + if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm + $sarm=substr($seq,0,$tag_end); + $larm=substr($seq,$tag_end); + $sarm_beg=1; + $sarm_end=$tag_end; + $larm_beg=$tag_end+1; + $larm_end=$seq_length; + $tag_arm="5p"; + } + else { + $larm=substr($seq,0,$tag_beg-1); # on 3' arm + $sarm=substr($seq,$tag_beg-1); + $larm_beg=1; + $larm_end=$tag_beg-1; + $sarm_beg=$tag_beg; + $sarm_end=$seq_length; + $tag_arm="3p"; + } + +# print "$sarm_beg,$sarm_end $sarm\n"; +# print "$larm_beg,$larm_end $larm\n"; + + # clipping short arm + if ($tag_arm eq "5p") { + $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1; + $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); + } + else { + $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length; + $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); + } +# print "$sarm_beg,$sarm_end $sarm\n"; +# print "$larm_beg,$larm_end $larm\n"; + + # define the precursor by hybriding short arm to long arm +=cut #modify in 2014-10-28 + my $duplex=RNA::duplexfold($sarm,$larm); + my $struct=$duplex->{structure}; + my $energy=sprintf "%.2f", $duplex->{energy}; + my ($str1,$str2)=split(/&/,$struct); + my $pair=$str1=~tr/(//; +# print "pair=$pair\n"; + my $beg1=$duplex->{i}+1-length($str1); + my $end1=$duplex->{i}; + my $beg2=$duplex->{j}; + my $end2=$duplex->{j}+length($str2)-1; +=cut +###### new codes begin + my $duplex=`perl -e 'print "$sarm\n$larm"' | RNAduplex`; + #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) + my @tmpduplex=split/\s+/,$duplex; + my $struct=$tmpduplex[0]; + $tmpduplex[-1]=~s/[(|)]//g; + my $energy=$tmpduplex[-1]; + my ($str1,$str2)=split(/&/,$struct); + my $pair=$str1=~tr/(//; + my ($beg1,$end1)=split/,/,$tmpduplex[1]; + my ($beg2,$end2)=split/,/,$tmpduplex[3]; +######## new codes end + +# print "$beg1:$end1 $beg2:$end2\n"; + # transform coordinates + $beg1=$beg1+$sarm_beg-1; + $end1=$end1+$sarm_beg-1; + $beg2=$beg2+$larm_beg-1; + $end2=$end2+$larm_beg-1; +# print "$beg1:$end1 $beg2:$end2\n"; + + my $off5p=$beg1-$sarm_beg; + my $off3p=$sarm_end-$end1; + $beg2-=$off3p; $beg2=1 if $beg2 < 1; + $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length; + +# print "$beg1:$end1 $beg2:$end2\n"; + + my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2; + my $end=$sarm_end > $end2 ? $sarm_end : $end2; + + return if $pair < $MIN_PAIR; +# print "$beg,$end\n"; + return ($beg,$end); +} + + +# duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex +sub likeMirDuplex1 { + my $seq1=shift; + my $seq2=shift; + my $like_mir_duplex=1; + + my $length1=length $seq1; + my $length2=length $seq2; +=cut + my $duplex=RNA::duplexfold($seq1, $seq2); + my $duplex_struct=$duplex->{structure}; + my $duplex_energy=sprintf "%.2f", $duplex->{energy}; + my ($str1,$str2)=split(/&/,$duplex_struct); + my $beg1=$duplex->{i}+1-length($str1); + my $end1=$duplex->{i}; + my $beg2=$duplex->{j}; + my $end2=$duplex->{j}+length($str2)-1; +=cut + my $duplex=`perl -e 'print "$seq1\n$seq2"' | RNAduplex`; + #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) + my @tmpduplex=split/\s+/,$duplex; + my $duplex_struct=$tmpduplex[0]; + $tmpduplex[-1]=~s/[(|)]//g; + my $duplex_energy=$tmpduplex[-1]; + my ($str1,$str2)=split(/&/,$duplex_struct); + #my $pair=$str1=~tr/(//; + my ($beg1,$end1)=split/,/,$tmpduplex[1]; + my ($beg2,$end2)=split/,/,$tmpduplex[3]; + + # revise beg1, end1, beg2, end2 + $str1=~/^(\.*)/; + $beg1+=length($1); + $str1=~/(\.*)$/; + $end1-=length($1); + $str2=~/^(\.*)/; + $beg2+=length($1); + $str2=~/(\.*)$/; + $end2-=length($1); + + my $pair_num=$str1=~tr/(//; + my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom + my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck +# print $pair_num,"\n"; +# print $overhang1,"\n"; +# print $overhang2,"\n"; + if ($pair_num < 13) { + $like_mir_duplex=0; + } + if ($overhang1 < 0 || $overhang2 < 0 ) { + $like_mir_duplex=0; + } + if ($overhang1 > 4 || $overhang2 > 4) { + $like_mir_duplex=0; + } + return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); +} + +# judge whether two matures form miR/miR* duplex, in hairpin context +sub likeMirDuplex2 { + my ($table,$beg1,$end1,$beg2,$end2)=@_; + my $like_mir_duplex=1; + +# s1 e1 +# 5 ----------------------------3 +# | | |||| ||| | +#3 -------------------------------5 +# e2 s2 + + my $pair_num=0; + my $overhang1=0; + my $overhang2=0; + my ($s1,$e1,$s2,$e2); + foreach my $i ($beg1..$end1) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + $s1=$i; + $e2=$j; + last; + } + } + } + foreach my $i (reverse ($beg1..$end1)) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + $e1=$i; + $s2=$j; + last; + } + } + } + +# print "$beg1,$end1 $s1,$e1\n"; +# print "$beg2,$end2 $s2,$e2\n"; + + foreach my $i ($beg1..$end1) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + ++$pair_num; + } + } + } + if (defined $s1 && defined $e2) { + $overhang1=($end2-$e2)-($s1-$beg1); + } + if (defined $e1 && defined $s2) { + $overhang2=($end1-$e1)-($s2-$beg2); + } + + if ($pair_num < 13) { + $like_mir_duplex=0; + } + if ($overhang1 < 0 && $overhang2 < 0) { + $like_mir_duplex=0; + } + return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); +} +sub parse_struct { + my $struct=shift; + my $table=shift; + + my @t=split('',$struct); + my @lbs; # left brackets + foreach my $k (0..$#t) { + if ($t[$k] eq "(") { + push @lbs, $k+1; + } + elsif ($t[$k] eq ")") { + my $lb=pop @lbs; + my $rb=$k+1; + $table->{$lb}=$rb; + $table->{$rb}=$lb; + } + } + if (@lbs) { + warn "unbalanced RNA struct.\n"; + } +} +sub which_arm { + my $substruct=shift; + my $arm; + if ($substruct=~/\(/ && $substruct=~/\)/) { + $arm="-"; + } + elsif ($substruct=~/\(/) { + $arm="5p"; + } + else { + $arm="3p"; + } + return $arm; +} +sub biggest_bulge { + my $struct=shift; + my $bulge_size=0; + my $max_bulge=0; + while ($struct=~/(\.+)/g) { + $bulge_size=length $1; + if ($bulge_size > $max_bulge) { + $max_bulge=$bulge_size; + } + } + return $max_bulge; +} +sub get_asy { + my($table,$a1,$a2)=@_; + my ($pre_i,$pre_j); + my $asymmetry=0; + foreach my $i ($a1..$a2) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if (defined $pre_i && defined $pre_j) { + my $diff=($i-$pre_i)+($j-$pre_j); + $asymmetry += abs($diff); + } + $pre_i=$i; + $pre_j=$j; + } + } + return $asymmetry; +} + +sub peaks{ + my @cluster=@{$_[0]}; + + return if(@cluster<1); + + my $max=0; my $index=-1; + for (my $i=0;$i<@cluster;$i++) { + if($cluster[$i][2]>$max){ + $max=$cluster[$i][2]; + $index=$i; + } + } +# &excise(\@cluster,$index,$_[1],$_[2]); + return($index); +} + +sub find_cites{ + my @tmp=@{$_[0]}; + my $i=&peaks(\@tmp); + + my $start=$tmp[$i][0]; + my $total=0; my $node5=0; + for (my $j=0;$j<@tmp ;$j++) { + $total+=$tmp[$j][2]; + $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2); + } + push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5); +} + +sub newpos{ + my ($a,$b,$c,$d)=@_; + my $s= $a>$c ? $c : $a; + my $e=$b>$d ? $b : $d; + return($s,$e); +} + +sub rev{ + + my($sequence)=@_; + + my $rev=reverse $sequence; + + return $rev; +} + +sub com{ + + my($sequence)=@_; + + $sequence=~tr/acgtuACGTU/TGCAATGCAA/; + + return $sequence; +} + +sub revcom{ + + my($sequence)=@_; + + my $revcom=rev(com($sequence)); + + return $revcom; +} + +sub find_strand{ + + #A subroutine to find the strand, parsing different blast formats + my($other)=@_; + + my $strand="+"; + + if($other=~/-/){ + $strand="-"; + } + + if($other=~/minus/i){ + $strand="-"; + } + + return($strand); +} +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -map -g -d -f -o -s -e +options: + -map input file# align result # bst. format + -g input file # genome sequence fasta format + -d Maximal space between miRNA and miRNA* (200) + -f Flank sequence length of miRNA precursor (10) + -o output file# percursor fasta file + -s output file# precursor structure file + -e Maximal free energy allowed for a miRNA precursor (-18 kcal/mol) + + -h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 quantify.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantify.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,502 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use File::Path; +use strict; +use File::Basename; +#use Getopt::Std; +use Getopt::Long; +#use RNA; + +my %opts; +GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","h"); +if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $read=$opts{'r'}; +my $pre=$opts{'p'}; +my $mature=$opts{'m'}; + +my $dir=$opts{'o'}; +unless ($dir=~/\/$/) {$dir .="/";} +if (not -d $dir) { + mkdir $dir; +} + +my $threads=defined $opts{'t'} ? $opts{'t'} : 1; +my $mismatch=defined $opts{'mis'} ? $opts{'mis'} : 0; + +my $upstream = 2; +my $downstream = 5; + +$upstream = $opts{'e'} if(defined $opts{'e'}); +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $marks=defined $opts{'tag'} ? $opts{'tag'} : ""; + +my $time=Time(); + +my $tmpdir="${dir}/known_miRNA_Express"; +if(not -d $tmpdir){ + mkdir($tmpdir); +} +chdir $tmpdir; + +`cp $pre ./`; +my $pre_file_name=basename($pre); + +&mapping(); # matures align to precursors && reads align to precursors; + +my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end; +&maturePosOnPre(); # acknowledge mature positions on precursor + +my %pre_read; +&readPosOnPre(); # acknowledge reads positions on precursors + +if(!(defined $opts{'tag'})){ + foreach my $key (keys %pre_read) { + $pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/; + my @ss=split/_/,$1; + for (my $i=1;$i<=@ss;$i++) { + $marks .="Smp$i;"; + } + last; + } +} + +my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...." +&attachPre(); + +my $preno=scalar (keys %pre); +print "Total Precursor Number is $preno !!!!\n"; + +my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe; +&structure(); + + +##### analysis and print out && moRs +my $aln=$dir."known_microRNA_express.aln"; +my $list=$dir."known_microRNA_express.txt"; +my $moRs=$dir."known_microRNA_express.moRs"; + +system("ln -s $mature $dir/known_microRNA_mature.fa "); +system("ln -s $pre $dir/known_microRNA_precursor.fa "); + +open ALN,">$aln"; +open LIST,">$list"; +open MORS,">$moRs"; + +$"="\t"; ##### @array print in \t + +my @marks=split/\;/,$marks; +#print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n"; +print LIST "#matueID\tpreID\tpos1\tpos2"; +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_matureExp"; +} +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_starExp"; +} +for (my $i=0;$i<@marks;$i++) { + print LIST "\t",$marks[$i],"_totalExp"; +} +print LIST "\n"; +print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n"; +print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n"; +my %moRs; + +foreach my $key (keys %pre) { + print ALN ">$key\n$pre{$key}\n$struc{$key}{struc} ($struc{$key}{mfe})\n"; + next if(! (exists $pre_read{$key})); + my @array=@{$pre_read{$key}}; + @array=sort{$a->[3]<=> $b->[3]} @array; + + my $length=length($pre{$key}); + + my $maxline=-1;my $max=0; ### storage the maxinum express read line + my $totalReadsNo=0; + my @not_over=(); ### new read format better for moRs analysis + +####print out Aln file start + for (my $i=0;$i<@array;$i++) { + my $maps=$array[$i][3]+1; + my $mape=$array[$i][3]+length($array[$i][4]); + my $str=""; + $str .= "." x ($maps-1); + $str .=$array[$i][4]; + $str .="." x ($length-$mape); + $str .=" "; + + $array[$i][0]=~/:([\d|_]+)_x(\d+)$/; + my @sample=split /\_/,$1; + my $total=$2; + print ALN $str,"@sample","\t",$total,"\n"; + + if($total>$max){$max=$total; $maxline=$i;} + $totalReadsNo+=$total; + + push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"]; + } +####print out Aln file end + +#### express list start + my ($ms,$me,$ss,$se); + if (!(exists($pre_mature{$key}))) { + $ms=$array[$maxline][3]+1; + $me=$array[$maxline][3]+length($array[$maxline][4]); + ($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'}); + + my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); + print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; + } + else{ + foreach my $maID (keys %{$pre_mature{$key}}) { + $ms=$pre_mature{$key}{$maID}{"mature"}[0]; + $me=$pre_mature{$key}{$maID}{"mature"}[1]; + $ss=$pre_mature{$key}{$maID}{"star"}[0]; + $se=$pre_mature{$key}{$maID}{"star"}[1]; + my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); + print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; + } + } +#### express list end + +#### analysis moRs start + my @result; my @m_texp;my $m_texp=0; ### moRs informations + + while (@not_over>0) { + my @over=@not_over; + @not_over=(); + +#丰度最高tag + my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1; + for (my $i=0;$i<@over;$i++) { + my @m_array=@{$over[$i]}; + if ($m_max<$m_array[4]) { + $m_max=$m_array[4]; + $m_maxline=$i; + } + } + $m_start=$over[$m_maxline][1]; + $m_end=$over[$m_maxline][2]; + $m_exp=$m_max; + $over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/; + my @m_nums=split/_/,$1; + for (my $j=0;$j<@m_nums;$j++) { + $m_exp[$j]=$m_nums[$j]; + } + +#统计以丰度最高tag为坐标的reads, 两端位置差异不超过3nt + for (my $i=0;$i<@over;$i++) { + next if($i==$m_maxline); + my @m_array=@{$over[$i]}; + if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) { + $m_exp+=$m_array[4]; + $m_no++; + $m_array[3]=~/:([\d|_]+)_x(\d+)$/; + my @m_nums=split/_/,$1; + for (my $j=0;$j<@m_nums;$j++) { + $m_exp[$j] +=$m_nums[$j]; + } + } + elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #去除跨越block的reads + } + if($m_exp>5){### 5个reads + $m_texp+=$m_exp; + for (my $j=0;$j<@m_exp;$j++) { + $m_texp[$j]+=$m_exp[$j]; + } + my $string=&subseq($pre{$key},$m_start,$m_end,"+"); + push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ; + } + } + + my $str=scalar @result; + my $percent=sprintf("%.2f",$m_texp/$totalReadsNo); + $str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}"; + @{$moRs{$str}}=@result; + +#### analysis moRs end +} + +##### moRs print out start +foreach my $key (keys %moRs) { + my @tmp=split/\t/,$key; + next if ($tmp[4]<=2); + next if($tmp[3]<0.95); + my @over; + for (my $i=0;$i<@{$moRs{$key}};$i++) { + my @arrayi=split/\t/,$moRs{$key}[$i]; + for (my $j=0;$j<@{$moRs{$key}};$j++) { + next if($i==$j); + my @arrayj=split/\t/,$moRs{$key}[$j]; + if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) { + push @over,$moRs{$key}[$i]; + } + } + } + if (@over>0) { + print MORS "$key\n"; + foreach (@{$moRs{$key}}) { + print MORS "$_\n"; + } + } +} +###### moRs print out end +close ALN; +close LIST; +close MORS; + +$"=" ";##### reset + + +################### Sub programs ################# +sub express{ + my ($ms,$me,$ss,$se,$read)=@_; + my (@mexp,@sexp,@texp); + $$read[0][0]=~/:([_|\d]+)_x(\d+)$/; + my @numsample=split/_/,$1; + for (my $i=0;$i<@numsample;$i++) { + $mexp[$i]=0; + $sexp[$i]=0; + $texp[$i]=0; + } + + for (my $i=0;$i<@{$read};$i++) { + my $start=$$read[$i][3]+1; + my $end=$$read[$i][3]+length($$read[$i][4]); + $$read[$i][0]=~/:([_|\d]+)_x(\d+)$/; + my $expresses=$1; + my @nums=split/_/,$expresses; + + for (my $j=0;$j<@nums;$j++) { + $texp[$j]+=$nums[$j]; + } + if ($start>=$ms && $end<=$me) { + for (my $j=0;$j<@nums;$j++) { + $mexp[$j]+=$nums[$j]; + } + } + if ($start>=$ss && $end<=$se) { + for (my $j=0;$j<@nums;$j++) { + $sexp[$j]+=$nums[$j]; + } + } + } + return(\@mexp,\@sexp,\@texp); +} + +sub structure{ + foreach my $key (keys %pre_mature) { + if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";} + #my ($str,$mfe)=RNA::fold($pre{$key}); + my $rnafold=`perl -e 'print "$pre{$key}"' | RNAfold --noPS`; + my @rnafolds=split/\s+/,$rnafold; + my $str=$rnafolds[1]; + my $mfe=$rnafolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + $struc{$key}{"struc"}=$str; + #$struc{$key}{"mfe"}=sprintf ("%.2f",$mfe); + $struc{$key}{"mfe"}=$mfe; + + foreach my $id (keys %{$pre_mature{$key}}) { + ($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str); + } +=cut +##### Nucleotide complementary + my @tmp=split//,$str; + my %a2b; + my @bps; + for (my $i=0;$i<@tmp;$i++) { + if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} + if ($tmp[$i] eq ")") { + my $up=pop @bps; + $a2b{$i+1}=$up; + $a2b{$up}=$i+1; + } + } + +##### search star position + foreach my $id (keys %{$pre_mature{$key}}) { + my $n=0; + for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) { + if (defined $a2b{$i}) { + my $a=$i; my $b=$a2b{$i}; + if($a>$b){ + $pre_mature{$key}{$id}{"star"}[0]=$b-$n+2; + $pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); + } + if($a<$b{ + $pre_mature{$key}{$id}{"star"}[1]=$b+$n+2; + $pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); + } + last; + } + $n++; + } + } +=cut + } +} +sub other_pair{ + my ($start,$end,$structure)=@_; + ##### Nucleotide complementary + my @tmp=split//,$structure; + my %a2b; my @bps; + for (my $i=0;$i<@tmp;$i++) { + if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} + if ($tmp[$i] eq ")") { + my $up=pop @bps; + $a2b{$i+1}=$up; + $a2b{$up}=$i+1; + } + } +##### search star position + my $n=0;my $startpos; my $endpos; + for (my $i=$start;$i<=$end ; $i++) { + if (defined $a2b{$i}) { + my $a=$i; my $b=$a2b{$i}; +# if($a>$b){ +# $startpos=$b-$n+2; +# $endpos=$b-$n+2+($end-$start); +# } +# if($a<$b){ + $endpos=$b+$n+2; + if($endpos>length($structure)){$endpos=length($structure);} + $startpos=$b+$n+2-($end-$start); + if($startpos<1){$startpos=1;} +# } + last; + } + $n++; + } + return ($startpos,$endpos); +} +sub attachPre{ + open IN, "<$pre_file_name"; + my $name; + while (my $aline=) { + chomp $aline; + if ($aline=~/^>(\S+)/) { + $name=$1; + next; + } + $pre{$name} .=$aline; + } + close IN; +} +sub readPosOnPre{ + open IN,") { + chomp $aline; + my @tmp=split/\t/,$aline; + my $id=lc($tmp[2]); + push @{$pre_read{$tmp[2]}},[@tmp]; + } + close IN; +} +sub maturePosOnPre{ + open IN,") { + chomp $aline; + my @tmp=split/\t/,$aline; + my $mm=$tmp[0]; +# $mm=~s/\-3P|\-5P//i; + $mm=lc($mm); + my $pm=$tmp[2]; + $pm=lc($pm); + +# next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a + next if($mm!~/$pm/); +# print "$tmp[2]\t$tmp[0]\n"; +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream; +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0); +# $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream; + $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1; + $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]); + } + close IN; +} +sub mapping{ + my $err; +## build bowtie index + #print STDERR "building bowtie index\n"; + $err = `bowtie-build $pre_file_name miRNA_precursor`; + +## map mature sequences against precursors + #print STDERR "mapping mature sequences against index\n"; + $err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`; + +## map reads against precursors + #print STDERR "mapping read sequences against index\n"; + $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`; + +} + +sub subseq{ + my $seq=shift; + my $beg=shift; + my $end=shift; + my $strand=shift; + + my $subseq=substr($seq,$beg-1,$end-$beg+1); + if ($strand eq "-") { + $subseq=revcom($subseq); + } + return uc $subseq; +} + +sub revcom{ + my $seq=shift; + $seq=~tr/ATCGatcg/TAGCtagc/; + $seq=reverse $seq; + return uc $seq; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -r -p -m -mis -t -e -f -tag -o -time +mandatory parameters: +-p precursor.fa miRNA precursor sequences from miRBase # must be absolute path +-m mature.fa miRNA sequences from miRBase # must be absolute path +-r reads.fa your read sequences #must be absolute path + +-o output directory + +options: +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-t [int] threads number,default 1 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-tag [string] sample marks# eg. sampleA;sampleB;sampleC +-time sting #make directory time,default is the local time +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 quantify_siRNA.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quantify_siRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w +#Filename: +#Author: chentt +#Email: +#Date: 2012-4-6 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","o=s","d=s","h"); +if (!(defined $opts{i} and defined $opts{d} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $input=$opts{'i'}; +my $output=$opts{'o'}; +my $depth=$opts{'d'}; + +open (IN,"<$input")||die"$!"; +open OUT,">$output"; +#my @Total=qw(15797079 18042650 17455254 17295526 18791753 16719596 15150009 18451484 17402501 17729362 19347595 17518516 15699663 16589265 15442892 14012264 14190746 17280260 13213117 12390121 14874304 ); +my @Total=split/\,/,$depth; +#print OUT "#clusterID\tmajor_length\tpercent\n"; +while (my $aline=) { + chomp $aline; + if ($aline=~/^\"/){ + my @title=split/\t/,$aline; + for (my $i=0;$i<@title ;$i++) { + $title[$i]=~s/^\"(\S+)\"$/$1/; + } + my $title=join "\t",@title; + print OUT "\#$title\n"; + next; + } + my @temp=split/\t/,$aline; + print OUT "$temp[0]\t$temp[1]\t$temp[2]"; + my @id=split/:/,$temp[0]; + my @posi=split/-/,$id[1]; + for (my $i=3;$i<@temp;$i++) { + my $rpkm=sprintf("%.2f",$temp[$i]/($posi[1]-$posi[0]+1)/$Total[$i-3]*1000000000); + print OUT "\t$rpkm"; + } + print OUT "\n"; +} +close IN; +close OUT; + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o -h +options: +-i input cluster file +-o output file +-d depth +-h help +USAGE +exit(1); +} \ No newline at end of file diff -r f008ab2cadc6 -r 7b5a48b972e9 sRNA_plot.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRNA_plot.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,411 @@ +#!/usr/bin/perl -w +#========================================================================================== +# Date: +# Title: +# Comment: Program to plot gene structure +# Input: 1. +# 2. +# 3. +# Output: output file of gene structure graph by html or svg formt +# Test Usage: +#======================================================================================== +#use strict; +my $version=1.00; +use SVG; +use Getopt::Long; +my %opt; +GetOptions(\%opt,"g=s","l=s","span=s","c=s","o=s","out=s","cen:s","mark=s","h"); +if (!( defined $opt{o}) || defined $opt{h}) { +&usage; +} +my $span=$opt{span}; +#my $sample_cloumn=$opt{n}; +my $mark=$opt{mark}; +my @mark=split/\#/,$mark; +my $genelist=$opt{g}; +#===============================Define Attribute========================================== +my %attribute=( + canvas=>{ + 'width'=>1500, + 'height'=>1800 + }, + text=>{ + 'stroke'=>"#000000", + 'fill'=>"none", + 'stroke-width'=>0.5 + }, + line=>{ + 'stroke'=>"black", + 'stroke-width'=>1 + }, + csv=>{ + 'stroke'=>"red", + 'stroke-width'=>0.5 + }, + exon=>{ + 'stroke'=>"black", + 'stroke-width'=>1 + }, + intron=>{ + 'stroke'=>"black", + 'stroke-width'=>1.5 + }, + font=>{ + 'fill'=>"#000000", + 'font-size'=>12, + 'font-size2'=>10, + #'font-weight'=>'bold', + 'font-family'=>"Arial" + #'font-family'=>"ArialNarrow-bold" + }, + rect=>{ + 'fill'=>"lightgreen", + 'stroke'=>"black", + 'stroke-width'=>0.5 + }, + readwidth=>0.5 +); +#############################s#define start coordinate and scale +open(TXT,">$opt{out}"); +open(LENGTH,"$opt{l}")||die"cannot open the file $opt{l}"; +my %length; +while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + $temp[0]=~s/^c/C/; + $length{$temp[0]}=$temp[1]; +} +close LENGTH; +#--------------------------------------------------------------- +open(GENE,"$opt{g}")||die"cannot open the file $opt{g}"; +my %genelist; +while (my $aline=) { + chomp $aline;#LOC_Os01g01280 Chr1 133291 134685 + + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + if ($temp[1]=~/^Chr(\d)$/) { + $temp[1]="Chr0$1"; + } + push @{$genelist{$temp[1]}},[$temp[0],$temp[2],$temp[3]]; + +} +close GENE; +#my %have_gene; +#foreach my $chr (sort keys %genelist) { +# my @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}}; +# my $start=$genelist[0][1]; +# my $end=$genelist[0][2]; +# for (my $i=0;$i<@genelist ;$i++) { +# if ($gene) { +# } +# } +#} + +my %gene_desity; +foreach my $chr (sort keys %genelist) { + my @genelist=sort{$a->[1] <=> $b->[1]}@{$genelist{$chr}}; + for (my $i=0;$i<@genelist ;$i++) { + my $start=int($genelist[$i][1]/$span); + my $end=int($genelist[$i][2]/$span); + #my @t_rpkm=split/\t/,$target_rpkm{$genelist[$i][0]}; + if ($start==$end) { + $gene_desity{$chr}[$start]++; + } + else{ + for (my $k=$start;$k<=$end ;$k++) { + $gene_desity{$chr}[$k]++; + } + } + } +} +#------------------------------------------region_gene_number------------------------- +my $max_gene_number=0; +my $total=0; +foreach my $chr (sort keys %genelist) { + for (my $i=0;$i<@{$gene_desity{$chr}} ;$i++) { + if (!(defined($gene_desity{$chr}[$i]))) { + $gene_desity{$chr}[$i]=0; + } + if ($gene_desity{$chr}[$i]>$max_gene_number) { + $max_gene_number=$gene_desity{$chr}[$i]; + #print "$gene_desity{$chr}[$i]\n"; + } + #print TXT "$i\t$gene_desity[$i]\n"; + $total+=$gene_desity{$chr}[$i]; + #print "$chr\t$i\t$gene_desity{$chr}[$i]\n"; + } +} +#print "Gene max:$max_gene_number\ntotal:$total\n"; + +#--------------------------------------------------------------- +my %centromere; +if (defined($opt{cen})) { + open CEN,"$opt{cen}"; + while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + my @temp=split/\t/,$aline; + $temp[0]=~s/^c/C/; + $centromere{$temp[0]}[0]=$temp[1]; + $centromere{$temp[0]}[1]=$temp[2]; + } + close CEN; +} + +#--------------------------------------------------------------- +my $max_length=0; +foreach my $chr (keys %length) { + if ($max_length<$length{$chr}) { + $max_length=$length{$chr}; + } + print "$chr\n"; +} +#====================================cluster data======================================= +open(CLUSTER,"$opt{c}")||die"cannot open the file $opt{c}"; +my %cluster; +my %cluster_density; +#my @sample=qw(39B3 3PA3 3LC3); +my @cluster_non_add; +while (my $aline=) { + next if($aline=~/^\#/); + chomp $aline;##Chr MajorLength Percent end 19B1 + my @temp=split/\t/,$aline; + my @ID=split/\:/,$temp[0]; + my @posi=split/\-/,$ID[1]; + my @all_rpkm=@temp; + shift @all_rpkm; + shift @all_rpkm; + shift @all_rpkm; +# for (my $s=0;$s<@all_rpkm ;$s++) {#log transfer +# $all_rpkm[$s]=log2($all_rpkm[$s]); +# } + push @{$cluster{$ID[0]}},[$temp[0],$posi[0],$posi[1],@all_rpkm];#ID start end rpkm(19B1,1PA1,1LC1); +} +close CLUSTER; +my %max_cluster; +my $chr_number=0; +print "@mark\n$mark\n"; +foreach my $chr (sort keys %cluster) { + for (my $i=0;$i<@mark ;$i++) { + $max_cluster{$chr}[$i]=0; + } + $chr_number++; +} +foreach my $chr (sort keys %cluster) { + @{$cluster{$chr}}=sort{$a->[1] <=> $b->[1]}@{$cluster{$chr}}; + for (my $i=0;$i<$#{$cluster{$chr}} ;$i++) { + for (my $s=0;$s<@mark;$s++) { + if ($cluster{$chr}[$i][3+$s]>$max_cluster{$chr}) { + $max_cluster{$chr}[$s]=$cluster{$chr}[$i][3+$s]; + } + } + } + +} +foreach my $chr (sort keys %max_cluster) { + for (my $s=0; $s<@mark;$s++) { + # print "$max_cluster{$chr}[$s]\n"; + } +} +#--------------------------------------------------------------------------------------- +foreach my $chr(keys %cluster) { + for(my $i=0;$i<$#{$cluster{$chr}};$i++) { + my $start=int($cluster{$chr}[$i][1]/$span); + my $end=int($cluster{$chr}[$i][2]/$span); + if ($start==$end) { + for (my $s=0;$s<@mark ;$s++) { + $cluster_density{$chr}[$start][$s]+=$cluster{$chr}[$i][3+$s]; + } + + } + else{ + for (my $m=$start;$m<=$end ;$m++) { + for (my $s=0;$s<@mark ;$s++) { + $cluster_density{$chr}[$m][$s]+=$cluster{$chr}[$i][3+$s]; + } + } + } + } +} +my %max_cluster_density; +my $max_all_density=0; +foreach my $chr (sort keys %cluster) {# + for (my $s=0;$s<@mark ;$s++) { + for (my $i=0;$i<$#{$cluster{$chr}} ;$i++) { + $max_cluster_density{$chr}[$s]=0; + } + } + +} +foreach my $chr (sort keys %cluster_density) { + print "$#{$cluster_density{$chr}}\n"; + for (my $k=0;$k<$#{$cluster_density{$chr}} ;$k++) { + print TXT "$chr\t$k"; + for (my $s=0;$s<@mark;$s++) { + if (!(defined($cluster_density{$chr}[$k][$s]))) { + $cluster_density{$chr}[$k][$s]=0; + } + if ($cluster_density{$chr}[$k][$s]>$max_cluster_density{$chr}[$s]) { + $max_cluster_density{$chr}[$s]=$cluster_density{$chr}[$k][$s]; + } + if ($cluster_density{$chr}[$k][$s]>$max_all_density) { + $max_all_density=$cluster_density{$chr}[$k][$s]; + } + print TXT "\t$cluster_density{$chr}[$k][$s]"; + } + print TXT "\n"; + } +} +print "max density: $max_all_density\n"; +#-------------------------------------------------------------------- +my $top_margin=30; +my $tail_margin=30; +my $XOFFSET=50; +my $YOFFSET=60; +my $chr_length=600; +my $Xscale=$chr_length/$max_length;#定义X轴比例尺 1:1000 x轴的坐标长度都要按照此比例尺换算 +#my $high_cov=$high_cov9B1=0.5;#定义峰图最高峰 +#my $Yscale=1/$high_cov;#定义Y轴比例尺 1:60 y轴的坐标长度都要按照此比例尺换算 +#========================================New canvas============================ +#### Starting #### +#新建画布 +my $width=1000; +my $heigth=100+130*$chr_number; +my $svg=SVG->new(width=>$width, height=>$heigth); +#画图起始点 +my $canvas_start_x=$XOFFSET; +my $canvas_end_x=$XOFFSET+$max_length*$Xscale;#按照比例尺 画线 +my $canvas_start_y=$YOFFSET; +my $canvas_end_y=$YOFFSET; +my $chr_heigth=$heigth-$YOFFSET-$tail_margin; +print "chr number:$chr_number\n"; +my $one_chr_heigth=$chr_heigth/$chr_number; +my $Yscale=($one_chr_heigth-15)/$max_all_density; +#my $chr_width=$YOFFSET; +#my $chr_start_y; +#my $chr_end_y; +#my $Yscale=0.01; +#=======================================title of the graph=============================== +#my $span_k=$span/1000; +#$svg->text('x',$width/2,'y',$YOFFSET-20,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',15,'font-family',$attribute{font}{'font-family'},'-cdata',"Clusters rpkm/"."$span_k"."kb Distribution"); +#=======================================the top max chr line============================= +$svg->line(id=>'l1',x1=>$canvas_start_x,y1=>$canvas_start_y,x2=>$canvas_end_x,y2=>$canvas_end_y,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); +$long_scale=int ($max_length/10);#十等分 大刻度 +#大坐标刻度 +for ($i=0;$i<=10;$i++) { + my $long_x_start=$XOFFSET+$long_scale*$i*$Xscale; + my $long_x_end=$long_x_start; + my $long_y_start=$YOFFSET; + my $long_y_end=$YOFFSET-5; + $svg->line('x1',$long_x_start,'y1',$long_y_start,'x2',$long_x_end,'y2',$long_y_end,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + my $Bscale=$long_scale*$i; + my $cdata=int ($Bscale/1000000); + $svg->text('x',$long_x_start,'y',$long_y_start-10,'style','fill:black;text-anchor:middle','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$cdata."M"); +} +#========================================================================================= +my $cc=1; +foreach my $chr (sort keys %length) { + my $chr_end_x=$XOFFSET+$length{$chr}*$Xscale; + my $chr_start_x=$XOFFSET; + my $chr_start_y=$YOFFSET+$cc*$one_chr_heigth; + my $chr_end_y=$chr_start_y; + #$chr_start_y+=$chr_width; + #$chr_end_y+=$chr_width; +# for (my $i=0;$i<@{$gene_desity{$chr}};$i++) { +# print "$chr\t$i\t$gene_desity{$chr}[$i]\n"; +# my $red=$gene_desity{$chr}[$i]/$max_gene_number*255; +# my $green=$gene_desity{$chr}[$i]/$max_gene_number*255; +# print "$red\t$green\t0\n"; +# $svg->rect('x',$chr_start_x+$i*$span*$Xscale,'y',$chr_start_y,'width',$span*$Xscale,'height',8,'stroke',"rgb($red,$green,0)",'stroke-width',0.1,'fill',"rgb($red,$green,0)"); +# } + + $svg->line(x1=>$chr_start_x,y1=>$chr_start_y,x2=>$chr_end_x,y2=>$chr_end_y,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + $svg->text('x',$XOFFSET-40,'y',$chr_start_y,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$chr); + my $m_length=$length{$chr}%1000000; + $svg->text('x',$chr_end_x+20,'y',$chr_start_y,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',12,'font-family',$attribute{font}{'font-family'},'-cdata',$m_length."M"); + + + if (defined($centromere{$chr}[0])) { + $svg->rect('x',$XOFFSET+$centromere{$chr}[0]*$Xscale,'y',$chr_start_y-2,'width',($centromere{$chr}[1]-$centromere{$chr}[0]+1)*$Xscale,'height',5,'stroke',"blue",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"blue"); + } + for (my $s=0;$s<@mark ;$s++) { + for (my $i=0;$i<$#{$cluster_density{$chr}}-1 ;$i++) { + #if ($cluster_density{$chr}[$i]*$Yscale>40) { + #$cluster_density{$chr}[$i]=40/$Yscale; + #$svg->rect('x',$XOFFSET+$i*$span*$Xscale,'y',$chr_start_y-45,'width',$span*$Xscale,'height',5,'stroke',"green",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"green"); + #} + #print "$i\t$cluster_density{$chr}[$i][$s]\t$cluster_density{$chr}[$i+1][$s]\n"; + my $cluster_density_start_x=$XOFFSET+$i*$span*$Xscale; + my $cluster_density_end_x=$XOFFSET+($i+1)*$span*$Xscale; + my $cluster_density_start_y=$chr_start_y-$cluster_density{$chr}[$i][$s]*$Yscale; + my $cluster_density_end_y=$chr_start_y-$cluster_density{$chr}[$i+1][$s]*$Yscale; + my $c_red=($s+1)/@mark*255; + $svg->line('x1',$cluster_density_start_x,'y1',$cluster_density_start_y,'x2',$cluster_density_end_x,'y2',$cluster_density_end_y,'stroke',"rgb($c_red,125,0)",'stroke-width',0.3); + } + + } + #=======Y axis + $svg->line(x1=>$chr_start_x,y1=>$chr_start_y,x2=>$chr_start_x,y2=>$chr_start_y-$one_chr_heigth+15,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + #=======Y axis ===>3 xiaoge + my $s10=1; + my $e10=0; + my $chr_max=$max_all_density; + while ($chr_max>10) { + $chr_max=int($chr_max/10); + $s10=$s10*10; + $e10++; + } + $chr_max=$chr_max/2; + #print "*****$max_all_density\t$chr_max\t$s10\n"; + for (my $i=1;$i<3 ;$i++) { + my $y1=$chr_start_y-$chr_max*$s10*$Yscale*$i; + my $xiaoge_Y=$chr_max*$i; + $svg->line('x1',$chr_start_x,'y1',$y1,'x2',$chr_start_x+3,'y2',$y1,'stroke',$attribute{line}{'stroke'},'stroke-width',$attribute{line}{'stroke-width'}); + $svg->text('x',$chr_start_x-26,'y',$y1+4,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',8,'font-family',$attribute{font}{'font-family'},'-cdata',$xiaoge_Y."e".$e10); + } + $cc++; +} + +for (my $s=0;$s<@mark ;$s++) { + my $c_red=($s+1)/@mark*255; + print "**$c_red\n"; + $svg->line('x1',$canvas_end_x+100,'y1',$YOFFSET+$s*20+30,'x2',$canvas_end_x+130,'y2',$YOFFSET+$s*20+30,'stroke',"rgb($c_red,125,0)",'stroke-width',1); + $svg->text('x',$canvas_end_x+150,'y',$YOFFSET+$s*20+5+30,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',10,'font-family',$attribute{font}{'font-family'},'-cdata',$mark[$s]); +} +# +# +if (defined($opt{cen})) { + $svg->rect('x',$canvas_end_x+100,'y',$YOFFSET+@mark*20+30,'width',30,'height',5,'stroke',"blue",'stroke-width',$attribute{intron}{'stroke-width'},'fill',"blue"); + $svg->text('x',$canvas_end_x+150,'y',$YOFFSET+@mark*20+30+5,'style','fill:black;text-anchor:left','stroke',$attribute{text}{'stroke'},'stroke-width',$attribute{text}{'stroke-width'},'font-size',10,'font-family',$attribute{font}{'font-family'},'-cdata',"centromere"); +} + +close TXT; + +open (OUT,">$opt{o}"); +print OUT $svg->xmlify(); + +sub log2 { + my $n = shift; + return log($n)/log(2); +} + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 +options: +-g genelist +-span +-n sample cloumn +-mark sample name +-o output graph file name with html or svg extension +-c cluster file input +-out txt output +-l length of chr +-cen centromere +-h help +USAGE +exit(1); +} \ No newline at end of file diff -r f008ab2cadc6 -r 7b5a48b972e9 sam2Bed_bowtie.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sam2Bed_bowtie.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,74 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2011/11/7 +#Modified: +#Description: sam2BED +my $version=1.00; + +use strict; +use Getopt::Long; + +my %opts; +GetOptions(\%opts,"i=s","mark=s","o=s","h"); +if (!(defined $opts{i} and defined $opts{o}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $filein=$opts{'i'}; +my $fileout=$opts{'o'}; +my $mark=$opts{'mark'}; +my @sample=split/\#/,$mark; +$mark=join"\t",@sample; +open OUT,">$fileout"; #output file +print OUT "#chr\tstrand\tstart\tend\t$mark\n"; + +open IN,"<$filein"; #input file +my $Tags_num=0; +my @read_num; +#print OUT "#chr\tstart\tend\tnum\t<=20\t21\t22\t23\t24\t>=25\n"; +while (my $aline=) { + chomp $aline; + next if($aline=~/^\@/); + my @tmp=split/\t/,$aline; + my $strand=$tmp[1]; + my $start=$tmp[3]+1; + my $length=length($tmp[4]); + my $end=$start+$length-1; + my $hit=$tmp[6]+1; + #======express caculate weighted=================================== + my $exp; + my @tempID=split/\:/,$tmp[0]; + my @exp=split/\_/,$tempID[1]; + pop @exp; + for (my $j=0;$j<@exp ;$j++) { + #my @tempID1=split/\=/,$tempID[$j]; + $exp[$j]=sprintf("%.2f",$exp[$j]/$hit); + $read_num[$j]+=$exp[$j]; + #print OUT "\t$exp"; + } + $exp=join "\t",@exp; + print OUT $tmp[2],"\t",$strand,"\t",$start,"\t",$end,"\t",$exp,"\n"; + $Tags_num++; + +} +print "Total Tags numer: $Tags_num\n"; +my $read_number=join "\t",@read_num; +print "Each sample numer: $read_number\n"; +close IN; +close OUT; +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -o +options: +-i input file +-mark sampleA sampleB sampleC..... +-o output file +-h help +USAGE +exit(1); +} + diff -r f008ab2cadc6 -r 7b5a48b972e9 siRNA.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,403 @@ +#!/usr/bin/perl -w +my $version=1.00; +use strict; +use warnings; +use Getopt::Long; +use Getopt::Std; +use threads; +use threads::shared; +use Parallel::ForkManager; +use lib '/leofs/biotrans/chentt/perl_module/'; +#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq +print " +##################################### +# # +# sRNA cluster # +# # +##################################### +"; +########################################################################################### +my $usage="$0 +Options: +-i input file# fasta +-config input file +-g genome file +-f gff file + +-o workdir file +-path script path +-t int, number of threads [1] +-format fastq, fq, fasta or fa +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-mis int number of allowed mismatches when mapping reads to genome, default 0 + +-n int max hits number,default 25; used in genome alignment +-d int distance of tag to merged a cluster; default 100 +-p cluster method F :conventional default is F + T :NIBLES +-l int the length of the upstream and downstream,default 1000;used in position annotate + +-nat natural antisense transcripts file +-repeat repeat information file out of Repeatmasker +-deg file config of de sample +-cen centromere file input +-span plot span, default 50000 +"; + +my %options; +GetOptions(\%options,"i:s","config=s","g=s","f=s","o=s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","t:i","d:i","l:i","idx:s","cen:s","span:s","h"); +#print help if that option is used +if($options{h}){die $usage;} + +my $filein=$options{'i'}; + +#my $config=$options{'i'}; +my $genome_fa=$options{'g'}; +my $gff=$options{'f'}; + + +########################################################################################## +my $predir=`pwd`; +chomp $predir; +my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; + +my $path=$options{'path'}; + +my $t=defined($options{'t'})? $options{'t'}:1; #threads number + +my $mis=defined $options{'mis'} ? $options{'mis'}:0; + + +my $hit=defined $options{'n'}?$options{'n'}:25; + +my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; + +my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; + +my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; + +my $format=$options{'format'}; +#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { +# die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +#} + + + +my $sample_number; +my ($dir,$dir_tmp); +################################ MAIN ################################################## +print "\ncluster program start:"; +my $time=Time(); +make_dir_tmp(); + +my $mark; +my $sample_mark; + +my $config=$opts{'config'}; +my (@filein,@mark); +&read_config(); +$sample_number=@mark; +$mark=join "\t",@mark; +$sample_mark=join "\#",@mark; + + + +my $data3=$filein; ### rfam not mapped reads +genome(); + +my $bed=$dir."cluster\/"."sample\.bed"; +my $read=$dir."cluster\/"."sample_reads\.cluster"; +my $read_txt=$dir."cluster\/"."cluster\.txt"; +my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; +my $preprocess; +my $cluster_file; +my $annotate_dir; +my $deg_dir; +my $plot_dir; +my %id; +for (my $i=0;$i<@mark ;$i++) { + $id{$mark[$i]}=$i+4; +} + + +my @map_read; +my $map_tag=0; + +bwt2bed(); + +cluster(); + +quantify(); + +phase(); + +if (defined($options{'nat'})&&defined($options{'repeat'})) { + class(); +} +else{ + get_genelist(); +} + +annotate(); + +genome_length(); + +plot(); + +my @pairdir; +if (defined($options{'deg'})) { + dec(); + infor_merge(); +} +else{infor_merge_no_dec()} +html(); +print "\ncluster program end:"; +Time(); +############################sub program################################################### +sub make_dir_tmp{ + + #make temporary directory + if(not -d "$workdir\/cluster_runs"){ + mkdir("$workdir\/cluster_runs"); + mkdir("$workdir\/cluster_runs\/ref\/"); + } + + $dir="$workdir\/cluster_runs\/"; + #print STDERR "mkdir $dir\n\n"; + return; +} + +sub genome{ + if(defined $options{'idx'}){ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; + }else{ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; + } + #=================== mapping sta =================================================== + my $map_file=$dir."genome_match\/genome_mapped\.fa"; + open (MAP,"<$map_file")||die"$!"; + print "\n#each sample mapping reads sta:\n\n"; + print "#$mark\ttotal\n"; + while (my $ID=) { + chomp $ID; + my @tmp=split/\:/,$ID; + my @exp=split/\_/,$tmp[1]; + $exp[-1] =~ s/^x//; + for (my $i=0;$i<@exp ;$i++) { + $map_read[$i]+=$exp[$i]; + } + $map_tag++; + my $seq=; + } + my $map_read=join"\t",@map_read; + print "$map_read\n\n"; + print "#total mapped tags:$map_read\n\n"; + close MAP; + return 0; +} + +sub bwt2bed{ + $cluster_file=$dir."cluster\/"; + mkdir ("$cluster_file"); + print "sam file changed to bed file\n"; + my ($file) = $dir."genome_match\/genome_mapped\.bwt"; + + my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; + print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; + return 0; +} + +sub cluster{ + print "tags is ready to merged clusters\n\n"; + my ($file) =$bed; + if ($cluster_mothod eq "F") { + my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; + print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; + } + elsif($cluster_mothod eq "T"){ + my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`; + print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n"; + } + else{print "\-p is wrong!\n\n";} + return 0; +} + + +sub quantify{ + print "clusters is ready to quantified\n\n"; + my @depth=@map_read; + pop @depth; + my $depth=join ",",@depth; + my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`; + print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n"; + return 0; +} + +sub phase{ + $annotate_dir=$dir."annotate\/"; + mkdir ("$annotate_dir"); + print "clusters is to predict phase siRNA\n"; + my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; + print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; + return 0; +} + +sub class{ + print "clusters is ready to annotate by sources\n\n"; + my $nat=$options{'nat'}; + my $repeat=$options{'repeat'}; + my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; + print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; +} + +sub annotate{ + print "clusters is ready to annotate by gff file\n\n"; + my $file; + if (defined($options{'nat'})&&defined($options{'repeat'})) { + $file="$annotate_dir\/sample_class.anno"; + } + else{ + $file=$rpkm; + } + my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; + print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; + return 0; +} +sub get_genelist{ + + my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; + print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; +} + +sub dec{ + print "deg reading\n\n"; + my $deg_file=$options{'deg'}; + open IN,"<$deg_file"; + my @deg; + my $s=0; + while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + $deg[$s]=$aline; + my @ea=split/\s+/,$aline; + push @pairdir,"$ea[0]_VS_$ea[1]\/"; + #print "$deg[$s]\n"; + $s++; + } + close IN; + $deg_dir=$dir."deg\/"; + mkdir ("$deg_dir"); + my $max_process = 10; + my $pm = new Parallel::ForkManager( $max_process ); + my $number=@deg-1; + foreach(0..$number){ + $pm->start and next; + &dec_pel($deg[$_]); + $pm->finish; + } + $pm->wait_all_children; +} + +sub dec_pel{ + print "\n******************\nstart:\n"; + Time(); + my $sample=shift(@_); + my @each=split/\s+/,$sample; + print "$each[0]\t$each[1]\n"; + my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; + mkdir ("$deg_sample_dir"); + print "read: $read\n"; + print "deg_sample_dir: $deg_sample_dir\n"; + print "$id{$each[0]}\t$each[0]\n"; + print "$id{$each[1]}\t$each[1]\n"; + my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 + my $time2=time(); + print "end:\n*************************\n"; + Time(); + sleep 1; +} + +sub infor_merge{ + my ($input,$mark); + foreach (@pairdir) { + print "@pairdir\n"; + $mark.=" -mark $_ "; + $input.=" -i $dir/deg\/$_\/output_score\.txt "; + print "$input\n$mark\n"; + } + my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; + print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; +} + +sub infor_merge_no_dec{ + my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; +} + +sub genome_length{ + my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; + print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" + +} + +sub plot{ + $plot_dir="$dir\/plot\/"; + mkdir ("$plot_dir"); + my $span=defined($options{span})?$options{span}:50000; + my $cen=""; + if (defined $options{cen}) { + $cen="-cen $options{cen}"; + } + my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `; + "print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n"; + +} + +sub html{ + my $pathfile="$dir/path.txt"; + open PA,">$pathfile"; + print PA "$config\n"; + print PA "$preprocess\n"; + print PA "$dir"."rfam_match\n"; + print PA "$dir"."genome_match\n"; + print PA "$cluster_file\n"; + print PA "$annotate_dir\n"; + print PA "$plot_dir\n"; + if (defined($deg_dir)) { + print PA "$deg_dir\n"; + } + close PA; + my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +################################################################################# +sub read_config{ + open CON,"<$config"; + while (my $aline=) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + #&check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + #&printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} diff -r f008ab2cadc6 -r 7b5a48b972e9 siRNA.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA.xml Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,122 @@ + + tool for plant siRNA analisis + + + SCRIPT_PATH + bowtie + R + fastx_toolkit + threads + Parallel-ForkManager + SVG + Boost-Graph + + + siRNA_pipeline.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + + -path \$SCRIPT_PATH + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + + ## Do or not annotate siRNAs by function + #if $params.function_anno == "yes": + -nat $params.nat -repeat $params.repeat + #end if + + ## Do or not DEG + #if $degseq.degseq_analysis == "yes" : + -deg $degseq.deg + #end if + + -i $reads -config $config -n $hits -format $format -g ${index_path}.fa -idx $index_path -f $gff -mis $mis -d $d -p $p -l $l -cen $cen -span $span > run.log + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r f008ab2cadc6 -r 7b5a48b972e9 siRNA_pipeline.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA_pipeline.pl Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,524 @@ +#!/usr/bin/perl -w +my $version=1.00; +use strict; +use warnings; +use Getopt::Long; +use Getopt::Std; +use threads; +use threads::shared; +use Parallel::ForkManager; +use lib '/leofs/biotrans/chentt/perl_module/'; +#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq +print " +##################################### +# # +# sRNA cluster # +# # +##################################### +"; +########################################################################################### +my $usage="$0 +Options: +-i input file# raw data file +-tag string #raw data sample name +-g genome file +-f gff file + +-o workdir file +-path script path +-t int, number of threads [1] +-format fastq, fq, fasta or fa +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-mis int number of allowed mismatches when mapping reads to genome, default 0 +-rfam string, input file# rfam database file. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-v int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment + +-a string, ADAPTER string. default is ATCTCGTATG. +-n int max hits number,default 25; used in genome alignment +-d int distance of tag to merged a cluster; default 100 +-p cluster method F :conventional default is F + T :NIBLES +-l int the length of the upstream and downstream,default 1000;used in position annotate + +-nat natural antisense transcripts file +-repeat repeat information file out of Repeatmasker +-deg file config of de sample +-cen centromere file input +-span plot span, default 50000 +"; + +my %options; +GetOptions(\%options,"i:s@","tag:s@","g=s","phed:i","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h"); +#print help if that option is used +if($options{h}){die $usage;} + +my @filein=@{$options{'i'}}; +my @mark=@{$options{'tag'}}; + +#my $config=$options{'i'}; +my $genome_fa=$options{'g'}; +my $gff=$options{'f'}; + + +########################################################################################## +my $predir=`pwd`; +chomp $predir; +my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; + +my $path=$options{'path'}; + +my $t=defined($options{'t'})? $options{'t'}:1; #threads number + +my $mis=defined $options{'mis'} ? $options{'mis'}:0; + +my $mis_rfam=defined $options{'v'} ? $options{'v'}:0; + +my $hit=defined $options{'n'}?$options{'n'}:25; + +my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; + +my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; + +my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; + +my $format=$options{'format'}; +#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { +# die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +#} + +my $adpter="ATCTCGTATG"; #adapter +if (defined $options{'a'}) {$adpter=$options{'a'};} + + +my $phred_qv=64; +if(defined $opts{'phred'}){$phred_qv=$opts{'phred'};} +my $sample_number; +my ($dir,$dir_tmp); +################################ MAIN ################################################## +print "\ncluster program start:"; +my $time=Time(); +make_dir_tmp(); + +my @clip; +my $mark; +my $sample_mark; + +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@filein;$i++) { + print CONFIG $filein[$i],"\t",$mark[$i],"\n"; + } +close CONFIG; +if (@filein != @mark) { + die "Maybe config file have some wrong!!!\n"; +} +$sample_number=@mark; +$mark=join "\t",@mark; +$sample_mark=join "\#",@mark; + + +#read_config(); + +trim_adapter_and_filter(); + +my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data +my $data2=$filter_out; ### mirbase not mapped reads +my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa"; ### rfam not mapped reads +my $bed=$dir."cluster\/"."sample\.bed"; +my $read=$dir."cluster\/"."sample_reads\.cluster"; +my $read_txt=$dir."cluster\/"."cluster\.txt"; +my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; +my $preprocess; +my $cluster_file; +my $annotate_dir; +my $deg_dir; +my $plot_dir; +my %id; +for (my $i=0;$i<@mark ;$i++) { + $id{$mark[$i]}=$i+4; +} + +print "\n######## tiandm test start ###########\n"; +print "\@mark: @mark\n\%id keys number:"; +print scalar keys %id; +print "\n"; +foreach my $kyess (keys %id){ + print $kyess," --> $id{$kyess}\n"; +} +print "\n######## tiandm test end ############\n"; +group_and_filter(); #collapse reads to tags + +rfam(); + +my @map_read; +my $map_tag=0; +genome(); + +bwt2bed(); + +cluster(); + +quantify(); + +phase(); + +if (defined($options{'nat'})&&defined($options{'repeat'})) { + class(); +} +else{ + get_genelist(); +} + +annotate(); + +genome_length(); + +plot(); + +my @pairdir; +if (defined($options{'deg'})) { + dec(); + infor_merge(); +} +else{infor_merge_no_dec()} +html(); +print "\ncluster program end:"; +Time(); +############################sub program################################################### +sub make_dir_tmp{ + + #make temporary directory + if(not -d "$workdir\/cluster_runs"){ + mkdir("$workdir\/cluster_runs"); + mkdir("$workdir\/cluster_runs\/ref\/"); + } + + $dir="$workdir\/cluster_runs\/"; + #print STDERR "mkdir $dir\n\n"; + return; +} + +#sub read_config{ +# open IN,"<$config"; +# while (my $aline=) { +# chomp $aline; +# my @tmp=split/\t/,$aline; +# push @filein,$tmp[0]; +# push @mark,$tmp[1]; +# } +# close IN; +# if (@filein != @mark) { +# die "Maybe config file have some wrong!!!\n"; +# } +# $sample_number=@mark; +# $mark=join "\t",@mark; +# $sample_mark=join "\#",@mark; +#} + + +sub trim_adapter_and_filter{ + my $time=time(); + $preprocess=$dir."preProcess/"; + mkdir $preprocess; + my $can_use_threads = eval 'use threads; 1'; + if ($can_use_threads) { + # Do processing using threads + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clip,$dir."preProcess\/$out\_clip\.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } + } + else { +# Do not processing using threads + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clip,$dir."preProcess\/$out\_clip\.fq"; + &clips($in,$out); + } + } +} + +sub clips{ + my ($filein,$fileout)=@_; + my $adapter=$dir."preProcess\/$fileout\_clip\.fq"; + if($format eq "fq" || $format eq "fastq"){ + my $clip=`fastx_clipper -a $adpter -M 6 -Q $phred_qv -i $filein -o $adapter`; + } + if($format eq "fa" || $format eq "fasta"){ + my $clip=`fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`; + } + #my $clean=$dir."preProcess\/$fileout\_clean.fq"; + #my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `; + return $fileout; +} + +sub group_and_filter{ + #my ($ins,$data)=@_; + my @ins=@clip; + my $str=""; + my $group_out_file=$dir."preProcess\/"."collapse_reads.fa"; + #print "$$ins[0]\t$$ins[0]\n"; + for (my $i=0;$i<@clip;$i++) { + $str .="-i $clip[$i] "; + #print "$$ins[$i]\n"; + } + my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`; + print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n"; + + my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa"; + my $tmpmark=join ",", @mark; + + my $length_f=`perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $tmpmark`; + print "perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $tmpmark\n\n"; + my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`; + print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n"; + my $plot_l_D=`perl $path/Length_Distibution.pl -i $dir/preProcess/reads_length_distribution_after_count_filter.txt -o $dir/preProcess/length.html `; + print "perl $path\/Length_Distibution.pl -i $dir\/preProcess\/reads_length_distribution_after_count_filter.txt -o $dir\/preProcess\/length\.html\n\n"; + return 0; +} + +sub rfam{ + if (defined $options{'idx2'}) { + system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}"); + }else{ + system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir"); + } + my $tag=join "\\;" ,@mark; + my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`; + return 0; +} +sub genome{ + if(defined $options{'idx'}){ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; + }else{ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; + } + #=================== mapping sta =================================================== + my $map_file=$dir."genome_match\/genome_mapped\.fa"; + open (MAP,"<$map_file")||die"$!"; + print "\n#each sample mapping reads sta:\n\n"; + print "#$mark\ttotal\n"; + while (my $ID=) { + chomp $ID; + my @tmp=split/\:/,$ID; + my @exp=split/\_/,$tmp[1]; + $exp[-1] =~ s/^x//; + for (my $i=0;$i<@exp ;$i++) { + $map_read[$i]+=$exp[$i]; + } + $map_tag++; + my $seq=; + } + my $map_read=join"\t",@map_read; + print "$map_read\n\n"; + print "#total mapped tags:$map_read\n\n"; + close MAP; + return 0; +} + +sub bwt2bed{ + $cluster_file=$dir."cluster\/"; + mkdir ("$cluster_file"); + print "sam file changed to bed file\n"; + my ($file) = $dir."genome_match\/genome_mapped\.bwt"; + + my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; + print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; + return 0; +} + +sub cluster{ + print "tags is ready to merged clusters\n\n"; + my ($file) =$bed; + if ($cluster_mothod eq "F") { + my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; + print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; + } + elsif($cluster_mothod eq "T"){ + my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`; + print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n"; + } + else{print "\-p is wrong!\n\n";} + return 0; +} + + +sub quantify{ + print "clusters is ready to quantified\n\n"; + my @depth=@map_read; + pop @depth; + my $depth=join ",",@depth; + my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`; + print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n"; + return 0; +} + +sub phase{ + $annotate_dir=$dir."annotate\/"; + mkdir ("$annotate_dir"); + print "clusters is to predict phase siRNA\n"; + my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; + print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; + return 0; +} + +sub class{ + print "clusters is ready to annotate by sources\n\n"; + my $nat=$options{'nat'}; + my $repeat=$options{'repeat'}; + my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; + print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; +} + +sub annotate{ + print "clusters is ready to annotate by gff file\n\n"; + my $file; + if (defined($options{'nat'})&&defined($options{'repeat'})) { + $file="$annotate_dir\/sample_class.anno"; + } + else{ + $file=$rpkm; + } + my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; + print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; + return 0; +} +sub get_genelist{ + + my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; + print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; +} + +sub dec{ + print "deg reading\n\n"; + my $deg_file=$options{'deg'}; + open IN,"<$deg_file"; + my @deg; + my $s=0; + while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + $deg[$s]=$aline; + my @ea=split/\s+/,$aline; + push @pairdir,"$ea[0]_VS_$ea[1]\/"; + #print "$deg[$s]\n"; + $s++; + } + close IN; + $deg_dir=$dir."deg\/"; + mkdir ("$deg_dir"); + my $max_process = 10; + my $pm = new Parallel::ForkManager( $max_process ); + my $number=@deg-1; + foreach(0..$number){ + $pm->start and next; + &dec_pel($deg[$_]); + $pm->finish; + } + $pm->wait_all_children; +} + +sub dec_pel{ + print "\n******************\nstart:\n"; + Time(); + my $sample=shift(@_); + my @each=split/\s+/,$sample; + print "$each[0]\t$each[1]\n"; + my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; + mkdir ("$deg_sample_dir"); + print "read: $read\n"; + print "deg_sample_dir: $deg_sample_dir\n"; + print "$id{$each[0]}\t$each[0]\n"; + print "$id{$each[1]}\t$each[1]\n"; + my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 + my $time2=time(); + print "end:\n*************************\n"; + Time(); + sleep 1; +} + +sub infor_merge{ + my ($input,$mark); + foreach (@pairdir) { + print "@pairdir\n"; + $mark.=" -mark $_ "; + $input.=" -i $dir/deg\/$_\/output_score\.txt "; + print "$input\n$mark\n"; + } + my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; + print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; +} + +sub infor_merge_no_dec{ + my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; +} + +sub genome_length{ + my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; + print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" + +} + +sub plot{ + $plot_dir="$dir\/plot\/"; + mkdir ("$plot_dir"); + my $span=defined($options{span})?$options{span}:50000; + my $cen=""; + if (defined $options{cen}) { + $cen="-cen $options{cen}"; + } + my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `; + "print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n"; + +} + +sub html{ + my $pathfile="$dir/path.txt"; + open PA,">$pathfile"; + print PA "$config\n"; + print PA "$preprocess\n"; + print PA "$dir"."rfam_match\n"; + print PA "$dir"."genome_match\n"; + print PA "$cluster_file\n"; + print PA "$annotate_dir\n"; + print PA "$plot_dir\n"; + if (defined($deg_dir)) { + print PA "$deg_dir\n"; + } + close PA; + my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +################################################################################# diff -r f008ab2cadc6 -r 7b5a48b972e9 siRNA_pipeline.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA_pipeline.xml Fri Dec 05 00:11:02 2014 -0500 @@ -0,0 +1,164 @@ + + tool for plant siRNA analisis + + + SCRIPT_PATH + bowtie + R + fastx_toolkit + threads + Parallel-ForkManager + SVG + Boost-Graph + + + siRNA_pipeline.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + + -path \$SCRIPT_PATH + + #for $j, $s in enumerate( $series ) + ##rank_of_series=$j + -i ${s.input} + -tag ${s.tag} + #end for + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + + ## prepare Rfam bowtie index + #set rfam_index_path = '' + #if str($reference_rfam.source) == "history": + bowtie-build "$reference_rfam.own_file" rfam; ln -s $reference_rfam.own_file" rfam.fa; + #set rfam_index_path = 'rfam' + #else: + #set rfam_index_path = $reference_rfam.index.fields.path + #end if + + + + ## Do or not annotate siRNAs by function + #if $params.function_anno == "yes": + -nat $params.nat -repeat $params.repeat + #end if + + ## Do or not DEG + #if $degseq.degseq_analysis == "yes" : + -deg $degseq.deg + #end if + + -format $format -phred $phred -g ${index_path}.fa -idx $index_path -f $gff -mis $mis -rfam ${rfam_index_path}.fa -idx2 $rfam_index_path -v $v -a $a -n $mapnt -d $d -p $p -l $l -cen $cen -span $span > run.log + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r f008ab2cadc6 -r 7b5a48b972e9 tool_dependencies.xml --- a/tool_dependencies.xml Wed Dec 03 02:03:27 2014 -0500 +++ b/tool_dependencies.xml Fri Dec 05 00:11:02 2014 -0500 @@ -26,6 +26,76 @@ + + + + + + + + + + + R CMD BATCH $REPOSITORY_INSTALL_DIR/install_DEG.R + echo "export PATH=$PATH" > $INSTALL_DIR/env.sh + chmod 755 $INSTALL_DIR/env.sh + + + + + + + + + http://www.cpan.org/authors/id/J/JD/JDHEDDEN/threads-1.96.tar.gz + $INSTALL_DIR/lib/perl5 + + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + make && + make install + + + $INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/ + + + + + + + + + + http://www.cpan.org/authors/id/S/SZ/SZABGAB/Parallel-ForkManager-1.06.tar.gz + $INSTALL_DIR/lib/perl5 + + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + make && + make install + + + $INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/ + + + + + + + + + http://www.cpan.org/authors/id/D/DU/DUFFEE/Boost-Graph-1.4_001.tar.gz + $INSTALL_DIR/lib/perl5 + + perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && + ls && + make && + make install + + + $INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/ + + + +