# HG changeset patch # User big-tiandm # Date 1406279893 14400 # Node ID 4ad3cbe96ded9b96381a0bed73f506aabcf3cb20 # Parent c97dcf05b5d100b061e2da95f70f6fe6591bf86a Deleted selected files diff -r c97dcf05b5d1 -r 4ad3cbe96ded collapseReads2Tags.pl --- a/collapseReads2Tags.pl Fri Jul 25 05:17:20 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,170 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2014-3-20 -#Modified: -#Description: fastq file form reads cluster(the same sequence in the same cluster) -my $version=1.00; - -use strict; -use Getopt::Long; - -my %opts; -GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h"); -if (!(defined $opts{o} and defined $opts{'format'}) || defined $opts{h}) { #necessary arguments -&usage; -} -my @filein=@{$opts{i}} if(defined $opts{i}); -my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq"; -my $fileout=$opts{'o'}; -my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33; -my %hash;##分块存放原始序列 - -my $format=$opts{'format'}; -if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { - die "Parameter -format is error!\n"; -} - -my ($qualT,$qualV); -if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) { #quality filter - my @temp=split /:/,$opts{'qual'}; - $qualT=$temp[0]; - $qualV=$temp[1]; - - for (my $i=0;$i<@filein;$i++) { - open IN,"<$filein[$i]"; - while (my $aline=) { - my $seq=; - my $n=; - my $qv=; - my $tag=&qvcheck($qv,$qualT,$qualV); - next if(!$tag); - my $str=substr($seq,0,6); - $hash{$str}[$i].=$seq; - } - close IN; - } -} -elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads - for (my $i=0;$i<@filein;$i++) { - open IN,"<$filein[$i]"; - while (my $aline=) { - my $seq=; - my $n=; - my $qv=; - my $str=substr($seq,0,6); - $hash{$str}[$i].=$seq; - } - close IN; - } - -} -elsif($format eq "fasta" || $format eq "fa"){ - for (my $i=0;$i<@filein;$i++) { - open IN,"<$filein[$i]"; - while (my $aline=) { - my $seq=; - my $str=substr($seq,0,6); - $hash{$str}[$i].=$seq; - } - close IN; - } -} - -open OUT,">$fileout"; #output file -my $count=0; -foreach my $key (keys %hash) { - my %cluster; - for (my $i=0;$i<@filein;$i++) { - next if(!(defined $hash{$key}[$i])); - my @tmp=split/\n/,$hash{$key}[$i]; - foreach (@tmp) { - $cluster{$_}[$i]++; - } - } - - foreach my $seq (keys %cluster) { - my $exp=""; my $ee=0; - for (my $i=0;$i<@filein;$i++) { - if (defined $cluster{$seq}[$i]) { - $exp.="_$cluster{$seq}[$i]"; - $ee+=$cluster{$seq}[$i]; - }else{ - $exp.="_0"; - } - } - $count+=$ee; - $exp=~s/^_//; - print OUT ">$name","_$count:$exp","_x$ee\n$seq\n"; - } -} -close OUT; - - -sub qvcheck{ - my ($str,$t,$v)=@_; - my $qv=0; - if($t eq "mean"){ - $qv=&getMeanQuality($str); - } - elsif($t eq "min"){ - $qv=&getMinQuality($str); - } - if ($qv<$v) { - return 0; - } - return 1; -} - -sub getMeanQuality(){ - chomp $_[0]; - my @bases = split(//,$_[0]); - my $sum = 0; - for(my $i = 0; $i <= $#bases; $i++){ - my $num = ord($bases[$i]) - $pq; - $sum += $num; - } - - return $sum/($#bases+1); - -} - -### -### This function gives back the Q-value of the worst base -sub getMinQuality(){ - chomp $_[0]; - my @bases = split(//,$_[0]); - my $worst = 1000; - for(my $i = 0; $i <= $#bases; $i++){ -# printf ("base: $bases[$i] --> %d\n",ord($bases[$i])); - my $num = ord($bases[$i]) - $pq; - if($num < $worst){ - $worst = $num; - } - } - return $worst; -} - -sub usage{ -print <<"USAGE"; -Version $version -Usage: -$0 -i -format -mark -qual -qv -o -options: --i input file#fastq file ##can be multiple -i file1 -i file2 ... --mark string#quary name,default is "seq" --o output file --format string # fastq|fasta|fq|fa - --qual #reads filter - eg:(min:value/mean:value) - This parameter just for solexa reads. - If the input files are solid and needs filter,please do filter first . - --qv integer #Phred quality64/33,default 33 --h help -USAGE -exit(1); -} - diff -r c97dcf05b5d1 -r 4ad3cbe96ded convert_bowtie_to_blast.pl --- a/convert_bowtie_to_blast.pl Fri Jul 25 05:17:20 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -#!/usr/bin/perl - - -use warnings; -use strict; -use Getopt::Std; - -######################################### USAGE ################################ - -my $usage= -"$0 file_bowtie_result file_solexa_seq file_chromosome - -This is a converter which changes Bowtie output into Blast format. -The input includes three files: a Bowtie result file (default Bowtie -output file), a fasta file consisting of small Reads and a chromosome -fasta file. It outputs the alignments in blast_parsed format. - -file_bowtie_result likes: - -AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0 -AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0 - -file_solexa_seq likes: - ->AtFlower100010_x2 -AAGGAGATTCTTTCAGTCCAG - -file_chromosome contains chromosome seq in fasta format - -"; - - -####################################### INPUT FILES ############################ - -my $file_bowtie_result=shift or die $usage; -my $file_short_seq=shift or die $usage; -my $file_chromosome_seq=shift or die $usage; - - -##################################### GLOBAL VARIBALES ######################### - -my %short_seq_length=(); -my %chromosome_length=(); - - -######################################### MAIN ################################# - -#get the short sequence id and its length -sequence_length($file_short_seq,\%short_seq_length); - -#get the chromosome sequence id and its length -sequence_length($file_chromosome_seq,\%chromosome_length); - -#convert bowtie result format to blast format; -change_format($file_bowtie_result); - -exit; - - -##################################### SUBROUTINES ############################## - -sub sequence_length{ - my ($file,$hash) = @_; - my ($id, $desc, $sequence, $seq_length) = (); - - open (FASTA, "<$file") or die "can not open $$file\n"; - while () - { - chomp; - if (/^>(\S+)(.*)/) - { - $id = $1; - $desc = $2; - $sequence = ""; - while (){ - chomp; - if (/^>(\S+)(.*)/){ - $$hash{$id} = length $sequence; - $id = $1; - $desc = $2; - $sequence = ""; - next; - } - $sequence .= $_; - } - } - } - $seq_length=length($sequence); - $$hash{$id} = $seq_length; - close FASTA; -} - - - - - -sub change_format{ - #Change Bowtie format into blast format - my $file=shift @_; - open(FILE,"<$file")||die"can not open the bowtie result file:$!\n"; - #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n"; - - while(){ - chomp; - my @tmp=split("\t",$_); - #Clean the reads ID - my @tmp1=split(" ",$tmp[0]); - print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t"; - if($tmp[1] eq "+"){ - my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]}; - my $seq_bg=$tmp[3] + 1; - print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n"; - } - if($tmp[1] eq "-"){ - my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3]; - my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1; - print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n"; - } - } - -# close BLASTOUT; - -} - - -