Mercurial > repos > big-tiandm > mirplant2
view collapseReads2Tags.pl @ 26:c90e7d84d521 draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 31 Jul 2014 03:03:37 -0400 |
parents | ea2fdf667620 |
children |
line wrap: on
line source
#!/usr/bin/perl -w #Filename: #Author: Tian Dongmei #Email: tiandm@big.ac.cn #Date: 2014-3-20 #Modified: #Description: fastq file form reads cluster(the same sequence in the same cluster) my $version=1.00; use strict; use Getopt::Long; my %opts; GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h"); if (!(defined $opts{o} and defined $opts{'format'}) || defined $opts{h}) { #necessary arguments &usage; } my @filein=@{$opts{i}} if(defined $opts{i}); my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq"; my $fileout=$opts{'o'}; my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33; my %hash;##分块存放原始序列 my $format=$opts{'format'}; if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { die "Parameter -format is error!\n"; } my ($qualT,$qualV); if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) { #quality filter my @temp=split /:/,$opts{'qual'}; $qualT=$temp[0]; $qualV=$temp[1]; for (my $i=0;$i<@filein;$i++) { open IN,"<$filein[$i]"; while (my $aline=<IN>) { my $seq=<IN>; my $n=<IN>; my $qv=<IN>; my $tag=&qvcheck($qv,$qualT,$qualV); next if(!$tag); my $str=substr($seq,0,6); $hash{$str}[$i].=$seq; } close IN; } } elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads for (my $i=0;$i<@filein;$i++) { open IN,"<$filein[$i]"; while (my $aline=<IN>) { my $seq=<IN>; my $n=<IN>; my $qv=<IN>; my $str=substr($seq,0,6); $hash{$str}[$i].=$seq; } close IN; } } elsif($format eq "fasta" || $format eq "fa"){ for (my $i=0;$i<@filein;$i++) { open IN,"<$filein[$i]"; while (my $aline=<IN>) { my $seq=<IN>; my $str=substr($seq,0,6); $hash{$str}[$i].=$seq; } close IN; } } open OUT,">$fileout"; #output file my $count=0; foreach my $key (keys %hash) { my %cluster; for (my $i=0;$i<@filein;$i++) { next if(!(defined $hash{$key}[$i])); my @tmp=split/\n/,$hash{$key}[$i]; foreach (@tmp) { $cluster{$_}[$i]++; } } foreach my $seq (keys %cluster) { my $exp=""; my $ee=0; for (my $i=0;$i<@filein;$i++) { if (defined $cluster{$seq}[$i]) { $exp.="_$cluster{$seq}[$i]"; $ee+=$cluster{$seq}[$i]; }else{ $exp.="_0"; } } $count+=$ee; $exp=~s/^_//; print OUT ">$name","_$count:$exp","_x$ee\n$seq\n"; } } close OUT; sub qvcheck{ my ($str,$t,$v)=@_; my $qv=0; if($t eq "mean"){ $qv=&getMeanQuality($str); } elsif($t eq "min"){ $qv=&getMinQuality($str); } if ($qv<$v) { return 0; } return 1; } sub getMeanQuality(){ chomp $_[0]; my @bases = split(//,$_[0]); my $sum = 0; for(my $i = 0; $i <= $#bases; $i++){ my $num = ord($bases[$i]) - $pq; $sum += $num; } return $sum/($#bases+1); } ### ### This function gives back the Q-value of the worst base sub getMinQuality(){ chomp $_[0]; my @bases = split(//,$_[0]); my $worst = 1000; for(my $i = 0; $i <= $#bases; $i++){ # printf ("base: $bases[$i] --> %d\n",ord($bases[$i])); my $num = ord($bases[$i]) - $pq; if($num < $worst){ $worst = $num; } } return $worst; } sub usage{ print <<"USAGE"; Version $version Usage: $0 -i -format -mark -qual -qv -o options: -i input file#fastq file ##can be multiple -i file1 -i file2 ... -mark string#quary name,default is "seq" -o output file -format string # fastq|fasta|fq|fa -qual #reads filter eg:(min:value/mean:value) This parameter just for solexa reads. If the input files are solid and needs filter,please do filter first . -qv integer #Phred quality64/33,default 33 -h help USAGE exit(1); }