view collapseReads2Tags.pl @ 16:6a878837c24a draft

Uploaded
author big-tiandm
date Fri, 25 Jul 2014 05:21:49 -0400
parents ea2fdf667620
children
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Tian Dongmei
#Email: tiandm@big.ac.cn
#Date: 2014-3-20
#Modified:
#Description: fastq file form reads cluster(the same sequence in the same cluster)
my $version=1.00;

use strict;
use Getopt::Long;

my %opts;
GetOptions(\%opts,"i:s@","format=s","mark:s","qual:s","qv:i","o=s","h");
if (!(defined $opts{o} and defined $opts{'format'})  || defined $opts{h}) { #necessary arguments
&usage;
}
my @filein=@{$opts{i}} if(defined $opts{i});
my $name=defined $opts{'mark'} ? $opts{'mark'} : "seq";
my $fileout=$opts{'o'};
my $pq=defined $opts{'qv'} ? $opts{'qv'} : 33;
my %hash;##分块存放原始序列

my $format=$opts{'format'};
if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
	die "Parameter -format is error!\n";
}

my ($qualT,$qualV);
if (defined $opts{'qual'} && ($format eq "fastq" || $format eq "fq")) {  #quality filter
	my @temp=split /:/,$opts{'qual'};
	$qualT=$temp[0];
	$qualV=$temp[1];

	for (my $i=0;$i<@filein;$i++) {
		open IN,"<$filein[$i]";
		while (my $aline=<IN>) {
			my $seq=<IN>;
			my $n=<IN>;
			my $qv=<IN>;
			my $tag=&qvcheck($qv,$qualT,$qualV);
			next if(!$tag);
			my $str=substr($seq,0,6);
			$hash{$str}[$i].=$seq;
		}
		close IN;
	}
}
elsif($format eq "fastq" || $format eq "fq"){ ### do not filter low quality reads
	for (my $i=0;$i<@filein;$i++) {
		open IN,"<$filein[$i]";
		while (my $aline=<IN>) {
			my $seq=<IN>;
			my $n=<IN>;
			my $qv=<IN>;
			my $str=substr($seq,0,6);
			$hash{$str}[$i].=$seq;
		}
		close IN;
	}

}
elsif($format eq "fasta" || $format eq "fa"){
	for (my $i=0;$i<@filein;$i++) {
		open IN,"<$filein[$i]";
		while (my $aline=<IN>) {
			my $seq=<IN>;
			my $str=substr($seq,0,6);
			$hash{$str}[$i].=$seq;
		}
		close IN;
	}
}

open OUT,">$fileout"; #output file  
my $count=0;
foreach my $key (keys %hash) {
	my %cluster;
	for (my $i=0;$i<@filein;$i++) {
		next if(!(defined $hash{$key}[$i]));
		my @tmp=split/\n/,$hash{$key}[$i];
		foreach  (@tmp) {
			$cluster{$_}[$i]++;
		}
	}
	
	foreach my $seq (keys %cluster) {
		my $exp=""; my $ee=0;
		for (my $i=0;$i<@filein;$i++) {
			if (defined $cluster{$seq}[$i]) {
				$exp.="_$cluster{$seq}[$i]";
				$ee+=$cluster{$seq}[$i];
			}else{
				$exp.="_0";
			}
		}
		$count+=$ee;
		$exp=~s/^_//;
		print OUT ">$name","_$count:$exp","_x$ee\n$seq\n";
	}
}
close OUT;


sub qvcheck{
	my ($str,$t,$v)=@_;
	my $qv=0;
	if($t eq "mean"){ 
		$qv=&getMeanQuality($str);
	}
	elsif($t eq "min"){
		$qv=&getMinQuality($str);
	}
	if ($qv<$v) {
		return 0;
	}
	return 1;
}

sub getMeanQuality(){
	chomp $_[0];
	my @bases = split(//,$_[0]);
	my $sum = 0;
	for(my $i = 0; $i <= $#bases; $i++){
		my $num = ord($bases[$i]) - $pq;
		$sum += $num;
	}
	
	return $sum/($#bases+1);
	
}

###
### This function gives back the Q-value of the worst base
sub getMinQuality(){
	chomp $_[0];
	my @bases = split(//,$_[0]);
	my $worst = 1000;
	for(my $i = 0; $i <= $#bases; $i++){
#		printf ("base: $bases[$i]  --> %d\n",ord($bases[$i]));
		my $num = ord($bases[$i]) - $pq;
		if($num < $worst){
			$worst = $num;
		}
	}
	return $worst;
}

sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -i -format -mark -qual -qv -o 
options:
-i input file#fastq file ##can be multiple -i file1 -i file2 ...
-mark string#quary name,default is "seq"
-o output file
-format string # fastq|fasta|fq|fa

-qual #reads filter
   eg:(min:value/mean:value)
   This parameter just for solexa reads.
   If the input files are solid and needs filter,please do filter first .
   
-qv  integer #Phred quality64/33,default 33
-h help
USAGE
exit(1);
}