diff csem_wrapper.pl @ 3:3ce7ee6f43a7

Uploaded
author dongjun
date Mon, 12 Sep 2011 10:01:12 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/csem_wrapper.pl	Mon Sep 12 10:01:12 2011 -0400
@@ -0,0 +1,192 @@
+# Wrapper for CSEM with Bowtie
+# Written by Dongjun Chung, Sep. 8, 2011
+
+#!/usr/bin/env perl;
+#use warnings;
+#use strict;
+use File::Temp qw/tempfile/;
+use File::Temp qw/tmpnam/;
+
+# parse command arguments
+
+die "Usage: perl csem_wrapper.pl [infile_name] [infile_format] [outfile_name] [outfile_format] [ref_genome] [pseudo_tags] [n_mismatch] [maxpos] [window_size] [n_iter] [n_core]" unless @ARGV == 11;
+
+my ( $infile_name, $infile_format, $outfile_name, $outfile_format, $ref_genome, $pseudo_tags, $n_mismatch, $maxpos, $window_size, $n_iter, $n_core ) = @ARGV;
+
+# construct ref genome file (adapted from "genRef.pl")
+
+open ( IN, "bowtie-inspect -s $ref_genome |" ) or die "Cannot run bowtie-inspect!\n";
+
+my $line;
+
+my $size = 0;
+my (@names, @lens) = ();
+	# $size: # of chromosomes
+	# @lens: chromosome size
+	# @names: chromosome name
+
+for (my $i = 0; $i < 3; $i++) {
+    # skip unnecessary lines
+    $line = <IN>;
+}
+
+while ( $line = <IN> ) {
+    ++$size;
+    chomp($line);
+    my ($seqn, $name, $len) = split(/[ \t]+/, $line);
+    push(@names, $name);
+    push(@lens, $len);
+}
+close(IN);
+
+my ($fh, $temp_reffile) = tempfile();
+print $fh "$size\n";
+print $fh "@lens\n";
+print $fh "@names\n";
+close($fh);
+
+# extract read length from FASTA/FASTQ files
+
+open( IN, $infile_name ) or die "Cannot open tag file!\n";
+
+$line = <IN>;	
+if ( $infile_format eq "fasta" ) {
+	while ( $line =~ /^>/ ) {
+		$line = <IN>;	
+	}
+} elsif ( $infile_format eq "fastq" ) {
+	while ( $line =~ /^@/ ) {
+		$line = <IN>;	
+	}
+} else {
+	print "Inappropriate aligned read file format!\n";
+	exit 1;
+}
+chomp($line);
+my $read_length = length $line;
+
+close( IN );
+
+# extract read ID
+
+open( IN, $infile_name ) or die "Cannot open tag file!\n";
+
+my @readID = ();	
+if ( $infile_format eq "fasta" ) {
+	foreach $line (<IN>) { 
+		chomp($line);
+		if ( $line =~ /^>(\S+)/ ) {
+			push @readID, $1;
+		}	
+	}
+} elsif ( $infile_format eq "fastq" ) {
+	foreach $line (<IN>) {
+		chomp($line);
+		if ( $line =~ /^@(\S+)/ ) {
+			push @readID, $1;
+		}
+	}
+} else {
+	print "Inappropriate aligned read file format!\n";
+	exit 1;
+}
+
+close( IN );
+
+# run bowtie & csem
+
+my $outfile_temp = tmpnam();
+
+if ( $infile_format eq "fasta" ) {
+	system( "bowtie -f -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
+} elsif ( $infile_format eq "fastq" ) {
+	system( "bowtie -q -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
+} else {
+	print "Inappropriate aligned read file format!\n";
+	exit 1;
+}
+
+# post-process chromosome & position 
+
+open( IN, $outfile_temp ) or die "Cannot open csem file!\n";
+open( OUT, ">", $outfile_name ) or die "Cannot open output file!\n";
+	
+foreach $line (<IN>) {
+	chomp($line);
+	my @element = split( /\s/, $line );
+		# assume columns are separated by some white space
+		
+	# check for invalid line: may cause error in exporting step
+	
+	if ( scalar(@element)<5 ) {
+		next;
+	}
+	
+	# post-process lines
+	
+	my ($id, $chr, $str, $loc, $prob) = @element;
+	if ( $outfile_format ne "bed" ) {
+		# first base is 0 in bowtie or BED
+		# first base is 1 in table or GFF
+		$loc++;
+	}
+	my $chrname = $names[$chr];	# translate chromosome
+	
+	# write down processed lines
+	# - generate pseudo-tags, if necessary (adapted from "round_tag_to_integer.pl")
+	
+	if ( $pseudo_tags eq "Y" ) {	
+		# if we want to generate pseudo-tags,
+		# then threshold prob at 0.5 & round prob to integer (i.e., set to one)
+		# (exclude prob = 0.5 as well in order to avoid a read appears more than once)
+		
+		if ( $prob <= 0.5 ) {
+			next;
+		} else {
+			$prob = 1;
+		}
+	}
+			
+	# write down results
+	
+	my $start;
+	my $end;
+	my $score;
+	
+	my $id_final = $readID[$id];
+	#my $id_final = $id;
+	
+	if ( $outfile_format eq "table" ) {
+		print OUT "$id_final\t$chrname $str $loc $prob\n";
+	} elsif ( $outfile_format eq "bed" ) {
+		# BED 
+		# - name: read ID
+		# - score = prob * 1000
+		
+		$start = $loc;
+		$end = $start + $read_length - 1;
+		my $name = $id_final;
+		$score = $prob * 1000;
+		
+		print OUT "$chrname\t$start\t$end\t$name\t$score\t$str\n";
+	} elsif ( $outfile_format eq "gff" ) {
+		# GFF
+		# - source: "CSEM"
+		# - feature: read ID
+		# - score = prob * 1000
+		 
+		$start = $loc;
+		$end = $start + $read_length - 1;
+		my $source = "CSEM";
+		my $feature = $id_final;
+		$score = $prob * 1000;
+		
+		print OUT "$chrname\t$source\t$feature\t$start\t$end\t$score\t$str\t.\t.\n";	
+	} else {
+	print "Inappropriate output file format!\n";
+	exit 1;
+}
+}
+
+close( IN );
+close( OUT );