view tools/human_genome_variation/hilbertvis.sh @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
line wrap: on
line source

#!/usr/bin/env bash

input_file="$1"
output_file="$2"
chromInfo_file="$3"
chrom="$4"
score_col="$5"
hilbert_curve_level="$6"
summarization_mode="$7"
chrom_col="$8"
start_col="$9"
end_col="${10}"
strand_col="${11}"

## use first sequence if chrom filed is empty
if [ -z "$chrom" ]; then
    chrom=$( head -n 1 "$input_file" | cut -f$chrom_col )
fi

## get sequence length 
if [ ! -r "$chromInfo_file" ]; then
    echo "Unable to read chromInfo_file $chromInfo_file" 1>&2
    exit 1
fi

chrom_len=$( awk '$1 == chrom {print $2}' chrom=$chrom $chromInfo_file )

## error if we can't find the chrom_len
if [ -z "$chrom_len" ]; then
    echo "Can't find length for sequence \"$chrom\" in chromInfo_file $chromInfo_file" 1>&2
    exit 1
fi

## make sure chrom_len is positive
if [ $chrom_len -le 0 ]; then
    echo "sequence \"$chrom\" length $chrom_len <= 0" 1>&2
    exit 1
fi

## modify R script depending on the inclusion of a score column, strand information
input_cols="\$${start_col}, \$${end_col}"
col_types='beg=0, end=0, strand=""'

# if strand_col == 0 (strandCol metadata is not set), assume everything's on the plus strand
if [ $strand_col -ne 0 ]; then
    input_cols="${input_cols}, \$${strand_col}"
else
    input_cols="${input_cols}, \\\"+\\\""
fi

# set plot value (either from data or use a constant value)
if [ $score_col -eq -1 ]; then
    value=1
else
    input_cols="${input_cols}, \$${score_col}"
    col_types="${col_types}, score=0"
    value='chunk$score[i]'
fi

R --vanilla &> /dev/null <<endOfCode
library(HilbertVis);

chrom_len <- ${chrom_len};
chunk_size <- 1000;
interval_count <- 0;
invalid_strand <- 0;

awk_cmd <- paste(
  "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}",
    "\$${chrom_col} == \"${chrom}\"",
      "{print ${input_cols}}' ${input_file}"
);

col_types <- list(${col_types});
vec <- vector(mode="numeric", length=chrom_len);
conn <- pipe(description=awk_cmd, open="r");

repeat {
  chunk <- scan(file=conn, what=col_types, sep="\t", nlines=chunk_size, quiet=TRUE);

  if ((rows <- length(chunk\$beg)) == 0)
        break;

  interval_count <- interval_count + rows;

  for (i in 1:rows) {
    if (chunk\$strand[i] == '+') {
      start <- chunk\$beg[i] + 1;
      stop <- chunk\$end[i];
    } else if (chunk\$strand[i] == '-') {
      start <- chrom_len - chunk\$end[i] - 1;
      stop <- chrom_len - chunk\$beg[i];
    } else {
      invalid_strand <- invalid_strand + 1;
      interval_count <- interval_count - 1;
      next;
    }
    vec[start:stop] <- ${value};
  }
}

close(conn);

hMat <- hilbertImage(vec, level=$hilbert_curve_level, mode="$summarization_mode");
pdf(file="$output_file", onefile=TRUE, width=8, height=10.5, paper="letter");
showHilbertImage(hMat);
dev.off();
endOfCode