view imgt_concatenate.sh @ 2:d77d4700fd0a draft

Uploaded
author davidvanzessen
date Tue, 27 Dec 2016 10:11:21 -0500
parents b360a373835f
children
line wrap: on
line source

#!/bin/bash
dir="$(cd "$(dirname "$0")" && pwd)"

args=("$@")
output=$1
inputs=("${args[@]:1}")

workdir="$PWD"

echo "Output: $output"
echo "Inputs: ${inputs[@]}"

mkdir "$workdir/output"

function imgt_unpack {
	local imgt_zip=$1
	local outdir=$2
	if [ ! -d "$outdir" ]; then
		mkdir "$outdir"
	fi
	local type="`file $imgt_zip`"
	if [[ "$type" == *"Zip archive"* ]] ; then
		unzip $imgt_zip -d $outdir
	elif [[ "$type" == *"XZ compressed data"* ]] ; then
		mkdir -p $outdir
		echo "tar -xJf $imgt_zip -C $outdir"
		tar -xJf $imgt_zip -C $outdir
	fi
}

function concat_imgt_files {
	indir=$1
	outdir=$2
	start_line=$3 #line # to start at, 2 to skip header
	id=$4
	if [[ "${start_line}" == "1" ]] ; then
		cat `find $indir/ -name "1_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/1_Summary.txt"
		cat `find $indir/ -name "2_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/2_IMGT-gapped-nt-sequences.txt"
		cat `find $indir/ -name "3_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/3_Nt-sequences.txt"
		cat `find $indir/ -name "4_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/4_IMGT-gapped-AA-sequences.txt"
		cat `find $indir/ -name "5_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/5_AA-sequences.txt"
		cat `find $indir/ -name "6_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/6_Junction.txt"
		cat `find $indir/ -name "7_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/7_V-REGION-mutation-and-AA-change-table.txt"
		cat `find $indir/ -name "8_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/8_V-REGION-nt-mutation-statistics.txt"
		cat `find $indir/ -name "9_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/9_V-REGION-AA-change-statistics.txt"
		cat `find $indir/ -name "10_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/10_V-REGION-mutation-hotspots.txt"
	else 
		cat `find $indir/ -name "1_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/1_Summary.txt"
		cat `find $indir/ -name "2_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/2_IMGT-gapped-nt-sequences.txt"
		cat `find $indir/ -name "3_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/3_Nt-sequences.txt"
		cat `find $indir/ -name "4_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/4_IMGT-gapped-AA-sequences.txt"
		cat `find $indir/ -name "5_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/5_AA-sequences.txt"
		cat `find $indir/ -name "6_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/6_Junction.txt"
		cat `find $indir/ -name "7_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/7_V-REGION-mutation-and-AA-change-table.txt"
		cat `find $indir/ -name "8_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/8_V-REGION-nt-mutation-statistics.txt"
		cat `find $indir/ -name "9_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/9_V-REGION-AA-change-statistics.txt"
		cat `find $indir/ -name "10_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/10_V-REGION-mutation-hotspots.txt"
	fi
	
}

echo "Unpacking IMGT file 1.."
imgt_unpack ${inputs[0]} "$workdir/input1"

echo "Concatenating IMGT file 1..."
id=${inputs[1]}
concat_imgt_files "$workdir/input1" "$workdir/output" 1 $id

remaining_inputs=("${inputs[@]:2}")

i="0"
while [ $i -lt ${#remaining_inputs[@]} ]; do
	j=$((i+1))
	input="${remaining_inputs[$i]}"
	id="${remaining_inputs[$j]}"
	
	echo "Unpacking IMGT file $j.."
	current_dir="$workdir/input${i}"
	imgt_unpack "${input}" "${current_dir}"
	echo "Concatenating IMGT file $1..."
	concat_imgt_files "${current_dir}" "$workdir/output" 2 $id
	i=$((i+2))
done

echo "`head $workdir/output/1_Summary.txt`"


echo "Creating new IMGT zip"
cd "$workdir/output"
tar cfJ "$output" *

#awk to fix the sequence numbers repeating?

echo "Done"

exit 0

i="1"
for input in "${remaining_inputs[@]}"
do
	echo "Unpacking IMGT file $i.."
	current_dir="$workdir/input${i}"
	imgt_unpack "${input}" "${current_dir}"
	echo "Concatenating IMGT file $1..."
	concat_imgt_files "${current_dir}" "$workdir/output" 2 $id
	i=$((i+1))
done