Mercurial > repos > davidvanzessen > imgt_concatenate
view imgt_concatenate.sh @ 2:d77d4700fd0a draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 27 Dec 2016 10:11:21 -0500 |
parents | b360a373835f |
children |
line wrap: on
line source
#!/bin/bash dir="$(cd "$(dirname "$0")" && pwd)" args=("$@") output=$1 inputs=("${args[@]:1}") workdir="$PWD" echo "Output: $output" echo "Inputs: ${inputs[@]}" mkdir "$workdir/output" function imgt_unpack { local imgt_zip=$1 local outdir=$2 if [ ! -d "$outdir" ]; then mkdir "$outdir" fi local type="`file $imgt_zip`" if [[ "$type" == *"Zip archive"* ]] ; then unzip $imgt_zip -d $outdir elif [[ "$type" == *"XZ compressed data"* ]] ; then mkdir -p $outdir echo "tar -xJf $imgt_zip -C $outdir" tar -xJf $imgt_zip -C $outdir fi } function concat_imgt_files { indir=$1 outdir=$2 start_line=$3 #line # to start at, 2 to skip header id=$4 if [[ "${start_line}" == "1" ]] ; then cat `find $indir/ -name "1_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/1_Summary.txt" cat `find $indir/ -name "2_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/2_IMGT-gapped-nt-sequences.txt" cat `find $indir/ -name "3_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/3_Nt-sequences.txt" cat `find $indir/ -name "4_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/4_IMGT-gapped-AA-sequences.txt" cat `find $indir/ -name "5_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/5_AA-sequences.txt" cat `find $indir/ -name "6_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/6_Junction.txt" cat `find $indir/ -name "7_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/7_V-REGION-mutation-and-AA-change-table.txt" cat `find $indir/ -name "8_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/8_V-REGION-nt-mutation-statistics.txt" cat `find $indir/ -name "9_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/9_V-REGION-AA-change-statistics.txt" cat `find $indir/ -name "10_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>1) {$2=$2id; } print}' >> "$outdir/10_V-REGION-mutation-hotspots.txt" else cat `find $indir/ -name "1_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/1_Summary.txt" cat `find $indir/ -name "2_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/2_IMGT-gapped-nt-sequences.txt" cat `find $indir/ -name "3_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/3_Nt-sequences.txt" cat `find $indir/ -name "4_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/4_IMGT-gapped-AA-sequences.txt" cat `find $indir/ -name "5_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/5_AA-sequences.txt" cat `find $indir/ -name "6_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/6_Junction.txt" cat `find $indir/ -name "7_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/7_V-REGION-mutation-and-AA-change-table.txt" cat `find $indir/ -name "8_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/8_V-REGION-nt-mutation-statistics.txt" cat `find $indir/ -name "9_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/9_V-REGION-AA-change-statistics.txt" cat `find $indir/ -name "10_*"` | tail -n+${start_line} | awk -F $'\t' -v id=$id 'BEGIN {OFS = FS} { if(NR>0) {$2=$2id; } print}' >> "$outdir/10_V-REGION-mutation-hotspots.txt" fi } echo "Unpacking IMGT file 1.." imgt_unpack ${inputs[0]} "$workdir/input1" echo "Concatenating IMGT file 1..." id=${inputs[1]} concat_imgt_files "$workdir/input1" "$workdir/output" 1 $id remaining_inputs=("${inputs[@]:2}") i="0" while [ $i -lt ${#remaining_inputs[@]} ]; do j=$((i+1)) input="${remaining_inputs[$i]}" id="${remaining_inputs[$j]}" echo "Unpacking IMGT file $j.." current_dir="$workdir/input${i}" imgt_unpack "${input}" "${current_dir}" echo "Concatenating IMGT file $1..." concat_imgt_files "${current_dir}" "$workdir/output" 2 $id i=$((i+2)) done echo "`head $workdir/output/1_Summary.txt`" echo "Creating new IMGT zip" cd "$workdir/output" tar cfJ "$output" * #awk to fix the sequence numbers repeating? echo "Done" exit 0 i="1" for input in "${remaining_inputs[@]}" do echo "Unpacking IMGT file $i.." current_dir="$workdir/input${i}" imgt_unpack "${input}" "${current_dir}" echo "Concatenating IMGT file $1..." concat_imgt_files "${current_dir}" "$workdir/output" 2 $id i=$((i+1)) done