annotate 2.4/script/Check_integration.sh @ 18:1163c16cb3c0 draft

Uploaded
author plus91-technologies-pvt-ltd
date Mon, 02 Jun 2014 07:35:53 -0400
parents e3609c8714fb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
1 #!/bin/sh
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
2 #$ -V
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
3 #$ -cwd
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
4 #$ -q 1-day
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
5 #$ -m ae
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
6 #$ -M hart.steven@mayo.edu
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
7 #$ -l h_vmem=8G
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
8 #$ -l h_stack=10M
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
9 VCF_FILE=$1
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
10 x=$2
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
11 #VIRAL_SEQDB=/data2/bsi/tertiary/m110344/SoftTile/Mia/BLAST_DB/OBrien/Virus_PCGS.fasta #must me indexed by bwasw
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
12 VIRAL_SEQDB=$3
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
13 VCF_FILE=$4
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
14 #VCF_FILE=final.vcf
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
15
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
16 set -x
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
17
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
18 perl -ane '$dist=100;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
19 $mate=$F[4];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
20 $mate=~s/[A-Z]|\[|\]//g;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
21 @mate=split(/:/,$mate);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
22 $end1a=@F[1]-$dist;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
23 $end1b=@F[1]+$dist;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
24 $end2a=$dist+@mate[1];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
25 $end2b=$dist+@mate[1];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
26 print "@F[0]\t$end1a\t$end1b\n@mate[0]\t$end2a\t$end2b\n"' $VCF_FILE|
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
27 sortBed > targets.bed
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
28
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
29
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
30 #100 min
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
31 time samtools view -h $x -L targets.bed |awk '(($9==0)&&($11!~/#/)&&($3!~/^chrGL/)&&($3!~/^chrM/))'|perl -ane 'print "\@@F[0]\n@F[9]\n+\n@F[10]\n"' >${x%%.bam}.res.fq
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
32 #23 min
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
33 time bwa mem -t 4 $VIRAL_SEQDB ${x%%.bam}.res.fq |samtools view -S - |grep gi > ${x%%.bam}.tmp.sam
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
34
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
35 #find out how many hits there are
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
36 cut -f3 ${x%%.bam}.tmp.sam|perl -ne '@_=split(":",$_);@res=split(/_/,@_[1],2);print "@res[1]"' | sort|uniq -c|sort -k1nr|tee ${x%%.bam}.Viral_maps.out |head
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
37 #Get the reads mapping to those hits to find out where the integration site is
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
38
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
39 #Read in the viruses until there is a significant drop off in number of reads (i.e. contributing less than 10%)
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
40 perl -ne '@_=split(" ",$_);$i=$_[0]+$i;$j=$_[0];$res=$j/$i;if($res>.1){print "@_[1]\n"}' ${x%%.bam}.Viral_maps.out >${x%%.bam}.to.keep
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
41 fgrep -f ${x%%.bam}.to.keep ${x%%.bam}.tmp.sam |cut -f1 >${x%%.bam}.reads
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
42
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
43 #75min+
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
44
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
45 time samtools view $x -L targets.bed |
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
46 fgrep -f ${x%%.bam}.reads|
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
47 awk '{if(($9==0)&&($11!~/#/)&&($3!~/^chrGL/)&&($3!~/^chrM/)&&($3!~/^\*/)){print $3"\t"$4"\t"$4"\t"$1}}'|
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
48 tee ${x%%.bam}.unsorted.bed|
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
49 sortBed | mergeBed -nms -d 1000|
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
50 perl -e 'open (FILE,"$ARGV[0]") or die "cant open file\n\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
51 $SAM="$ARGV[1]";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
52 $SAM=~chomp;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
53 while(<FILE>){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
54 chomp;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
55 @_=split(/\t/,$_);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
56 @reads=split(/;/,@_[3]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
57 #print "LINE=$_\nRES=grep $reads[0] $SAM\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
58 $res=`grep $reads[0] $SAM` ;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
59 # print "AFTER GREP, RES=$res\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
60 if($res){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
61 @res=split(/\t/,$res);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
62 print join("\t",@_[0..2],@res[2])."\n"
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
63 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
64 };
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
65 close FILE' - ${x%%.bam}.tmp.sam |
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
66 perl -ne 's/\|/\t/g;@_=split("\t",$_);print join ("\t",@_[0..2,7])'|
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
67 perl -pne 's/_/\t/'| cut -f4 --complement |
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
68 perl -e '
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
69 open (FILE,"$ARGV[0]") or die "cant open file\n\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
70 $SAM="$ARGV[1]";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
71 while(<FILE>){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
72 chomp;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
73 @_=split(/\t/,$_);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
74 $res=`grep $_[3] $SAM`;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
75 if($res){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
76 @res=split(" ",$res);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
77 $reads[0]=chomp;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
78 print join("\t",@_[0..4],@res[0])."\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
79 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
80 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
81 close FILE;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
82 ' - ${x%%.bam}.Viral_maps.out|
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
83 perl -pne 's/_/ /g'> ${x%%.bam}.Virus.integrated.bed
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
84
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
85 rm ${x%%.bam}.reads ${x%%.bam}.to.keep ${x%%.bam}.tmp.sam ${x%%.bam}.res.fq
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
86 echo "DONE with $x"