comparison commandline_sample_STR-FM_shortread_profiling @ 2:d5ed5c2e25c3 draft

Uploaded
author arkarachai-fungtammasan
date Wed, 22 Apr 2015 12:48:40 -0400
parents
children
comparison
equal deleted inserted replaced
1:f2bab38e3cbd 2:d5ed5c2e25c3
1 ## This is a sample PBS script for profiling STR from short read using STR-FM version 2.0.0 (April 20, 2015)
2 ##
3 ##requirement
4 ##1 fastq input in sangerfq Phred scale --> ${INPUT}.fastq
5 ##2 index of mapping program (bwa, bowtie, etc)
6 ##3 location of all STR in reference genome (use PBS script name "sampleSTR_reference_profiling.txt) --> /path/to/STR/in/reference/genome.TR (you can make 4 separated TR files for 4 types of STRs)
7 ##4 reference genome in FASTA and in 2bit file --> /path/to/2bit/ref.2bit (use utility from UCSC genome browser to create 2bit file version of reference genome)
8 ##5 local Galaxy (available from Galaxy website for Mac and Unix computer)
9 ##6 STR error rates (can be downloaded from https://usegalaxy.org/u/guru%40psu.edu/h/error-rates-files) --> errorrate.bymajorallele
10 ##
11 echo " "
12 echo " "
13 echo "Job started on `hostname` at `date`"
14 ref=/path/to/reference/sequence/and/bwa/index/ref.fa
15 export PYTHONPATH=/path/to/galaxy-dist/lib/
16 galaxydir=/path/to/galaxy-dist/tools
17 cd /working/directory/
18 echo " "
19 echo " detect STR in short read" ## See detail in microsatellite.xml on https://github.com/Arkarachai/STR-FM
20 python microsatellite.py ${INPUT}.fastq --fastq --period=1 --partialmotifs --minlength=5 --prefix=20 --suffix=20 --hamming=0 --multipleruns >${INPUT}.mono.out
21 python microsatellite.py ${INPUT}.fastq --fastq --period=2 --partialmotifs --minlength=6 --prefix=20 --suffix=20 --hamming=0 --multipleruns >${INPUT}.di.out
22 python microsatellite.py ${INPUT}.fastq --fastq --period=3 --partialmotifs --minlength=9 --prefix=20 --suffix=20 --hamming=0 --multipleruns >${INPUT}.tri.out
23 python microsatellite.py ${INPUT}.fastq --fastq --period=4 --partialmotifs --minlength=12 --prefix=20 --suffix=20 --hamming=0 --multipleruns >${INPUT}.tetra.out
24
25 echo "change read name at " ## See detail in space2underscore_readname.xml on https://github.com/Arkarachai/STR-FM
26 python changespacetounderscore_readname.py ${INPUT}.mono.out ${INPUT}.mono.new 6
27 python changespacetounderscore_readname.py ${INPUT}.di.out ${INPUT}.di.new 6
28 python changespacetounderscore_readname.py ${INPUT}.tri.out ${INPUT}.tri.new 6
29 python changespacetounderscore_readname.py ${INPUT}.tetra.out ${INPUT}.tetra.new 6
30
31 echo "start fetch flanking at `date`" ## See detail in fetchflank.xml on https://github.com/Arkarachai/STR-FM
32 python pair_fetch_DNA_ff.py ${INPUT}.mono.new ${INPUT}.mono_ff_L.txt ${INPUT}.mono_ff_R.txt 20 20
33 python pair_fetch_DNA_ff.py ${INPUT}.di.new ${INPUT}.di_ff_L.txt ${INPUT}.di_ff_R.txt 20 20
34 python pair_fetch_DNA_ff.py ${INPUT}.tri.new ${INPUT}.tri_ff_L.txt ${INPUT}.tri_ff_R.txt 20 20
35 python pair_fetch_DNA_ff.py ${INPUT}.tetra.new ${INPUT}.tetra_ff_L.txt ${INPUT}.tetra_ff_R.txt 20 20
36
37 echo "BWA uniquely mapped no indel no deletion "
38 bwa aln -n 0 -o 0 ${ref} ${INPUT}.mono_ff_L.txt > ${INPUT}.mono_ff_L.sai
39 bwa aln -n 0 -o 0 ${ref} ${INPUT}.mono_ff_R.txt > ${INPUT}.mono_ff_R.sai
40 bwa sampe ${ref} ${INPUT}.mono_ff_L.sai ${INPUT}.mono_ff_R.sai ${INPUT}.mono_ff_L.txt ${INPUT}.mono_ff_R.txt > ${INPUT}.mono.sam
41 samtools view -Sb -F 12 -q 1 ${INPUT}.mono.sam > ${INPUT}.mono.n.all.bam
42 bwa aln -n 0 -o 0 ${ref} ${INPUT}.di_ff_L.txt > ${INPUT}.di_ff_L.sai
43 bwa aln -n 0 -o 0 ${ref} ${INPUT}.di_ff_R.txt > ${INPUT}.di_ff_R.sai
44 bwa sampe ${ref} ${INPUT}.di_ff_L.sai ${INPUT}.di_ff_R.sai ${INPUT}.di_ff_L.txt ${INPUT}.di_ff_R.txt > ${INPUT}.di.sam
45 samtools view -Sb -F 12 -q 1 ${INPUT}.di.sam > ${INPUT}.di.n.all.bam
46 bwa aln -n 0 -o 0 ${ref} ${INPUT}.tri_ff_L.txt > ${INPUT}.tri_ff_L.sai
47 bwa aln -n 0 -o 0 ${ref} ${INPUT}.tri_ff_R.txt > ${INPUT}.tri_ff_R.sai
48 bwa sampe ${ref} ${INPUT}.tri_ff_L.sai ${INPUT}.tri_ff_R.sai ${INPUT}.tri_ff_L.txt ${INPUT}.tri_ff_R.txt > ${INPUT}.tri.sam
49 samtools view -Sb -F 12 -q 1 ${INPUT}.tri.sam > ${INPUT}.tri.n.all.bam
50 bwa aln -n 0 -o 0 ${ref} ${INPUT}.tetra_ff_L.txt > ${INPUT}.tetra_ff_L.sai
51 bwa aln -n 0 -o 0 ${ref} ${INPUT}.tetra_ff_R.txt > ${INPUT}.tetra_ff_R.sai
52 bwa sampe ${ref} ${INPUT}.tetra_ff_L.sai ${INPUT}.tetra_ff_R.sai ${INPUT}.tetra_ff_L.txt ${INPUT}.tetra_ff_R.txt > ${INPUT}.tetra.sam
53 samtools view -Sb -F 12 -q 1 ${INPUT}.tetra.sam > ${INPUT}.tetra.n.all.bam
54
55 echo "sort result by read name"
56 samtools sort -n ${INPUT}.mono.n.all.bam ${INPUT}.mono.n.sorted.all
57 samtools sort -n ${INPUT}.di.n.all.bam ${INPUT}.di.n.sorted.all
58 samtools sort -n ${INPUT}.tri.n.all.bam ${INPUT}.tri.n.sorted.all
59 samtools sort -n ${INPUT}.tetra.n.all.bam ${INPUT}.tetra.n.sorted.all
60 samtools view -h -o ${INPUT}.mono.n.sorted.all.sam ${INPUT}.mono.n.sorted.all.bam
61 samtools view -h -o ${INPUT}.di.n.sorted.all.sam ${INPUT}.di.n.sorted.all.bam
62 samtools view -h -o ${INPUT}.tri.n.sorted.all.sam ${INPUT}.tri.n.sorted.all.bam
63 samtools view -h -o ${INPUT}.tetra.n.sorted.all.sam ${INPUT}.tetra.n.sorted.all.bam
64
65 echo "merge faux paired end reads" ## See detail in PEsortedSAM2readprofile.xml on https://github.com/Arkarachai/STR-FM
66 python PEsortedSAM2readprofile.py ${INPUT}.mono.n.sorted.all.sam /path/to/2bit/ref.2bit 100 250 ${INPUT}.mono.RF
67 python PEsortedSAM2readprofile.py ${INPUT}.di.n.sorted.all.sam /path/to/2bit/ref.2bit 100 250 ${INPUT}.mono.RF
68 python PEsortedSAM2readprofile.py ${INPUT}.tri.n.sorted.all.sam /path/to/2bit/ref.2bit 100 250 ${INPUT}.mono.RF
69 python PEsortedSAM2readprofile.py ${INPUT}.tetra.n.sorted.all.sam /path/to/2bit/ref.2bit 100 250 ${INPUT}.mono.RF
70
71 echo "join mapped coordinate with STR length using read name"
72 python ${galaxydir}/filters/join.py ${INPUT}.mono.new ${INPUT}.mono.RF 6 1 ${INPUT}.mono.RF.j "" "" --index_depth=3 --buffer=50000000 --fill_options_file='None'
73 python ${galaxydir}/filters/join.py ${INPUT}.di.new ${INPUT}.di.RF 6 1 ${INPUT}.mono.RF.j "" "" --index_depth=3 --buffer=50000000 --fill_options_file='None'
74 python ${galaxydir}/filters/join.py ${INPUT}.tri.new ${INPUT}.tri.RF 6 1 ${INPUT}.mono.RF.j "" "" --index_depth=3 --buffer=50000000 --fill_options_file='None'
75 python ${galaxydir}/filters/join.py ${INPUT}.tetra.new ${INPUT}.tetra.RF 6 1 ${INPUT}.mono.RF.j "" "" --index_depth=3 --buffer=50000000 --fill_options_file='None'
76
77 echo "join mapped coordinate and STR length with STR location in genome"
78 python ${galaxydir}/new_operations/gops_join.py /path/to/STR/in/reference/genome.TR ${INPUT}.mono.RF.j ${INPUT}.mono.gop -1 1,2,3,0 -2 10,13,14,0 -m 1 -f
79 python ${galaxydir}/new_operations/gops_join.py /path/to/STR/in/reference/genome.TR ${INPUT}.di.RF.j ${INPUT}.di.gop -1 1,2,3,0 -2 10,13,14,0 -m 1 -f
80 python ${galaxydir}/new_operations/gops_join.py /path/to/STR/in/reference/genome.TR ${INPUT}.tri.RF.j ${INPUT}.tri.gop -1 1,2,3,0 -2 10,13,14,0 -m 1 -f
81 python ${galaxydir}/new_operations/gops_join.py /path/to/STR/in/reference/genome.TR ${INPUT}.tetra.RF.j ${INPUT}.tetra.gop -1 1,2,3,0 -2 10,13,14,0 -m 1 -f
82
83 echo "remove incompatible motif (remove incorrect mapped reads given that there is no STR motif difference from reference genome)" ## See detail in microsatcompat.xml on https://github.com/Arkarachai/STR-FM
84 python microsatcompat.py ${INPUT}.mono.gop 4 10 > ${INPUT}.mono.fulltable1
85 python microsatcompat.py ${INPUT}.di.gop 4 10 > ${INPUT}.di.fulltable1
86 python microsatcompat.py ${INPUT}.tri.gop 4 10 > ${INPUT}.tri.fulltable1
87 python microsatcompat.py ${INPUT}.tetra.gop 4 10 > ${INPUT}.tetra.fulltable1
88
89 echo "remove shifting flanking location (remove cases that come from STR interruption or flanking bases are misread as STRs)"
90 cat ${INPUT}.mono.fulltable1 | awk '($19==$2) && ($20==$3) {print $0}' > ${INPUT}.mono.fulltable2
91 cat ${INPUT}.di.fulltable1 | awk '($19==$2) && ($20==$3) {print $0}' > ${INPUT}.di.fulltable2
92 cat ${INPUT}.tri.fulltable1 | awk '($19==$2) && ($20==$3) {print $0}' > ${INPUT}.tri.fulltable2
93 cat ${INPUT}.tetra.fulltable1 | awk '($19==$2) && ($20==$3) {print $0}' > ${INPUT}.tetra.fulltable2
94
95 echo "keep only column that are necessary for profiling"
96 cat ${INPUT}.mono.fulltable2| cut -f 1,2,3,4,5,7 | sort -k 1n,1 -k 2n,2 -k 3n,3 > ${INPUT}.mono.cuttmp0
97 cat ${INPUT}.di.fulltable2| cut -f 1,2,3,4,5,7 | sort -k 1n,1 -k 2n,2 -k 3n,3 > ${INPUT}.di.cuttmp0
98 cat ${INPUT}.tri.fulltable2| cut -f 1,2,3,4,5,7 | sort -k 1n,1 -k 2n,2 -k 3n,3 > ${INPUT}.tri.cuttmp0
99 cat ${INPUT}.tetra.fulltable2| cut -f 1,2,3,4,5,7 | sort -k 1n,1 -k 2n,2 -k 3n,3 > ${INPUT}.tetra.cuttmp0
100
101 echo "If you multiple analysis by splitting initial fastq, you should merge (cat) all results from the same sample after this step"
102
103 echo "create genomic coordinate column and group by that column"
104 perl ${galaxydir}/filters/fixedValueColumn.pl ${INPUT}.mono.cuttmp0 ${INPUT}.mono.cuttmp1 "_" "no"
105 python ${galaxydir}/filters/mergeCols.py ${INPUT}.mono.cuttmp1 ${INPUT}.mono.cuttmp2 1 7 2 7 3
106 python ${galaxydir}/stats/grouping.py ${INPUT}.mono.cuttmp3 ${INPUT}.mono.cuttmp2 8 0 'cat 6 0' 'cat_uniq 4 0'
107 perl ${galaxydir}/filters/fixedValueColumn.pl ${INPUT}.di.cuttmp0 ${INPUT}.di.cuttmp1 "_" "no"
108 python ${galaxydir}/filters/mergeCols.py ${INPUT}.di.cuttmp1 ${INPUT}.di.cuttmp2 1 7 2 7 3
109 python ${galaxydir}/stats/grouping.py ${INPUT}.di.cuttmp3 ${INPUT}.di.cuttmp2 8 0 'cat 6 0' 'cat_uniq 4 0'
110 perl ${galaxydir}/filters/fixedValueColumn.pl ${INPUT}.tri.cuttmp0 ${INPUT}.tri.cuttmp1 "_" "no"
111 python ${galaxydir}/filters/mergeCols.py ${INPUT}.tri.cuttmp1 ${INPUT}.tri.cuttmp2 1 7 2 7 3
112 python ${galaxydir}/stats/grouping.py ${INPUT}.tri.cuttmp3 ${INPUT}.tri.cuttmp2 8 0 'cat 6 0' 'cat_uniq 4 0'
113 perl ${galaxydir}/filters/fixedValueColumn.pl ${INPUT}.tetra.cuttmp0 ${INPUT}.tetra.cuttmp1 "_" "no"
114 python ${galaxydir}/filters/mergeCols.py ${INPUT}.tetra.cuttmp1 ${INPUT}.tetra.cuttmp2 1 7 2 7 3
115 python ${galaxydir}/stats/grouping.py ${INPUT}.tetra.cuttmp3 ${INPUT}.tetra.cuttmp2 8 0 'cat 6 0' 'cat_uniq 4 0'
116
117 echo "you may filter for minimum sequencing depth here"
118
119 echo "genotyping using error correction model" ## See detail in GenotypingSTR.xml on https://github.com/Arkarachai/STR-FM
120 cat ${INPUT}.mono.cuttmp2 ${INPUT}.di.cuttmp2 ${INPUT}.tri.cuttmp2 ${INPUT}.tetra.cuttmp2 > ${INPUT}.step5
121 python GenotypeTRcorrection.py ${INPUT}.step5 errorrate.bymajorallele ${INPUT}.step5.result 0.5
122 ## final output is ${INPUT}.step5.result
123
124 echo "Job end on `hostname` at `date`"