Mercurial > repos > iuc > te_finder
comparison TEfinder @ 0:838fb3a1678f draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/te_finder/ commit d86db11ee07ccc379667797b9124185ddfde1948
author | iuc |
---|---|
date | Mon, 08 Aug 2022 19:41:18 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:838fb3a1678f |
---|---|
1 #!/usr/bin/env bash | |
2 | |
3 ## | |
4 ## | |
5 ## Authors: Vista Sohrab & Dilay Hazal Ayhan | |
6 ## Date: January 15, 2021 | |
7 ## Description: TEfinder uses discordant reads to detect novel transposable element insertion events in short read paired-end sample sequencing data. | |
8 ## Software dependencies include bedtools 2.28.0 or later, samtools 1.3 or later, picard 2.0.1 or later | |
9 ## Required inputs include sample alignment file (.bam|.sam), reference genome FASTA (.fa), reference TE annotation in GFF/GTF or GFF3 (.gff|.gtf), and TEs of interest (.txt) | |
10 ## | |
11 ## University of Massachusetts Amherst | |
12 ## | |
13 ## | |
14 ## | |
15 ## | |
16 | |
17 set -e | |
18 | |
19 margs=4 | |
20 | |
21 # Functions | |
22 function example { | |
23 echo -e "example: TEfinder -alignment sample.bam -fa reference.fa -gtf TEs.gtf -te List_of_TEs.txt" | |
24 } | |
25 | |
26 function help { | |
27 echo -e "REQUIRED:" | |
28 echo -e " -alignment, --alignmentFile STR sample reads aligned to reference genome (BAM/SAM file)" | |
29 echo -e " -fa, --FastaFile STR reference genome FASTA index (FA file)" | |
30 echo -e " -gtf, --TransposonsInGenome STR reference genome TE annotation (GFF2/GTF file)" | |
31 echo -e " -te, --TransposonsToSearch STR TE names (single column text file)\n" | |
32 echo -e "OPTIONAL:" | |
33 echo -e " -bamo, --DiscordantReads STR BAM output\n" | |
34 echo -e " -bedo, --bTEinsertions STR TEinsertions BED output\n" | |
35 echo -e " -gtfo, --gTEinsertions STR TEinsertions GTF output\n" | |
36 echo -e " -fis, --FragmentInsertSize INT short-read sequencing fragment insert size [400]" | |
37 echo -e " -picard, --pathToPicardjar STR path to picard tools .jar file [picard.jar]" | |
38 echo -e " -md, --MaxDistanceForMerge INT maximum distance between reads for bedtools merge [150]" | |
39 echo -e " -k, --MaxTSDLength INT maximum TE target site duplication (TSD) length [20]" | |
40 echo -e " -maxHeapMem, --MaxHeapMemory INT java maximum heap memory allocation for picard in Mb [2000]" | |
41 echo -e " -workingdir, --WorkingDirectory STR working directory name [TEfinder_<Date>]" | |
42 echo -e " -out, --OutputFormat STR output format as GTF [BED]" | |
43 echo -e " -outname, --OutputName STR output name prefix added to file names [null]" | |
44 echo -e " -threads, --Threads INT number of threads for samtools multi-threading [1]" | |
45 echo -e " -intermed --IntermediateFiles STR keep intermediate files created by pipeline [no]" | |
46 echo -e " -h, --help prints help\n" | |
47 example | |
48 } | |
49 | |
50 # check if mandatory args are empty | |
51 function margs_check { | |
52 if [ $# -lt $margs ]; then | |
53 echo -e "One or more required parameters are missing." | |
54 example | |
55 exit 1 # error | |
56 fi | |
57 } | |
58 | |
59 # main workflow | |
60 #### : comment out | |
61 function pipeline() { | |
62 mkdir ${workingdir}/${line} | |
63 currdir=${workingdir}/${line} | |
64 echo -e $(date) " Transposon analysis for "${line}" has started\n" | |
65 | |
66 grep -P '[^(\w|\d|\-|\_|\#|\.)]'${line}'[^(\w|\d|\-|\_|\#|\.)]' $gtf > ${currdir}/${line}_TE.gff | |
67 echo -e $(date) " Individual TE GFF has been created for "${line}"\n" #### | |
68 | |
69 bedtools intersect -abam ${workingdir}/${outname}Alignments.bam -b ${currdir}/${line}_TE.gff -wa > ${currdir}/${line}_MappedReadsToTE.bam | |
70 echo -e $(date) " Mapped reads to TE via bedtools intersect has been completed for "${line}"\n" #### | |
71 samtools view -@ $threads ${currdir}/${line}_MappedReadsToTE.bam | \ | |
72 awk -v Ins=`expr $fis \* 10` '{if (($7 != "=") || ($9 > Ins) || ($9 < -Ins)) print $1}' > ${currdir}/${line}_ReadID.txt | |
73 echo -e $(date) " Identifying discordant read IDs has been completed for "${line}"\n" #### | |
74 | |
75 # if discordant readID file exists, then continue with remainder of TE analysis | |
76 if [[ -s ${currdir}/${line}_ReadID.txt ]] | |
77 then | |
78 #java $maxHeapMem -jar $picard FilterSamReads I=${workingdir}/${outname}Alignments.bam O=${currdir}/${line}_DiscordantPairs.bam READ_LIST_FILE=${currdir}/${line}_ReadID.txt FILTER=includeReadList WRITE_READS_FILES=false | |
79 $picard $maxHeapMem FilterSamReads -I ${workingdir}/${outname}Alignments.bam -O ${currdir}/${line}_DiscordantPairs.bam -READ_LIST_FILE ${currdir}/${line}_ReadID.txt -FILTER includeReadList -WRITE_READS_FILES false &>/dev/null | |
80 echo -e $(date) " Filtering original alignment based on discordant reads IDs is complete for "${line}"\n" #### | |
81 | |
82 bedtools merge -d $md -S + -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \ | |
83 awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_plusCluster.bed | |
84 echo -e $(date) " Primary reads from the + strand have been merged if read count greater than 3 for "${line}"\n" #### | |
85 | |
86 bedtools merge -d $md -S - -c 1 -o count -i ${currdir}/${line}_DiscordantPairs.bam | \ | |
87 awk '{if ($4 > 3) print $0}' > ${currdir}/${line}_minusCluster.bed | |
88 echo -e $(date) " Primary reads from the - strand have been merged if read count greater than 3 for "${line}"\n" #### | |
89 | |
90 # filtering edges piped into bedtools merge (keeping read counts greater than 3 in the line above) | |
91 ## find the closest minus strand to the plus strand in the cluster | |
92 ## filter by the distance between the plus and minus clusters - only retain pairs if reads are 0-100 bases away | |
93 ## if plus strand start is less than minus strand start and plus strand end is less than minus strand end then in proper orientation | |
94 bedtools closest -d -g ${workingdir}/reference.fa.fai -t first -a ${currdir}/${line}_plusCluster.bed -b ${currdir}/${line}_minusCluster.bed | \ | |
95 awk -v TSD=$k '{if ($9 <= TSD && $9 >= 0) print $0}' | \ | |
96 awk '{if ($2 < $6 && $3 < $7) print $0}' > ${currdir}/${line}_plusminus.bed | |
97 echo -e $(date) " Filtration of clusters in proper orientation using bedtools closest has been completed for "${line}"\n" #### | |
98 | |
99 # if plus strand end is greater than minus strand start, then report the pair | |
100 awk '{if ($3 > $6) print $1"\t"$6"\t"$3"\t"$0}' ${currdir}/${line}_plusminus.bed > ${currdir}/${line}_plusminus_1.bed | |
101 echo -e $(date) " Overlapping reads TE insertions reported for "${line}"\n" #### | |
102 | |
103 #if plus strand end is less than or equal to minus strand start and the region in between is less than a user-defined value k, report the pair | |
104 awk -v TSD=$k '{if ($3 <= $6 && $6 - $3 < TSD) print $1"\t"$3 - 1"\t"$6 + 1"\t"$0}' ${currdir}/${line}_plusminus.bed > \ | |
105 ${currdir}/${line}_plusminus_2.bed | |
106 echo -e $(date) " Non-overlapping reads TE insertions reported for "${line}"\n" #### | |
107 | |
108 #combine reported TE insertions | |
109 cat ${currdir}/${line}_plusminus_1.bed ${currdir}/${line}_plusminus_2.bed | \ | |
110 awk -v TEname=$line '{$0=TEname"\t"$0}1' | sort -k 1 | sort -k 2 > \ | |
111 ${currdir}/${line}_insertionRegion.txt | |
112 | |
113 | |
114 cat ${currdir}/${line}_insertionRegion.txt >> ${workingdir}/insertions.txt | |
115 echo -e $(date) " TE insertions for "${line}" have been reported.\n" #### | |
116 | |
117 | |
118 echo -e $(date) " Transposon named "${line}" is processed.\n" | |
119 else | |
120 echo -e $(date) " Transposon named "${line}" is processed. No discordant reads found.\n" | |
121 rm -r ${currdir} | |
122 fi | |
123 } | |
124 | |
125 # functions end | |
126 | |
127 # get arguments | |
128 fa= | |
129 alignment= | |
130 gtf= | |
131 te= | |
132 out= | |
133 intermed= | |
134 bamo="DiscordantReads.bam" | |
135 bedo="TEinsertions.bed" | |
136 gtfo="TEinsertions.gtf" | |
137 outname="" | |
138 fis=400 | |
139 picard="picard" | |
140 maxHeapMem=-Xmx2000m | |
141 md=150 | |
142 k=20 | |
143 d=$(date +%Y%m%d%H%M%S) | |
144 # workingdir=TEfinder_${d} | |
145 workingdir="TEfinder" | |
146 threads=1 | |
147 | |
148 while [ "$1" != "" ]; | |
149 do | |
150 case $1 in | |
151 -fa | --FastaFile ) | |
152 shift | |
153 fa=$1 ;; | |
154 -alignment | --alignmentFile ) | |
155 shift | |
156 alignment=$1 ;; | |
157 -gtf | --TransposonsInGenome ) | |
158 shift | |
159 gtf=$1 ;; | |
160 -te | --TransposonsToSearch ) | |
161 shift | |
162 te=$1 ;; | |
163 -fis | --FragmentInsertSize ) | |
164 shift | |
165 fis=$1 ;; | |
166 -picard | --pathToPicardjar ) | |
167 shift | |
168 picard=$1 ;; | |
169 -md | --MaxDistanceForMerge ) | |
170 shift | |
171 md=$1 ;; | |
172 -k | --MaxTSDLength ) | |
173 shift | |
174 k=$1 ;; | |
175 -bamo | --DiscordantReads ) | |
176 shift | |
177 bamo=$1 ;; | |
178 -bedo | --bTEinsertions ) | |
179 shift | |
180 bedo=$1 ;; | |
181 -gtfo | --gTEinsertions ) | |
182 shift | |
183 gtfo=$1 ;; | |
184 -maxHeapMem | --MaxHeapMemory ) | |
185 shift | |
186 maxHeapMem="-Xmx"$1"m" ;; | |
187 -workingdir | --WorkingDirectory ) | |
188 shift | |
189 workingdir=$1 ;; | |
190 -out | --OutputFormat ) | |
191 shift | |
192 out=$1 ;; | |
193 -outname | --OutputName ) | |
194 shift | |
195 outname=$1 ;; | |
196 -threads | --Threads ) | |
197 shift | |
198 threads=$1 ;; | |
199 -intermed | --IntermediateFiles ) | |
200 shift | |
201 intermed=$1 ;; | |
202 -h | --help ) | |
203 help | |
204 exit;; | |
205 *) # error | |
206 echo "TEfinder: illegal option $1" | |
207 example | |
208 exit 1 ;; | |
209 esac | |
210 shift | |
211 done | |
212 margs_check $fa $alignment $gtf $te | |
213 | |
214 # main | |
215 | |
216 mkdir ${workingdir} | |
217 | |
218 # remove empty lines from user provided TE list if present | |
219 sed '/^$/d' $te > ${workingdir}"/userTE_noEmptyLines.txt" | |
220 | |
221 # create output file | |
222 printf "%s\t" "track name=TEfinder" "type=bedDetail" "description=FR:forward read, RR:reverse read, InsRegion:insertion region start and end positions, FILTER:comma separated filters" > ${workingdir}/${outname}TEinsertions.bed | |
223 printf "\n" >> ${workingdir}/${outname}TEinsertions.bed | |
224 | |
225 # create fasta index (fai) file | |
226 cp $fa ${workingdir}/reference.fa | |
227 samtools faidx ${workingdir}/reference.fa | |
228 | |
229 # sort alignment input | |
230 samtools sort -@ $threads -o ${workingdir}/alignmentInput.sorted.bam ${alignment} | |
231 echo -e $(date) " Alignment file sorted successfully.\n" | |
232 | |
233 # remove secondary and supplementary alignments from sorted bam | |
234 samtools view -F 2304 -@ $threads -o ${workingdir}/${outname}Alignments.bam ${workingdir}/alignmentInput.sorted.bam | |
235 echo -e $(date) " Alignments are filtered - secondary and supplementary alignments have been removed. \n" | |
236 | |
237 # run pipeline for each TE | |
238 while IFS="" read -r line || [ -n "$line" ] | |
239 do | |
240 pipeline & | |
241 done < ${workingdir}/userTE_noEmptyLines.txt | |
242 wait | |
243 echo -e $(date) " All transposons are processed. Finalizing...\n" | |
244 | |
245 # combine discordant bam files | |
246 samtools merge -@ $threads -r ${workingdir}/${outname}DiscordantReads.bam ${workingdir}/*/*_DiscordantPairs.bam | |
247 echo -e $(date) " BAM Output: Discordant pair alignment file is now available.\n" | |
248 # Sorting by position | |
249 samtools sort -@ $threads ${workingdir}/${outname}DiscordantReads.bam | samtools view -h -o ${workingdir}/${outname}DiscordantReads.sam | |
250 grep -v '^@PG' ${workingdir}/${outname}DiscordantReads.sam > ${workingdir}/${outname}DiscordantReadsNoPG.sam | |
251 rm ${workingdir}/${outname}DiscordantReads.sam | |
252 samtools view -hb -x "PG" --no-PG --remove-flags "PG" -O BAM ${workingdir}/${outname}DiscordantReadsNoPG.sam -o ${bamo} | |
253 rm ${workingdir}/${outname}DiscordantReadsNoPG.sam | |
254 | |
255 # update output BED file with TEfinder results: organize the starting file | |
256 awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$8+$12"\t.\tFR="$8";RR="$12";InsRegion="$6"-"$11";FILTER="}' ${workingdir}/insertions.txt > ${workingdir}/TEinsertions_putative.bed | |
257 # find the entries in repeat regions for filtering | |
258 bedtools intersect -wa -u -a ${workingdir}/TEinsertions_putative.bed -b $gtf > ${workingdir}/TEinsertions_putative_inrepeat.bed | |
259 # filtering process | |
260 while IFS="" read -r line || [ -n "$line" ] | |
261 do | |
262 #located in repeat region | |
263 if (grep -Fxq "$line" "${workingdir}/TEinsertions_putative_inrepeat.bed") | |
264 then | |
265 line=$line"in_repeat," | |
266 fi | |
267 | |
268 #weak evidence | |
269 readc=$(echo $line | awk '{print $5}') | |
270 if (( $readc < 10 )) | |
271 then | |
272 line=$line"weak_evidence," | |
273 fi | |
274 | |
275 #strand-biased | |
276 FR=$(echo $line | grep -o 'FR=[[:digit:]]*' | cut -f2 -d'=') | |
277 RR=$(echo $line | grep -o 'RR=[[:digit:]]*' | cut -f2 -d'=') | |
278 var1=$(echo 'e(l('$FR')*1.25)' | bc -l) | |
279 var2=$(echo 'e(l('$FR')*0.8)' | bc -l) | |
280 | |
281 if [ $(echo "$RR > $var1" | bc) -eq 1 ] || [ $(echo "$RR < $var2" | bc) -eq 1 ] | |
282 then | |
283 line=$line"strand_bias," | |
284 fi | |
285 | |
286 #pass | |
287 lastchar=${line: -1} | |
288 if [ $lastchar == "," ] | |
289 then | |
290 line=${line::${#line}-1} | |
291 else | |
292 line=$line"PASS" | |
293 fi | |
294 | |
295 #write to final output | |
296 printf "%s\n" "$line" >> ${workingdir}/${outname}TEinsertions.bed | |
297 | |
298 done < ${workingdir}/TEinsertions_putative.bed | |
299 wait | |
300 echo -e $(date) " BED Output: TEfinder output BED file is now available.\n" | |
301 # Sorting | |
302 # cp ${workingdir}/${outname}TEinsertions.bed ${outo} | |
303 bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.bed > ${bedo} | |
304 # cat ${bedo} | |
305 | |
306 # gtf option - create output GTF files with TEfinder results | |
307 if [ ! -z "$out" ] | |
308 then | |
309 awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${workingdir}/${outname}TEinsertions.bed > ${workingdir}/${outname}TEinsertions.gtf | |
310 bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo} | |
311 # awk 'FNR > 1 {print $1"\tTEfinder\tTIP\t"$2 + 1"\t"$3"\t"$5"\t.\t.\tte_name \""$4"\"; tags \""$7"\""}' ${bedo} > ${gtfo} | |
312 # Sorting | |
313 # cp ${workingdir}/${outname}TEinsertions.gtf ${gtfo} | |
314 echo -e "\n\n" | |
315 # cat ${gtfo} | |
316 # bedtools sort -chrThenSizeA -i ${workingdir}/${outname}TEinsertions.gtf > ${gtfo} | |
317 echo -e $(date) " GTF Output: TEfinder output GTF file is now available.\n" | |
318 fi | |
319 | |
320 # clean working directory | |
321 if [ -z "$intermed" ] | |
322 then | |
323 rm ${workingdir}/TEinsertions_putative.bed ${workingdir}/TEinsertions_putative_inrepeat.bed ${workingdir}/reference.fa ${workingdir}/reference.fa.fai \ | |
324 ${workingdir}/alignmentInput.sorted.bam ${workingdir}/insertions.txt ${workingdir}/${outname}Alignments.bam ${workingdir}/userTE_noEmptyLines.txt | |
325 rm -r ${workingdir}/*/ | |
326 fi | |
327 | |
328 if [ `wc -l <${workingdir}/${outname}TEinsertions.bed` -le "1" ] | |
329 then | |
330 echo -e $(date) " Error: TEfinder run unsuccessful." | |
331 else | |
332 echo -e $(date) " TE insertion output files have been created. TEfinder completed successfully." | |
333 fi |