# HG changeset patch
# User john-mccallum
# Date 1339716566 14400
# Node ID 21053f7f9ed1025ea85ce219073d62a59d4052c9
First upload of PCR Marker tools
diff -r 000000000000 -r 21053f7f9ed1 CAPS2gff.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/CAPS2gff.sh Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,21 @@
+#!/bin/sh
+##convert output of CAPS detection tool to GFF3
+#Copyright 2012 John McCallum
+#New Zealand Institute for Plant and Food Research
+
+#This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+inputfile=$1
+outputfile=$2
+awk -F '\t' 'split($4,ID,":") {print ID[1], "FINDCAPS",ID[3],ID[4],ID[4],".",".",".","ID="$1";Enzyme="$5";Phase="$6}' $inputfile > $outputfile
diff -r 000000000000 -r 21053f7f9ed1 CAPS2gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/CAPS2gff.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,32 @@
+
+
+ convert output of CAPS detection to gvf/gff3
+ CAPS2gff.sh $inputFile $outputfile
+
+
+
+
+
+
+
+
+This tool provides a simple conversion from CAPS 3 column output to GFF3
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 GVF_Features_Extracter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GVF_Features_Extracter.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,54 @@
+
+
diff -r 000000000000 -r 21053f7f9ed1 convert_gsMapper_gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_gsMapper_gff3.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,70 @@
+
+
+ Convert Roche gsMapper to GFF3
+ gsmapper2gff.sh $inputGsFile $outputfile
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**TIP**
+
+This tool parses Roche gsMapper read mapping output into GVF/GFF3 format
+
+
+----
+
+**Example**
+
+*input*
+
+::
+
+ >Reference Start End Ref Var Total Var Ref Var Coding Region Known # Fwd # Rev # Fwd # Rev
+ >Accno Pos Pos Nuc Nuc Depth Freq AA AA Frame Name SNP's w/ var w/ var Total Total
+ ______________________________
+
+ >1118212 673 673 A C 7 100% 6 1 6 1
+
+ Reads with Difference:
+ 1118212 648+ GTTGGTCCACTATTACTCTCAGATT-ATTTCATAACAATAATGG----A-TAC-AA 696
+ **
+ FX289JP01DVQR7 53- GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGGGCTGA-TACTA 1
+ FX289JP02IJT2O (7) 82+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 130
+ FX289JP01B8R7V 84+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATA-TGG----A-TAC-AA 131
+ FX289JP02FX58L 68+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----AC-AC-AA 116
+ FX289JP02JXAX7 (7) 67+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 115
+ FX289JP02JOOQZ (2) 69+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 117
+ FX289JP02GPHPX 45+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 93
+ **
+
+
+*output*
+
+::
+
+ ##gff-version 3
+ 1118212 gsMapper SNP 673 673 . . . ID=1118212:gsMapper:SNP:673;Reference_seq=A;Variant_seq=C;Total_reads=7;Variant_freq=100;
+ 1118212 gsMapper SNP 730 730 . . . ID=1118212:gsMapper:SNP:730;Reference_seq=A;Variant_seq=G;Total_reads=13;Variant_freq=31;
+ 1118212 gsMapper SNP 782 782 . . . ID=1118212:gsMapper:SNP:782;Reference_seq=T;Variant_seq=C;Total_reads=13;Variant_freq=92;
+ 1118212 gsMapper SNP 1319 1319 . . . ID=1118212:gsMapper:SNP:1319;Reference_seq=G;Variant_seq=A;Total_reads=7;Variant_freq=100;
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 design_primers.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/design_primers.py Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,153 @@
+#!/usr/local/bin/python2.6
+##design primers to features in multiple sequences
+##usage: python design_primers.py
+
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+import os
+import StringIO
+import re
+import tempfile
+import subprocess
+import copy
+import sys
+from BCBio import GFF
+from BCBio.GFF import GFFExaminer
+from Bio import SeqIO
+from Bio.Emboss.Applications import Primer3Commandline
+from Bio.Emboss import Primer3
+
+
+in_file = sys.argv[1]
+gff_file = sys.argv[2]
+target_file = sys.argv[3]
+prod_min_size = int(sys.argv[4])
+prod_max_size = int(sys.argv[5])
+
+max_tm_diff=1 ##
+opt_GC_percent=50 ##
+maxpolyx=4 ##
+optimum_length=20
+##target is specified in start, end format
+productsizerange = str(prod_min_size) + "," + str(prod_max_size)
+#open input files
+in_seq_handle = open(in_file)
+in_gff_handle = open(gff_file)
+in_target_handle=open(target_file)
+#read target feature IDs into list
+targets=in_target_handle.readlines()
+in_target_handle.close()
+##and create a hit list of sequences from this
+target_seq_id_list = list(set([line.split(":")[0] for line in targets]))
+##create iterator returning sequence records
+for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+ #check if this sequence is included in the target list
+ if myrec.id in target_seq_id_list:
+ ##create sequence dictionary so we can add in gff annotations
+ seq_dict = {myrec.id : myrec}
+ ##just limit to gff annotations for this sequence
+ limit_info = dict(gff_id = [ myrec.id ])
+ ##rewind gff filehandle
+ in_gff_handle.seek(0)
+ ##read annotations into sequence dictionary for this sequence record only
+ annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]
+ ##if there are any annotations, then proceed.
+ if annotations:
+ rec=annotations[0]
+ ##iterate over list of target IDs
+ for t in targets:
+ target_ID = t.strip('\n')
+ target_annotations = [f for f in rec.features if f.id == target_ID ]
+ if target_annotations:
+ mytarget = target_annotations[0]
+ #create temporary files
+ tempfastaFile = tempfile.mktemp()
+ tempproutfile = tempfile.mktemp()
+ #just consider slice of sequence in a window of +/- prod_max_size bp
+ ##from feature UNLESS feature is close to end
+ ##Note that slice is zero-based
+ featLocation = mytarget.location.start.position
+ if featLocation > prod_max_size:
+ slice_start = featLocation - prod_max_size
+ featPosition = prod_max_size
+ else:
+ slice_start = 0
+ featPosition = featLocation
+ if (len(rec) - featLocation) < prod_max_size:
+ slice_end = len(rec)
+ else:
+ slice_end = featLocation + prod_max_size
+ ###grab slice of sequence fom this window.
+ targetRec = rec[slice_start:slice_end]
+ matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
+ if matching_feature:
+ target_feat = matching_feature[0]
+ if target_feat.location.start.position == 0:
+ target_feat.location.start.position = 1
+ #we get the mask features by removing the target...all features are masked as just using snp and indels
+ ##a smarter filter could be added
+ ##note use of list copy to avoid possible side-effects
+ exclude_feat = list(targetRec.features)
+ exclude_feat.remove(target_feat)
+ ##print'targetRec.features', targetRec.features ##for debug
+ mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)
+ #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat)
+ p3_exclude_str = str(mask_str).replace('\', \'',':')
+ p3_target = str(target_feat.location.start.position +1) + "," + str(target_feat.location.end.position +1)
+ #write sequence record into template file as fasta
+ t_output_handle = open(tempfastaFile, "w")
+ SeqIO.write([targetRec], t_output_handle, "fasta")
+ t_output_handle.close()
+ #create Primer3Commandline() for emboss tool
+ primer_cl = Primer3Commandline()
+ #set the emboss tool to suppress output as this will make Galaxy think it is error message although it is a message to state run success
+ primer_cl.set_parameter("-auto",'1')
+ #pass sequence file to emboss
+ primer_cl.set_parameter("-sequence",tempfastaFile)
+ #add target location
+ primer_cl.set_parameter("-target", p3_target)
+ ##mask off other features...dumb masking of everything at present, beware
+ if (p3_exclude_str != ""):
+ primer_cl.set_parameter("-excludedregion", p3_exclude_str)
+ #add temporary output file to get the result
+ primer_cl.set_parameter("-outfile", tempproutfile)
+ #specify maximum different of tm
+ primer_cl.set_parameter("-maxdifftm",max_tm_diff )
+ #other useful parameters
+ primer_cl.set_parameter("-ogcpercent", opt_GC_percent)
+ primer_cl.set_parameter("-opolyxmax", maxpolyx)
+ primer_cl.set_parameter("-osize", optimum_length)
+ #set product size range
+ primer_cl.set_parameter("-prange", productsizerange)
+ #using python subprocess method to run emboss command line programe with the parameters given
+ fnull = open(os.devnull, 'w')
+ result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
+ #read temporary outputfile
+ handle = open(tempproutfile)
+ record = Primer3.read(handle)
+ ##just return first set, if there is one
+ if len(record.primers) > 0:
+ primer= record.primers[0]
+ outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
+ else:
+ outputstr=[mytarget.id,"NONE","NONE","NONE"]
+ print('\t'.join(map(str,outputstr)))
+
+
+in_gff_handle.close()
+in_seq_handle.close()
diff -r 000000000000 -r 21053f7f9ed1 design_primers.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/design_primers.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,79 @@
+
+
+ Design PCR Primers to Features
+ design_primers.py $inputfastaFile $inputSNPfile $inputTargetfile $min_size $max_size > $primer_outputfile
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**TIP**
+
+This tool designs primer pairs to flank features
+
+It takes
+
+* an input reference sequence file
+* a gff3 file containing feature information
+* a single column file containing list of features
+
+----
+
+**Example**
+
+--input GFF
+
+::
+
+ PGSC0003DMB000000001 samtools SNP 6345 6346 4.84 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:6345;Variant_seq=C;Reference_seq=T;Total_reads=2
+ PGSC0003DMB000000001 samtools SNP 6453 6454 18 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:6453;Variant_seq=T;Reference_seq=G;Total_reads=8
+ PGSC0003DMB000000001 samtools SNP 7255 7256 149 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:7255;Variant_seq=G;Reference_seq=T;Total_reads=14
+ PGSC0003DMB000000001 samtools SNP 7371 7372 86.8 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:7371;Variant_seq=C;Reference_seq=T;Total_reads=9
+ PGSC0003DMB000000001 samtools SNP 8288 8289 10.7 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:8288;Variant_seq=A;Reference_seq=G;Total_reads=5
+
+
+--input features
+
+::
+
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1012901
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1021771
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1025761
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026717
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026834
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1029542
+
+
+--output columnar data
+
+::
+
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1012901 AGAGGTCGGCTCTCTAGTAGCA GGGGATCCACTAACTATGTCACTT 86
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1021771 CCTATGCGAGAAAGGGACAC GCCCTTCCATGTTGTACGAG 100
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1025761 TGTGAGTAACTTAGTGTCCTACGTCAA CACTCAATGAGCCAAAGCAA 92
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026717 TTCCTAAGTCATGGGAAAGCA AGTTCATCCAAGGCAAGCAT 76
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1026834 AATGAAGTGACTGGGGAGGA TGCTGGTCGAAGCTTTCTTT 98
+ PGSC0003DMB000000001:SAMTOOLS:SNP:1029542 TAACCAGAAAGTCCGGATGG TTCTGAAGTCAAGTGGGGAGA 75
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 find_CAPS.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_CAPS.py Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,93 @@
+#!/usr/bin/python2.6
+##find snps that condition CAPS
+##usage find_CAPS.py
+
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+
+import sys
+
+from Bio import SeqIO
+from BCBio import GFF
+from Bio.Restriction import *
+
+###This list is limited to economical enzymes performing well in PCR buffer
+rest_batch = RestrictionBatch(
+ [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI,
+ HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI])
+
+in_file=sys.argv[1]
+gff_file=sys.argv[2]
+
+in_seq_handle = open(in_file)
+in_gff_handle=open(gff_file)
+
+##use iterator
+for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+ ##create single-entry dictionary to accept gff annotations from parser
+ seq_dict = {myrec.id:myrec}
+
+ ##note that this filters out only SNP features
+ limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP'])
+ in_gff_handle.seek(0)
+
+ ##parse annotations into
+ annotations = [r for r
+ in GFF.parse(in_gff_handle,
+ base_dict=seq_dict,
+ limit_info=limit_info)]
+
+ ##if there are any for this sequence, proceed
+ if annotations:
+ rec=annotations[0]
+ for feat in rec.features:
+ fstart=feat.location.start.position
+ fend=feat.location.end.position
+
+ if 20 < fstart < len(rec) - 20:
+ #just work with +/- 20 bp, ignoring SNPS within this
+ #distance from ends
+ fseq=rec.seq[fstart-20:fstart+20]
+ ref_seq = rec.seq[fstart-20:fstart+20]
+ variant_seq = ref_seq.tomutable()
+
+ #mutate the variant
+ variant_seq[20]= feat.qualifiers['Variant_seq'][0]
+ variant_seq = variant_seq.toseq()
+
+ #digest the sequences
+ ref_cuts = rest_batch.search(ref_seq)
+ var_cuts = rest_batch.search(variant_seq)
+
+ #print
+ for enz in ref_cuts:
+ kr = set(ref_cuts[enz])
+ km = set(var_cuts[enz])
+ outputstr=[rec.id, fstart +1,fend+1,feat.id,enz]
+ if len(kr) > len(km):
+ outputstr.append("reference")
+ print('\t'.join(map(str,outputstr)))
+ elif len(kr) < len(km):
+ outputstr.append("variant")
+ print('\t'.join(map(str,outputstr)))
+
+in_gff_handle.close()
+in_seq_handle.close()
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 find_CAPS.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_CAPS.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,81 @@
+
+
+ identify SNPs that condition restriction polymorphisms
+ find_CAPS.py $inputFasta $inputSNPGff3File > $outputfile
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**TIP**
+
+This tool identifies SNPs that condition restriction polymorphisms.
+
+Currently it utilizes a fixed list of robust enzymes known to work well in PCR buffers
+
+ AluI,ApaI,BamHI,BbrPI,BfrI,ClaI,DpnI,DraI,EcoRI,HaeIII,HindII,HinfI,HpaI,PvuII,RsaI,SacI,Sau3AI,SmaI,TaqI
+
+It produces a tabular output in interval format
+
+record ID, start, stop, feature ID,enzyme, phase (ie whether it cuts reference or variant sequence)
+
+
+
+
+
+**Example**
+
+
+
+*input GFF*
+
+::
+
+ JR843866 gsmapper SNP 63 63 . . . ID=JR843866:gsmapperSNP:63;Reference_seq=T;Variant_seq=C;Total_reads=22;Variant_reads=20
+ JR843866 gsmapper SNP 146 146 . . . ID=JR843866:gsmapperSNP:146;Reference_seq=T;Variant_seq=C;Total_reads=26;Variant_reads=10
+ JR843866 gsmapper SNP 258 258 . . . ID=JR843866:gsmapperSNP:258;Reference_seq=T;Variant_seq=G;Total_reads=4;Variant_reads=3
+ JR848320 gsmapper SNP 157 157 . . . ID=JR848320:gsmapperSNP:157;Reference_seq=C;Variant_seq=T;Total_reads=10;Variant_reads=10
+ JR848554 gsmapper SNP 54 54 . . . ID=JR848554:gsmapperSNP:54;Reference_seq=T;Variant_seq=G;Total_reads=5;Variant_reads=5
+ JR848554 gsmapper SNP 74 74 . . . ID=JR848554:gsmapperSNP:74;Reference_seq=C;Variant_seq=T;Total_reads=7;Variant_reads=7
+ JR848554 gsmapper SNP 123 123 . . . ID=JR848554:gsmapperSNP:123;Reference_seq=T;Variant_seq=A;Total_reads=11;Variant_reads=11
+ JR848554 gsmapper SNP 147 147 . . . ID=JR848554:gsmapperSNP:147;Reference_seq=T;Variant_seq=C;Total_reads=13;Variant_reads=13
+ JR848554 gsmapper SNP 161 161 . . . ID=JR848554:gsmapperSNP:161;Reference_seq=C;Variant_seq=T;Total_reads=13;Variant_reads=13
+
+
+
+*output columnar data*
+
+::
+
+ JR843866 63 64 JR843866:gsmapperSNP:63 HaeIII variant
+ JR848320 157 158 JR848320:gsmapperSNP:157 TaqI variant
+ JR848320 157 158 JR848320:gsmapperSNP:157 HinfI variant
+ JR848554 162 163 JR848554:gsmapperSNP:162 TaqI variant
+ JR848554 162 163 JR848554:gsmapperSNP:162 ClaI variant
+ JR848554 306 307 JR848554:gsmapperSNP:306 TaqI variant
+ JR848554 652 653 JR848554:gsmapperSNP:652 TaqI variant
+
+
+-------------------------------------------------------------------------------
+
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 gsmapper2gff.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gsmapper2gff.sh Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,50 @@
+#!/bin/sh
+##convert gsMapper output into gff3/GVF format
+
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+infile=$1
+outfile=$2
+
+awk '
+BEGIN {OFS="\t"}
+/^>/ && sub(/%/,"",$7) {
+ ID=substr($1,2)
+ if (length($4) > 1 || match($4,"-") || length($5) > 1 || match($5,"-"))
+ type="indel"
+ else
+ type="SNP"
+start=$2
+end=$3
+Col9_ID=ID ":gsmapper:" type ":"start
+
+Reference_seq=$4
+Variant_seq=$5
+Total_reads=$6
+Variant_reads=Total_reads * $7 /100 - (Total_reads * $7 % 100)/100
+
+
+
+ print ID,"gsmapper",type,start,end,".",".",".","ID="Col9_ID";Reference_seq="Reference_seq";Variant_seq="Variant_seq";Total_reads="Total_reads";Variant_reads="Variant_reads
+}' "$infile" > "$outfile"
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 parse_primersearch.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/parse_primersearch.pl Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,37 @@
+#!/usr/bin/perl
+#parse_primersearch.pl
+#reformat EMBOSS primersearch output into columnar Galaxy interval format
+
+#Copyright 2012 John McCallum
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+open (IN, "<$ARGV[0]");
+open (OUT, ">$ARGV[1]");
+
+#print OUT "primerset_id","\t","sequence_id","\t","hit_start","\","mismatches","\t","amplimer_size",\n";
+
+
+
+while () {
+ /^Primer name (\S+)/ && ($name = $1); # get primer set name
+ # Modified to cope with unnamed sequence input 28/7/05
+ /Sequence: (\S+)/ && print OUT $name,"\t",$1;
+ /Sequence:(\s{4,})/ && print OUT $name,"\t","unnamed_seq";
+ /hits forward strand at (\d+) with (\d) mismatches/ && ($start = $1) && print OUT "\t",$2,"\t",$start,;
+ /Amplimer length: (\S+)/ && ($amp_length = $1) && print OUT "\t",$start + $amp_length,"\t",$1,"\n";
+ }
+
+close( IN );
+close( OUT );
diff -r 000000000000 -r 21053f7f9ed1 parse_primersearch.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/parse_primersearch.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,68 @@
+
+
+ Parse EMBOSS primer search output to tabular
+ parse_primersearch.pl $inputPrimersearchFile $outputfile
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**TIP**
+
+This tool parses EMBOSS primersearch_ output into columnar format suitable for use as interval
+
+
+
+Columns are:
+
+1. Primer set ID
+2. Hit ID
+3. Number of mismatches
+4. Amplimer start
+5. Amplimer end
+6. Amplicon length
+
+
+----
+
+**Example**
+
+*output*
+
+::
+
+ ACP032 isotig07062 0 214 363 149
+ ACP223 isotig04647 0 362 574 212
+ ACP224 isotig04647 0 303 519 216
+ ACP225 isotig04647 0 153 355 202
+ ACP363 isotig10393 0 93 193 100
+ ACP394 isotig00271 0 894 986 92
+ ACP394 isotig00273 0 805 897 92
+ ACP440 isotig05277 0 506 601 95
+ ACP615 isotig00271 0 894 978 84
+ ACP615 isotig00273 0 805 889 84
+ AJK295 isotig06005 0 182 651 469
+
+
+.. _primersearch: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/primersearch.html
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 patman.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/patman.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,51 @@
+
+
+ search for approximate patterns in DNA libraries
+ patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This is a Galaxy wrapper for PatMaN: a DNA pattern matcher for short sequences
+
+* Website https://bioinf.eva.mpg.de/patman/
+* PubMed Citation_
+
+*Inputs*
+
+* Patterns in fasta format (create from tabular using tabular-to-fasta tool)
+* Multifasta file of target sequences
+
+*Output*
+
+* 6 Column tabular interval data
+* Columns Chrom, Pattern Name, Start, End, strand, N mismatches
+
+::
+
+ isotig05934 ACP818 368 389 + 0
+ isotig05934 ACP859 377 396 + 0
+ isotig06765 ACP822 448 468 + 0
+ isotig07088 ACP825 49 75 + 0
+ isotig07652 ACP830 199 218 + 0
+ isotig07652 ACP831 257 276 + 0
+ isotig10333 ACP837 474 497 + 0
+ isotig10393 ACP838 10 34 + 0
+
+
+
+.. _Citation: http://www.ncbi.nlm.nih.gov/pubmed/18467344?dopt=Abstract
+
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 patman2gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/patman2gff.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,26 @@
+
+
+ convert output of patman pattern detection to gvf/gff3
+ awk 'OFS="\t" {print $1,"patman","primer_binding_site",$3,$4,$6,$5,".","name="$2}' $inputFile > $outputfile
+
+
+
+
+
+
+
+
+This tool provides a simple conversion from patman column output to GFF3
+
+-------------------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
diff -r 000000000000 -r 21053f7f9ed1 uniq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/uniq.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,12 @@
+
+
+ Return unique lines
+ cat $inputFile | sort | uniq > $outputfile
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 21053f7f9ed1 vcf2gvf.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2gvf.sh Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,46 @@
+#!/bin/sh
+##convert vcf to gvf
+##NOTE This is a very simple basic parser for a complex format.
+
+##usage vcf2gvf.sh
+
+#Copyright 2012 John McCallum & Leshi Chen
+#New Zealand Institute for Plant and Food Research
+
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+
+inputfile=$1
+outputfile=$2
+
+echo "##gvf-version 1.05" > $outputfile
+
+awk '
+BEGIN {OFS="\t"}
+
+##get feature type
+{if (index($8,"INDEL")== 1) {type="INDEL"} else {type="SNP"} }
+##get feature length
+{if (type=="SNP")
+ {feat_length=1}
+ else {feat_length=length($4)}
+}
+{end=($2+feat_length)}
+
+!/^#/ { print $1 ,"SAMTOOLS",type,$2,end,$6,".",".","ID="$1":SAMTOOLS:"type":"$2";Variant_seq="$5";Reference_seq="$4";"$8}
+
+END {print ""}
+' "$inputfile" > "$outputfile"
\ No newline at end of file
diff -r 000000000000 -r 21053f7f9ed1 vcf2gvf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2gvf.xml Thu Jun 14 19:29:26 2012 -0400
@@ -0,0 +1,55 @@
+
+
+ convert vcf to gvf/gff3
+ vcf2gvf.sh $inputFile $outputfile
+
+
+
+
+
+
+
+
+This tool provides a simple conversion from vcf to gvf.
+
+Be sure to read the documentation to determine if it meets your requirements.
+
+* vcf documentation at http://samtools.sourceforge.net/samtools.shtml#6
+* GVF/GFF3 at http://www.sequenceontology.org/resources/gvf.html
+
+
+
+**input**
+
+::
+
+ PGSC0003DMB000000010 2042429 . C A 44.6 . DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1 GT:PL:DP:GQ 0/1:14,0,42:5:23 1/1:27,6,0:2:9 1/1:15,3,0:1:7 1/1:30,6,0:2:9
+ PGSC0003DMB000000038 1756646 . G A 3.69 . DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31 GT:PL:DP:GQ 0/1:20,3,0:1:6 0/1:9,0,67:7:8 0/0:0,15,82:5:17 0/1:16,3,0:1:5
+ PGSC0003DMB000000064 1916664 . T C 8.12 . DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5 GT:PL:DP:GQ 1/1:14,3,0:1:5 1/1:0,0,0:0:3 1/1:13,3,0:1:5 1/1:15,3,0:1:5
+
+
+**output**
+
+
+::
+
+ PGSC0003DMB000000010 samtools SNP 2042429 2042430 44.6 . . ID=PGSC0003DMB000000010:SAMTOOLS:SNP:2042429;Variant_seq=A;Reference_seq=C;DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1
+ PGSC0003DMB000000038 samtools SNP 1756646 1756647 3.69 . . ID=PGSC0003DMB000000038:SAMTOOLS:SNP:1756646;Variant_seq=A;Reference_seq=G;DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31
+ PGSC0003DMB000000064 samtools SNP 1916664 1916665 8.12 . . ID=PGSC0003DMB000000064:SAMTOOLS:SNP:1916664;Variant_seq=C;Reference_seq=T;DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5
+
+
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
+