# HG changeset patch # User john-mccallum # Date 1339716566 14400 # Node ID 21053f7f9ed1025ea85ce219073d62a59d4052c9 First upload of PCR Marker tools diff -r 000000000000 -r 21053f7f9ed1 CAPS2gff.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CAPS2gff.sh Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,21 @@ +#!/bin/sh +##convert output of CAPS detection tool to GFF3 +#Copyright 2012 John McCallum +#New Zealand Institute for Plant and Food Research + +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +inputfile=$1 +outputfile=$2 +awk -F '\t' 'split($4,ID,":") {print ID[1], "FINDCAPS",ID[3],ID[4],ID[4],".",".",".","ID="$1";Enzyme="$5";Phase="$6}' $inputfile > $outputfile diff -r 000000000000 -r 21053f7f9ed1 CAPS2gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CAPS2gff.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,32 @@ + + + convert output of CAPS detection to gvf/gff3 + CAPS2gff.sh $inputFile $outputfile + + + + + + + + +This tool provides a simple conversion from CAPS 3 column output to GFF3 + +----------------------- + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + + + + + + + + diff -r 000000000000 -r 21053f7f9ed1 GVF_Features_Extracter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GVF_Features_Extracter.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,54 @@ + + + Extract unique feature IDs from a GVF/GFF3 file + awk '!/^#/ {print $1":"$2":"$3":"$4}' $inputgff3File | sort | uniq > $column_outputfile + + + + + + + + +.. class:: infomark + +**TIP** + +This tool extracts a unique feature identifier from GVF/GFF3 format, primarily to enable downstream analysis of SNPs or marker design + + +---- + +**Example** + +*input GFF3* + +:: + + ##gff-version 3 + 1119927 gsMapper SNP 434 434 . . . ID=1119927:gsMapper:SNP:434;Reference_seq=G;Variant_seq=A;Total_reads=10;Variant_freq=100;Enzyme=AluI;Phase=variant + 1120709 gsMapper SNP 361 361 . . . ID=1120709:gsMapper:SNP:361;Reference_seq=T;Variant_seq=C;Total_reads=22;Variant_freq=68;Enzyme=TaqI;Phase=variant + 1120709 gsMapper SNP 704 704 . . . ID=1120709:gsMapper:SNP:704;Reference_seq=A;Variant_seq=G;Total_reads=20;Variant_freq=90;Enzyme=RsaI;Phase=variant + +*output tabular text* + +:: + + 1119927:gsMapper:SNP:434 + 1120709:gsMapper:SNP:361 + 1120709:gsMapper:SNP:704 + + ----------------------------------------------------- + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + + + + diff -r 000000000000 -r 21053f7f9ed1 convert_gsMapper_gff3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_gsMapper_gff3.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,70 @@ + + + Convert Roche gsMapper to GFF3 + gsmapper2gff.sh $inputGsFile $outputfile + + + + + + + + +.. class:: infomark + +**TIP** + +This tool parses Roche gsMapper read mapping output into GVF/GFF3 format + + +---- + +**Example** + +*input* + +:: + + >Reference Start End Ref Var Total Var Ref Var Coding Region Known # Fwd # Rev # Fwd # Rev + >Accno Pos Pos Nuc Nuc Depth Freq AA AA Frame Name SNP's w/ var w/ var Total Total + ______________________________ + + >1118212 673 673 A C 7 100% 6 1 6 1 + + Reads with Difference: + 1118212 648+ GTTGGTCCACTATTACTCTCAGATT-ATTTCATAACAATAATGG----A-TAC-AA 696 + ** + FX289JP01DVQR7 53- GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGGGCTGA-TACTA 1 + FX289JP02IJT2O (7) 82+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 130 + FX289JP01B8R7V 84+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATA-TGG----A-TAC-AA 131 + FX289JP02FX58L 68+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----AC-AC-AA 116 + FX289JP02JXAX7 (7) 67+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 115 + FX289JP02JOOQZ (2) 69+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 117 + FX289JP02GPHPX 45+ GTTGGTCCACTATTACTCTCAGATTC-TTTCATAACAATAATGG----A-TAC-AA 93 + ** + + +*output* + +:: + + ##gff-version 3 + 1118212 gsMapper SNP 673 673 . . . ID=1118212:gsMapper:SNP:673;Reference_seq=A;Variant_seq=C;Total_reads=7;Variant_freq=100; + 1118212 gsMapper SNP 730 730 . . . ID=1118212:gsMapper:SNP:730;Reference_seq=A;Variant_seq=G;Total_reads=13;Variant_freq=31; + 1118212 gsMapper SNP 782 782 . . . ID=1118212:gsMapper:SNP:782;Reference_seq=T;Variant_seq=C;Total_reads=13;Variant_freq=92; + 1118212 gsMapper SNP 1319 1319 . . . ID=1118212:gsMapper:SNP:1319;Reference_seq=G;Variant_seq=A;Total_reads=7;Variant_freq=100; + +----------------------- + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + + + + diff -r 000000000000 -r 21053f7f9ed1 design_primers.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/design_primers.py Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,153 @@ +#!/usr/local/bin/python2.6 +##design primers to features in multiple sequences +##usage: python design_primers.py + + +#Copyright 2012 John McCallum & Leshi Chen +#New Zealand Institute for Plant and Food Research +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import os +import StringIO +import re +import tempfile +import subprocess +import copy +import sys +from BCBio import GFF +from BCBio.GFF import GFFExaminer +from Bio import SeqIO +from Bio.Emboss.Applications import Primer3Commandline +from Bio.Emboss import Primer3 + + +in_file = sys.argv[1] +gff_file = sys.argv[2] +target_file = sys.argv[3] +prod_min_size = int(sys.argv[4]) +prod_max_size = int(sys.argv[5]) + +max_tm_diff=1 ## +opt_GC_percent=50 ## +maxpolyx=4 ## +optimum_length=20 +##target is specified in start, end format +productsizerange = str(prod_min_size) + "," + str(prod_max_size) +#open input files +in_seq_handle = open(in_file) +in_gff_handle = open(gff_file) +in_target_handle=open(target_file) +#read target feature IDs into list +targets=in_target_handle.readlines() +in_target_handle.close() +##and create a hit list of sequences from this +target_seq_id_list = list(set([line.split(":")[0] for line in targets])) +##create iterator returning sequence records +for myrec in SeqIO.parse(in_seq_handle, "fasta"): + #check if this sequence is included in the target list + if myrec.id in target_seq_id_list: + ##create sequence dictionary so we can add in gff annotations + seq_dict = {myrec.id : myrec} + ##just limit to gff annotations for this sequence + limit_info = dict(gff_id = [ myrec.id ]) + ##rewind gff filehandle + in_gff_handle.seek(0) + ##read annotations into sequence dictionary for this sequence record only + annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)] + ##if there are any annotations, then proceed. + if annotations: + rec=annotations[0] + ##iterate over list of target IDs + for t in targets: + target_ID = t.strip('\n') + target_annotations = [f for f in rec.features if f.id == target_ID ] + if target_annotations: + mytarget = target_annotations[0] + #create temporary files + tempfastaFile = tempfile.mktemp() + tempproutfile = tempfile.mktemp() + #just consider slice of sequence in a window of +/- prod_max_size bp + ##from feature UNLESS feature is close to end + ##Note that slice is zero-based + featLocation = mytarget.location.start.position + if featLocation > prod_max_size: + slice_start = featLocation - prod_max_size + featPosition = prod_max_size + else: + slice_start = 0 + featPosition = featLocation + if (len(rec) - featLocation) < prod_max_size: + slice_end = len(rec) + else: + slice_end = featLocation + prod_max_size + ###grab slice of sequence fom this window. + targetRec = rec[slice_start:slice_end] + matching_feature = [f for f in targetRec.features if f.id == mytarget.id] + if matching_feature: + target_feat = matching_feature[0] + if target_feat.location.start.position == 0: + target_feat.location.start.position = 1 + #we get the mask features by removing the target...all features are masked as just using snp and indels + ##a smarter filter could be added + ##note use of list copy to avoid possible side-effects + exclude_feat = list(targetRec.features) + exclude_feat.remove(target_feat) + ##print'targetRec.features', targetRec.features ##for debug + mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat) + #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat) + p3_exclude_str = str(mask_str).replace('\', \'',':') + p3_target = str(target_feat.location.start.position +1) + "," + str(target_feat.location.end.position +1) + #write sequence record into template file as fasta + t_output_handle = open(tempfastaFile, "w") + SeqIO.write([targetRec], t_output_handle, "fasta") + t_output_handle.close() + #create Primer3Commandline() for emboss tool + primer_cl = Primer3Commandline() + #set the emboss tool to suppress output as this will make Galaxy think it is error message although it is a message to state run success + primer_cl.set_parameter("-auto",'1') + #pass sequence file to emboss + primer_cl.set_parameter("-sequence",tempfastaFile) + #add target location + primer_cl.set_parameter("-target", p3_target) + ##mask off other features...dumb masking of everything at present, beware + if (p3_exclude_str != ""): + primer_cl.set_parameter("-excludedregion", p3_exclude_str) + #add temporary output file to get the result + primer_cl.set_parameter("-outfile", tempproutfile) + #specify maximum different of tm + primer_cl.set_parameter("-maxdifftm",max_tm_diff ) + #other useful parameters + primer_cl.set_parameter("-ogcpercent", opt_GC_percent) + primer_cl.set_parameter("-opolyxmax", maxpolyx) + primer_cl.set_parameter("-osize", optimum_length) + #set product size range + primer_cl.set_parameter("-prange", productsizerange) + #using python subprocess method to run emboss command line programe with the parameters given + fnull = open(os.devnull, 'w') + result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull) + #read temporary outputfile + handle = open(tempproutfile) + record = Primer3.read(handle) + ##just return first set, if there is one + if len(record.primers) > 0: + primer= record.primers[0] + outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size] + else: + outputstr=[mytarget.id,"NONE","NONE","NONE"] + print('\t'.join(map(str,outputstr))) + + +in_gff_handle.close() +in_seq_handle.close() diff -r 000000000000 -r 21053f7f9ed1 design_primers.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/design_primers.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,79 @@ + + + Design PCR Primers to Features + design_primers.py $inputfastaFile $inputSNPfile $inputTargetfile $min_size $max_size > $primer_outputfile + + + + + + + + + + + +.. class:: infomark + +**TIP** + +This tool designs primer pairs to flank features + +It takes + +* an input reference sequence file +* a gff3 file containing feature information +* a single column file containing list of features + +---- + +**Example** + +--input GFF + +:: + + PGSC0003DMB000000001 samtools SNP 6345 6346 4.84 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:6345;Variant_seq=C;Reference_seq=T;Total_reads=2 + PGSC0003DMB000000001 samtools SNP 6453 6454 18 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:6453;Variant_seq=T;Reference_seq=G;Total_reads=8 + PGSC0003DMB000000001 samtools SNP 7255 7256 149 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:7255;Variant_seq=G;Reference_seq=T;Total_reads=14 + PGSC0003DMB000000001 samtools SNP 7371 7372 86.8 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:7371;Variant_seq=C;Reference_seq=T;Total_reads=9 + PGSC0003DMB000000001 samtools SNP 8288 8289 10.7 . . ID=PGSC0003DMB000000001:SAMTOOLS:SNP:8288;Variant_seq=A;Reference_seq=G;Total_reads=5 + + +--input features + +:: + + PGSC0003DMB000000001:SAMTOOLS:SNP:1012901 + PGSC0003DMB000000001:SAMTOOLS:SNP:1021771 + PGSC0003DMB000000001:SAMTOOLS:SNP:1025761 + PGSC0003DMB000000001:SAMTOOLS:SNP:1026717 + PGSC0003DMB000000001:SAMTOOLS:SNP:1026834 + PGSC0003DMB000000001:SAMTOOLS:SNP:1029542 + + +--output columnar data + +:: + + PGSC0003DMB000000001:SAMTOOLS:SNP:1012901 AGAGGTCGGCTCTCTAGTAGCA GGGGATCCACTAACTATGTCACTT 86 + PGSC0003DMB000000001:SAMTOOLS:SNP:1021771 CCTATGCGAGAAAGGGACAC GCCCTTCCATGTTGTACGAG 100 + PGSC0003DMB000000001:SAMTOOLS:SNP:1025761 TGTGAGTAACTTAGTGTCCTACGTCAA CACTCAATGAGCCAAAGCAA 92 + PGSC0003DMB000000001:SAMTOOLS:SNP:1026717 TTCCTAAGTCATGGGAAAGCA AGTTCATCCAAGGCAAGCAT 76 + PGSC0003DMB000000001:SAMTOOLS:SNP:1026834 AATGAAGTGACTGGGGAGGA TGCTGGTCGAAGCTTTCTTT 98 + PGSC0003DMB000000001:SAMTOOLS:SNP:1029542 TAACCAGAAAGTCCGGATGG TTCTGAAGTCAAGTGGGGAGA 75 + +----------------------- + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + + + + diff -r 000000000000 -r 21053f7f9ed1 find_CAPS.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_CAPS.py Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,93 @@ +#!/usr/bin/python2.6 +##find snps that condition CAPS +##usage find_CAPS.py + + +#Copyright 2012 John McCallum & Leshi Chen +#New Zealand Institute for Plant and Food Research +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + + +import sys + +from Bio import SeqIO +from BCBio import GFF +from Bio.Restriction import * + +###This list is limited to economical enzymes performing well in PCR buffer +rest_batch = RestrictionBatch( + [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI, + HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI]) + +in_file=sys.argv[1] +gff_file=sys.argv[2] + +in_seq_handle = open(in_file) +in_gff_handle=open(gff_file) + +##use iterator +for myrec in SeqIO.parse(in_seq_handle, "fasta"): + ##create single-entry dictionary to accept gff annotations from parser + seq_dict = {myrec.id:myrec} + + ##note that this filters out only SNP features + limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP']) + in_gff_handle.seek(0) + + ##parse annotations into + annotations = [r for r + in GFF.parse(in_gff_handle, + base_dict=seq_dict, + limit_info=limit_info)] + + ##if there are any for this sequence, proceed + if annotations: + rec=annotations[0] + for feat in rec.features: + fstart=feat.location.start.position + fend=feat.location.end.position + + if 20 < fstart < len(rec) - 20: + #just work with +/- 20 bp, ignoring SNPS within this + #distance from ends + fseq=rec.seq[fstart-20:fstart+20] + ref_seq = rec.seq[fstart-20:fstart+20] + variant_seq = ref_seq.tomutable() + + #mutate the variant + variant_seq[20]= feat.qualifiers['Variant_seq'][0] + variant_seq = variant_seq.toseq() + + #digest the sequences + ref_cuts = rest_batch.search(ref_seq) + var_cuts = rest_batch.search(variant_seq) + + #print + for enz in ref_cuts: + kr = set(ref_cuts[enz]) + km = set(var_cuts[enz]) + outputstr=[rec.id, fstart +1,fend+1,feat.id,enz] + if len(kr) > len(km): + outputstr.append("reference") + print('\t'.join(map(str,outputstr))) + elif len(kr) < len(km): + outputstr.append("variant") + print('\t'.join(map(str,outputstr))) + +in_gff_handle.close() +in_seq_handle.close() + + + diff -r 000000000000 -r 21053f7f9ed1 find_CAPS.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_CAPS.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,81 @@ + + + identify SNPs that condition restriction polymorphisms + find_CAPS.py $inputFasta $inputSNPGff3File > $outputfile + + + + + + + + +.. class:: infomark + +**TIP** + +This tool identifies SNPs that condition restriction polymorphisms. + +Currently it utilizes a fixed list of robust enzymes known to work well in PCR buffers + + AluI,ApaI,BamHI,BbrPI,BfrI,ClaI,DpnI,DraI,EcoRI,HaeIII,HindII,HinfI,HpaI,PvuII,RsaI,SacI,Sau3AI,SmaI,TaqI + +It produces a tabular output in interval format + +record ID, start, stop, feature ID,enzyme, phase (ie whether it cuts reference or variant sequence) + + + + + +**Example** + + + +*input GFF* + +:: + + JR843866 gsmapper SNP 63 63 . . . ID=JR843866:gsmapperSNP:63;Reference_seq=T;Variant_seq=C;Total_reads=22;Variant_reads=20 + JR843866 gsmapper SNP 146 146 . . . ID=JR843866:gsmapperSNP:146;Reference_seq=T;Variant_seq=C;Total_reads=26;Variant_reads=10 + JR843866 gsmapper SNP 258 258 . . . ID=JR843866:gsmapperSNP:258;Reference_seq=T;Variant_seq=G;Total_reads=4;Variant_reads=3 + JR848320 gsmapper SNP 157 157 . . . ID=JR848320:gsmapperSNP:157;Reference_seq=C;Variant_seq=T;Total_reads=10;Variant_reads=10 + JR848554 gsmapper SNP 54 54 . . . ID=JR848554:gsmapperSNP:54;Reference_seq=T;Variant_seq=G;Total_reads=5;Variant_reads=5 + JR848554 gsmapper SNP 74 74 . . . ID=JR848554:gsmapperSNP:74;Reference_seq=C;Variant_seq=T;Total_reads=7;Variant_reads=7 + JR848554 gsmapper SNP 123 123 . . . ID=JR848554:gsmapperSNP:123;Reference_seq=T;Variant_seq=A;Total_reads=11;Variant_reads=11 + JR848554 gsmapper SNP 147 147 . . . ID=JR848554:gsmapperSNP:147;Reference_seq=T;Variant_seq=C;Total_reads=13;Variant_reads=13 + JR848554 gsmapper SNP 161 161 . . . ID=JR848554:gsmapperSNP:161;Reference_seq=C;Variant_seq=T;Total_reads=13;Variant_reads=13 + + + +*output columnar data* + +:: + + JR843866 63 64 JR843866:gsmapperSNP:63 HaeIII variant + JR848320 157 158 JR848320:gsmapperSNP:157 TaqI variant + JR848320 157 158 JR848320:gsmapperSNP:157 HinfI variant + JR848554 162 163 JR848554:gsmapperSNP:162 TaqI variant + JR848554 162 163 JR848554:gsmapperSNP:162 ClaI variant + JR848554 306 307 JR848554:gsmapperSNP:306 TaqI variant + JR848554 652 653 JR848554:gsmapperSNP:652 TaqI variant + + +------------------------------------------------------------------------------- + + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + + + + + + + diff -r 000000000000 -r 21053f7f9ed1 gsmapper2gff.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gsmapper2gff.sh Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,50 @@ +#!/bin/sh +##convert gsMapper output into gff3/GVF format + +#New Zealand Institute for Plant and Food Research +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +infile=$1 +outfile=$2 + +awk ' +BEGIN {OFS="\t"} +/^>/ && sub(/%/,"",$7) { + ID=substr($1,2) + if (length($4) > 1 || match($4,"-") || length($5) > 1 || match($5,"-")) + type="indel" + else + type="SNP" +start=$2 +end=$3 +Col9_ID=ID ":gsmapper:" type ":"start + +Reference_seq=$4 +Variant_seq=$5 +Total_reads=$6 +Variant_reads=Total_reads * $7 /100 - (Total_reads * $7 % 100)/100 + + + + print ID,"gsmapper",type,start,end,".",".",".","ID="Col9_ID";Reference_seq="Reference_seq";Variant_seq="Variant_seq";Total_reads="Total_reads";Variant_reads="Variant_reads +}' "$infile" > "$outfile" + + + + + + + + diff -r 000000000000 -r 21053f7f9ed1 parse_primersearch.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parse_primersearch.pl Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,37 @@ +#!/usr/bin/perl +#parse_primersearch.pl +#reformat EMBOSS primersearch output into columnar Galaxy interval format + +#Copyright 2012 John McCallum +#New Zealand Institute for Plant and Food Research +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +open (IN, "<$ARGV[0]"); +open (OUT, ">$ARGV[1]"); + +#print OUT "primerset_id","\t","sequence_id","\t","hit_start","\","mismatches","\t","amplimer_size",\n"; + + + +while () { + /^Primer name (\S+)/ && ($name = $1); # get primer set name + # Modified to cope with unnamed sequence input 28/7/05 + /Sequence: (\S+)/ && print OUT $name,"\t",$1; + /Sequence:(\s{4,})/ && print OUT $name,"\t","unnamed_seq"; + /hits forward strand at (\d+) with (\d) mismatches/ && ($start = $1) && print OUT "\t",$2,"\t",$start,; + /Amplimer length: (\S+)/ && ($amp_length = $1) && print OUT "\t",$start + $amp_length,"\t",$1,"\n"; + } + +close( IN ); +close( OUT ); diff -r 000000000000 -r 21053f7f9ed1 parse_primersearch.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parse_primersearch.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,68 @@ + + + Parse EMBOSS primer search output to tabular + parse_primersearch.pl $inputPrimersearchFile $outputfile + + + + + + + + + +.. class:: infomark + +**TIP** + +This tool parses EMBOSS primersearch_ output into columnar format suitable for use as interval + + + +Columns are: + +1. Primer set ID +2. Hit ID +3. Number of mismatches +4. Amplimer start +5. Amplimer end +6. Amplicon length + + +---- + +**Example** + +*output* + +:: + + ACP032 isotig07062 0 214 363 149 + ACP223 isotig04647 0 362 574 212 + ACP224 isotig04647 0 303 519 216 + ACP225 isotig04647 0 153 355 202 + ACP363 isotig10393 0 93 193 100 + ACP394 isotig00271 0 894 986 92 + ACP394 isotig00273 0 805 897 92 + ACP440 isotig05277 0 506 601 95 + ACP615 isotig00271 0 894 978 84 + ACP615 isotig00273 0 805 889 84 + AJK295 isotig06005 0 182 651 469 + + +.. _primersearch: http://emboss.sourceforge.net/apps/release/5.0/emboss/apps/primersearch.html + +----------------------- + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + + + + diff -r 000000000000 -r 21053f7f9ed1 patman.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/patman.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,51 @@ + + + search for approximate patterns in DNA libraries + patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile + + + + + + + + + + + + + + +This is a Galaxy wrapper for PatMaN: a DNA pattern matcher for short sequences + +* Website https://bioinf.eva.mpg.de/patman/ +* PubMed Citation_ + +*Inputs* + +* Patterns in fasta format (create from tabular using tabular-to-fasta tool) +* Multifasta file of target sequences + +*Output* + +* 6 Column tabular interval data +* Columns Chrom, Pattern Name, Start, End, strand, N mismatches + +:: + + isotig05934 ACP818 368 389 + 0 + isotig05934 ACP859 377 396 + 0 + isotig06765 ACP822 448 468 + 0 + isotig07088 ACP825 49 75 + 0 + isotig07652 ACP830 199 218 + 0 + isotig07652 ACP831 257 276 + 0 + isotig10333 ACP837 474 497 + 0 + isotig10393 ACP838 10 34 + 0 + + + +.. _Citation: http://www.ncbi.nlm.nih.gov/pubmed/18467344?dopt=Abstract + + + + diff -r 000000000000 -r 21053f7f9ed1 patman2gff.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/patman2gff.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,26 @@ + + + convert output of patman pattern detection to gvf/gff3 + awk 'OFS="\t" {print $1,"patman","primer_binding_site",$3,$4,$6,$5,".","name="$2}' $inputFile > $outputfile + + + + + + + + +This tool provides a simple conversion from patman column output to GFF3 + +------------------------------- + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + + diff -r 000000000000 -r 21053f7f9ed1 uniq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/uniq.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,12 @@ + + + Return unique lines + cat $inputFile | sort | uniq > $outputfile + + + + + + + + diff -r 000000000000 -r 21053f7f9ed1 vcf2gvf.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2gvf.sh Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,46 @@ +#!/bin/sh +##convert vcf to gvf +##NOTE This is a very simple basic parser for a complex format. + +##usage vcf2gvf.sh + +#Copyright 2012 John McCallum & Leshi Chen +#New Zealand Institute for Plant and Food Research + +#New Zealand Institute for Plant and Food Research +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + + +inputfile=$1 +outputfile=$2 + +echo "##gvf-version 1.05" > $outputfile + +awk ' +BEGIN {OFS="\t"} + +##get feature type +{if (index($8,"INDEL")== 1) {type="INDEL"} else {type="SNP"} } +##get feature length +{if (type=="SNP") + {feat_length=1} + else {feat_length=length($4)} +} +{end=($2+feat_length)} + +!/^#/ { print $1 ,"SAMTOOLS",type,$2,end,$6,".",".","ID="$1":SAMTOOLS:"type":"$2";Variant_seq="$5";Reference_seq="$4";"$8} + +END {print ""} +' "$inputfile" > "$outputfile" \ No newline at end of file diff -r 000000000000 -r 21053f7f9ed1 vcf2gvf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2gvf.xml Thu Jun 14 19:29:26 2012 -0400 @@ -0,0 +1,55 @@ + + + convert vcf to gvf/gff3 + vcf2gvf.sh $inputFile $outputfile + + + + + + + + +This tool provides a simple conversion from vcf to gvf. + +Be sure to read the documentation to determine if it meets your requirements. + +* vcf documentation at http://samtools.sourceforge.net/samtools.shtml#6 +* GVF/GFF3 at http://www.sequenceontology.org/resources/gvf.html + + + +**input** + +:: + + PGSC0003DMB000000010 2042429 . C A 44.6 . DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1 GT:PL:DP:GQ 0/1:14,0,42:5:23 1/1:27,6,0:2:9 1/1:15,3,0:1:7 1/1:30,6,0:2:9 + PGSC0003DMB000000038 1756646 . G A 3.69 . DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31 GT:PL:DP:GQ 0/1:20,3,0:1:6 0/1:9,0,67:7:8 0/0:0,15,82:5:17 0/1:16,3,0:1:5 + PGSC0003DMB000000064 1916664 . T C 8.12 . DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5 GT:PL:DP:GQ 1/1:14,3,0:1:5 1/1:0,0,0:0:3 1/1:13,3,0:1:5 1/1:15,3,0:1:5 + + +**output** + + +:: + + PGSC0003DMB000000010 samtools SNP 2042429 2042430 44.6 . . ID=PGSC0003DMB000000010:SAMTOOLS:SNP:2042429;Variant_seq=A;Reference_seq=C;DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1 + PGSC0003DMB000000038 samtools SNP 1756646 1756647 3.69 . . ID=PGSC0003DMB000000038:SAMTOOLS:SNP:1756646;Variant_seq=A;Reference_seq=G;DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31 + PGSC0003DMB000000064 samtools SNP 1916664 1916665 8.12 . . ID=PGSC0003DMB000000064:SAMTOOLS:SNP:1916664;Variant_seq=C;Reference_seq=T;DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5 + + + +----------------------- + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + + + +