Mercurial > repos > john-mccallum > pcr_markers
view find_CAPS.py @ 4:be070a68521e draft
Uploaded workflow from vcf file to CAPS
author | john-mccallum |
---|---|
date | Thu, 18 Oct 2012 19:47:07 -0400 |
parents | 21053f7f9ed1 |
children |
line wrap: on
line source
#!/usr/bin/python2.6 ##find snps that condition CAPS ##usage find_CAPS.py <reference file> <gff file> #Copyright 2012 John McCallum & Leshi Chen #New Zealand Institute for Plant and Food Research #This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import sys from Bio import SeqIO from BCBio import GFF from Bio.Restriction import * ###This list is limited to economical enzymes performing well in PCR buffer rest_batch = RestrictionBatch( [AluI, ApaI, BamHI, BbrPI, BfrI, ClaI, DdeI, DpnII, DraI, EcoRI, HaeIII, HindII, HinfI, HpaI, PvuII, RsaI, SacI, Sau3AI, SmaI, TaqI]) in_file=sys.argv[1] gff_file=sys.argv[2] in_seq_handle = open(in_file) in_gff_handle=open(gff_file) ##use iterator for myrec in SeqIO.parse(in_seq_handle, "fasta"): ##create single-entry dictionary to accept gff annotations from parser seq_dict = {myrec.id:myrec} ##note that this filters out only SNP features limit_info = dict(gff_id = [myrec.id] ,gff_type = ['SNP']) in_gff_handle.seek(0) ##parse annotations into annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict, limit_info=limit_info)] ##if there are any for this sequence, proceed if annotations: rec=annotations[0] for feat in rec.features: fstart=feat.location.start.position fend=feat.location.end.position if 20 < fstart < len(rec) - 20: #just work with +/- 20 bp, ignoring SNPS within this #distance from ends fseq=rec.seq[fstart-20:fstart+20] ref_seq = rec.seq[fstart-20:fstart+20] variant_seq = ref_seq.tomutable() #mutate the variant variant_seq[20]= feat.qualifiers['Variant_seq'][0] variant_seq = variant_seq.toseq() #digest the sequences ref_cuts = rest_batch.search(ref_seq) var_cuts = rest_batch.search(variant_seq) #print for enz in ref_cuts: kr = set(ref_cuts[enz]) km = set(var_cuts[enz]) outputstr=[rec.id, fstart +1,fend+1,feat.id,enz] if len(kr) > len(km): outputstr.append("reference") print('\t'.join(map(str,outputstr))) elif len(kr) < len(km): outputstr.append("variant") print('\t'.join(map(str,outputstr))) in_gff_handle.close() in_seq_handle.close()