# HG changeset patch
# User cpt
# Date 1655470003 0
# Node ID a68f32350196cc1f151960e4e94f7148eb610028
Uploaded
diff -r 000000000000 -r a68f32350196 cpt_gbk_to_gff/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_gff/cpt-macros.xml Fri Jun 17 12:46:43 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r a68f32350196 cpt_gbk_to_gff/cpt_gbkToGff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_gff/cpt_gbkToGff3.xml Fri Jun 17 12:46:43 2022 +0000
@@ -0,0 +1,49 @@
+
+
+ CPT made Biobython-based solution
+
+ macros.xml
+ cpt-macros.xml
+
+
+ $default]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
diff -r 000000000000 -r a68f32350196 cpt_gbk_to_gff/gbk_to_gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_gff/gbk_to_gff3.py Fri Jun 17 12:46:43 2022 +0000
@@ -0,0 +1,274 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import FeatureLocation
+from CPT_GFFParser import gffSeqFeature, gffWrite
+
+bottomFeatTypes = ["exon", "RBS", "CDS"]
+
+def makeGffFeat(inFeat, num, recName, identifier):
+ if inFeat.type == "RBS" or (inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"):
+ inFeat.type = "Shine_Dalgarno_sequence"
+ if "codon_start" in inFeat.qualifiers.keys():
+ shift = int(inFeat.qualifiers["codon_start"][0]) - 1
+ else:
+ shift = "."
+ if identifier in inFeat.qualifiers.keys():
+ name = inFeat.qualifiers[identifier][0] + "." + inFeat.type
+ if num > 0:
+ name += "." + str(num)
+ else:
+ name = recName + "." + inFeat.type + "." + str(num)
+
+ outFeat = gffSeqFeature(inFeat.location, inFeat.type, '', inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff")
+ outFeat.qualifiers["ID"] = [name]
+ return outFeat
+
+def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):
+
+ ofh = sys.stdout
+ if outFile:
+ ofh = outFile
+
+ outRec = []
+ failed = 0
+ for rec in SeqIO.parse(inFile, "genbank"):
+ recID = rec.name
+
+ if len(str(rec.seq)) > 0:
+ seqs_pending_writes = True
+ outSeq = str(rec.seq)
+ seqLen = len(outSeq)
+
+ locBucket = {}
+ outFeats = []
+ topTypeDict = {}
+ seekingParent = []
+ geneNum = 0
+ autoGeneNum = 0
+ for feat in rec.features:
+ if identifier not in feat.qualifiers.keys(): #Allow metadata features and other features with no ID (Output warning?) - AJC
+ if feat.type in bottomFeatTypes:
+ seekingParent.append([feat, [], []]) # [Feature, all parent candidates, strongest parent candidates]
+ continue
+ elif feat.type not in topTypeDict.keys():
+ topTypeDict[feat.type] = 1
+ else:
+ topTypeDict[feat.type] += 1
+ outFeats.append(makeGffFeat(feat, topTypeDict[feat.type], recID, identifier))
+ continue
+ elif feat.qualifiers[identifier][0] not in locBucket.keys():
+ locBucket[feat.qualifiers[identifier][0]] = []
+ locBucket[feat.qualifiers[identifier][0]].append(feat)
+
+ for locus in locBucket.keys():
+ minLoc = locBucket[locus][0].location.start
+ maxLoc = locBucket[locus][0].location.end
+ for feat in locBucket[locus]:
+ minLoc = min(minLoc, feat.location.start)
+ maxLoc = max(maxLoc, feat.location.end)
+ for x in seekingParent:
+ if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:
+ x[1].append(locus)
+ if x[0].location.start == minLoc or x[0].location.end == maxLoc:
+ x[2].append(locus)
+
+ for x in seekingParent: #Reformat to [Feature, Locus, Unused/Free]
+ if len(x[2]) == 1:
+ finList = ""
+ if len(x[1]) > 1:
+ for loc in x[1]:
+ if loc != x[2][0]:
+ finList += loc + ", "
+ finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "."
+ else:
+ finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived."
+ if "Notes" not in x[0].qualifiers.keys():
+ x[0].qualifiers["Notes"] = []
+ x[0].qualifiers["Notes"].append(finList)
+ x[1] = x[2][0]
+ elif len(x[2]) > 1:
+ candidate = x[2][0] #Arbitrarily choose first one
+ finList = ""
+ strongList = ""
+ for loc in x[2]:
+ if loc != candidate:
+ finList += loc + ", "
+ strongList += loc + ", "
+ for loc in x[1]:
+ if loc not in x[2]:
+ finList += loc + ", "
+ finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + finList[0:-2] + " (Equally strong candidate(s): " + strongList[0:-2] + ")."
+ if "Notes" not in x[0].qualifiers.keys():
+ x[0].qualifiers["Notes"] = []
+ x[0].qualifiers["Notes"].append(finList)
+ x[1] = candidate
+ elif len(x[1]) == 1:
+ x[1] = x[1][0]
+ if "Notes" not in x[0].qualifiers.keys():
+ x[0].qualifiers["Notes"] = []
+ finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived."
+ x[0].qualifiers["Notes"].append(finList)
+ elif len(x[1]) > 1:
+ candidate = x[1][0] #Arbitrarily choose first one
+ finList = ""
+ for loc in x[1]:
+ if loc != candidate:
+ finList += loc + ", "
+ finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + finList[0:-2] + "."
+ if "Notes" not in x[0].qualifiers.keys():
+ x[0].qualifiers["Notes"] = []
+ x[0].qualifiers["Notes"].append(finList)
+ x[1] = candidate
+ else:
+ if makeGene:
+ sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "], creating standalone gene.\n")
+ autoGeneNum += 1
+ x[0].source = "GbkToGff"
+ x[0].score = 0
+ x[0].shift = 0
+ if "ID" not in x[0].qualifiers.keys():
+ x[0].qualifiers["ID"] = [recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)]
+ tempName = recID + ".derived_Gene." + str(autoGeneNum)
+ tempQuals = {"ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]}
+ tempGene = gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'gene', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
+ if makeMRNA:
+ tempName = recID + ".derived_mRNA." + str(autoGeneNum)
+ tempQuals = {"ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
+ tempGene.sub_features.append(gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'mRNA', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff"))
+ tempGene.sub_features[-1].sub_features.append(x[0])
+ else:
+ tempGene.sub_features.append(x[0])
+
+
+ outFeats.append(tempGene)
+ else:
+ sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "].\n")
+ if x[0].type not in topTypeDict.keys():
+ topTypeDict[x[0].type] = 1
+ else:
+ topTypeDict[x[0].type] += 1
+ outFeats.append(makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier))
+
+ for locus in locBucket.keys():
+ if len(locBucket[locus]) == 1: # No heirarchy to be made
+ outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier))
+ continue
+ topFeat = None
+ midFeat = None
+ bottomFeats = []
+ typeDict = {}
+ minLoc = locBucket[locus][0].location.start
+ maxLoc = locBucket[locus][0].location.end
+ geneNum += 1
+ for feat in locBucket[locus]:
+ # If we want to make our own top-level feat?
+ minLoc = min(minLoc, feat.location.start)
+ maxLoc = max(maxLoc, feat.location.end)
+
+ # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot
+ if feat.type in ['gene']:
+ if not topFeat:
+ topFeat = feat
+ # Else handle multiple top features
+ elif feat.type in ['mRNA', 'tRNA', 'rRNA']:
+ if not midFeat:
+ midFeat = feat
+ # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats)
+ else:
+ if feat.type not in typeDict.keys():
+ typeDict[feat.type] = 1
+ else:
+ typeDict[feat.type] += 1
+ bottomFeats.append(feat)
+
+ for x in seekingParent:
+ if type(x[1]) != "list" and locus == x[1]:
+ x[0].qualifiers[identifier] = [locus]
+ bottomFeats.append(x[0])
+ if x[0].type not in typeDict.keys():
+ typeDict[x[0].type] = 1
+ else:
+ typeDict[x[0].type] += 1
+
+
+
+
+
+ #if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds
+
+ for x in typeDict.keys(): # If only 1, set it to 0 so we don't append a number to the name
+ if typeDict[x] == 1: # Else, set to 1 so that we count up as we encounter the features
+ typeDict[x] = 0
+ else:
+ typeDict[x] = 1
+
+ if not topFeat:
+ if makeGene:
+ if midFeat:
+ possibleStrand = midFeat.strand
+ else:
+ possibleStrand = bottomFeats[0].strand
+ tempName = recID + ".gene." + str(geneNum)
+ tempQuals = {identifier : [locus], "ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]}
+ topFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, possibleStrand), 'gene', '', possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
+ else:
+ sys.stderr.write("Unable to create a feature heirarchy at location [%d, %d] with features: \n" % (minLoc, maxLoc))
+ for x in locBucket[locus]:
+ sys.stderr.write(str(x))
+ sys.stderr.write('\n')
+ failed = 1
+ continue
+
+ outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))
+ if not midFeat and topFeat.type == "gene" and makeMRNA:
+ if identifier in topFeat.qualifiers.keys():
+ tempName = topFeat.qualifiers[identifier][0] + ".mRNA"
+ tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
+ else:
+ tempName = outFeats[-1].ID + ".mRNA"
+ tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
+ midFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, topFeat.strand), 'mRNA', '', topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
+
+ if midFeat: # Again, need a new if statement if we want to handle multiple mid-tier features
+ outFeats[-1].sub_features.append(makeGffFeat(midFeat, 0, recID, identifier))
+ outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
+ for x in bottomFeats:
+ typeDict[x.type] += 1
+ outFeats[-1].sub_features[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))
+ outFeats[-1].sub_features[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].sub_features[-1].id]
+ else: # No midFeat, append bottom feats directly to top feats
+ for x in bottomFeats:
+ typeDict[x.type] += 1
+ outFeats[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))
+ outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
+
+ outRec.append(SeqRecord(rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations))
+ SeqIO.write([outRec[-1]], fastaFile, "fasta")
+ gffWrite(outRec, ofh)
+ exit(failed) # 0 if all features handled, 1 if unable to handle some
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser( description='Biopython solution to Gbk to GFF conversion')
+
+ parser.add_argument('inFile', type=argparse.FileType("r"), help='Path to an input GBK file' )
+ parser.add_argument('--makeMRNA', action="store_true", required=False, help="Automatically create mRNA features")
+ parser.add_argument('--makeGene', action="store_true", required=False, help="Automatically create missing Gene features")
+ parser.add_argument('--identifier', type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from")
+ parser.add_argument('--fastaFile', type=argparse.FileType("w"), help='Fasta output for sequences' )
+ parser.add_argument('--outFile', type=argparse.FileType("w"), help='GFF feature output' )
+ args = parser.parse_args()
+ main(**vars(args))
+
+
+
+
+
+
+
+
diff -r 000000000000 -r a68f32350196 cpt_gbk_to_gff/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_gff/macros.xml Fri Jun 17 12:46:43 2022 +0000
@@ -0,0 +1,105 @@
+
+
+
+
+ python
+ biopython
+ cpt_gffparser
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+