# HG changeset patch
# User cpt
# Date 1685932984 0
# Node ID bb6332a85aa621e653cb43b73fc490d58818824f
# Parent a68f32350196cc1f151960e4e94f7148eb610028
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
diff -r a68f32350196 -r bb6332a85aa6 cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt-macros.xml Mon Jun 05 02:43:04 2023 +0000
@@ -0,0 +1,115 @@
+
+
+
+ python
+ biopython
+ requests
+ cpt_gffparser
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbkToGff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbkToGff3.xml Mon Jun 05 02:43:04 2023 +0000
@@ -0,0 +1,46 @@
+
+ CPT made Biobython-based solution
+
+ macros.xml
+ cpt-macros.xml
+
+
+ '$default']]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/cpt-macros.xml
--- a/cpt_gbk_to_gff/cpt-macros.xml Fri Jun 17 12:46:43 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,115 +0,0 @@
-
-
-
-
- python
- biopython
- requests
-
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {C. Ross},
- title = {CPT Galaxy Tools},
- year = {2020-},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {E. Mijalis, H. Rasche},
- title = {CPT Galaxy Tools},
- year = {2013-2017},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
-
-
- @unpublished{galaxyTools,
- author = {C. Maughmer},
- title = {CPT Galaxy Tools},
- year = {2017-2020},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
-
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/cpt_gbkToGff3.xml
--- a/cpt_gbk_to_gff/cpt_gbkToGff3.xml Fri Jun 17 12:46:43 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,49 +0,0 @@
-
-
- CPT made Biobython-based solution
-
- macros.xml
- cpt-macros.xml
-
-
- $default]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 10.1371/journal.pcbi.1008214
-
- @unpublished{galaxyTools,
- author = {A. Criscione},
- title = {CPT Galaxy Tools},
- year = {2019-2021},
- note = {https://github.com/tamu-cpt/galaxy-tools/}
- }
-
-
-
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/gbk_to_gff3.py
--- a/cpt_gbk_to_gff/gbk_to_gff3.py Fri Jun 17 12:46:43 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,274 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import sys
-
-from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
-from Bio.SeqFeature import FeatureLocation
-from CPT_GFFParser import gffSeqFeature, gffWrite
-
-bottomFeatTypes = ["exon", "RBS", "CDS"]
-
-def makeGffFeat(inFeat, num, recName, identifier):
- if inFeat.type == "RBS" or (inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"):
- inFeat.type = "Shine_Dalgarno_sequence"
- if "codon_start" in inFeat.qualifiers.keys():
- shift = int(inFeat.qualifiers["codon_start"][0]) - 1
- else:
- shift = "."
- if identifier in inFeat.qualifiers.keys():
- name = inFeat.qualifiers[identifier][0] + "." + inFeat.type
- if num > 0:
- name += "." + str(num)
- else:
- name = recName + "." + inFeat.type + "." + str(num)
-
- outFeat = gffSeqFeature(inFeat.location, inFeat.type, '', inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff")
- outFeat.qualifiers["ID"] = [name]
- return outFeat
-
-def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):
-
- ofh = sys.stdout
- if outFile:
- ofh = outFile
-
- outRec = []
- failed = 0
- for rec in SeqIO.parse(inFile, "genbank"):
- recID = rec.name
-
- if len(str(rec.seq)) > 0:
- seqs_pending_writes = True
- outSeq = str(rec.seq)
- seqLen = len(outSeq)
-
- locBucket = {}
- outFeats = []
- topTypeDict = {}
- seekingParent = []
- geneNum = 0
- autoGeneNum = 0
- for feat in rec.features:
- if identifier not in feat.qualifiers.keys(): #Allow metadata features and other features with no ID (Output warning?) - AJC
- if feat.type in bottomFeatTypes:
- seekingParent.append([feat, [], []]) # [Feature, all parent candidates, strongest parent candidates]
- continue
- elif feat.type not in topTypeDict.keys():
- topTypeDict[feat.type] = 1
- else:
- topTypeDict[feat.type] += 1
- outFeats.append(makeGffFeat(feat, topTypeDict[feat.type], recID, identifier))
- continue
- elif feat.qualifiers[identifier][0] not in locBucket.keys():
- locBucket[feat.qualifiers[identifier][0]] = []
- locBucket[feat.qualifiers[identifier][0]].append(feat)
-
- for locus in locBucket.keys():
- minLoc = locBucket[locus][0].location.start
- maxLoc = locBucket[locus][0].location.end
- for feat in locBucket[locus]:
- minLoc = min(minLoc, feat.location.start)
- maxLoc = max(maxLoc, feat.location.end)
- for x in seekingParent:
- if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:
- x[1].append(locus)
- if x[0].location.start == minLoc or x[0].location.end == maxLoc:
- x[2].append(locus)
-
- for x in seekingParent: #Reformat to [Feature, Locus, Unused/Free]
- if len(x[2]) == 1:
- finList = ""
- if len(x[1]) > 1:
- for loc in x[1]:
- if loc != x[2][0]:
- finList += loc + ", "
- finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "."
- else:
- finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived."
- if "Notes" not in x[0].qualifiers.keys():
- x[0].qualifiers["Notes"] = []
- x[0].qualifiers["Notes"].append(finList)
- x[1] = x[2][0]
- elif len(x[2]) > 1:
- candidate = x[2][0] #Arbitrarily choose first one
- finList = ""
- strongList = ""
- for loc in x[2]:
- if loc != candidate:
- finList += loc + ", "
- strongList += loc + ", "
- for loc in x[1]:
- if loc not in x[2]:
- finList += loc + ", "
- finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + finList[0:-2] + " (Equally strong candidate(s): " + strongList[0:-2] + ")."
- if "Notes" not in x[0].qualifiers.keys():
- x[0].qualifiers["Notes"] = []
- x[0].qualifiers["Notes"].append(finList)
- x[1] = candidate
- elif len(x[1]) == 1:
- x[1] = x[1][0]
- if "Notes" not in x[0].qualifiers.keys():
- x[0].qualifiers["Notes"] = []
- finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived."
- x[0].qualifiers["Notes"].append(finList)
- elif len(x[1]) > 1:
- candidate = x[1][0] #Arbitrarily choose first one
- finList = ""
- for loc in x[1]:
- if loc != candidate:
- finList += loc + ", "
- finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + finList[0:-2] + "."
- if "Notes" not in x[0].qualifiers.keys():
- x[0].qualifiers["Notes"] = []
- x[0].qualifiers["Notes"].append(finList)
- x[1] = candidate
- else:
- if makeGene:
- sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "], creating standalone gene.\n")
- autoGeneNum += 1
- x[0].source = "GbkToGff"
- x[0].score = 0
- x[0].shift = 0
- if "ID" not in x[0].qualifiers.keys():
- x[0].qualifiers["ID"] = [recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)]
- tempName = recID + ".derived_Gene." + str(autoGeneNum)
- tempQuals = {"ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]}
- tempGene = gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'gene', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
- if makeMRNA:
- tempName = recID + ".derived_mRNA." + str(autoGeneNum)
- tempQuals = {"ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
- tempGene.sub_features.append(gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'mRNA', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff"))
- tempGene.sub_features[-1].sub_features.append(x[0])
- else:
- tempGene.sub_features.append(x[0])
-
-
- outFeats.append(tempGene)
- else:
- sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "].\n")
- if x[0].type not in topTypeDict.keys():
- topTypeDict[x[0].type] = 1
- else:
- topTypeDict[x[0].type] += 1
- outFeats.append(makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier))
-
- for locus in locBucket.keys():
- if len(locBucket[locus]) == 1: # No heirarchy to be made
- outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier))
- continue
- topFeat = None
- midFeat = None
- bottomFeats = []
- typeDict = {}
- minLoc = locBucket[locus][0].location.start
- maxLoc = locBucket[locus][0].location.end
- geneNum += 1
- for feat in locBucket[locus]:
- # If we want to make our own top-level feat?
- minLoc = min(minLoc, feat.location.start)
- maxLoc = max(maxLoc, feat.location.end)
-
- # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot
- if feat.type in ['gene']:
- if not topFeat:
- topFeat = feat
- # Else handle multiple top features
- elif feat.type in ['mRNA', 'tRNA', 'rRNA']:
- if not midFeat:
- midFeat = feat
- # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats)
- else:
- if feat.type not in typeDict.keys():
- typeDict[feat.type] = 1
- else:
- typeDict[feat.type] += 1
- bottomFeats.append(feat)
-
- for x in seekingParent:
- if type(x[1]) != "list" and locus == x[1]:
- x[0].qualifiers[identifier] = [locus]
- bottomFeats.append(x[0])
- if x[0].type not in typeDict.keys():
- typeDict[x[0].type] = 1
- else:
- typeDict[x[0].type] += 1
-
-
-
-
-
- #if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds
-
- for x in typeDict.keys(): # If only 1, set it to 0 so we don't append a number to the name
- if typeDict[x] == 1: # Else, set to 1 so that we count up as we encounter the features
- typeDict[x] = 0
- else:
- typeDict[x] = 1
-
- if not topFeat:
- if makeGene:
- if midFeat:
- possibleStrand = midFeat.strand
- else:
- possibleStrand = bottomFeats[0].strand
- tempName = recID + ".gene." + str(geneNum)
- tempQuals = {identifier : [locus], "ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]}
- topFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, possibleStrand), 'gene', '', possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
- else:
- sys.stderr.write("Unable to create a feature heirarchy at location [%d, %d] with features: \n" % (minLoc, maxLoc))
- for x in locBucket[locus]:
- sys.stderr.write(str(x))
- sys.stderr.write('\n')
- failed = 1
- continue
-
- outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))
- if not midFeat and topFeat.type == "gene" and makeMRNA:
- if identifier in topFeat.qualifiers.keys():
- tempName = topFeat.qualifiers[identifier][0] + ".mRNA"
- tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
- else:
- tempName = outFeats[-1].ID + ".mRNA"
- tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
- midFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, topFeat.strand), 'mRNA', '', topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
-
- if midFeat: # Again, need a new if statement if we want to handle multiple mid-tier features
- outFeats[-1].sub_features.append(makeGffFeat(midFeat, 0, recID, identifier))
- outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
- for x in bottomFeats:
- typeDict[x.type] += 1
- outFeats[-1].sub_features[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))
- outFeats[-1].sub_features[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].sub_features[-1].id]
- else: # No midFeat, append bottom feats directly to top feats
- for x in bottomFeats:
- typeDict[x.type] += 1
- outFeats[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))
- outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
-
- outRec.append(SeqRecord(rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations))
- SeqIO.write([outRec[-1]], fastaFile, "fasta")
- gffWrite(outRec, ofh)
- exit(failed) # 0 if all features handled, 1 if unable to handle some
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser( description='Biopython solution to Gbk to GFF conversion')
-
- parser.add_argument('inFile', type=argparse.FileType("r"), help='Path to an input GBK file' )
- parser.add_argument('--makeMRNA', action="store_true", required=False, help="Automatically create mRNA features")
- parser.add_argument('--makeGene', action="store_true", required=False, help="Automatically create missing Gene features")
- parser.add_argument('--identifier', type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from")
- parser.add_argument('--fastaFile', type=argparse.FileType("w"), help='Fasta output for sequences' )
- parser.add_argument('--outFile', type=argparse.FileType("w"), help='GFF feature output' )
- args = parser.parse_args()
- main(**vars(args))
-
-
-
-
-
-
-
-
diff -r a68f32350196 -r bb6332a85aa6 cpt_gbk_to_gff/macros.xml
--- a/cpt_gbk_to_gff/macros.xml Fri Jun 17 12:46:43 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,105 +0,0 @@
-
-
-
-
- python
- biopython
- cpt_gffparser
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r a68f32350196 -r bb6332a85aa6 gbk_to_gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gbk_to_gff3.py Mon Jun 05 02:43:04 2023 +0000
@@ -0,0 +1,477 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import FeatureLocation
+from CPT_GFFParser import gffSeqFeature, gffWrite
+
+bottomFeatTypes = ["exon", "RBS", "CDS"]
+
+
+def makeGffFeat(inFeat, num, recName, identifier):
+ if inFeat.type == "RBS" or (
+ inFeat.type == "regulatory"
+ and "regulatory_class" in inFeat.qualifiers.keys()
+ and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"
+ ):
+ inFeat.type = "Shine_Dalgarno_sequence"
+ if "codon_start" in inFeat.qualifiers.keys():
+ shift = int(inFeat.qualifiers["codon_start"][0]) - 1
+ else:
+ shift = "."
+ if identifier in inFeat.qualifiers.keys():
+ name = inFeat.qualifiers[identifier][0] + "." + inFeat.type
+ if num > 0:
+ name += "." + str(num)
+ else:
+ name = recName + "." + inFeat.type + "." + str(num)
+
+ outFeat = gffSeqFeature(
+ inFeat.location,
+ inFeat.type,
+ "",
+ inFeat.strand,
+ name,
+ inFeat.qualifiers,
+ None,
+ None,
+ None,
+ shift,
+ 0,
+ "GbkToGff",
+ )
+ outFeat.qualifiers["ID"] = [name]
+ return outFeat
+
+
+def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):
+
+ ofh = sys.stdout
+ if outFile:
+ ofh = outFile
+
+ outRec = []
+ failed = 0
+ for rec in SeqIO.parse(inFile, "genbank"):
+ recID = rec.name
+
+ if len(str(rec.seq)) > 0:
+ seqs_pending_writes = True
+ outSeq = str(rec.seq)
+ seqLen = len(outSeq)
+
+ locBucket = {}
+ outFeats = []
+ topTypeDict = {}
+ seekingParent = []
+ geneNum = 0
+ autoGeneNum = 0
+ for feat in rec.features:
+ if (
+ identifier not in feat.qualifiers.keys()
+ ): # Allow metadata features and other features with no ID (Output warning?) - AJC
+ if feat.type in bottomFeatTypes:
+ seekingParent.append(
+ [feat, [], []]
+ ) # [Feature, all parent candidates, strongest parent candidates]
+ continue
+ elif feat.type not in topTypeDict.keys():
+ topTypeDict[feat.type] = 1
+ else:
+ topTypeDict[feat.type] += 1
+ outFeats.append(
+ makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)
+ )
+ continue
+ elif feat.qualifiers[identifier][0] not in locBucket.keys():
+ locBucket[feat.qualifiers[identifier][0]] = []
+ locBucket[feat.qualifiers[identifier][0]].append(feat)
+
+ for locus in locBucket.keys():
+ minLoc = locBucket[locus][0].location.start
+ maxLoc = locBucket[locus][0].location.end
+ for feat in locBucket[locus]:
+ minLoc = min(minLoc, feat.location.start)
+ maxLoc = max(maxLoc, feat.location.end)
+ for x in seekingParent:
+ if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:
+ x[1].append(locus)
+ if x[0].location.start == minLoc or x[0].location.end == maxLoc:
+ x[2].append(locus)
+
+ for x in seekingParent: # Reformat to [Feature, Locus, Unused/Free]
+ if len(x[2]) == 1:
+ finList = ""
+ if len(x[1]) > 1:
+ for loc in x[1]:
+ if loc != x[2][0]:
+ finList += loc + ", "
+ finList = (
+ str(x[0].type)
+ + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were "
+ + finList[0:-2]
+ + "."
+ )
+ else:
+ finList = (
+ str(x[0].type)
+ + " had no locus tag set in .gbk file, automatically derived."
+ )
+ if "Notes" not in x[0].qualifiers.keys():
+ x[0].qualifiers["Notes"] = []
+ x[0].qualifiers["Notes"].append(finList)
+ x[1] = x[2][0]
+ elif len(x[2]) > 1:
+ candidate = x[2][0] # Arbitrarily choose first one
+ finList = ""
+ strongList = ""
+ for loc in x[2]:
+ if loc != candidate:
+ finList += loc + ", "
+ strongList += loc + ", "
+ for loc in x[1]:
+ if loc not in x[2]:
+ finList += loc + ", "
+ finList = (
+ str(x[0].type)
+ + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were "
+ + finList[0:-2]
+ + " (Equally strong candidate(s): "
+ + strongList[0:-2]
+ + ")."
+ )
+ if "Notes" not in x[0].qualifiers.keys():
+ x[0].qualifiers["Notes"] = []
+ x[0].qualifiers["Notes"].append(finList)
+ x[1] = candidate
+ elif len(x[1]) == 1:
+ x[1] = x[1][0]
+ if "Notes" not in x[0].qualifiers.keys():
+ x[0].qualifiers["Notes"] = []
+ finList = (
+ str(x[0].type)
+ + " had no locus tag set in .gbk file, automatically derived."
+ )
+ x[0].qualifiers["Notes"].append(finList)
+ elif len(x[1]) > 1:
+ candidate = x[1][0] # Arbitrarily choose first one
+ finList = ""
+ for loc in x[1]:
+ if loc != candidate:
+ finList += loc + ", "
+ finList = (
+ str(x[0].type)
+ + " had no locus tag set in .gbk file, automatically derived. Other candidates were "
+ + finList[0:-2]
+ + "."
+ )
+ if "Notes" not in x[0].qualifiers.keys():
+ x[0].qualifiers["Notes"] = []
+ x[0].qualifiers["Notes"].append(finList)
+ x[1] = candidate
+ else:
+ if makeGene:
+ sys.stderr.write(
+ "Warning: Unable to find potential parent for feature with no "
+ + identifier
+ + " of type "
+ + str(x[0].type)
+ + " at location ["
+ + str(x[0].location.start + 1)
+ + ", "
+ + str(x[0].location.end)
+ + "], creating standalone gene.\n"
+ )
+ autoGeneNum += 1
+ x[0].source = "GbkToGff"
+ x[0].score = 0
+ x[0].shift = 0
+ if "ID" not in x[0].qualifiers.keys():
+ x[0].qualifiers["ID"] = [
+ recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)
+ ]
+ tempName = recID + ".derived_Gene." + str(autoGeneNum)
+ tempQuals = {
+ "ID": [tempName],
+ "Notes": [
+ "Gene feature automatically generated by Gbk to GFF conversion"
+ ],
+ }
+ tempGene = gffSeqFeature(
+ FeatureLocation(
+ x[0].location.start, x[0].location.end, x[0].location.strand
+ ),
+ "gene",
+ "",
+ x[0].strand,
+ tempName,
+ tempQuals,
+ None,
+ None,
+ None,
+ ".",
+ 0,
+ "GbkToGff",
+ )
+ if makeMRNA:
+ tempName = recID + ".derived_mRNA." + str(autoGeneNum)
+ tempQuals = {
+ "ID": [tempName],
+ "Notes": [
+ "mRNA feature automatically generated by Gbk to GFF conversion"
+ ],
+ }
+ tempGene.sub_features.append(
+ gffSeqFeature(
+ FeatureLocation(
+ x[0].location.start,
+ x[0].location.end,
+ x[0].location.strand,
+ ),
+ "mRNA",
+ "",
+ x[0].strand,
+ tempName,
+ tempQuals,
+ None,
+ None,
+ None,
+ ".",
+ 0,
+ "GbkToGff",
+ )
+ )
+ tempGene.sub_features[-1].sub_features.append(x[0])
+ else:
+ tempGene.sub_features.append(x[0])
+
+ outFeats.append(tempGene)
+ else:
+ sys.stderr.write(
+ "Warning: Unable to find potential parent for feature with no "
+ + identifier
+ + " of type "
+ + str(x[0].type)
+ + " at location ["
+ + str(x[0].location.start + 1)
+ + ", "
+ + str(x[0].location.end)
+ + "].\n"
+ )
+ if x[0].type not in topTypeDict.keys():
+ topTypeDict[x[0].type] = 1
+ else:
+ topTypeDict[x[0].type] += 1
+ outFeats.append(
+ makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier)
+ )
+
+ for locus in locBucket.keys():
+ if len(locBucket[locus]) == 1: # No heirarchy to be made
+ outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier))
+ continue
+ topFeat = None
+ midFeat = None
+ bottomFeats = []
+ typeDict = {}
+ minLoc = locBucket[locus][0].location.start
+ maxLoc = locBucket[locus][0].location.end
+ geneNum += 1
+ for feat in locBucket[locus]:
+ # If we want to make our own top-level feat?
+ minLoc = min(minLoc, feat.location.start)
+ maxLoc = max(maxLoc, feat.location.end)
+
+ # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot
+ if feat.type in ["gene"]:
+ if not topFeat:
+ topFeat = feat
+ # Else handle multiple top features
+ elif feat.type in ["mRNA", "tRNA", "rRNA"]:
+ if not midFeat:
+ midFeat = feat
+ # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats)
+ else:
+ if feat.type not in typeDict.keys():
+ typeDict[feat.type] = 1
+ else:
+ typeDict[feat.type] += 1
+ bottomFeats.append(feat)
+
+ for x in seekingParent:
+ if type(x[1]) != "list" and locus == x[1]:
+ x[0].qualifiers[identifier] = [locus]
+ bottomFeats.append(x[0])
+ if x[0].type not in typeDict.keys():
+ typeDict[x[0].type] = 1
+ else:
+ typeDict[x[0].type] += 1
+
+ # if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds
+
+ for (
+ x
+ ) in (
+ typeDict.keys()
+ ): # If only 1, set it to 0 so we don't append a number to the name
+ if (
+ typeDict[x] == 1
+ ): # Else, set to 1 so that we count up as we encounter the features
+ typeDict[x] = 0
+ else:
+ typeDict[x] = 1
+
+ if not topFeat:
+ if makeGene:
+ if midFeat:
+ possibleStrand = midFeat.strand
+ else:
+ possibleStrand = bottomFeats[0].strand
+ tempName = recID + ".gene." + str(geneNum)
+ tempQuals = {
+ identifier: [locus],
+ "ID": [tempName],
+ "Notes": [
+ "Gene feature automatically generated by Gbk to GFF conversion"
+ ],
+ }
+ topFeat = gffSeqFeature(
+ FeatureLocation(minLoc, maxLoc, possibleStrand),
+ "gene",
+ "",
+ possibleStrand,
+ tempName,
+ tempQuals,
+ None,
+ None,
+ None,
+ ".",
+ 0,
+ "GbkToGff",
+ )
+ else:
+ sys.stderr.write(
+ "Unable to create a feature heirarchy at location [%d, %d] with features: \n"
+ % (minLoc, maxLoc)
+ )
+ for x in locBucket[locus]:
+ sys.stderr.write(str(x))
+ sys.stderr.write("\n")
+ failed = 1
+ continue
+
+ outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))
+ if not midFeat and topFeat.type == "gene" and makeMRNA:
+ if identifier in topFeat.qualifiers.keys():
+ tempName = topFeat.qualifiers[identifier][0] + ".mRNA"
+ tempQuals = {
+ identifier: topFeat.qualifiers[identifier],
+ "ID": [tempName],
+ "Notes": [
+ "mRNA feature automatically generated by Gbk to GFF conversion"
+ ],
+ }
+ else:
+ tempName = outFeats[-1].ID + ".mRNA"
+ tempQuals = {
+ identifier: topFeat.qualifiers[identifier],
+ "ID": [tempName],
+ "Notes": [
+ "mRNA feature automatically generated by Gbk to GFF conversion"
+ ],
+ }
+ midFeat = gffSeqFeature(
+ FeatureLocation(minLoc, maxLoc, topFeat.strand),
+ "mRNA",
+ "",
+ topFeat.strand,
+ tempName,
+ tempQuals,
+ None,
+ None,
+ None,
+ ".",
+ 0,
+ "GbkToGff",
+ )
+
+ if (
+ midFeat
+ ): # Again, need a new if statement if we want to handle multiple mid-tier features
+ outFeats[-1].sub_features.append(
+ makeGffFeat(midFeat, 0, recID, identifier)
+ )
+ outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
+ for x in bottomFeats:
+ typeDict[x.type] += 1
+ outFeats[-1].sub_features[-1].sub_features.append(
+ makeGffFeat(x, typeDict[x.type], recID, identifier)
+ )
+ outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[
+ "Parent"
+ ] = [outFeats[-1].sub_features[-1].id]
+ else: # No midFeat, append bottom feats directly to top feats
+ for x in bottomFeats:
+ typeDict[x.type] += 1
+ outFeats[-1].sub_features.append(
+ makeGffFeat(x, typeDict[x.type], recID, identifier)
+ )
+ outFeats[-1].sub_features[-1].qualifiers["Parent"] = [
+ outFeats[-1].id
+ ]
+
+ outRec.append(
+ SeqRecord(
+ rec.seq,
+ recID,
+ rec.name,
+ rec.description,
+ rec.dbxrefs,
+ sorted(outFeats, key=lambda x: x.location.start),
+ rec.annotations,
+ rec.letter_annotations,
+ )
+ )
+ SeqIO.write([outRec[-1]], fastaFile, "fasta")
+ gffWrite(outRec, ofh)
+ exit(failed) # 0 if all features handled, 1 if unable to handle some
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Biopython solution to Gbk to GFF conversion"
+ )
+
+ parser.add_argument(
+ "inFile", type=argparse.FileType("r"), help="Path to an input GBK file"
+ )
+ parser.add_argument(
+ "--makeMRNA",
+ action="store_true",
+ required=False,
+ help="Automatically create mRNA features",
+ )
+ parser.add_argument(
+ "--makeGene",
+ action="store_true",
+ required=False,
+ help="Automatically create missing Gene features",
+ )
+ parser.add_argument(
+ "--identifier",
+ type=str,
+ default="locus_tag",
+ required=False,
+ help="Qualifier to derive ID property from",
+ )
+ parser.add_argument(
+ "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences"
+ )
+ parser.add_argument(
+ "--outFile", type=argparse.FileType("w"), help="GFF feature output"
+ )
+ args = parser.parse_args()
+ main(**vars(args))
diff -r a68f32350196 -r bb6332a85aa6 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jun 05 02:43:04 2023 +0000
@@ -0,0 +1,74 @@
+
+
+
+ progressivemauve
+
+ bcbiogff
+
+
+
+ 2.4.0
+
+ 10.1371/journal.pone.0011147
+
+
+ 10.1093/bioinformatics/btm039
+
+
+ '$xmfa'
+
+
+
+
+
+ '$sequences'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ '$gff3_data'
+
+
+ #if str($reference_genome.reference_genome_source) == 'cached':
+ '${reference_genome.fasta_indexes.fields.path}'
+ #else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+ #end if
+
+
+ #if $reference_genome.reference_genome_source == 'history':
+ ln -s '$reference_genome.genome_fasta' genomeref.fa;
+ #end if
+
+
+ #if str($reference_genome.reference_genome_source) == 'cached':
+ '${reference_genome.fasta_indexes.fields.path}'
+ #else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+ #end if
+
+