Mercurial > repos > jjjjia > cpo_prediction
changeset 18:596bf8a792de draft
planemo upload
author | jjjjia |
---|---|
date | Tue, 28 Aug 2018 15:15:09 -0400 |
parents | ed3b291693fc |
children | ab2a037ad69c |
files | cpo_galaxy_prediction.py cpo_galaxy_predictions.xml cpo_galaxy_tree.py cpo_mash.xml |
diffstat | 4 files changed, 160 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/cpo_galaxy_prediction.py Mon Aug 27 19:58:24 2018 -0400 +++ b/cpo_galaxy_prediction.py Tue Aug 28 15:15:09 2018 -0400 @@ -42,6 +42,7 @@ parser.add_option("-e", "--expected", dest="expectedSpecies", default="NA/NA/NA", type="string", help="expected species of the isolate") parser.add_option("-s", "--mlst-scheme", dest="mlstScheme", default= "./scheme_species_map.tab", type="string", help="absolute file path to mlst scheme") parser.add_option("-p", "--plasmidfinder", dest="plasmidfinder", type="string", help="absolute file path to plasmidfinder ") + parser.add_options("-d", "--mash", dest="mash", type="string", help="absolute file path to mash plasmiddb result") #parallelization, useless, these are hard coded to 8cores/64G RAM #parser.add_option("-t", "--threads", dest="threads", default=8, type="int", help="number of cpu to use") @@ -60,6 +61,7 @@ expectedSpecies = str(options.expectedSpecies).lstrip().rstrip() mlstScheme = str(options.mlstScheme).lstrip().rstrip() plasmidfinder = str(options.plasmidfinder).lstrip().rstrip() + mash = str(options.mash).lstrip().rstrip() outputDir = "./" print(mlst) print(mobfindercontig) @@ -68,6 +70,8 @@ print(rgi) print(expectedSpecies) print(mlstScheme) + print(mash) + else: curDir = os.getcwd() ID = "BC11" @@ -79,6 +83,7 @@ expectedSpecies = "Escherichia coli" mlstScheme = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\scheme_species_map.tab" plasmidfinder = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\BC11-Kpn005_S2.origins" + mash = "D:\OneDrive\ProjectCDC\ProjectCDCInPython\ProjectCDCInPython\pipelineTest\predictions\mash.tsv" outputDir = "./" #region result objects @@ -190,6 +195,26 @@ self.source = "" self.row = "" +class MashResult(object): + def __init__(self): + self.size = 0.0 + self.depth = 0.0 + self.identity = 0.0 + self.sharedHashes = "" + self.medianMultiplicity = 0 + self.pvalue = 0.0 + self.queryID= "" + self.queryComment = "" + self.species = "" + self.row = "" + self.accession = "" + self.gcf="" + self.assembly="" + + def toDict(self): #doesnt actually work + return dict((name, getattr(self, name)) for name in dir(self) if not name.startswith('__')) + + #endregion #region useful functions @@ -476,6 +501,22 @@ #plasmidFinderContigs.append(str(plasmidFinder.iloc[i,1])) #origins.append(str(plasmidFinder.iloc[i,4][:plasmidFinder.iloc[i,4].index("_")])) return _pFinder + +def ParseMashResult(pathToMashScreen): + mashScreen = pandas.read_csv(pathToMashScreen, delimiter='\t', header=None) + + _mashPlasmidHits = {} #*********************** + #parse what the species are. + for i in (range(len(mashScreen.index))): + mr = MashResult() + mr.identity = float(mashScreen.ix[i, 0]) + mr.sharedHashes = mashScreen.ix[i, 1] + mr.medianMultiplicity = int(mashScreen.ix[i, 2]) + mr.pvalue = float(mashScreen.ix[i, 3]) + mr.name = mashScreen.ix[i, 4] #accession + mr.row = "\t".join(str(x) for x in mashScreen.ix[i].tolist()) + _mashPlasmidHits[mr.name] = mr + return _mashPlasmidHits #endregion def Main(): @@ -531,6 +572,9 @@ rgiAMR = ParseRGIResult(rgi, plasmidContigs, likelyPlasmidContigs) # outputDir + "/predictions/" + ID + ".rgi.txt", plasmidContigs, likelyPlasmidContigs)#*********************** ToJson(rgiAMR, "rgi.json") #************* + plasmidFamily = ParseMashResult(mash) + ToJson(plasmidFamily, "mash.json") + carbapenamases = [] resfinderCarbas = [] #list of rfinder objects for lindaout list amrGenes = [] @@ -613,20 +657,27 @@ lindaTemp += "\t\t" #sero and kcap #resfinderCarbas + index = 0 for carbs in resfinderCarbas: if (carbs.source == "plasmid"): # - lindaTemp += "\t\t\t\t\t" #plasmid 1 rflp plasmid 1 family information. PLASMID_1_FAMILY\tPLASMID_1_BEST_MATCH\tPLASMID_1_COVERAGE\tPLASMID_1_SNVS_TO_BEST_MATCH + lindaTemp += "\t" + plasmid = plasmidFamily[list(plasmidFamily.keys())[index]] + lindaTemp += plasmid.name + "\t" + lindaTemp += str(plasmid.identity) + "\t" + lindaTemp += plasmid.sharedHashes + "\t" lindaTemp += carbs.shortGene + "\t" #found an carbapenase contig = carbs.sequence[6:] #this is the contig number for i in mSuite.keys(): if (str(mSuite[i].contig_num) == str(contig)): #found the right plasmid - lindaTemp += mSuite[i].rep_type + clusterid = mSuite[i].cluster_id + rep_types = mSuitePlasmids["plasmid_" + str(clusterid) + ".fasta"].rep_types + lindaTemp += rep_types lindaOut.append(lindaTemp) out = open("summary.linda.tsv", 'w') for item in lindaOut: out.write("%s\n" % item) - tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") + tsvOut.append("new\tID\tExpected Species\tMLST Species\tSequence Type\tMLST Scheme\tCarbapenem Resistance Genes\tOther AMR Genes\tPlasmid Best Match\tTotal Plasmids\tPlasmids ID\tNum_Contigs\tPlasmid Length\tPlasmid RepType\tPlasmid Mobility\tNearest Reference\tDefinitely Plasmid Contigs\tLikely Plasmid Contigs") #start with ID temp = "\t" temp += (ID + "\t") @@ -642,6 +693,7 @@ temp += ";".join(amrGenes) + "\t" #lastly plasmids + temp += str(plasmidFamily[list(plasmidFamily.keys())[0]].name) temp+= str(len(mSuitePlasmids)) + "\t" plasmidID = "" contigs = ""
--- a/cpo_galaxy_predictions.xml Mon Aug 27 19:58:24 2018 -0400 +++ b/cpo_galaxy_predictions.xml Tue Aug 28 15:15:09 2018 -0400 @@ -11,11 +11,12 @@ '-m $mlst' '-c $mobsuitecontig' '-f $mobsuiteaggregate' - '-a $abricate' + '-a $resfinder' '-r $rgi' '-e $expected' '-s $__tool_directory__/scheme_species_map.tab' '-p $plasmidfinder' + '-d $mash' ]]> </command> <inputs> @@ -23,15 +24,16 @@ <param type="data" name="mlst" format="tabular" /> <param type="data" name="mobsuitecontig" format="tabular" /> <param type="data" name="mobsuiteaggregate" format="tabular" /> - <param type="data" name="abricate" format="tabular" /> + <param type="data" name="resfinder" format="tabular" /> <param type="data" name="rgi" format="tabular" /> <param type="data" name="plasmidfinder" format="tabular" /> + <param type="data" name="mash" format ="tabular"/> <param type="text" name="expected" optional ="false"/> </inputs> <outputs> <data name="tsvSummary" format="tabular" from_work_dir="summary.tsv"/> - <data name="tsvSummaryExistingFormat" format="tabular" from_work_dir="summary.linda.tsv"/> - <data name="txtSummary" format="txt" from_work_dir="summary.txt"/> + <data name="tsvSummaryExistingFormat" format="tabular" from_work_dir="summary.linda.tsv"/> + <data name="txtSummary" format="txt" from_work_dir="summary.txt"/> </outputs> <tests> <test>
--- a/cpo_galaxy_tree.py Mon Aug 27 19:58:24 2018 -0400 +++ b/cpo_galaxy_tree.py Tue Aug 28 15:15:09 2018 -0400 @@ -39,18 +39,53 @@ #parses some parameters parser = optparse.OptionParser("Usage: %prog [options] arg1 arg2 ...") -parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="identifier of the isolate") -parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path forward read (R1)") -parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to reverse read (R2)") +parser.add_option("-t", "--tree", dest="treePath", type="string", default="./pipelineTest/tree.txt", help="absolute file path to phylip tree") +parser.add_option("-d", "--distance", dest="distancePath", type="string", default="./pipelineTest/distance.tab", help="absolute file path to distance matrix") +parser.add_option("-m", "--metadata", dest="metadataPath", type="string", default="./pipelineTest/metadata.tsv",help="absolute file path to metadata file") +parser.add_option("-o", "--output_file", dest="outputFile", type="string", default="tree.png", help="Output graphics file. Use ending 'png', 'pdf' or 'svg' to specify file format.") + +# sensitive data adder +parser.add_option("-p", "--sensitive_data", dest="sensitivePath", type="string", default="", help="Spreadsheet (CSV) with sensitive metadata") +parser.add_option("-c", "--sensitive_cols", dest="sensitiveCols", type="string", default="", help="CSV list of column names from sensitive metadata spreadsheet to use as labels on dendrogram") +parser.add_option("-b", "--bcid_column", dest="bcidCol", type="string", default="BCID", help="Column name of BCID in sensitive metadata file") +parser.add_option("-n", "--missing_value", dest="naValue", type="string", default="NA", help="Value to write for missing data.") + (options,args) = parser.parse_args() treePath = str(options.treePath).lstrip().rstrip() distancePath = str(options.distancePath).lstrip().rstrip() metadataPath = str(options.metadataPath).lstrip().rstrip() +sensitivePath = str(options.sensitivePath).lstrip().rstrip() +sensitiveCols = str(options.sensitiveCols).lstrip().rstrip() +outputFile = str(options.outputFile).lstrip().rstrip() +bcidCol = str( str(options.bcidCol).lstrip().rstrip() ) +naValue = str( str(options.naValue).lstrip().rstrip() ) + #region result objects #define some objects to store values from results #//TODO this is not the proper way of get/set private object variables. every value has manually assigned defaults intead of specified in init(). Also, use property(def getVar, def setVar). + +class SensitiveMetadata(object): + def __init__(self): + x = pandas.read_csv( sensitivePath ) + col_names = [ s for s in sensitiveCols.split(',')] # convert to 0 offset + if not bcidCol in col_names: + col_names.append( bcidCol ) + all_cols = [ str(col) for col in x.columns ] + col_idxs = [ all_cols.index(col) for col in col_names ] + self.sensitive_data = x.iloc[:, col_idxs] + def get_columns(self): + cols = [ str(x) for x in self.sensitive_data.columns ] + return cols + def get_value( self, bcid, column_name ): # might be nice to get them all in single call via an input list of bcids ... for later + bcids= list( self.sensitive_data.loc[:, bcidCol ] ) # get the list of all BCIDs in sensitive metadata + if not bcid in bcids: + return naValue + else: + row_idx = bcids.index( bcid ) # lookup the row for this BCID + return self.sensitive_data.loc[ row_idx, column_name ] # return the one value based on the column (col_idx) and this row + class workflowResult(object): def __init__(self): self.new = False @@ -60,6 +95,7 @@ self.SequenceType = "?" self.MLSTScheme = "?" self.CarbapenemResistanceGenes ="?" + self.plasmidBestMatch ="?" self.OtherAMRGenes="?" self.TotalPlasmids = -1 self.plasmids = [] @@ -145,6 +181,7 @@ _results.CarbapenemResistanceGenes = (str(r.loc[r.index[i], 'Carbapenem Resistance Genes'])) _results.OtherAMRGenes = (str(r.loc[r.index[i], 'Other AMR Genes'])) _results.TotalPlasmids = int(r.loc[r.index[i], 'Total Plasmids']) + _results.plasmidBestMatch = str(r.loc[r.index[i], 'Plasmid Best Match']) for j in range(0,_results.TotalPlasmids): _plasmid = plasmidObj() _plasmid.PlasmidsID =(((str(r.loc[r.index[i], 'Plasmids ID'])).split(";"))[j]) @@ -163,6 +200,9 @@ #endregion def Main(): + if len(sensitivePath)>0: + sensitive_meta_data = SensitiveMetadata() + metadata = ParseWorkflowResults(metadataPath) distance = read(distancePath) treeFile = "".join(read(treePath)) @@ -212,6 +252,12 @@ if (n.is_leaf() and n.name == "Reference"): #if its the reference branch, populate the faces with column headers index = 0 + + if len(sensitivePath)>0: #sensitive metadat @ chris + for sensitive_data_column in sensitive_meta_data.get_columns(): + (t&"Reference").add_face(addFace(sensitive_data_column), index, "aligned") + index = index + 1 + (t&"Reference").add_face(addFace("SampleID"), index, "aligned") index = index + 1 (t&"Reference").add_face(addFace("New?"), index, "aligned") @@ -225,12 +271,28 @@ index = index + 1 (t&"Reference").add_face(addFace("Carbapenamases"), index, "aligned") index = index + 1 + (t&"Reference").add_face(addFace("Plasmid Best Match"), index, "aligned") + index = index + 1 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds the distance matrix (t&"Reference").add_face(addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned") index = index + len(distanceDict[list(distanceDict.keys())[0]]) elif (n.is_leaf() and not n.name == "Reference"): #not reference branches, populate with metadata index = 0 + + if len(sensitivePath)>0: #sensitive metadata @ chris + # pushing in sensitive data + for sensitive_data_column in sensitive_meta_data.get_columns(): + # tree uses bcids like BC18A021A_S12 + # while sens meta-data uses BC18A021A + # trim the "_S.*" if present + bcid = str(mData.ID) + if bcid.find( "_S" ) != -1: + bcid = bcid[ 0:bcid.find( "_S" ) ] + sens_col_val = sensitive_meta_data.get_value(bcid=bcid, column_name=sensitive_data_column ) + n.add_face(addFace(sens_col_val), index, "aligned") + index = index + 1 + if (n.name.replace(".fa","") in metadata.keys()): mData = metadata[n.name.replace(".fa","")] else: @@ -262,11 +324,13 @@ index = index + 1 n.add_face(addFace(mData.CarbapenemResistanceGenes), index, "aligned") index = index + 1 + n.add_face(addFace(mData.plasmidBestMatch), index, "aligned") + index = index + 1 for i in range(len(distanceDict[list(distanceDict.keys())[0]])): #this loop adds distance matrix if (n.name in distanceDict): #make sure the column is in the distance matrice n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned") - t.render("./tree.pdf", w=5000,units="mm", tree_style=ts) #save it as a png. or an phyloxml + t.render(outputFile, w=5000,units="mm", tree_style=ts) #save it as a png, pdf, svg or an phyloxml #endregion #endregion
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpo_mash.xml Tue Aug 28 15:15:09 2018 -0400 @@ -0,0 +1,31 @@ +<tool id="cpo_clustalw" name="cpo_clustalw" version="2.1"> + <description>Modified version of mash 2.0 with custom database</description> + <requirements> + <requirement type="package" version="2.0">mash</requirement> + </requirements> + <command detect_errors="exit_code"> + <![CDATA[ + mash screen $__tool_directory__/bcPlasmidDB.msh $input | sort -gr | head -10 > mashresult.tsv + ]]> + </command> + <inputs> + <param name="input" type="data" format="fasta" label="Input" help="FASTA file with contig(s)"/> + </inputs> + <outputs> + <data name="mashResult" format="tabular" from_work_dir="mashresult.tsv"/> + </outputs> + <help> + mash screen bcPlasmidDB.msh $input | sort -gr | head -10 > mashresult.tsv + </help> + <citations> + <citation type="bibtex"> + @misc{githubmob-suite, + author = {Robertson J, Nash J}, + title = {MOB-Suite: Software tools for clustering, reconstruction and typing of plasmids from draft assemblies.}, + publisher = {GitHub}, + journal = {GitHub repository}, + doi = {10.1099/mgen.0.000206}, + url = {https://github.com/phac-nml/mob-suite} + }</citation> + </citations> +</tool> \ No newline at end of file