Mercurial > repos > johnheap > vapper
comparison Tryp_V.py @ 0:36cb22bd911d draft
planemo upload for repository https://github.com/johnheap/VAPPER-Galaxy
| author | johnheap |
|---|---|
| date | Wed, 04 Jul 2018 16:39:13 -0400 |
| parents | |
| children | 4432e4183ebd |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:36cb22bd911d |
|---|---|
| 1 """ | |
| 2 * Copyright 2018 University of Liverpool | |
| 3 * Author: John Heap, Computational Biology Facility, UoL | |
| 4 * Based on original scripts of Sara Silva Pereira, Institute of Infection and Global Health, UoL | |
| 5 * | |
| 6 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 * you may not use this file except in compliance with the License. | |
| 8 * You may obtain a copy of the License at | |
| 9 * | |
| 10 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 * | |
| 12 * Unless required by applicable law or agreed to in writing, software | |
| 13 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 * See the License for the specific language governing permissions and | |
| 16 * limitations under the License. | |
| 17 * | |
| 18 """ | |
| 19 | |
| 20 import matplotlib as mpl | |
| 21 mpl.use('Agg') | |
| 22 | |
| 23 import subprocess | |
| 24 import shutil | |
| 25 import re | |
| 26 import pandas as pd | |
| 27 import os | |
| 28 | |
| 29 import sys | |
| 30 | |
| 31 import matplotlib.pyplot as plt | |
| 32 from matplotlib.patches import Patch | |
| 33 import seaborn as sns | |
| 34 | |
| 35 def assembleWithVelvet(name, kmers, inslen, covcut, fastq1name,fastq2name): | |
| 36 #argString = "velveth " + name + "_k65 65 -shortPaired -fastq " + name + "_R1.fastq " + name + "_R2.fastq" | |
| 37 argString = "velveth " + name + "_k"+ kmers+" "+ kmers + " -shortPaired -fastq " + fastq1name+" "+fastq2name | |
| 38 print(argString) | |
| 39 returncode = subprocess.call(argString, shell=True) | |
| 40 if returncode != 0: | |
| 41 return "Error in velveth" | |
| 42 argString = "velvetg " + name + "_k"+kmers+" -exp_cov auto -ins_length "+inslen+" -clean yes -ins_length_sd 50 -min_pair_count 20" | |
| 43 #argString = "velvetg " + name + "_k"+kmers+" -exp_cov auto -ins_length "+inslen+" -cov_cutoff "+covcut+" -clean yes -ins_length_sd 50 -min_pair_count 20" | |
| 44 #argString = "velvetg " + name + "_k65 -exp_cov auto -ins_length 400 -cov_cutoff 5 -clean yes -ins_length_sd 50 -min_pair_count 20"+quietString | |
| 45 print(argString) | |
| 46 returncode = subprocess.call(argString, shell = True) | |
| 47 if returncode != 0: | |
| 48 return "Error in velvetg" | |
| 49 shutil.copyfile(name + "_k"+kmers+"//contigs.fa",name + ".fa") # my $namechange = "mv ".$input."_k65/contigs.fa ".$input.".fa"; | |
| 50 return "ok" | |
| 51 | |
| 52 | |
| 53 def blastContigs(test_name,database): | |
| 54 print(test_name) | |
| 55 print(database) | |
| 56 #db_path = os.path.dirname(os.path.realpath(__file__))+database | |
| 57 db_path = database | |
| 58 argString = "blastn -db "+db_path+" -query "+test_name+".fa -outfmt 10 -out "+test_name+"_blast.txt" | |
| 59 print (argString) | |
| 60 returncode = subprocess.call(argString, shell = True) | |
| 61 if returncode != 0: | |
| 62 return "Error in blastall" | |
| 63 blast_df = pd.read_csv(""+test_name+"_blast.txt") | |
| 64 #print (blast_df) | |
| 65 #if ($temp[2] >= 98 & & $temp[3] > 100 & & $temp[10] < 0.001){ | |
| 66 #'qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore' | |
| 67 blast_df.columns = ['qaccver', 'saccver', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue','bitscore'] | |
| 68 blastResult_df = blast_df[(blast_df['pident']>=98) & (blast_df['length'] > 100) & (blast_df['evalue']<0.001) ] | |
| 69 blastResult_df = blastResult_df[['qaccver', 'saccver', 'pident']] #query accession.version, subject accession.version, Percentage of identical matches | |
| 70 | |
| 71 return blastResult_df | |
| 72 | |
| 73 | |
| 74 | |
| 75 def getCogsPresent(blastResult_df,strain,cogOrBinList): | |
| 76 blastResult_df.sort_values('pident',axis = 0, ascending=False, inplace=True) | |
| 77 nodeList = blastResult_df['qaccver'].tolist() | |
| 78 cogList = blastResult_df['saccver'].tolist() | |
| 79 cogSet = set(cogList) #get unique values | |
| 80 cogList = sorted(cogSet) #sort them | |
| 81 | |
| 82 #print (cogList) | |
| 83 #print (len(cogList)) | |
| 84 | |
| 85 thereList = [] | |
| 86 dataList = [] | |
| 87 #dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 88 fname = cogOrBinList | |
| 89 cnt = 0 | |
| 90 with open (fname) as f: | |
| 91 for line in f: | |
| 92 dataList.append(line.rstrip('\n\r ')) | |
| 93 if line.rstrip('\n\r ') in cogList: | |
| 94 thereList.append('1') | |
| 95 cnt = cnt+1 | |
| 96 else: | |
| 97 thereList.append('0') | |
| 98 | |
| 99 #print (thereList) | |
| 100 #print (cnt) | |
| 101 data = {'Cog': dataList, strain: thereList} | |
| 102 presence_df = pd.DataFrame(data) | |
| 103 #print (presence_df) | |
| 104 return presence_df | |
| 105 | |
| 106 def addToCurrentData(cog_df, name): | |
| 107 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 108 j_fname = dir_path + r"/data/vivax/TvDatabase.csv" | |
| 109 tv_df = pd.read_csv(j_fname) | |
| 110 | |
| 111 cogList = cog_df[name].tolist() | |
| 112 #cogList.insert(0,'Test') | |
| 113 #print (len(tv_df)) | |
| 114 #print(len(cogList)) | |
| 115 | |
| 116 #print(cogList) | |
| 117 tv_df.loc[:,name]=cogList | |
| 118 return tv_df | |
| 119 | |
| 120 | |
| 121 | |
| 122 | |
| 123 def createClusterMap(tv_df,name,html_path,pdfExport): | |
| 124 #Retrieve Data | |
| 125 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 126 j_fname = dir_path+r"/data/vivax/geoTv.csv" | |
| 127 geo_df = pd.read_csv(j_fname) | |
| 128 geo_df.loc[len(geo_df)] =[name,name,'k'] | |
| 129 print(geo_df) | |
| 130 | |
| 131 myStrains = tv_df.columns.values.tolist() #beware first entry is COG | |
| 132 myStrains = myStrains[1:] | |
| 133 print(myStrains) | |
| 134 myPal = [] | |
| 135 for s in myStrains: | |
| 136 col = geo_df[(geo_df['Strain'] == s)]['colour'].tolist() | |
| 137 myPal.append(col[0]) | |
| 138 print(myPal) | |
| 139 mycogmap = ['skyblue', 'orangered'] # blue absent,red present | |
| 140 tv_df.set_index('COG', inplace=True) | |
| 141 tv_df = tv_df[tv_df.columns].astype(float) | |
| 142 | |
| 143 cg = sns.clustermap(tv_df, method='ward', col_colors=myPal, cmap=mycogmap, yticklabels = 1500, row_cluster=False, linewidths = 0) | |
| 144 #cg = sns.clustermap(tv_df, method='ward', row_cluster=False, linewidths = 0) | |
| 145 ax = cg.ax_heatmap | |
| 146 #xasix ticks and labels. | |
| 147 ax.xaxis.tick_top() #set ticks at top | |
| 148 newlabs = [] | |
| 149 | |
| 150 labs = ax.xaxis.get_ticklabels() | |
| 151 for i in range(0, len(labs)): | |
| 152 print(labs[i]) | |
| 153 # labs[i].set_text(" "+labs[i].get_text()) #make enough room so label sits atop of col_color bars | |
| 154 newlabs.append(" " + labs[i].get_text()) | |
| 155 ax.xaxis.set_ticklabels(newlabs) | |
| 156 | |
| 157 #labs = ax.xaxis.get_ticklabels() | |
| 158 #for i in range(0, len(labs)): | |
| 159 # print(labs[i]) | |
| 160 # labs[i].set_text(" "+labs[i].get_text()) #make enough room so label sits atop of col_color bars | |
| 161 # print(labs[i]) | |
| 162 #ax.xaxis.set_ticklabels(labs) | |
| 163 plt.setp(cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90) # get x labels printed vertically | |
| 164 | |
| 165 cg.cax.set_visible(False) | |
| 166 ax = cg.ax_heatmap | |
| 167 ax.set_yticklabels("") | |
| 168 ax.set_ylabel("") | |
| 169 ax = cg.ax_heatmap | |
| 170 # ax.set_title("Variant antigen profiles of T. vivax genomes.\nDendrogram reflects the VSG repertoire relationships of each strain inferred by the presence and absence of non-universal T. vivax VSG orthologs.", va = "top", wrap = "True") | |
| 171 b = len(tv_df) | |
| 172 print(b) | |
| 173 title = "Figure Legend: The Variant Antigen Profiles of $\itTrypanosoma$ $\itvivax$ " \ | |
| 174 "showing the \ncombination of present and absent diagnostic cluster of VSG orthologs " \ | |
| 175 "across the sample cohort. \nDendrogram reflects the relationships amongst the VSG" \ | |
| 176 " repertoires of each strain. " \ | |
| 177 "Strains were isolated \nfrom multiple African countries as shown in the key.\nData was produced with the " \ | |
| 178 "'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018)." | |
| 179 | |
| 180 ax.text(-1.5, len(tv_df) + 8, | |
| 181 title, | |
| 182 ha="left", va="top", wrap="True") | |
| 183 col = cg.ax_col_dendrogram.get_position() | |
| 184 cg.ax_col_dendrogram.set_position([col.x0, col.y0*1.08, col.width, col.height*1.1]) | |
| 185 | |
| 186 | |
| 187 legend_elements = [Patch(facecolor='orangered', label='COG Present'), | |
| 188 Patch (facecolor='skyblue', label='COG Absent'), | |
| 189 Patch(facecolor='w', label=''), | |
| 190 Patch (facecolor='b', label='Nigeria'), | |
| 191 Patch(facecolor = 'g', label='Uganda'), | |
| 192 Patch (facecolor='c', label='Gambia'), | |
| 193 Patch (facecolor='r', label='Ivory Coast'), | |
| 194 Patch(facecolor='m', label='Brazil'), | |
| 195 Patch(facecolor='k', label=name)] | |
| 196 #legend_test = [[Patch(facecolor='orangered'),Patch(facecolor='r')],["test","test2"]] | |
| 197 ax.legend(handles = legend_elements, bbox_to_anchor=(-0.3,1.2),loc = 'upper left') | |
| 198 | |
| 199 | |
| 200 | |
| 201 | |
| 202 | |
| 203 | |
| 204 | |
| 205 #plt.setp(cg.ax_heatmap.yaxis.get_ticklabels(), rotation=0 ) # get y labels printed horizontally | |
| 206 # cg.dendrogram_col.linkage # linkage matrix for columns | |
| 207 # cg.dendrogram_row.linkage # linkage matrix for rows | |
| 208 # plt.savefig(r"results/" + name + "_heatmap.png") | |
| 209 #plt.savefig(htmlresource + "/heatmap.png") | |
| 210 #if pdf == 'PDF_Yes': | |
| 211 # plt.savefig(htmlresource + "/heatmap.pdf") | |
| 212 # shutil.copyfile("heatmap.pdf",heatmapfn) # | |
| 213 #plt.legend() | |
| 214 fname = html_path+"/"+name+"_clustermap.png" | |
| 215 cg.savefig(fname) | |
| 216 if pdfExport == 'PDF_Yes': | |
| 217 fname = html_path + "/" + name + "_clustermap.pdf" | |
| 218 cg.savefig(fname) | |
| 219 #plt.show() | |
| 220 | |
| 221 def createHTML(name,htmlfn,htmlPath): | |
| 222 #assumes imgs are heatmap.png, dheatmap.png, vapPCA.png and already in htmlresource | |
| 223 htmlString = r"<html><title>T.vivax VAP</title><body><div style='text-align:center'><h2><i>Trypanosoma vivax</i> Variant Antigen Profile</h2><h3>" | |
| 224 htmlString += name | |
| 225 | |
| 226 | |
| 227 htmlString += r'<p> <h3>The Heat Map and Dendrogram</h3></p>' | |
| 228 imgString = r"<img src = '"+name+"_clustermap.png' alt='Cog Clustermap' style='max-width:100%'><br><br>" | |
| 229 htmlString += imgString | |
| 230 print(htmlString) | |
| 231 | |
| 232 with open(htmlfn, "w") as htmlfile: | |
| 233 htmlfile.write(htmlString) | |
| 234 | |
| 235 | |
| 236 def vivax_assemble(args,dict): | |
| 237 #argdict = {'name': 2, 'pdfexport': 3, 'kmers': 4, 'inslen': 5, 'covcut': 6, 'forward': 7, 'reverse': 8, 'html_file': 9,'html_resource': 10} | |
| 238 #assembleWithVelvet("V2_Test", '65', '400', '5', "data/TviBrRp.1.clean", "data/TviBrRp.2.clean") | |
| 239 | |
| 240 vivaxPath = os.path.dirname(os.path.realpath(__file__))+r"/data/vivax" | |
| 241 assembleWithVelvet(args[dict['name']], args[dict['kmers']], args[dict['inslen']], args[dict['covcut']], | |
| 242 args[dict['forward']], args[dict['reverse']]) | |
| 243 blastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/COGs.fas") | |
| 244 orthPresence_df = getCogsPresent(blastResult_df, args[dict['name']], vivaxPath+r"/Database/COGlist.txt") | |
| 245 | |
| 246 binBlastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/Bin_2.fas") | |
| 247 binPresence_df = getCogsPresent(binBlastResult_df, args[dict['name']], vivaxPath+r"/Database/binlist.txt") | |
| 248 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True) | |
| 249 | |
| 250 fname = args[dict['html_resource']] +args[dict['name']]+"_cogspresent.csv" | |
| 251 cogPresence_df.to_csv(fname) | |
| 252 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | |
| 253 createClusterMap(current_df, args[dict['name']],args[dict['html_resource']],args[dict['pdfexport']]) | |
| 254 createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']]) | |
| 255 | |
| 256 def test_cluster(args,dict): | |
| 257 print ("name: %s",args[dict['name']]) | |
| 258 cogPresence_df = pd.read_csv("test_cogspresent.csv") | |
| 259 print(cogPresence_df) | |
| 260 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | |
| 261 createClusterMap(current_df, args[dict['name']], args[dict['html_resource']], args[dict['pdfexport']]) | |
| 262 | |
| 263 def vivax_contigs(args,dict): | |
| 264 # argdict = {'name': 2, 'pdfexport': 3, 'contigs': 4, 'html_file': 5, 'html_resource': 6} | |
| 265 | |
| 266 #test_cluster(args,dict) | |
| 267 #return; | |
| 268 | |
| 269 #subprocess.call('echo $PATH',shell = True) | |
| 270 #sys.exit(1) | |
| 271 | |
| 272 vivaxPath = os.path.dirname(os.path.realpath(__file__))+r"/data/vivax" | |
| 273 shutil.copyfile(args[dict['contigs']], args[dict['name']]+".fa") | |
| 274 blastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/COGs.fas") | |
| 275 orthPresence_df = getCogsPresent(blastResult_df, args[dict['name']], vivaxPath+r"/Database/COGlist.txt") | |
| 276 | |
| 277 binBlastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/Bin_2.fas") | |
| 278 binPresence_df = getCogsPresent(binBlastResult_df, args[dict['name']], vivaxPath+r"/Database/binlist.txt") | |
| 279 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True) | |
| 280 | |
| 281 fname = args[dict['html_resource']]+r"/"+ args[dict['name']]+"_cogspresent.csv" | |
| 282 cogPresence_df.to_csv(fname) | |
| 283 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | |
| 284 createClusterMap(current_df, args[dict['name']], args[dict['html_resource']], args[dict['pdfexport']]) | |
| 285 createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']]) | |
| 286 | |
| 287 if __name__ == "__main__": | |
| 288 #assembleWithVelvet("V2_Test",'65','400', '5',"data/TviBrRp.1.clean","data/TviBrRp.2.clean") | |
| 289 #assembleWithVelvet("V2_Test",'65','400', '5',"data/Tv493.1","data/Tv493.2") | |
| 290 #blastResult_df=blastContigs("V2_Test",r"/Database/COGs.fas") | |
| 291 cogPresence_df = pd.read_csv("test_cogspresent.csv") | |
| 292 print(cogPresence_df) | |
| 293 current_df = addToCurrentData(cogPresence_df,"vTest") # load in Tvdatabase and add cogPresence column to it. | |
| 294 createClusterMap(current_df, "vTest", "sausages","no") | |
| 295 createHTML("vTest","vTest.html","sausages") | |
| 296 sys.exit() | |
| 297 | |
| 298 blastResult_df=blastContigs("Tv493",r"/Database/COGs.fas") | |
| 299 orthPresence_df = getCogsPresent(blastResult_df,"Tv493",r"/Database/COGlist.txt") | |
| 300 | |
| 301 #binBlastResult_df=blastContigs("V2_Test",r"/Database/Bin_2.fas") | |
| 302 | |
| 303 binBlastResult_df=blastContigs("Tv493",r"/Database/Bin_2.fas") | |
| 304 binPresence_df = getCogsPresent(binBlastResult_df,"Tv493",r"/Database/binlist.txt") | |
| 305 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True) | |
| 306 #now do the next bit? | |
| 307 current_df = addToCurrentData(cogPresence_df) # load in Tvdatabase and add cogPresence column to it. | |
| 308 createClusterMap(current_df,'Test',dict['html_resource'] ,dict['pdfexport']) | |
| 309 | |
| 310 | |
| 311 #print(cogPresence_df) | |
| 312 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 313 fname = dir_path+r"/results/V2_TestPresence.csv" | |
| 314 #fnameb = dir_path+r"/results/V2_Test_blastOrth.csv" | |
| 315 #fnameb_bin = dir_path+r"/results/V2_Test_blastBin.csv" | |
| 316 #binBlastResult_df.to_csv(fnameb_bin) | |
| 317 #blastResult_df.to_csv(fnameb) | |
| 318 | |
| 319 #cogPresence_df.to_csv(fname) | |
| 320 cogPresence_df = pd.read_csv(fname) | |
| 321 | |
| 322 current_df = addToCurrentData(cogPresence_df) #load in Tvdatabase and add cogPresence column to it. | |
| 323 | |
| 324 | |
| 325 | |
| 326 |
