comparison data_manager/resource_building.py @ 4:e967a99d66b3 draft

"planemo upload commit 540dd383c0617193db43bf11457011888751b022-dirty"
author proteore
date Thu, 23 Jan 2020 08:38:02 -0500
parents af0250fd023c
children b05fa99ddda2
comparison
equal deleted inserted replaced
3:af0250fd023c 4:e967a99d66b3
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 """ 2 """
3 The purpose of this script is to create source files from different databases to be used in other proteore tools 3 The purpose of this script is to create source files from different databases to be used in other proteore tools
4 """ 4 """
5 5
6 import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile 6 import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile, subprocess
7 from io import BytesIO 7 from io import BytesIO
8 from zipfile import ZipFile 8 from zipfile import ZipFile
9 from galaxy.util.json import from_json_string, to_json_string 9 from galaxy.util.json import from_json_string, to_json_string
10 10
11 ####################################################################################################### 11 #######################################################################################################
129 # 3. ID mapping file 129 # 3. ID mapping file
130 ####################################################################################################### 130 #######################################################################################################
131 import ftplib, gzip 131 import ftplib, gzip
132 csv.field_size_limit(sys.maxsize) # to handle big files 132 csv.field_size_limit(sys.maxsize) # to handle big files
133 133
134 def id_mapping_sources (data_manager_dict, species, target_directory) : 134 def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) :
135 135
136 human = species == "Human" 136 human = species == "Human"
137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } 137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] 138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
139 archive = os.path.join(tool_data_path, "id_mapping/ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d")))
140 if os.path.isdir(archive) is False : os.mkdir(archive)
139 141
140 #header 142 #header
141 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] 143 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']]
142 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] 144 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']]
143 145
144 #get selected.tab and keep only ids of interest 146 #get selected.tab and keep only ids of interest
145 selected_tab_file=species_dict[species]+"_"+files[0] 147 selected_tab_file=species_dict[species]+"_"+files[0]
146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) 148 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
147 with gzip.open(tab_path,"rt") as select : 149 with gzip.open(tab_path,"rt") as select :
148 tab_reader = csv.reader(select,delimiter="\t") 150 tab_reader = csv.reader(select,delimiter="\t")
149 for line in tab_reader : 151 for line in tab_reader :
150 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) 152 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
151 os.remove(tab_path) 153 if os.path.exists(os.path.join(archive,tab_path.split("/")[-1])) : os.remove(os.path.join(archive,tab_path.split("/")[-1]))
152 154 shutil.move(tab_path, archive)
153 #print("selected_tab ok") 155 #print("selected_tab ok")
154 156
155 #get uniprot-AC reviewed 157 #get uniprot-AC reviewed
156 organism = species_dict[species].split("_")[1] 158 organism = species_dict[species].split("_")[1]
157 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" 159 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list"
159 with requests.Session() as s: 161 with requests.Session() as s:
160 download = s.get(query) 162 download = s.get(query)
161 decoded_content = download.content.decode('utf-8') 163 decoded_content = download.content.decode('utf-8')
162 uniprot_reviewed_list = decoded_content.splitlines() 164 uniprot_reviewed_list = decoded_content.splitlines()
163 165
166 #save reviewed list
167 reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt')
168 with open(reviewed_list_path,'w') as reviewed_list_file:
169 for id in uniprot_reviewed_list:
170 reviewed_list_file.write(id+"\n")
171
172 #remove unreviewed uniprot-AC
164 for line in tab[1:]: 173 for line in tab[1:]:
165 UniProtAC = line[0] 174 UniProtAC = line[1]
166 if UniProtAC not in uniprot_reviewed_list : 175 if UniProtAC not in uniprot_reviewed_list :
167 line[0]=""
168 line[1]="" 176 line[1]=""
169 177
170 """ 178 """
171 Supplementary ID to get from HUMAN_9606_idmapping.dat : 179 Supplementary ID to get from HUMAN_9606_idmapping.dat :
172 -NextProt,BioGrid,STRING,KEGG 180 -NextProt,BioGrid,STRING,KEGG
173 """ 181 """
174 182
175 #there's more id type for human 183 #there's more id type for human
176 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file 184 if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ] #ids to get from dat_file
177 else : ids = ['BioGrid','STRING','KEGG' ] 185 else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ]
178 unidict = {} 186 unidict = {}
179 187
180 #keep only ids of interest in dictionaries 188 #keep only ids of interest in dictionaries
181 dat_file=species_dict[species]+"_"+files[1] 189 dat_file = species_dict[species]+"_"+files[1]
182 dat_path = download_from_uniprot_ftp(dat_file,target_directory) 190 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
183 with gzip.open(dat_path,"rt") as dat : 191 with gzip.open(dat_path,"rt") as dat :
184 dat_reader = csv.reader(dat,delimiter="\t") 192 dat_reader = csv.reader(dat,delimiter="\t")
185 for line in dat_reader : 193 for line in dat_reader :
186 uniprotID=line[0] #UniProtID as key 194 uniprotID=line[0] #UniProtID as key
192 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary 200 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
193 else : 201 else :
194 unidict[uniprotID].update({ id_type : cor_id }) 202 unidict[uniprotID].update({ id_type : cor_id })
195 elif id_type in ids : 203 elif id_type in ids :
196 unidict[uniprotID]={id_type : cor_id} 204 unidict[uniprotID]={id_type : cor_id}
197 os.remove(dat_path) 205 if os.path.exists(os.path.join(archive,dat_path.split("/")[-1])) : os.remove(os.path.join(archive,dat_path.split("/")[-1]))
206 shutil.move(dat_path, archive)
198 207
199 #print("dat_file ok") 208 #print("dat_file ok")
200 209
201 #add ids from idmapping.dat to the final tab 210 #add ids from idmapping.dat to the final tab
202 for line in tab[1:] : 211 for line in tab[1:] :
204 if human : 213 if human :
205 if uniprotID in unidict : 214 if uniprotID in unidict :
206 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') 215 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
207 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) 216 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
208 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), 217 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
209 access_dictionary(unidict,uniprotID,'KEGG')]) 218 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
210 else : 219 else :
211 line.extend(["","","",""]) 220 line.extend(["","","","",""])
212 else : 221 else :
213 if uniprotID in unidict : 222 if uniprotID in unidict :
214 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), 223 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
215 access_dictionary(unidict,uniprotID,'KEGG')]) 224 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
216 else : 225 else :
217 line.extend(["","",""]) 226 line.extend(["","","",""])
218 227
219 #print ("tab ok") 228 #print ("tab ok")
220 229
221 #add missing nextprot ID for human or replace old ones 230 #add missing nextprot ID for human or replace old ones
222 if human : 231 if human :
223 #build next_dict 232 #build next_dict
224 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) 233 nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
234 with open(nextprot_path,'r') as nextprot_ids :
235 nextprot_ids = nextprot_ids.read().splitlines()
236 if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1]))
237 shutil.move(nextprot_path,archive)
225 next_dict = {} 238 next_dict = {}
226 for nextid in nextprot_ids : 239 for nextid in nextprot_ids :
227 next_dict[nextid.replace("NX_","")] = nextid 240 next_dict[nextid.replace("NX_","")] = nextid
228 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) 241 #os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
229 242
230 #add missing nextprot ID 243 #add missing nextprot ID
231 for line in tab[1:] : 244 for line in tab[1:] :
232 uniprotID=line[0] 245 uniprotID=line[0]
233 nextprotID=line[13] 246 nextprotID=line[14]
234 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : 247 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
235 line[13]=next_dict[uniprotID] 248 line[14]=next_dict[uniprotID]
236 249
237 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" 250 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
238 path = os.path.join(target_directory,output_file) 251 path = os.path.join(target_directory,output_file)
239 252
240 with open(path,"w") as out : 253 with open(path,"w") as out :
241 w = csv.writer(out,delimiter='\t') 254 w = csv.writer(out,delimiter='\t')
242 w.writerows(tab) 255 w.writerows(tab)
256
257 subprocess.call(['tar', '-czvf', archive+".tar.gz", archive])
258 shutil.rmtree(archive, ignore_errors=True)
243 259
244 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} 260 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
245 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" 261 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
246 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") 262 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
247 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order 263 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order
265 ftp = ftplib.FTP("ftp.nextprot.org") 281 ftp = ftplib.FTP("ftp.nextprot.org")
266 ftp.login("anonymous", "anonymous") 282 ftp.login("anonymous", "anonymous")
267 ftp.cwd(ftp_dir) 283 ftp.cwd(ftp_dir)
268 ftp.retrbinary("RETR " + file, open(path, 'wb').write) 284 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
269 ftp.quit() 285 ftp.quit()
270 with open(path,'r') as nextprot_ids : 286
271 nextprot_ids = nextprot_ids.read().splitlines() 287 return (path)
272 return (nextprot_ids)
273 288
274 #return '' if there's no value in a dictionary, avoid error 289 #return '' if there's no value in a dictionary, avoid error
275 def access_dictionary (dico,key1,key2) : 290 def access_dictionary (dico,key1,key2) :
276 if key1 in dico : 291 if key1 in dico :
277 if key2 in dico[key1] : 292 if key2 in dico[key1] :
371 386
372 ##Bioplex 387 ##Bioplex
373 elif interactome=="bioplex": 388 elif interactome=="bioplex":
374 389
375 with requests.Session() as s: 390 with requests.Session() as s:
376 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv',verify=False) 391 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv')
377 r = r.content.decode('utf-8') 392 r = r.content.decode('utf-8')
378 bioplex = csv.reader(r.splitlines(), delimiter='\t') 393 bioplex = csv.reader(r.splitlines(), delimiter='\t')
379 394
380 dico_network = {} 395 dico_network = {}
381 dico_network["GeneID"]={} 396 dico_network["GeneID"]={}
595 parser.add_argument("--interactome", metavar = ("PPI")) 610 parser.add_argument("--interactome", metavar = ("PPI"))
596 parser.add_argument("--species") 611 parser.add_argument("--species")
597 parser.add_argument("--date") 612 parser.add_argument("--date")
598 parser.add_argument("-o", "--output") 613 parser.add_argument("-o", "--output")
599 parser.add_argument("--database") 614 parser.add_argument("--database")
615 parser.add_argument("--tool_data_path")
600 args = parser.parse_args() 616 args = parser.parse_args()
601 617
602 data_manager_dict = {} 618 data_manager_dict = {}
603 # Extract json file params 619 # Extract json file params
604 filename = args.output 620 filename = args.output
629 for pa_tissue in peptide_atlas: 645 for pa_tissue in peptide_atlas:
630 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) 646 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory)
631 647
632 ## Download ID_mapping source file from Uniprot 648 ## Download ID_mapping source file from Uniprot
633 try: 649 try:
634 id_mapping=args.id_mapping 650 id_mapping = args.id_mapping
635 except NameError: 651 except NameError:
636 id_mapping = None 652 id_mapping = None
637 if id_mapping is not None: 653 if id_mapping is not None:
638 id_mapping = id_mapping .split(",") 654 id_mapping = id_mapping .split(",")
639 for species in id_mapping : 655 for species in id_mapping :
640 id_mapping_sources(data_manager_dict, species, target_directory) 656 id_mapping_sources(data_manager_dict, species, target_directory, args.tool_data_path)
641 657
642 ## Download PPI ref files from biogrid/bioplex/humap 658 ## Download PPI ref files from biogrid/bioplex/humap
643 try: 659 try:
644 interactome=args.interactome 660 interactome=args.interactome
645 if interactome == "biogrid" : 661 if interactome == "biogrid" :