comparison data_manager/resource_building.py @ 0:9e31ea9fc7ea draft

planemo upload commit 567ba7934c0ca55529dfeb5e7ca0935ace260ad7-dirty
author proteore
date Wed, 13 Mar 2019 06:30:42 -0400
parents
children f3507260b30f
comparison
equal deleted inserted replaced
-1:000000000000 0:9e31ea9fc7ea
1 # -*- coding: utf-8 -*-
2 """
3 The purpose of this script is to create source files from different databases to be used in other proteore tools
4 """
5
6 import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile
7 from io import BytesIO
8 from zipfile import ZipFile
9 from galaxy.util.json import from_json_string, to_json_string
10
11 #######################################################################################################
12 # General functions
13 #######################################################################################################
14 def unzip(url, output_file):
15 """
16 Get a zip file content from a link and unzip
17 """
18 content = requests.get(url)
19 zipfile = ZipFile(BytesIO(content.content))
20 output_content = ""
21 output_content += zipfile.open(zipfile.namelist()[0]).read()
22 output = open(output_file, "w")
23 output.write(output_content)
24 output.close()
25
26 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
27 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
28 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
29 data_manager_dict['data_tables'][data_table].append(data_table_entry)
30 return data_manager_dict
31
32 #######################################################################################################
33 # 1. Human Protein Atlas
34 # - Normal tissue
35 # - Pathology
36 # - Full Atlas
37 #######################################################################################################
38 def HPA_sources(data_manager_dict, tissue, target_directory):
39 if tissue == "HPA_normal_tissue":
40 tissue_name = "HPA normal tissue"
41 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
42 table = "proteore_protein_atlas_normal_tissue"
43 elif tissue == "HPA_pathology":
44 tissue_name = "HPA pathology"
45 url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
46 table = "proteore_protein_atlas_tumor_tissue"
47 elif tissue == "HPA_full_atlas":
48 tissue_name = "HPA full atlas"
49 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
50 table = "proteore_protein_full_atlas"
51
52 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
53 path = os.path.join(target_directory, output_file)
54 unzip(url, path) #download and save file
55 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
56 tissue_id = tissue_name.replace(" ","_").replace("/","-")
57
58
59 data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path)
60 _add_data_table_entry(data_manager_dict, data_table_entry, table)
61
62
63 #######################################################################################################
64 # 2. Peptide Atlas
65 #######################################################################################################
66 def peptide_atlas_sources(data_manager_dict, tissue, date, target_directory):
67 # Define organism_id (here Human) - to be upraded when other organism added to the project
68 organism_id = "2"
69 # Extract sample_category_id and output filename
70 tissue=tissue.split(".")
71 sample_category_id = tissue[0]
72 tissue_name = tissue[1]
73 output_file = tissue_name+"_"+date + ".tsv"
74
75 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \
76 sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \
77 "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\
78 &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY"
79
80 with requests.Session() as s:
81 download = s.get(query)
82 decoded_content = download.content.decode('utf-8')
83 cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
84
85 uni_dict = build_dictionary(cr)
86
87 #columns of data table peptide_atlas
88 tissue_id = tissue_name+"_"+date
89 name = tissue_id.replace("-","/").replace("_"," ")
90 path = os.path.join(target_directory,output_file)
91
92 with open(path,"w") as out :
93 w = csv.writer(out,delimiter='\t')
94 w.writerow(["Uniprot_AC","nb_obs"])
95 w.writerows(uni_dict.items())
96
97 data_table_entry = dict(id=tissue_id, name=name, value = path, tissue = tissue_name)
98 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas")
99
100 #function to count the number of observations by uniprot id
101 def build_dictionary (csv) :
102 uni_dict = {}
103 for line in csv :
104 if "-" not in line[0] and check_uniprot_access(line[0]) :
105 if line[0] in uni_dict :
106 uni_dict[line[0]] += int(line[5])
107 else :
108 uni_dict[line[0]] = int(line[5])
109
110 return uni_dict
111
112 #function to check if an id is an uniprot accession number : return True or False-
113 def check_uniprot_access (id) :
114 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
115 if uniprot_pattern.match(id) :
116 return True
117 else :
118 return False
119
120 def check_entrez_geneid (id) :
121 entrez_pattern = re.compile("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+")
122 if entrez_pattern.match(id) :
123 return True
124 else :
125 return False
126
127 #######################################################################################################
128 # 3. ID mapping file
129 #######################################################################################################
130 import ftplib, gzip
131 csv.field_size_limit(sys.maxsize) # to handle big files
132
133 def id_mapping_sources (data_manager_dict, species, target_directory) :
134
135 human = species == "Human"
136 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
137 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
138
139 #header
140 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
141 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
142
143 #print("header ok")
144
145 #get selected.tab and keep only ids of interest
146 selected_tab_file=species_dict[species]+"_"+files[0]
147 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
148 with gzip.open(tab_path,"rt") as select :
149 tab_reader = csv.reader(select,delimiter="\t")
150 for line in tab_reader :
151 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
152 os.remove(tab_path)
153
154 #print("selected_tab ok")
155
156 """
157 Supplementary ID to get from HUMAN_9606_idmapping.dat :
158 -NextProt,BioGrid,STRING,KEGG
159 """
160
161 #there's more id type for human
162 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file
163 else : ids = ['BioGrid','STRING','KEGG' ]
164 unidict = {}
165
166 #keep only ids of interest in dictionaries
167 dat_file=species_dict[species]+"_"+files[1]
168 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
169 with gzip.open(dat_path,"rt") as dat :
170 dat_reader = csv.reader(dat,delimiter="\t")
171 for line in dat_reader :
172 uniprotID=line[0] #UniProtID as key
173 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary
174 cor_id=line[2] #corresponding id
175 if "-" not in id_type : #we don't keep isoform
176 if id_type in ids and uniprotID in unidict :
177 if id_type in unidict[uniprotID] :
178 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
179 else :
180 unidict[uniprotID].update({ id_type : cor_id })
181 elif id_type in ids :
182 unidict[uniprotID]={id_type : cor_id}
183 os.remove(dat_path)
184
185 #print("dat_file ok")
186
187 #add ids from idmapping.dat to the final tab
188 for line in tab[1:] :
189 uniprotID=line[0]
190 if human :
191 if uniprotID in unidict :
192 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
193 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
194 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
195 access_dictionary(unidict,uniprotID,'KEGG')])
196 else :
197 line.extend(["","","",""])
198 else :
199 if uniprotID in unidict :
200 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
201 access_dictionary(unidict,uniprotID,'KEGG')])
202 else :
203 line.extend(["","",""])
204
205 #print ("tab ok")
206
207 #add missing nextprot ID for human
208 if human :
209 #build next_dict
210 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
211 next_dict = {}
212 for nextid in nextprot_ids :
213 next_dict[nextid.replace("NX_","")] = nextid
214 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
215
216 #add missing nextprot ID
217 for line in tab[1:] :
218 uniprotID=line[0]
219 nextprotID=line[13]
220 if nextprotID == '' and uniprotID in next_dict :
221 line[13]=next_dict[uniprotID]
222
223 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
224 path = os.path.join(target_directory,output_file)
225
226 with open(path,"w") as out :
227 w = csv.writer(out,delimiter='\t')
228 w.writerows(tab)
229
230 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
231 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
232 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
233
234 data_table_entry = dict(id=id, name = name, species = species, value = path)
235 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
236
237 def download_from_uniprot_ftp(file,target_directory) :
238 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
239 path = os.path.join(target_directory, file)
240 ftp = ftplib.FTP("ftp.uniprot.org")
241 ftp.login("anonymous", "anonymous")
242 ftp.cwd(ftp_dir)
243 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
244 ftp.quit()
245 return (path)
246
247 def id_list_from_nextprot_ftp(file,target_directory) :
248 ftp_dir = "pub/current_release/ac_lists/"
249 path = os.path.join(target_directory, file)
250 ftp = ftplib.FTP("ftp.nextprot.org")
251 ftp.login("anonymous", "anonymous")
252 ftp.cwd(ftp_dir)
253 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
254 ftp.quit()
255 with open(path,'r') as nextprot_ids :
256 nextprot_ids = nextprot_ids.read().splitlines()
257 return (nextprot_ids)
258
259 #return '' if there's no value in a dictionary, avoid error
260 def access_dictionary (dico,key1,key2) :
261 if key1 in dico :
262 if key2 in dico[key1] :
263 return (dico[key1][key2])
264 else :
265 return ("")
266 #print (key2,"not in ",dico,"[",key1,"]")
267 else :
268 return ('')
269
270 #if there are several nextprot ID for one uniprotID, return the uniprot like ID
271 def clean_nextprot_id (next_id,uniprotAc) :
272 if len(next_id.split(";")) > 1 :
273 tmp = next_id.split(";")
274 if "NX_"+uniprotAc in tmp :
275 return ("NX_"+uniprotAc)
276 else :
277 return (tmp[1])
278 else :
279 return (next_id)
280
281
282 #######################################################################################################
283 # 4. Build protein interaction maps files
284 #######################################################################################################
285
286 def get_interactant_name(line,dico):
287
288 if line[0] in dico :
289 interactant_A = dico[line[0]]
290 else :
291 interactant_A = "NA"
292
293 if line[1] in dico :
294 interactant_B = dico[line[1]]
295 else :
296 interactant_B = "NA"
297
298 return interactant_A, interactant_B
299
300 def PPI_ref_files(data_manager_dict, species, interactome, target_directory):
301
302 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"}
303
304 ##BioGRID
305 if interactome=="biogrid":
306
307 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip"
308
309 #download zip file
310 r = requests.get(tab2_link)
311 with open("BioGRID.zip", "wb") as code:
312 code.write(r.content)
313
314 #unzip files
315 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref:
316 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID")
317 zip_ref.extractall("tmp_BioGRID")
318
319 #import file of interest and build dictionary
320 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt"
321 with open(file_path,"r") as handle :
322 tab_file = csv.reader(handle,delimiter="\t")
323 dico_network = {}
324 GeneID_index=1
325 network_cols=[1,2,7,8,11,12,14,18,20]
326 for line in tab_file :
327 if line[GeneID_index] not in dico_network:
328 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]]
329 else:
330 dico_network[line[GeneID_index]].append([line[i] for i in network_cols])
331
332 #delete tmp_BioGRID directory
333 os.remove("BioGRID.zip")
334 shutil.rmtree("tmp_BioGRID", ignore_errors=True)
335
336 #download NCBI2Reactome.txt file and build dictionary
337 with requests.Session() as s:
338 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
339 r.encoding ="utf-8"
340 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
341
342 dico_nodes = {}
343 geneid_index=0
344 pathway_description_index=3
345 species_index=5
346 for line in tab_file :
347 if line[species_index]==species_dict[species]:
348 if line[geneid_index] in dico_nodes :
349 dico_nodes[line[geneid_index]].append(line[pathway_description_index])
350 else :
351 dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
352
353 dico={}
354 dico['network']=dico_network
355 dico['nodes']=dico_nodes
356
357 ##Bioplex
358 elif interactome=="bioplex":
359
360 with requests.Session() as s:
361 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv')
362 r = r.content.decode('utf-8')
363 bioplex = csv.reader(r.splitlines(), delimiter='\t')
364
365 dico_network = {}
366 dico_network["GeneID"]={}
367 network_geneid_cols=[0,1,4,5,8]
368 dico_network["UniProt-AC"]={}
369 network_uniprot_cols=[2,3,4,5,8]
370 dico_GeneID_to_UniProt = {}
371 for line in bioplex :
372 if line[0] not in dico_network["GeneID"]:
373 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]]
374 else :
375 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols])
376 if line[1] not in dico_network["UniProt-AC"]:
377 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]]
378 else:
379 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols])
380 dico_GeneID_to_UniProt[line[0]]=line[2]
381
382 with requests.Session() as s:
383 r = s.get('https://reactome.org/download/current/UniProt2Reactome.txt')
384 r.encoding ="utf-8"
385 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
386
387 dico_nodes_uniprot = {}
388 uniProt_index=0
389 pathway_description_index=3
390 species_index=5
391 for line in tab_file :
392 if line[species_index]==species_dict[species]:
393 if line[uniProt_index] in dico_nodes_uniprot :
394 dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index])
395 else :
396 dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]]
397
398 with requests.Session() as s:
399 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
400 r.encoding ="utf-8"
401 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
402
403 dico_nodes_geneid = {}
404 geneid_index=0
405 pathway_description_index=3
406 species_index=5
407 for line in tab_file :
408 if line[species_index]==species_dict[species]:
409 if line[geneid_index] in dico_nodes_geneid :
410 dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index])
411 else :
412 dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]]
413
414 dico={}
415 dico_nodes={}
416 dico_nodes['GeneID']=dico_nodes_geneid
417 dico_nodes['UniProt-AC']=dico_nodes_uniprot
418 dico['network']=dico_network
419 dico['nodes']=dico_nodes
420 dico['convert']=dico_GeneID_to_UniProt
421
422 ##Humap
423 elif interactome=="humap":
424
425 with requests.Session() as s:
426 r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt')
427 r = r.content.decode('utf-8')
428 humap_nodes = csv.reader(r.splitlines(), delimiter=',')
429
430 dico_geneid_to_gene_name={}
431 dico_protein_name={}
432 for line in humap_nodes :
433 if check_entrez_geneid(line[4]):
434 if line[4] not in dico_geneid_to_gene_name:
435 dico_geneid_to_gene_name[line[4]]=line[3]
436 if line[4] not in dico_protein_name:
437 dico_protein_name[line[4]]=line[5]
438
439 with requests.Session() as s:
440 r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt')
441 r = r.content.decode('utf-8')
442 humap = csv.reader(r.splitlines(), delimiter='\t')
443
444 dico_network = {}
445 for line in humap :
446 if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]):
447
448 interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name)
449
450 #first interactant (first column)
451 if line[0] not in dico_network:
452 dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]]
453 else :
454 dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]])
455
456 #second interactant (second column)
457 if line[1] not in dico_network:
458 dico_network[line[1]]=[[line[1],line[0],interactant_B,interactant_A,line[2]]]
459 else :
460 dico_network[line[1]].append([line[1],line[0],interactant_B,interactant_A,line[2]])
461
462 with requests.Session() as s:
463 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
464 r.encoding ="utf-8"
465 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
466
467 dico_nodes = {}
468 geneid_index=0
469 pathway_description_index=3
470 species_index=5
471 for line in tab_file :
472 if line[species_index]==species_dict[species]:
473 #Fill dictionary with pathways
474 if line[geneid_index] in dico_nodes :
475 dico_nodes[line[geneid_index]].append(line[pathway_description_index])
476 else :
477 dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
478
479 dico={}
480 dico['network']=dico_network
481 dico['nodes']=dico_nodes
482 dico['gene_name']=dico_geneid_to_gene_name
483 dico['protein_name']=dico_protein_name
484
485 #writing output
486 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json"
487 path = os.path.join(target_directory,output_file)
488 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
489 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y")
490
491 with open(path, 'w') as handle:
492 json.dump(dico, handle, sort_keys=True)
493
494 data_table_entry = dict(id=id, name = name, species = species, value = path)
495 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
496
497 #######################################################################################################
498 # 5. nextprot (add protein features)
499 #######################################################################################################
500
501 def Build_nextprot_ref_file(data_manager_dict,target_directory):
502 nextprot_ids_file = "nextprot_ac_list_all.txt"
503 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
504
505 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
506 for id in ids :
507 #print (id)
508 query="https://api.nextprot.org/entry/"+id+".json"
509 resp = requests.get(url=query)
510 data = resp.json()
511
512 #get info from json dictionary
513 mass_mol = data["entry"]["isoforms"][0]["massAsString"]
514 seq_length = data['entry']["isoforms"][0]["sequenceLength"]
515 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
516 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
517 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
518
519 #put all subcell loc in a set
520 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
521 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
522 all_subcell_locs = set()
523 for loc in subcell_locs :
524 all_subcell_locs.add(loc['cvTermName'])
525 all_subcell_locs.discard("")
526 all_subcell_locs = ";".join(all_subcell_locs)
527 else :
528 all_subcell_locs = "NA"
529
530 #put all subcell loc in a set
531 if ('disease') in data['entry']['annotationsByCategory'].keys() :
532 diseases = data['entry']['annotationsByCategory']['disease']
533 all_diseases = set()
534 for disease in diseases :
535 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
536 all_diseases.add(disease['cvTermName'])
537 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
538 else : all_diseases="NA"
539 else :
540 all_diseases="NA"
541
542 #get all tm domain
543 nb_domains = 0
544 if "domain" in data['entry']['annotationsByCategory'].keys():
545 tm_domains = data['entry']['annotationsByCategory']["domain"]
546 for tm_domain in tm_domains :
547 if "properties" in tm_domain.keys() and tm_domain['properties']!=[]:
548 domains = tm_domains["properties"]
549 for domain in domains :
550 if domain["name"]=="region structure" and domain["value"]=="Helical" :
551 nb_domains+=1
552
553
554 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
555
556 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
557 path = os.path.join(target_directory,output_file)
558 name = "neXtProt release "+time.strftime("%d-%m-%Y")
559 id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
560
561 with open(path, 'w') as output:
562 writer = csv.writer(output,delimiter="\t")
563 writer.writerows(nextprot_file)
564
565 data_table_entry = dict(id=id, name = name, value = path)
566 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
567
568 #######################################################################################################
569 # Main function
570 #######################################################################################################
571 def main():
572 parser = argparse.ArgumentParser()
573 parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
574 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
575 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
576 parser.add_argument("--interactome", metavar = ("PPI"))
577 parser.add_argument("--species")
578 parser.add_argument("--date")
579 parser.add_argument("-o", "--output")
580 parser.add_argument("--database")
581 args = parser.parse_args()
582
583 data_manager_dict = {}
584 # Extract json file params
585 filename = args.output
586 params = from_json_string(open(filename).read())
587 target_directory = params[ 'output_data' ][0]['extra_files_path']
588 os.mkdir(target_directory)
589
590 ## Download source files from HPA
591 try:
592 hpa = args.hpa
593 except NameError:
594 hpa = None
595 if hpa is not None:
596 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
597 hpa = hpa.split(",")
598 for hpa_tissue in hpa:
599 HPA_sources(data_manager_dict, hpa_tissue, target_directory)
600
601 ## Download source file from Peptide Atlas query
602 try:
603 peptide_atlas = args.peptideatlas
604 date = args.date
605 except NameError:
606 peptide_atlas = None
607 if peptide_atlas is not None:
608 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
609 peptide_atlas = peptide_atlas.split(",")
610 for pa_tissue in peptide_atlas:
611 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory)
612
613 ## Download ID_mapping source file from Uniprot
614 try:
615 id_mapping=args.id_mapping
616 except NameError:
617 id_mapping = None
618 if id_mapping is not None:
619 id_mapping = id_mapping .split(",")
620 for species in id_mapping :
621 id_mapping_sources(data_manager_dict, species, target_directory)
622
623 ## Download PPI ref files from biogrid/bioplex/humap
624 try:
625 interactome=args.interactome
626 if interactome == "biogrid" :
627 species=args.species
628 else :
629 species="Human"
630 except NameError:
631 interactome=None
632 species=None
633 if interactome is not None and species is not None:
634 PPI_ref_files(data_manager_dict, species, interactome, target_directory)
635
636 ## Build nextprot ref file for add protein features
637 try:
638 database=args.database
639 except NameError:
640 database=None
641 if database is not None :
642 Build_nextprot_ref_file(data_manager_dict,target_directory)
643
644 #save info to json file
645 filename = args.output
646 open(filename, 'wb').write(to_json_string(data_manager_dict))
647
648 if __name__ == "__main__":
649 main()