comparison data_manager/resource_building.py @ 2:9ec42cb35abd draft

planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty
author proteore
date Wed, 19 Jun 2019 04:42:03 -0400
parents f3507260b30f
children af0250fd023c
comparison
equal deleted inserted replaced
1:f3507260b30f 2:9ec42cb35abd
51 51
52 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" 52 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
53 path = os.path.join(target_directory, output_file) 53 path = os.path.join(target_directory, output_file)
54 unzip(url, path) #download and save file 54 unzip(url, path) #download and save file
55 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") 55 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
56 tissue_id = tissue_name.replace(" ","_").replace("/","-") 56 release = tissue_name.replace(" ","_").replace("/","-")
57 57 id = str(10000000000 - int(time.strftime("%Y%m%d")))
58 58
59 data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path) 59
60 data_table_entry = dict(id=id, release=release, name = tissue_name, tissue = tissue, value = path)
60 _add_data_table_entry(data_manager_dict, data_table_entry, table) 61 _add_data_table_entry(data_manager_dict, data_table_entry, table)
61 62
62 63
63 ####################################################################################################### 64 #######################################################################################################
64 # 2. Peptide Atlas 65 # 2. Peptide Atlas
138 139
139 #header 140 #header
140 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] 141 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
141 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] 142 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
142 143
143 #print("header ok")
144
145 #get selected.tab and keep only ids of interest 144 #get selected.tab and keep only ids of interest
146 selected_tab_file=species_dict[species]+"_"+files[0] 145 selected_tab_file=species_dict[species]+"_"+files[0]
147 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) 146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
148 with gzip.open(tab_path,"rt") as select : 147 with gzip.open(tab_path,"rt") as select :
149 tab_reader = csv.reader(select,delimiter="\t") 148 tab_reader = csv.reader(select,delimiter="\t")
150 for line in tab_reader : 149 for line in tab_reader :
151 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) 150 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
152 os.remove(tab_path) 151 os.remove(tab_path)
153 152
154 #print("selected_tab ok") 153 #print("selected_tab ok")
154
155 #get uniprot-AC reviewed
156 organism = species_dict[species].split("_")[1]
157 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list"
158
159 with requests.Session() as s:
160 download = s.get(query)
161 decoded_content = download.content.decode('utf-8')
162 uniprot_reviewed_list = decoded_content.splitlines()
163
164 for line in tab[1:]:
165 UniProtAC = line[0]
166 if UniProtAC not in uniprot_reviewed_list :
167 line[0]=""
168 line[1]=""
155 169
156 """ 170 """
157 Supplementary ID to get from HUMAN_9606_idmapping.dat : 171 Supplementary ID to get from HUMAN_9606_idmapping.dat :
158 -NextProt,BioGrid,STRING,KEGG 172 -NextProt,BioGrid,STRING,KEGG
159 """ 173 """
202 else : 216 else :
203 line.extend(["","",""]) 217 line.extend(["","",""])
204 218
205 #print ("tab ok") 219 #print ("tab ok")
206 220
207 #add missing nextprot ID for human 221 #add missing nextprot ID for human or replace old ones
208 if human : 222 if human :
209 #build next_dict 223 #build next_dict
210 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) 224 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
211 next_dict = {} 225 next_dict = {}
212 for nextid in nextprot_ids : 226 for nextid in nextprot_ids :
215 229
216 #add missing nextprot ID 230 #add missing nextprot ID
217 for line in tab[1:] : 231 for line in tab[1:] :
218 uniprotID=line[0] 232 uniprotID=line[0]
219 nextprotID=line[13] 233 nextprotID=line[13]
220 if nextprotID == '' and uniprotID in next_dict : 234 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
221 line[13]=next_dict[uniprotID] 235 line[13]=next_dict[uniprotID]
222 236
223 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" 237 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
224 path = os.path.join(target_directory,output_file) 238 path = os.path.join(target_directory,output_file)
225 239
227 w = csv.writer(out,delimiter='\t') 241 w = csv.writer(out,delimiter='\t')
228 w.writerows(tab) 242 w.writerows(tab)
229 243
230 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} 244 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
231 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" 245 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
232 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") 246 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
233 247 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order
234 data_table_entry = dict(id=id, name = name, species = species, value = path) 248
249 data_table_entry = dict(id=id, release=release , name = name, species = species, value = path)
235 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) 250 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
236 251
237 def download_from_uniprot_ftp(file,target_directory) : 252 def download_from_uniprot_ftp(file,target_directory) :
238 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" 253 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
239 path = os.path.join(target_directory, file) 254 path = os.path.join(target_directory, file)
481 dico['nodes']=dico_nodes 496 dico['nodes']=dico_nodes
482 dico['gene_name']=dico_geneid_to_gene_name 497 dico['gene_name']=dico_geneid_to_gene_name
483 dico['protein_name']=dico_protein_name 498 dico['protein_name']=dico_protein_name
484 499
485 #writing output 500 #writing output
486 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" 501 output_file = species+'_'+interactome+'_'+ time.strftime("%Y-%m-%d") + ".json"
487 path = os.path.join(target_directory,output_file) 502 path = os.path.join(target_directory,output_file)
488 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") 503 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
489 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") 504 release = species+"_"+interactome+"_"+ time.strftime("%Y-%m-%d")
505 id = str(10000000000 - int(time.strftime("%Y%m%d")))
490 506
491 with open(path, 'w') as handle: 507 with open(path, 'w') as handle:
492 json.dump(dico, handle, sort_keys=True) 508 json.dump(dico, handle, sort_keys=True)
493 509
494 data_table_entry = dict(id=id, name = name, species = species, value = path) 510 data_table_entry = dict(id=id, release=release, name = name, species = species, value = path)
495 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") 511 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
496 512
497 ####################################################################################################### 513 #######################################################################################################
498 # 5. nextprot (add protein features) 514 # 5. nextprot (add protein features)
499 ####################################################################################################### 515 #######################################################################################################
500 516
501 def Build_nextprot_ref_file(data_manager_dict,target_directory): 517 def Build_nextprot_ref_file(data_manager_dict,target_directory):
502 nextprot_ids_file = "nextprot_ac_list_all.txt" 518 nextprot_ids_file = "nextprot_ac_list_all.txt"
503 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) 519 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
504 520
521 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
522 path = os.path.join(target_directory,output_file)
523 name = "neXtProt release "+time.strftime("%d-%m-%Y")
524 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
525
526 output = open(path, 'w')
527 writer = csv.writer(output,delimiter="\t")
528
505 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] 529 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
530 writer.writerows(nextprot_file)
531
506 for id in ids : 532 for id in ids :
507 #print (id) 533 #print (id)
508 query="https://api.nextprot.org/entry/"+id+".json" 534 query="https://api.nextprot.org/entry/"+id+".json"
509 resp = requests.get(url=query) 535 resp = requests.get(url=query)
510 data = resp.json() 536 data = resp.json()
545 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] 571 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
546 all_tm_domains = set() 572 all_tm_domains = set()
547 for tm in tm_domains : 573 for tm in tm_domains :
548 all_tm_domains.add(tm['cvTermName']) 574 all_tm_domains.add(tm['cvTermName'])
549 nb_domains+=1 575 nb_domains+=1
550 print "nb domains ++" 576 #print "nb domains ++"
551 print (nb_domains) 577 #print (nb_domains)
552 578 nextprot_file[:] = []
553 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) 579 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
554
555 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
556 path = os.path.join(target_directory,output_file)
557 name = "neXtProt release "+time.strftime("%d-%m-%Y")
558 id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
559
560 with open(path, 'w') as output:
561 writer = csv.writer(output,delimiter="\t")
562 writer.writerows(nextprot_file) 580 writer.writerows(nextprot_file)
563 581
564 data_table_entry = dict(id=id, name = name, value = path) 582 id = str(10000000000 - int(time.strftime("%Y%m%d")))
583
584 data_table_entry = dict(id=id, release=release_id, name = name, value = path)
565 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") 585 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
566 586
567 ####################################################################################################### 587 #######################################################################################################
568 # Main function 588 # Main function
569 ####################################################################################################### 589 #######################################################################################################