# HG changeset patch
# User proteore
# Date 1552473042 14400
# Node ID 9e31ea9fc7ea07968e56b3a40032a08f4e704d27
planemo upload commit 567ba7934c0ca55529dfeb5e7ca0935ace260ad7-dirty
diff -r 000000000000 -r 9e31ea9fc7ea data_manager/resource_building.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/resource_building.py Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,649 @@
+# -*- coding: utf-8 -*-
+"""
+The purpose of this script is to create source files from different databases to be used in other proteore tools
+"""
+
+import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile
+from io import BytesIO
+from zipfile import ZipFile
+from galaxy.util.json import from_json_string, to_json_string
+
+#######################################################################################################
+# General functions
+#######################################################################################################
+def unzip(url, output_file):
+ """
+ Get a zip file content from a link and unzip
+ """
+ content = requests.get(url)
+ zipfile = ZipFile(BytesIO(content.content))
+ output_content = ""
+ output_content += zipfile.open(zipfile.namelist()[0]).read()
+ output = open(output_file, "w")
+ output.write(output_content)
+ output.close()
+
+def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
+ data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+ data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
+ data_manager_dict['data_tables'][data_table].append(data_table_entry)
+ return data_manager_dict
+
+#######################################################################################################
+# 1. Human Protein Atlas
+# - Normal tissue
+# - Pathology
+# - Full Atlas
+#######################################################################################################
+def HPA_sources(data_manager_dict, tissue, target_directory):
+ if tissue == "HPA_normal_tissue":
+ tissue_name = "HPA normal tissue"
+ url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
+ table = "proteore_protein_atlas_normal_tissue"
+ elif tissue == "HPA_pathology":
+ tissue_name = "HPA pathology"
+ url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
+ table = "proteore_protein_atlas_tumor_tissue"
+ elif tissue == "HPA_full_atlas":
+ tissue_name = "HPA full atlas"
+ url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
+ table = "proteore_protein_full_atlas"
+
+ output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
+ path = os.path.join(target_directory, output_file)
+ unzip(url, path) #download and save file
+ tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
+ tissue_id = tissue_name.replace(" ","_").replace("/","-")
+
+
+ data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path)
+ _add_data_table_entry(data_manager_dict, data_table_entry, table)
+
+
+#######################################################################################################
+# 2. Peptide Atlas
+#######################################################################################################
+def peptide_atlas_sources(data_manager_dict, tissue, date, target_directory):
+ # Define organism_id (here Human) - to be upraded when other organism added to the project
+ organism_id = "2"
+ # Extract sample_category_id and output filename
+ tissue=tissue.split(".")
+ sample_category_id = tissue[0]
+ tissue_name = tissue[1]
+ output_file = tissue_name+"_"+date + ".tsv"
+
+ query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \
+ sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \
+ "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\
+ &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY"
+
+ with requests.Session() as s:
+ download = s.get(query)
+ decoded_content = download.content.decode('utf-8')
+ cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
+
+ uni_dict = build_dictionary(cr)
+
+ #columns of data table peptide_atlas
+ tissue_id = tissue_name+"_"+date
+ name = tissue_id.replace("-","/").replace("_"," ")
+ path = os.path.join(target_directory,output_file)
+
+ with open(path,"w") as out :
+ w = csv.writer(out,delimiter='\t')
+ w.writerow(["Uniprot_AC","nb_obs"])
+ w.writerows(uni_dict.items())
+
+ data_table_entry = dict(id=tissue_id, name=name, value = path, tissue = tissue_name)
+ _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas")
+
+#function to count the number of observations by uniprot id
+def build_dictionary (csv) :
+ uni_dict = {}
+ for line in csv :
+ if "-" not in line[0] and check_uniprot_access(line[0]) :
+ if line[0] in uni_dict :
+ uni_dict[line[0]] += int(line[5])
+ else :
+ uni_dict[line[0]] = int(line[5])
+
+ return uni_dict
+
+#function to check if an id is an uniprot accession number : return True or False-
+def check_uniprot_access (id) :
+ uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
+ if uniprot_pattern.match(id) :
+ return True
+ else :
+ return False
+
+def check_entrez_geneid (id) :
+ entrez_pattern = re.compile("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+")
+ if entrez_pattern.match(id) :
+ return True
+ else :
+ return False
+
+#######################################################################################################
+# 3. ID mapping file
+#######################################################################################################
+import ftplib, gzip
+csv.field_size_limit(sys.maxsize) # to handle big files
+
+def id_mapping_sources (data_manager_dict, species, target_directory) :
+
+ human = species == "Human"
+ species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
+ files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
+
+ #header
+ if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+ else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+
+ #print("header ok")
+
+ #get selected.tab and keep only ids of interest
+ selected_tab_file=species_dict[species]+"_"+files[0]
+ tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
+ with gzip.open(tab_path,"rt") as select :
+ tab_reader = csv.reader(select,delimiter="\t")
+ for line in tab_reader :
+ tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
+ os.remove(tab_path)
+
+ #print("selected_tab ok")
+
+ """
+ Supplementary ID to get from HUMAN_9606_idmapping.dat :
+ -NextProt,BioGrid,STRING,KEGG
+ """
+
+ #there's more id type for human
+ if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file
+ else : ids = ['BioGrid','STRING','KEGG' ]
+ unidict = {}
+
+ #keep only ids of interest in dictionaries
+ dat_file=species_dict[species]+"_"+files[1]
+ dat_path = download_from_uniprot_ftp(dat_file,target_directory)
+ with gzip.open(dat_path,"rt") as dat :
+ dat_reader = csv.reader(dat,delimiter="\t")
+ for line in dat_reader :
+ uniprotID=line[0] #UniProtID as key
+ id_type=line[1] #ID type of corresponding id, key of sub-dictionnary
+ cor_id=line[2] #corresponding id
+ if "-" not in id_type : #we don't keep isoform
+ if id_type in ids and uniprotID in unidict :
+ if id_type in unidict[uniprotID] :
+ unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
+ else :
+ unidict[uniprotID].update({ id_type : cor_id })
+ elif id_type in ids :
+ unidict[uniprotID]={id_type : cor_id}
+ os.remove(dat_path)
+
+ #print("dat_file ok")
+
+ #add ids from idmapping.dat to the final tab
+ for line in tab[1:] :
+ uniprotID=line[0]
+ if human :
+ if uniprotID in unidict :
+ nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
+ if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
+ line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
+ access_dictionary(unidict,uniprotID,'KEGG')])
+ else :
+ line.extend(["","","",""])
+ else :
+ if uniprotID in unidict :
+ line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
+ access_dictionary(unidict,uniprotID,'KEGG')])
+ else :
+ line.extend(["","",""])
+
+ #print ("tab ok")
+
+ #add missing nextprot ID for human
+ if human :
+ #build next_dict
+ nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+ next_dict = {}
+ for nextid in nextprot_ids :
+ next_dict[nextid.replace("NX_","")] = nextid
+ os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
+
+ #add missing nextprot ID
+ for line in tab[1:] :
+ uniprotID=line[0]
+ nextprotID=line[13]
+ if nextprotID == '' and uniprotID in next_dict :
+ line[13]=next_dict[uniprotID]
+
+ output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
+ path = os.path.join(target_directory,output_file)
+
+ with open(path,"w") as out :
+ w = csv.writer(out,delimiter='\t')
+ w.writerows(tab)
+
+ name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
+ name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
+ id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
+
+ data_table_entry = dict(id=id, name = name, species = species, value = path)
+ _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
+
+def download_from_uniprot_ftp(file,target_directory) :
+ ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
+ path = os.path.join(target_directory, file)
+ ftp = ftplib.FTP("ftp.uniprot.org")
+ ftp.login("anonymous", "anonymous")
+ ftp.cwd(ftp_dir)
+ ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+ ftp.quit()
+ return (path)
+
+def id_list_from_nextprot_ftp(file,target_directory) :
+ ftp_dir = "pub/current_release/ac_lists/"
+ path = os.path.join(target_directory, file)
+ ftp = ftplib.FTP("ftp.nextprot.org")
+ ftp.login("anonymous", "anonymous")
+ ftp.cwd(ftp_dir)
+ ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+ ftp.quit()
+ with open(path,'r') as nextprot_ids :
+ nextprot_ids = nextprot_ids.read().splitlines()
+ return (nextprot_ids)
+
+#return '' if there's no value in a dictionary, avoid error
+def access_dictionary (dico,key1,key2) :
+ if key1 in dico :
+ if key2 in dico[key1] :
+ return (dico[key1][key2])
+ else :
+ return ("")
+ #print (key2,"not in ",dico,"[",key1,"]")
+ else :
+ return ('')
+
+#if there are several nextprot ID for one uniprotID, return the uniprot like ID
+def clean_nextprot_id (next_id,uniprotAc) :
+ if len(next_id.split(";")) > 1 :
+ tmp = next_id.split(";")
+ if "NX_"+uniprotAc in tmp :
+ return ("NX_"+uniprotAc)
+ else :
+ return (tmp[1])
+ else :
+ return (next_id)
+
+
+#######################################################################################################
+# 4. Build protein interaction maps files
+#######################################################################################################
+
+def get_interactant_name(line,dico):
+
+ if line[0] in dico :
+ interactant_A = dico[line[0]]
+ else :
+ interactant_A = "NA"
+
+ if line[1] in dico :
+ interactant_B = dico[line[1]]
+ else :
+ interactant_B = "NA"
+
+ return interactant_A, interactant_B
+
+def PPI_ref_files(data_manager_dict, species, interactome, target_directory):
+
+ species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"}
+
+ ##BioGRID
+ if interactome=="biogrid":
+
+ tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip"
+
+ #download zip file
+ r = requests.get(tab2_link)
+ with open("BioGRID.zip", "wb") as code:
+ code.write(r.content)
+
+ #unzip files
+ with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref:
+ if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID")
+ zip_ref.extractall("tmp_BioGRID")
+
+ #import file of interest and build dictionary
+ file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt"
+ with open(file_path,"r") as handle :
+ tab_file = csv.reader(handle,delimiter="\t")
+ dico_network = {}
+ GeneID_index=1
+ network_cols=[1,2,7,8,11,12,14,18,20]
+ for line in tab_file :
+ if line[GeneID_index] not in dico_network:
+ dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]]
+ else:
+ dico_network[line[GeneID_index]].append([line[i] for i in network_cols])
+
+ #delete tmp_BioGRID directory
+ os.remove("BioGRID.zip")
+ shutil.rmtree("tmp_BioGRID", ignore_errors=True)
+
+ #download NCBI2Reactome.txt file and build dictionary
+ with requests.Session() as s:
+ r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
+ r.encoding ="utf-8"
+ tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
+
+ dico_nodes = {}
+ geneid_index=0
+ pathway_description_index=3
+ species_index=5
+ for line in tab_file :
+ if line[species_index]==species_dict[species]:
+ if line[geneid_index] in dico_nodes :
+ dico_nodes[line[geneid_index]].append(line[pathway_description_index])
+ else :
+ dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
+
+ dico={}
+ dico['network']=dico_network
+ dico['nodes']=dico_nodes
+
+ ##Bioplex
+ elif interactome=="bioplex":
+
+ with requests.Session() as s:
+ r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv')
+ r = r.content.decode('utf-8')
+ bioplex = csv.reader(r.splitlines(), delimiter='\t')
+
+ dico_network = {}
+ dico_network["GeneID"]={}
+ network_geneid_cols=[0,1,4,5,8]
+ dico_network["UniProt-AC"]={}
+ network_uniprot_cols=[2,3,4,5,8]
+ dico_GeneID_to_UniProt = {}
+ for line in bioplex :
+ if line[0] not in dico_network["GeneID"]:
+ dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]]
+ else :
+ dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols])
+ if line[1] not in dico_network["UniProt-AC"]:
+ dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]]
+ else:
+ dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols])
+ dico_GeneID_to_UniProt[line[0]]=line[2]
+
+ with requests.Session() as s:
+ r = s.get('https://reactome.org/download/current/UniProt2Reactome.txt')
+ r.encoding ="utf-8"
+ tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
+
+ dico_nodes_uniprot = {}
+ uniProt_index=0
+ pathway_description_index=3
+ species_index=5
+ for line in tab_file :
+ if line[species_index]==species_dict[species]:
+ if line[uniProt_index] in dico_nodes_uniprot :
+ dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index])
+ else :
+ dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]]
+
+ with requests.Session() as s:
+ r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
+ r.encoding ="utf-8"
+ tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
+
+ dico_nodes_geneid = {}
+ geneid_index=0
+ pathway_description_index=3
+ species_index=5
+ for line in tab_file :
+ if line[species_index]==species_dict[species]:
+ if line[geneid_index] in dico_nodes_geneid :
+ dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index])
+ else :
+ dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]]
+
+ dico={}
+ dico_nodes={}
+ dico_nodes['GeneID']=dico_nodes_geneid
+ dico_nodes['UniProt-AC']=dico_nodes_uniprot
+ dico['network']=dico_network
+ dico['nodes']=dico_nodes
+ dico['convert']=dico_GeneID_to_UniProt
+
+ ##Humap
+ elif interactome=="humap":
+
+ with requests.Session() as s:
+ r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt')
+ r = r.content.decode('utf-8')
+ humap_nodes = csv.reader(r.splitlines(), delimiter=',')
+
+ dico_geneid_to_gene_name={}
+ dico_protein_name={}
+ for line in humap_nodes :
+ if check_entrez_geneid(line[4]):
+ if line[4] not in dico_geneid_to_gene_name:
+ dico_geneid_to_gene_name[line[4]]=line[3]
+ if line[4] not in dico_protein_name:
+ dico_protein_name[line[4]]=line[5]
+
+ with requests.Session() as s:
+ r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt')
+ r = r.content.decode('utf-8')
+ humap = csv.reader(r.splitlines(), delimiter='\t')
+
+ dico_network = {}
+ for line in humap :
+ if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]):
+
+ interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name)
+
+ #first interactant (first column)
+ if line[0] not in dico_network:
+ dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]]
+ else :
+ dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]])
+
+ #second interactant (second column)
+ if line[1] not in dico_network:
+ dico_network[line[1]]=[[line[1],line[0],interactant_B,interactant_A,line[2]]]
+ else :
+ dico_network[line[1]].append([line[1],line[0],interactant_B,interactant_A,line[2]])
+
+ with requests.Session() as s:
+ r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
+ r.encoding ="utf-8"
+ tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
+
+ dico_nodes = {}
+ geneid_index=0
+ pathway_description_index=3
+ species_index=5
+ for line in tab_file :
+ if line[species_index]==species_dict[species]:
+ #Fill dictionary with pathways
+ if line[geneid_index] in dico_nodes :
+ dico_nodes[line[geneid_index]].append(line[pathway_description_index])
+ else :
+ dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
+
+ dico={}
+ dico['network']=dico_network
+ dico['nodes']=dico_nodes
+ dico['gene_name']=dico_geneid_to_gene_name
+ dico['protein_name']=dico_protein_name
+
+ #writing output
+ output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json"
+ path = os.path.join(target_directory,output_file)
+ name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
+ id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y")
+
+ with open(path, 'w') as handle:
+ json.dump(dico, handle, sort_keys=True)
+
+ data_table_entry = dict(id=id, name = name, species = species, value = path)
+ _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
+
+#######################################################################################################
+# 5. nextprot (add protein features)
+#######################################################################################################
+
+def Build_nextprot_ref_file(data_manager_dict,target_directory):
+ nextprot_ids_file = "nextprot_ac_list_all.txt"
+ ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
+
+ nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
+ for id in ids :
+ #print (id)
+ query="https://api.nextprot.org/entry/"+id+".json"
+ resp = requests.get(url=query)
+ data = resp.json()
+
+ #get info from json dictionary
+ mass_mol = data["entry"]["isoforms"][0]["massAsString"]
+ seq_length = data['entry']["isoforms"][0]["sequenceLength"]
+ iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
+ chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
+ protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
+
+ #put all subcell loc in a set
+ if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
+ subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
+ all_subcell_locs = set()
+ for loc in subcell_locs :
+ all_subcell_locs.add(loc['cvTermName'])
+ all_subcell_locs.discard("")
+ all_subcell_locs = ";".join(all_subcell_locs)
+ else :
+ all_subcell_locs = "NA"
+
+ #put all subcell loc in a set
+ if ('disease') in data['entry']['annotationsByCategory'].keys() :
+ diseases = data['entry']['annotationsByCategory']['disease']
+ all_diseases = set()
+ for disease in diseases :
+ if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
+ all_diseases.add(disease['cvTermName'])
+ if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
+ else : all_diseases="NA"
+ else :
+ all_diseases="NA"
+
+ #get all tm domain
+ nb_domains = 0
+ if "domain" in data['entry']['annotationsByCategory'].keys():
+ tm_domains = data['entry']['annotationsByCategory']["domain"]
+ for tm_domain in tm_domains :
+ if "properties" in tm_domain.keys() and tm_domain['properties']!=[]:
+ domains = tm_domains["properties"]
+ for domain in domains :
+ if domain["name"]=="region structure" and domain["value"]=="Helical" :
+ nb_domains+=1
+
+
+ nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
+
+ output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
+ path = os.path.join(target_directory,output_file)
+ name = "neXtProt release "+time.strftime("%d-%m-%Y")
+ id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
+
+ with open(path, 'w') as output:
+ writer = csv.writer(output,delimiter="\t")
+ writer.writerows(nextprot_file)
+
+ data_table_entry = dict(id=id, name = name, value = path)
+ _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
+
+#######################################################################################################
+# Main function
+#######################################################################################################
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
+ parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
+ parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
+ parser.add_argument("--interactome", metavar = ("PPI"))
+ parser.add_argument("--species")
+ parser.add_argument("--date")
+ parser.add_argument("-o", "--output")
+ parser.add_argument("--database")
+ args = parser.parse_args()
+
+ data_manager_dict = {}
+ # Extract json file params
+ filename = args.output
+ params = from_json_string(open(filename).read())
+ target_directory = params[ 'output_data' ][0]['extra_files_path']
+ os.mkdir(target_directory)
+
+ ## Download source files from HPA
+ try:
+ hpa = args.hpa
+ except NameError:
+ hpa = None
+ if hpa is not None:
+ #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
+ hpa = hpa.split(",")
+ for hpa_tissue in hpa:
+ HPA_sources(data_manager_dict, hpa_tissue, target_directory)
+
+ ## Download source file from Peptide Atlas query
+ try:
+ peptide_atlas = args.peptideatlas
+ date = args.date
+ except NameError:
+ peptide_atlas = None
+ if peptide_atlas is not None:
+ #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
+ peptide_atlas = peptide_atlas.split(",")
+ for pa_tissue in peptide_atlas:
+ peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory)
+
+ ## Download ID_mapping source file from Uniprot
+ try:
+ id_mapping=args.id_mapping
+ except NameError:
+ id_mapping = None
+ if id_mapping is not None:
+ id_mapping = id_mapping .split(",")
+ for species in id_mapping :
+ id_mapping_sources(data_manager_dict, species, target_directory)
+
+ ## Download PPI ref files from biogrid/bioplex/humap
+ try:
+ interactome=args.interactome
+ if interactome == "biogrid" :
+ species=args.species
+ else :
+ species="Human"
+ except NameError:
+ interactome=None
+ species=None
+ if interactome is not None and species is not None:
+ PPI_ref_files(data_manager_dict, species, interactome, target_directory)
+
+ ## Build nextprot ref file for add protein features
+ try:
+ database=args.database
+ except NameError:
+ database=None
+ if database is not None :
+ Build_nextprot_ref_file(data_manager_dict,target_directory)
+
+ #save info to json file
+ filename = args.output
+ open(filename, 'wb').write(to_json_string(data_manager_dict))
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r 9e31ea9fc7ea data_manager/resource_building.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/resource_building.xml Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,223 @@
+
+
+to create or update reference files for proteore tools
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+`_.
+* `Pathology `_.
+* `Full Atlas `_.
+
+For 'Peptide Atlas':
+
+
+* `Human Adrenal gland proteome `_.
+* `Human Brain proteome `_.
+* `Human Breast Proteome `_.
+* `Human CSF proteome `_.
+* `Human Digestive System proteome `_.
+* `Human female reproductive system proteome `_.
+* `Human Heart proteome `_.
+* `Human Kidney man Kidney Proteome `_.
+* `Human Liver proteome `_.
+* `Human Lung proteome `_.
+* `Human Male Reproductive System proteome `_.
+* `Human Pancreas proteome `_.
+* `Human Plasma Non-Glyco proteome `_.
+* `Human Spleen proteome `_.
+* `Human Testis proteome `_.
+* `Human Urinary Bladder proteome `_.
+* `Human Urine proteome `_.
+
+For ID mapping:
+
+* `HUMAN_9606_idmapping_selected.tab `_.
+* `HUMAN_9606_idmapping.dat `_.
+* `nextprot_ac_list_all.txt `_.
+* `MOUSE_10090_idmapping_selected.tab `_.
+* `MOUSE_10090_idmapping.dat `_.
+* `RAT_10116_idmapping_selected.tab `_.
+* `RAT_10116_idmapping.dat `_.
+
+For Build protein interaction maps:
+
+* `BIOGRID_oragism `_.
+* `NCBI2Reactome.txt `_.
+* `Bioplex_interactionList_v4a.tsv `_.
+* `UniProt2Reactome.txt `_.
+
+-----
+
+.. class:: infomark
+
+**Authors**
+
+David Christiany, Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR
+
+Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
+
+This work has been partially funded through the French National Agency for Research (ANR) IFB project.
+
+Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool.
+
+ ]]>
+
+
+
+
diff -r 000000000000 -r 9e31ea9fc7ea data_manager_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,152 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_biogrid_dictionaries.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_biogrid_dictionaries.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,4 @@
+#id name species value
+#biogrid_human_08-01-2019 Human (Homo sapiens) 08/01/2019 Human PPI_dictionaries/Human_biogrid.json
+#biogrid_mouse_08-01-2019 Mouse (Mus musculus) 08/01/2019 Mouse PPI_dictionaries/Mouse_biogrid.json
+#biogrid_rat_08-01-2019 Rat (Rattus norvegicus) 08/01/2019 Rat PPI_dictionaries/Rat_biogrid.json
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_bioplex_dictionaries.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_bioplex_dictionaries.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,2 @@
+#id name species value
+#bioplex_human_08-01-2019 Human (Homo sapiens) 08/01/2019 Human PPI_dictionaries/human_bioplex.json
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_humap_dictionaries.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_humap_dictionaries.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,2 @@
+#id name species value
+#humap_human_01-02-2019 Human (Homo sapiens) 01/02/19 Human PPI_dictionaries/Human_humap_01-02-2019.json
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_id_mapping_Human.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_id_mapping_Human.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,3 @@
+#This file lists the locations of reference file for id_converter tool
+#
+#human_id_mapping_01-01-2018 Human (homo sapiens) human_id_mapping tool-data/human_id_mapping.tsv
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_id_mapping_Mouse.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_id_mapping_Mouse.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,3 @@
+#This file lists the locations of reference file for id_converter tool
+#
+#mouse_id_mapping_01-01-2018 Mouse (Mus musculus) mouse_id_mapping tool-data/mouse_id_mapping.tsv
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_id_mapping_Rat.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_id_mapping_Rat.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,3 @@
+#This file lists the locations of reference file for id_converter tool
+#
+#rat_id_mapping_01-01-2018 Rat (Rattus norvegicus) rat_id_mapping tool-data/rat_id_mapping.tsv
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_nextprot_ref.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_nextprot_ref.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,2 @@
+#
+#nextprot_ref_09-03-2019 neXtProt release 09-03-2019 tool-data/nextprot_ref_09-03-2019.tsv
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_protein_atlas_normal_tissue.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_protein_atlas_normal_tissue.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,4 @@
+#This file lists the locations name and values of reference files for Get expression data tool
+#This is a tab separated file (TAB, not 4 spaces !)
+#
+#HPA_normal_tissue_19-07-2018 HPA normal tissue 19/07/2018 HPA_normal_tissue /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19159/dataset_39307_files/HPA_normal_tissue_19-07-2018.tsv
diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_protein_atlas_tumor_tissue.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/proteore_protein_atlas_tumor_tissue.loc.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,4 @@
+#This file lists the locations name and values of reference files for Get expression data tool
+#This is a tab separated file (TAB, not 4 spaces !)
+#
+#HPA_pathology_19-07-2018 HPA pathology 19/07/2018 HPA_pathology /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_pathology_19-07-2018.tsv
diff -r 000000000000 -r 9e31ea9fc7ea tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Wed Mar 13 06:30:42 2019 -0400
@@ -0,0 +1,43 @@
+
+
+
+ id, name, tissue, value
+
+
+
+ id, name, tissue, value
+
+
+
+ id, name, tissue, value
+
+
+
+ id, name, species, value
+
+
+
+ id, name, species, value
+
+
+
+ id, name, species, value
+
+
+
+ id, name, species, value
+
+
+
+ id, name, species, value
+
+
+
+ id, name, species, value
+
+
+
+