# HG changeset patch # User proteore # Date 1552473042 14400 # Node ID 9e31ea9fc7ea07968e56b3a40032a08f4e704d27 planemo upload commit 567ba7934c0ca55529dfeb5e7ca0935ace260ad7-dirty diff -r 000000000000 -r 9e31ea9fc7ea data_manager/resource_building.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/resource_building.py Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,649 @@ +# -*- coding: utf-8 -*- +""" +The purpose of this script is to create source files from different databases to be used in other proteore tools +""" + +import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile +from io import BytesIO +from zipfile import ZipFile +from galaxy.util.json import from_json_string, to_json_string + +####################################################################################################### +# General functions +####################################################################################################### +def unzip(url, output_file): + """ + Get a zip file content from a link and unzip + """ + content = requests.get(url) + zipfile = ZipFile(BytesIO(content.content)) + output_content = "" + output_content += zipfile.open(zipfile.namelist()[0]).read() + output = open(output_file, "w") + output.write(output_content) + output.close() + +def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): + data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) + data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) + data_manager_dict['data_tables'][data_table].append(data_table_entry) + return data_manager_dict + +####################################################################################################### +# 1. Human Protein Atlas +# - Normal tissue +# - Pathology +# - Full Atlas +####################################################################################################### +def HPA_sources(data_manager_dict, tissue, target_directory): + if tissue == "HPA_normal_tissue": + tissue_name = "HPA normal tissue" + url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" + table = "proteore_protein_atlas_normal_tissue" + elif tissue == "HPA_pathology": + tissue_name = "HPA pathology" + url = "https://www.proteinatlas.org/download/pathology.tsv.zip" + table = "proteore_protein_atlas_tumor_tissue" + elif tissue == "HPA_full_atlas": + tissue_name = "HPA full atlas" + url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" + table = "proteore_protein_full_atlas" + + output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" + path = os.path.join(target_directory, output_file) + unzip(url, path) #download and save file + tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") + tissue_id = tissue_name.replace(" ","_").replace("/","-") + + + data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path) + _add_data_table_entry(data_manager_dict, data_table_entry, table) + + +####################################################################################################### +# 2. Peptide Atlas +####################################################################################################### +def peptide_atlas_sources(data_manager_dict, tissue, date, target_directory): + # Define organism_id (here Human) - to be upraded when other organism added to the project + organism_id = "2" + # Extract sample_category_id and output filename + tissue=tissue.split(".") + sample_category_id = tissue[0] + tissue_name = tissue[1] + output_file = tissue_name+"_"+date + ".tsv" + + query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \ + sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \ + "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\ + &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY" + + with requests.Session() as s: + download = s.get(query) + decoded_content = download.content.decode('utf-8') + cr = csv.reader(decoded_content.splitlines(), delimiter='\t') + + uni_dict = build_dictionary(cr) + + #columns of data table peptide_atlas + tissue_id = tissue_name+"_"+date + name = tissue_id.replace("-","/").replace("_"," ") + path = os.path.join(target_directory,output_file) + + with open(path,"w") as out : + w = csv.writer(out,delimiter='\t') + w.writerow(["Uniprot_AC","nb_obs"]) + w.writerows(uni_dict.items()) + + data_table_entry = dict(id=tissue_id, name=name, value = path, tissue = tissue_name) + _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas") + +#function to count the number of observations by uniprot id +def build_dictionary (csv) : + uni_dict = {} + for line in csv : + if "-" not in line[0] and check_uniprot_access(line[0]) : + if line[0] in uni_dict : + uni_dict[line[0]] += int(line[5]) + else : + uni_dict[line[0]] = int(line[5]) + + return uni_dict + +#function to check if an id is an uniprot accession number : return True or False- +def check_uniprot_access (id) : + uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") + if uniprot_pattern.match(id) : + return True + else : + return False + +def check_entrez_geneid (id) : + entrez_pattern = re.compile("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+") + if entrez_pattern.match(id) : + return True + else : + return False + +####################################################################################################### +# 3. ID mapping file +####################################################################################################### +import ftplib, gzip +csv.field_size_limit(sys.maxsize) # to handle big files + +def id_mapping_sources (data_manager_dict, species, target_directory) : + + human = species == "Human" + species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } + files=["idmapping_selected.tab.gz","idmapping.dat.gz"] + + #header + if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] + else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] + + #print("header ok") + + #get selected.tab and keep only ids of interest + selected_tab_file=species_dict[species]+"_"+files[0] + tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) + with gzip.open(tab_path,"rt") as select : + tab_reader = csv.reader(select,delimiter="\t") + for line in tab_reader : + tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) + os.remove(tab_path) + + #print("selected_tab ok") + + """ + Supplementary ID to get from HUMAN_9606_idmapping.dat : + -NextProt,BioGrid,STRING,KEGG + """ + + #there's more id type for human + if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file + else : ids = ['BioGrid','STRING','KEGG' ] + unidict = {} + + #keep only ids of interest in dictionaries + dat_file=species_dict[species]+"_"+files[1] + dat_path = download_from_uniprot_ftp(dat_file,target_directory) + with gzip.open(dat_path,"rt") as dat : + dat_reader = csv.reader(dat,delimiter="\t") + for line in dat_reader : + uniprotID=line[0] #UniProtID as key + id_type=line[1] #ID type of corresponding id, key of sub-dictionnary + cor_id=line[2] #corresponding id + if "-" not in id_type : #we don't keep isoform + if id_type in ids and uniprotID in unidict : + if id_type in unidict[uniprotID] : + unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary + else : + unidict[uniprotID].update({ id_type : cor_id }) + elif id_type in ids : + unidict[uniprotID]={id_type : cor_id} + os.remove(dat_path) + + #print("dat_file ok") + + #add ids from idmapping.dat to the final tab + for line in tab[1:] : + uniprotID=line[0] + if human : + if uniprotID in unidict : + nextprot = access_dictionary(unidict,uniprotID,'neXtProt') + if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) + line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), + access_dictionary(unidict,uniprotID,'KEGG')]) + else : + line.extend(["","","",""]) + else : + if uniprotID in unidict : + line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), + access_dictionary(unidict,uniprotID,'KEGG')]) + else : + line.extend(["","",""]) + + #print ("tab ok") + + #add missing nextprot ID for human + if human : + #build next_dict + nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) + next_dict = {} + for nextid in nextprot_ids : + next_dict[nextid.replace("NX_","")] = nextid + os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) + + #add missing nextprot ID + for line in tab[1:] : + uniprotID=line[0] + nextprotID=line[13] + if nextprotID == '' and uniprotID in next_dict : + line[13]=next_dict[uniprotID] + + output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" + path = os.path.join(target_directory,output_file) + + with open(path,"w") as out : + w = csv.writer(out,delimiter='\t') + w.writerows(tab) + + name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} + name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" + id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + + data_table_entry = dict(id=id, name = name, species = species, value = path) + _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) + +def download_from_uniprot_ftp(file,target_directory) : + ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" + path = os.path.join(target_directory, file) + ftp = ftplib.FTP("ftp.uniprot.org") + ftp.login("anonymous", "anonymous") + ftp.cwd(ftp_dir) + ftp.retrbinary("RETR " + file, open(path, 'wb').write) + ftp.quit() + return (path) + +def id_list_from_nextprot_ftp(file,target_directory) : + ftp_dir = "pub/current_release/ac_lists/" + path = os.path.join(target_directory, file) + ftp = ftplib.FTP("ftp.nextprot.org") + ftp.login("anonymous", "anonymous") + ftp.cwd(ftp_dir) + ftp.retrbinary("RETR " + file, open(path, 'wb').write) + ftp.quit() + with open(path,'r') as nextprot_ids : + nextprot_ids = nextprot_ids.read().splitlines() + return (nextprot_ids) + +#return '' if there's no value in a dictionary, avoid error +def access_dictionary (dico,key1,key2) : + if key1 in dico : + if key2 in dico[key1] : + return (dico[key1][key2]) + else : + return ("") + #print (key2,"not in ",dico,"[",key1,"]") + else : + return ('') + +#if there are several nextprot ID for one uniprotID, return the uniprot like ID +def clean_nextprot_id (next_id,uniprotAc) : + if len(next_id.split(";")) > 1 : + tmp = next_id.split(";") + if "NX_"+uniprotAc in tmp : + return ("NX_"+uniprotAc) + else : + return (tmp[1]) + else : + return (next_id) + + +####################################################################################################### +# 4. Build protein interaction maps files +####################################################################################################### + +def get_interactant_name(line,dico): + + if line[0] in dico : + interactant_A = dico[line[0]] + else : + interactant_A = "NA" + + if line[1] in dico : + interactant_B = dico[line[1]] + else : + interactant_B = "NA" + + return interactant_A, interactant_B + +def PPI_ref_files(data_manager_dict, species, interactome, target_directory): + + species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"} + + ##BioGRID + if interactome=="biogrid": + + tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" + + #download zip file + r = requests.get(tab2_link) + with open("BioGRID.zip", "wb") as code: + code.write(r.content) + + #unzip files + with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref: + if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID") + zip_ref.extractall("tmp_BioGRID") + + #import file of interest and build dictionary + file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" + with open(file_path,"r") as handle : + tab_file = csv.reader(handle,delimiter="\t") + dico_network = {} + GeneID_index=1 + network_cols=[1,2,7,8,11,12,14,18,20] + for line in tab_file : + if line[GeneID_index] not in dico_network: + dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]] + else: + dico_network[line[GeneID_index]].append([line[i] for i in network_cols]) + + #delete tmp_BioGRID directory + os.remove("BioGRID.zip") + shutil.rmtree("tmp_BioGRID", ignore_errors=True) + + #download NCBI2Reactome.txt file and build dictionary + with requests.Session() as s: + r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') + r.encoding ="utf-8" + tab_file = csv.reader(r.content.splitlines(), delimiter='\t') + + dico_nodes = {} + geneid_index=0 + pathway_description_index=3 + species_index=5 + for line in tab_file : + if line[species_index]==species_dict[species]: + if line[geneid_index] in dico_nodes : + dico_nodes[line[geneid_index]].append(line[pathway_description_index]) + else : + dico_nodes[line[geneid_index]] = [line[pathway_description_index]] + + dico={} + dico['network']=dico_network + dico['nodes']=dico_nodes + + ##Bioplex + elif interactome=="bioplex": + + with requests.Session() as s: + r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv') + r = r.content.decode('utf-8') + bioplex = csv.reader(r.splitlines(), delimiter='\t') + + dico_network = {} + dico_network["GeneID"]={} + network_geneid_cols=[0,1,4,5,8] + dico_network["UniProt-AC"]={} + network_uniprot_cols=[2,3,4,5,8] + dico_GeneID_to_UniProt = {} + for line in bioplex : + if line[0] not in dico_network["GeneID"]: + dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]] + else : + dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols]) + if line[1] not in dico_network["UniProt-AC"]: + dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]] + else: + dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols]) + dico_GeneID_to_UniProt[line[0]]=line[2] + + with requests.Session() as s: + r = s.get('https://reactome.org/download/current/UniProt2Reactome.txt') + r.encoding ="utf-8" + tab_file = csv.reader(r.content.splitlines(), delimiter='\t') + + dico_nodes_uniprot = {} + uniProt_index=0 + pathway_description_index=3 + species_index=5 + for line in tab_file : + if line[species_index]==species_dict[species]: + if line[uniProt_index] in dico_nodes_uniprot : + dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index]) + else : + dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]] + + with requests.Session() as s: + r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') + r.encoding ="utf-8" + tab_file = csv.reader(r.content.splitlines(), delimiter='\t') + + dico_nodes_geneid = {} + geneid_index=0 + pathway_description_index=3 + species_index=5 + for line in tab_file : + if line[species_index]==species_dict[species]: + if line[geneid_index] in dico_nodes_geneid : + dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index]) + else : + dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]] + + dico={} + dico_nodes={} + dico_nodes['GeneID']=dico_nodes_geneid + dico_nodes['UniProt-AC']=dico_nodes_uniprot + dico['network']=dico_network + dico['nodes']=dico_nodes + dico['convert']=dico_GeneID_to_UniProt + + ##Humap + elif interactome=="humap": + + with requests.Session() as s: + r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt') + r = r.content.decode('utf-8') + humap_nodes = csv.reader(r.splitlines(), delimiter=',') + + dico_geneid_to_gene_name={} + dico_protein_name={} + for line in humap_nodes : + if check_entrez_geneid(line[4]): + if line[4] not in dico_geneid_to_gene_name: + dico_geneid_to_gene_name[line[4]]=line[3] + if line[4] not in dico_protein_name: + dico_protein_name[line[4]]=line[5] + + with requests.Session() as s: + r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt') + r = r.content.decode('utf-8') + humap = csv.reader(r.splitlines(), delimiter='\t') + + dico_network = {} + for line in humap : + if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]): + + interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name) + + #first interactant (first column) + if line[0] not in dico_network: + dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]] + else : + dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]]) + + #second interactant (second column) + if line[1] not in dico_network: + dico_network[line[1]]=[[line[1],line[0],interactant_B,interactant_A,line[2]]] + else : + dico_network[line[1]].append([line[1],line[0],interactant_B,interactant_A,line[2]]) + + with requests.Session() as s: + r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') + r.encoding ="utf-8" + tab_file = csv.reader(r.content.splitlines(), delimiter='\t') + + dico_nodes = {} + geneid_index=0 + pathway_description_index=3 + species_index=5 + for line in tab_file : + if line[species_index]==species_dict[species]: + #Fill dictionary with pathways + if line[geneid_index] in dico_nodes : + dico_nodes[line[geneid_index]].append(line[pathway_description_index]) + else : + dico_nodes[line[geneid_index]] = [line[pathway_description_index]] + + dico={} + dico['network']=dico_network + dico['nodes']=dico_nodes + dico['gene_name']=dico_geneid_to_gene_name + dico['protein_name']=dico_protein_name + + #writing output + output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" + path = os.path.join(target_directory,output_file) + name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") + id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") + + with open(path, 'w') as handle: + json.dump(dico, handle, sort_keys=True) + + data_table_entry = dict(id=id, name = name, species = species, value = path) + _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") + +####################################################################################################### +# 5. nextprot (add protein features) +####################################################################################################### + +def Build_nextprot_ref_file(data_manager_dict,target_directory): + nextprot_ids_file = "nextprot_ac_list_all.txt" + ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) + + nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] + for id in ids : + #print (id) + query="https://api.nextprot.org/entry/"+id+".json" + resp = requests.get(url=query) + data = resp.json() + + #get info from json dictionary + mass_mol = data["entry"]["isoforms"][0]["massAsString"] + seq_length = data['entry']["isoforms"][0]["sequenceLength"] + iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] + chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] + protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) + + #put all subcell loc in a set + if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : + subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] + all_subcell_locs = set() + for loc in subcell_locs : + all_subcell_locs.add(loc['cvTermName']) + all_subcell_locs.discard("") + all_subcell_locs = ";".join(all_subcell_locs) + else : + all_subcell_locs = "NA" + + #put all subcell loc in a set + if ('disease') in data['entry']['annotationsByCategory'].keys() : + diseases = data['entry']['annotationsByCategory']['disease'] + all_diseases = set() + for disease in diseases : + if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): + all_diseases.add(disease['cvTermName']) + if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) + else : all_diseases="NA" + else : + all_diseases="NA" + + #get all tm domain + nb_domains = 0 + if "domain" in data['entry']['annotationsByCategory'].keys(): + tm_domains = data['entry']['annotationsByCategory']["domain"] + for tm_domain in tm_domains : + if "properties" in tm_domain.keys() and tm_domain['properties']!=[]: + domains = tm_domains["properties"] + for domain in domains : + if domain["name"]=="region structure" and domain["value"]=="Helical" : + nb_domains+=1 + + + nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) + + output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" + path = os.path.join(target_directory,output_file) + name = "neXtProt release "+time.strftime("%d-%m-%Y") + id = "nextprot_ref_"+time.strftime("%d-%m-%Y") + + with open(path, 'w') as output: + writer = csv.writer(output,delimiter="\t") + writer.writerows(nextprot_file) + + data_table_entry = dict(id=id, name = name, value = path) + _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") + +####################################################################################################### +# Main function +####################################################################################################### +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--hpa", metavar = ("HPA_OPTION")) + parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) + parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) + parser.add_argument("--interactome", metavar = ("PPI")) + parser.add_argument("--species") + parser.add_argument("--date") + parser.add_argument("-o", "--output") + parser.add_argument("--database") + args = parser.parse_args() + + data_manager_dict = {} + # Extract json file params + filename = args.output + params = from_json_string(open(filename).read()) + target_directory = params[ 'output_data' ][0]['extra_files_path'] + os.mkdir(target_directory) + + ## Download source files from HPA + try: + hpa = args.hpa + except NameError: + hpa = None + if hpa is not None: + #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" + hpa = hpa.split(",") + for hpa_tissue in hpa: + HPA_sources(data_manager_dict, hpa_tissue, target_directory) + + ## Download source file from Peptide Atlas query + try: + peptide_atlas = args.peptideatlas + date = args.date + except NameError: + peptide_atlas = None + if peptide_atlas is not None: + #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" + peptide_atlas = peptide_atlas.split(",") + for pa_tissue in peptide_atlas: + peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) + + ## Download ID_mapping source file from Uniprot + try: + id_mapping=args.id_mapping + except NameError: + id_mapping = None + if id_mapping is not None: + id_mapping = id_mapping .split(",") + for species in id_mapping : + id_mapping_sources(data_manager_dict, species, target_directory) + + ## Download PPI ref files from biogrid/bioplex/humap + try: + interactome=args.interactome + if interactome == "biogrid" : + species=args.species + else : + species="Human" + except NameError: + interactome=None + species=None + if interactome is not None and species is not None: + PPI_ref_files(data_manager_dict, species, interactome, target_directory) + + ## Build nextprot ref file for add protein features + try: + database=args.database + except NameError: + database=None + if database is not None : + Build_nextprot_ref_file(data_manager_dict,target_directory) + + #save info to json file + filename = args.output + open(filename, 'wb').write(to_json_string(data_manager_dict)) + +if __name__ == "__main__": + main() diff -r 000000000000 -r 9e31ea9fc7ea data_manager/resource_building.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/resource_building.xml Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,223 @@ + + +to create or update reference files for proteore tools + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +`_. +* `Pathology `_. +* `Full Atlas `_. + +For 'Peptide Atlas': + + +* `Human Adrenal gland proteome `_. +* `Human Brain proteome `_. +* `Human Breast Proteome `_. +* `Human CSF proteome `_. +* `Human Digestive System proteome `_. +* `Human female reproductive system proteome `_. +* `Human Heart proteome `_. +* `Human Kidney man Kidney Proteome `_. +* `Human Liver proteome `_. +* `Human Lung proteome `_. +* `Human Male Reproductive System proteome `_. +* `Human Pancreas proteome `_. +* `Human Plasma Non-Glyco proteome `_. +* `Human Spleen proteome `_. +* `Human Testis proteome `_. +* `Human Urinary Bladder proteome `_. +* `Human Urine proteome `_. + +For ID mapping: + +* `HUMAN_9606_idmapping_selected.tab `_. +* `HUMAN_9606_idmapping.dat `_. +* `nextprot_ac_list_all.txt `_. +* `MOUSE_10090_idmapping_selected.tab `_. +* `MOUSE_10090_idmapping.dat `_. +* `RAT_10116_idmapping_selected.tab `_. +* `RAT_10116_idmapping.dat `_. + +For Build protein interaction maps: + +* `BIOGRID_oragism `_. +* `NCBI2Reactome.txt `_. +* `Bioplex_interactionList_v4a.tsv `_. +* `UniProt2Reactome.txt `_. + +----- + +.. class:: infomark + +**Authors** + +David Christiany, Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR + +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + + ]]> + + + + diff -r 000000000000 -r 9e31ea9fc7ea data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,152 @@ + + + + + + + + + + + + peptide_atlas/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/peptide_atlas/${id}.tsv + abspath + + + + + + + + + + + protein_atlas/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/protein_atlas/${id}.tsv + abspath + + + + + + + + + + + protein_atlas/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/protein_atlas/${id}.tsv + abspath + + + + + + + + + + + + id_mapping/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${id}.tsv + abspath + + + + + + + + + + + + id_mapping/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${id}.tsv + abspath + + + + + + + + + + + + id_mapping/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${id}.tsv + abspath + + + + + + + + + + + + PPI_dictionaries/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${id}.json + abspath + + + + + + + + + + + + PPI_dictionaries/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${id}.json + abspath + + + + + + + + + + + + PPI_dictionaries/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${id}.json + abspath + + + + + + + + + + + proteore_nextprot_ref/ + + ${GALAXY_DATA_MANAGER_DATA_PATH}/proteore_nextprot_ref/${id}.tsv + abspath + + + + + diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_biogrid_dictionaries.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_biogrid_dictionaries.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,4 @@ +#id name species value +#biogrid_human_08-01-2019 Human (Homo sapiens) 08/01/2019 Human PPI_dictionaries/Human_biogrid.json +#biogrid_mouse_08-01-2019 Mouse (Mus musculus) 08/01/2019 Mouse PPI_dictionaries/Mouse_biogrid.json +#biogrid_rat_08-01-2019 Rat (Rattus norvegicus) 08/01/2019 Rat PPI_dictionaries/Rat_biogrid.json diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_bioplex_dictionaries.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_bioplex_dictionaries.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,2 @@ +#id name species value +#bioplex_human_08-01-2019 Human (Homo sapiens) 08/01/2019 Human PPI_dictionaries/human_bioplex.json diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_humap_dictionaries.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_humap_dictionaries.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,2 @@ +#id name species value +#humap_human_01-02-2019 Human (Homo sapiens) 01/02/19 Human PPI_dictionaries/Human_humap_01-02-2019.json diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_id_mapping_Human.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_id_mapping_Human.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,3 @@ +#This file lists the locations of reference file for id_converter tool +# +#human_id_mapping_01-01-2018 Human (homo sapiens) human_id_mapping tool-data/human_id_mapping.tsv diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_id_mapping_Mouse.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_id_mapping_Mouse.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,3 @@ +#This file lists the locations of reference file for id_converter tool +# +#mouse_id_mapping_01-01-2018 Mouse (Mus musculus) mouse_id_mapping tool-data/mouse_id_mapping.tsv diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_id_mapping_Rat.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_id_mapping_Rat.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,3 @@ +#This file lists the locations of reference file for id_converter tool +# +#rat_id_mapping_01-01-2018 Rat (Rattus norvegicus) rat_id_mapping tool-data/rat_id_mapping.tsv diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_nextprot_ref.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_nextprot_ref.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,2 @@ +# +#nextprot_ref_09-03-2019 neXtProt release 09-03-2019 tool-data/nextprot_ref_09-03-2019.tsv diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_protein_atlas_normal_tissue.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_protein_atlas_normal_tissue.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,4 @@ +#This file lists the locations name and values of reference files for Get expression data tool +#This is a tab separated file (TAB, not 4 spaces !) +# +#HPA_normal_tissue_19-07-2018 HPA normal tissue 19/07/2018 HPA_normal_tissue /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19159/dataset_39307_files/HPA_normal_tissue_19-07-2018.tsv diff -r 000000000000 -r 9e31ea9fc7ea tool-data/proteore_protein_atlas_tumor_tissue.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_protein_atlas_tumor_tissue.loc.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,4 @@ +#This file lists the locations name and values of reference files for Get expression data tool +#This is a tab separated file (TAB, not 4 spaces !) +# +#HPA_pathology_19-07-2018 HPA pathology 19/07/2018 HPA_pathology /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_pathology_19-07-2018.tsv diff -r 000000000000 -r 9e31ea9fc7ea tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Mar 13 06:30:42 2019 -0400 @@ -0,0 +1,43 @@ + + + + id, name, tissue, value + +
+ + id, name, tissue, value + +
+ + id, name, tissue, value + +
+ + id, name, species, value + +
+ + id, name, species, value + +
+ + id, name, species, value + +
+ + id, name, species, value + +
+ + id, name, species, value + +
+ + id, name, species, value + +
+ + id, name, value + +
+