# HG changeset patch # User proteore # Date 1560933723 14400 # Node ID 9ec42cb35abd6d996b2ea83b37e6c21cee2e6f32 # Parent f3507260b30f24b105b9435b6050bd703efbacf6 planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty diff -r f3507260b30f -r 9ec42cb35abd data_manager/resource_building.py --- a/data_manager/resource_building.py Tue Apr 16 07:46:59 2019 -0400 +++ b/data_manager/resource_building.py Wed Jun 19 04:42:03 2019 -0400 @@ -53,10 +53,11 @@ path = os.path.join(target_directory, output_file) unzip(url, path) #download and save file tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") - tissue_id = tissue_name.replace(" ","_").replace("/","-") + release = tissue_name.replace(" ","_").replace("/","-") + id = str(10000000000 - int(time.strftime("%Y%m%d"))) - data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path) + data_table_entry = dict(id=id, release=release, name = tissue_name, tissue = tissue, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, table) @@ -140,8 +141,6 @@ if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] - #print("header ok") - #get selected.tab and keep only ids of interest selected_tab_file=species_dict[species]+"_"+files[0] tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) @@ -153,6 +152,21 @@ #print("selected_tab ok") + #get uniprot-AC reviewed + organism = species_dict[species].split("_")[1] + query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" + + with requests.Session() as s: + download = s.get(query) + decoded_content = download.content.decode('utf-8') + uniprot_reviewed_list = decoded_content.splitlines() + + for line in tab[1:]: + UniProtAC = line[0] + if UniProtAC not in uniprot_reviewed_list : + line[0]="" + line[1]="" + """ Supplementary ID to get from HUMAN_9606_idmapping.dat : -NextProt,BioGrid,STRING,KEGG @@ -204,7 +218,7 @@ #print ("tab ok") - #add missing nextprot ID for human + #add missing nextprot ID for human or replace old ones if human : #build next_dict nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) @@ -217,7 +231,7 @@ for line in tab[1:] : uniprotID=line[0] nextprotID=line[13] - if nextprotID == '' and uniprotID in next_dict : + if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : line[13]=next_dict[uniprotID] output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" @@ -229,9 +243,10 @@ name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" - id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by in xml only in descending order - data_table_entry = dict(id=id, name = name, species = species, value = path) + data_table_entry = dict(id=id, release=release , name = name, species = species, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) def download_from_uniprot_ftp(file,target_directory) : @@ -483,15 +498,16 @@ dico['protein_name']=dico_protein_name #writing output - output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" + output_file = species+'_'+interactome+'_'+ time.strftime("%Y-%m-%d") + ".json" path = os.path.join(target_directory,output_file) name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") - id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") + release = species+"_"+interactome+"_"+ time.strftime("%Y-%m-%d") + id = str(10000000000 - int(time.strftime("%Y%m%d"))) with open(path, 'w') as handle: json.dump(dico, handle, sort_keys=True) - data_table_entry = dict(id=id, name = name, species = species, value = path) + data_table_entry = dict(id=id, release=release, name = name, species = species, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") ####################################################################################################### @@ -501,8 +517,18 @@ def Build_nextprot_ref_file(data_manager_dict,target_directory): nextprot_ids_file = "nextprot_ac_list_all.txt" ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) - + + output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" + path = os.path.join(target_directory,output_file) + name = "neXtProt release "+time.strftime("%d-%m-%Y") + release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") + + output = open(path, 'w') + writer = csv.writer(output,delimiter="\t") + nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] + writer.writerows(nextprot_file) + for id in ids : #print (id) query="https://api.nextprot.org/entry/"+id+".json" @@ -547,21 +573,15 @@ for tm in tm_domains : all_tm_domains.add(tm['cvTermName']) nb_domains+=1 - print "nb domains ++" - print (nb_domains) - - nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) - - output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" - path = os.path.join(target_directory,output_file) - name = "neXtProt release "+time.strftime("%d-%m-%Y") - id = "nextprot_ref_"+time.strftime("%d-%m-%Y") - - with open(path, 'w') as output: - writer = csv.writer(output,delimiter="\t") + #print "nb domains ++" + #print (nb_domains) + nextprot_file[:] = [] + nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) writer.writerows(nextprot_file) - data_table_entry = dict(id=id, name = name, value = path) + id = str(10000000000 - int(time.strftime("%Y%m%d"))) + + data_table_entry = dict(id=id, release=release_id, name = name, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") ####################################################################################################### diff -r f3507260b30f -r 9ec42cb35abd data_manager/resource_building.xml --- a/data_manager/resource_building.xml Tue Apr 16 07:46:59 2019 -0400 +++ b/data_manager/resource_building.xml Wed Jun 19 04:42:03 2019 -0400 @@ -1,4 +1,4 @@ - + to create or update reference files for proteore tools @@ -94,6 +94,7 @@ + diff -r f3507260b30f -r 9ec42cb35abd data_manager_conf.xml --- a/data_manager_conf.xml Tue Apr 16 07:46:59 2019 -0400 +++ b/data_manager_conf.xml Wed Jun 19 04:42:03 2019 -0400 @@ -19,13 +19,14 @@ + protein_atlas/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/protein_atlas/${id}.tsv + ${GALAXY_DATA_MANAGER_DATA_PATH}/protein_atlas/${release}.tsv abspath @@ -33,13 +34,14 @@ + protein_atlas/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/protein_atlas/${id}.tsv + ${GALAXY_DATA_MANAGER_DATA_PATH}/protein_atlas/${release}.tsv abspath @@ -47,6 +49,7 @@ + @@ -54,7 +57,7 @@ id_mapping/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${id}.tsv + ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${release}.tsv abspath @@ -62,6 +65,7 @@ + @@ -69,7 +73,7 @@ id_mapping/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${id}.tsv + ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${release}.tsv abspath @@ -77,6 +81,7 @@ + @@ -84,7 +89,7 @@ id_mapping/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${id}.tsv + ${GALAXY_DATA_MANAGER_DATA_PATH}/id_mapping/${release}.tsv abspath @@ -92,6 +97,7 @@ + @@ -99,7 +105,7 @@ PPI_dictionaries/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${id}.json + ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${release}.json abspath @@ -107,6 +113,7 @@ + @@ -114,7 +121,7 @@ PPI_dictionaries/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${id}.json + ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${release}.json abspath @@ -122,6 +129,7 @@ + @@ -129,7 +137,7 @@ PPI_dictionaries/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${id}.json + ${GALAXY_DATA_MANAGER_DATA_PATH}/PPI_dictionaries/${release}.json abspath @@ -137,13 +145,14 @@ + proteore_nextprot_ref/ - ${GALAXY_DATA_MANAGER_DATA_PATH}/proteore_nextprot_ref/${id}.tsv + ${GALAXY_DATA_MANAGER_DATA_PATH}/proteore_nextprot_ref/${release}.tsv abspath diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_biogrid_dictionaries.loc.sample --- a/tool-data/proteore_biogrid_dictionaries.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_biogrid_dictionaries.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,4 +1,4 @@ -#id name species value -#biogrid_human_08-01-2019 Human (Homo sapiens) 08/01/2019 Human PPI_dictionaries/Human_biogrid.json -#biogrid_mouse_08-01-2019 Mouse (Mus musculus) 08/01/2019 Mouse PPI_dictionaries/Mouse_biogrid.json -#biogrid_rat_08-01-2019 Rat (Rattus norvegicus) 08/01/2019 Rat PPI_dictionaries/Rat_biogrid.json +#id release name species value +#9979809892 biogrid_human_2019-01-08 Human (Homo sapiens) 08/01/2019 Human PPI_dictionaries/Human_biogrid.json +#9979809892 biogrid_mouse_2019-01-08 Mouse (Mus musculus) 08/01/2019 Mouse PPI_dictionaries/Mouse_biogrid.json +#9979809892 biogrid_rat_2019-01-08 Rat (Rattus norvegicus) 08/01/2019 Rat PPI_dictionaries/Rat_biogrid.json diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_bioplex_dictionaries.loc.sample --- a/tool-data/proteore_bioplex_dictionaries.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_bioplex_dictionaries.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,2 +1,2 @@ -#id name species value -#bioplex_human_08-01-2019 Human (Homo sapiens) 08/01/2019 Human PPI_dictionaries/human_bioplex.json +#id release name species value +#9979809892 bioplex_human_2019-01-08 Human (Homo sapiens) 08/01/2019 Human PPI_dictionaries/human_bioplex.json diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_humap_dictionaries.loc.sample --- a/tool-data/proteore_humap_dictionaries.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_humap_dictionaries.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,2 +1,2 @@ -#id name species value -#humap_human_01-02-2019 Human (Homo sapiens) 01/02/19 Human PPI_dictionaries/Human_humap_01-02-2019.json +#id release name species value +#9979809799 humap_human_2019-01-02 Human (Homo sapiens) 01/02/19 Human PPI_dictionaries/Human_humap_01-02-2019.json diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_id_mapping_Human.loc.sample --- a/tool-data/proteore_id_mapping_Human.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_id_mapping_Human.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,3 +1,3 @@ #This file lists the locations of reference file for id_converter tool -# -#human_id_mapping_01-01-2018 Human (homo sapiens) human_id_mapping tool-data/human_id_mapping.tsv +# +#9979818977 human_id_mapping_23-10-2018 Human (homo sapiens 23/10/2018) Human tool-data/human_id_mapping_23-10-2018.tsv \ No newline at end of file diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_id_mapping_Mouse.loc.sample --- a/tool-data/proteore_id_mapping_Mouse.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_id_mapping_Mouse.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,3 +1,3 @@ #This file lists the locations of reference file for id_converter tool -# -#mouse_id_mapping_01-01-2018 Mouse (Mus musculus) mouse_id_mapping tool-data/mouse_id_mapping.tsv +# +#9979818977 mouse_id_mapping_23-10-2018 Mouse (Mus musculus 23-10-2018) Mouse tool-data/mouse_id_mapping_23-10-2018.tsv diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_id_mapping_Rat.loc.sample --- a/tool-data/proteore_id_mapping_Rat.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_id_mapping_Rat.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,3 +1,3 @@ #This file lists the locations of reference file for id_converter tool -# -#rat_id_mapping_01-01-2018 Rat (Rattus norvegicus) rat_id_mapping tool-data/rat_id_mapping.tsv +# +#9979818977 rat_id_mapping_23-10-2018 Rat (Rattus norvegicus 23-10-2018) Rat tool-data/rat_id_mapping_23-10-2018.tsv diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_nextprot_ref.loc.sample --- a/tool-data/proteore_nextprot_ref.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_nextprot_ref.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,2 +1,2 @@ -# -#nextprot_ref_09-03-2019 neXtProt release 09-03-2019 tool-data/nextprot_ref_09-03-2019.tsv +# +#9979809691 nextprot_ref_09-03-2019 neXtProt release 09-03-2019 tool-data/nextprot_ref_09-03-2019.tsv diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_peptide_atlas.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_peptide_atlas.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -0,0 +1,10 @@ +#This file lists the locations name and values of reference files for number of MS/MS observations in a tissue +#This is a tab separated file (TAB, not 4 spaces !) +# +#Human_Heart_20-07-2018 Human Heart 20/07/2018 Human_Heart /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Heart_20-07-2018.tsv +#Human_Liver_20-07-2018 Human Liver 20/07/2018 Human_Liver /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Liver_20-07-2018.tsv +#Human_Urine_20-07-2018 Human Urine 20/07/2018 Human_Urine /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Urine_20-07-2018.tsv +#Human_Brain_20-07-2018 Human Brain 20/07/2018 Human_Brain /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Brain_20-07-2018.tsv +#Human_Kidney_20-07-2018 Human Kidney 20/07/2018 Human_Kidney /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Kidney_20-07-2018.tsv +#Human_Plasma_20-07-2018 Human Plasma 20/07/2018 Human_Plasma /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_Plasma_20-07-2018.tsv +#Human_CSF_20-07-2018 Human CSF 20/07/2018 Human_CSF /projet/galaxydev/galaxy/tool-data/peptide_atlas/Human_CSF_20-07-2018.tsv diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_protein_atlas_normal_tissue.loc.sample --- a/tool-data/proteore_protein_atlas_normal_tissue.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_protein_atlas_normal_tissue.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,4 +1,4 @@ #This file lists the locations name and values of reference files for Get expression data tool #This is a tab separated file (TAB, not 4 spaces !) -# -#HPA_normal_tissue_19-07-2018 HPA normal tissue 19/07/2018 HPA_normal_tissue /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19159/dataset_39307_files/HPA_normal_tissue_19-07-2018.tsv +# +#9979819281 HPA_normal_tissue_19-07-2018 HPA normal tissue 19/07/2018 HPA_normal_tissue /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19159/dataset_39307_files/HPA_normal_tissue_19-07-2018.tsv diff -r f3507260b30f -r 9ec42cb35abd tool-data/proteore_protein_atlas_tumor_tissue.loc.sample --- a/tool-data/proteore_protein_atlas_tumor_tissue.loc.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool-data/proteore_protein_atlas_tumor_tissue.loc.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,4 +1,4 @@ #This file lists the locations name and values of reference files for Get expression data tool #This is a tab separated file (TAB, not 4 spaces !) -# -#HPA_pathology_19-07-2018 HPA pathology 19/07/2018 HPA_pathology /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_pathology_19-07-2018.tsv +# +#9979819281 HPA_pathology_19-07-2018 HPA pathology 19/07/2018 HPA_pathology /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_pathology_19-07-2018.tsv diff -r f3507260b30f -r 9ec42cb35abd tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample Tue Apr 16 07:46:59 2019 -0400 +++ b/tool_data_table_conf.xml.sample Wed Jun 19 04:42:03 2019 -0400 @@ -1,43 +1,43 @@ - - id, name, tissue, value - -
- - id, name, tissue, value - -
- - id, name, tissue, value - -
- - id, name, species, value - -
- - id, name, species, value - -
- - id, name, species, value - -
- - id, name, species, value - -
- - id, name, species, value - -
- - id, name, species, value - -
- - id, name, value - -
+ + id, name, tissue, value + +
+ + id, release, name, tissue, value + +
+ + id, release, name, tissue, value + +
+ + id, release, name, species, value + +
+ + id, release, name, species, value + +
+ + id, release, name, species, value + +
+ + id, release, name, species, value + +
+ + id, release, name, species, value + +
+ + id, release, name, species, value + +
+ + id, release, name, value + +