Mercurial > repos > proteore > proteore_data_manager
changeset 4:e967a99d66b3 draft
"planemo upload commit 540dd383c0617193db43bf11457011888751b022-dirty"
author | proteore |
---|---|
date | Thu, 23 Jan 2020 08:38:02 -0500 |
parents | af0250fd023c |
children | b05fa99ddda2 |
files | data_manager/resource_building.py data_manager/resource_building.xml data_manager_conf.xml tool-data/proteore_protein_full_atlas.loc.sample tool_data_table_conf.xml.sample |
diffstat | 5 files changed, 69 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/resource_building.py Thu Sep 05 07:45:16 2019 -0400 +++ b/data_manager/resource_building.py Thu Jan 23 08:38:02 2020 -0500 @@ -3,7 +3,7 @@ The purpose of this script is to create source files from different databases to be used in other proteore tools """ -import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile +import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile, subprocess from io import BytesIO from zipfile import ZipFile from galaxy.util.json import from_json_string, to_json_string @@ -131,15 +131,17 @@ import ftplib, gzip csv.field_size_limit(sys.maxsize) # to handle big files -def id_mapping_sources (data_manager_dict, species, target_directory) : +def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) : human = species == "Human" species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } files=["idmapping_selected.tab.gz","idmapping.dat.gz"] + archive = os.path.join(tool_data_path, "id_mapping/ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d"))) + if os.path.isdir(archive) is False : os.mkdir(archive) #header - if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] - else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] + if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] + else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']] #get selected.tab and keep only ids of interest selected_tab_file=species_dict[species]+"_"+files[0] @@ -147,9 +149,9 @@ with gzip.open(tab_path,"rt") as select : tab_reader = csv.reader(select,delimiter="\t") for line in tab_reader : - tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) - os.remove(tab_path) - + tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) + if os.path.exists(os.path.join(archive,tab_path.split("/")[-1])) : os.remove(os.path.join(archive,tab_path.split("/")[-1])) + shutil.move(tab_path, archive) #print("selected_tab ok") #get uniprot-AC reviewed @@ -161,10 +163,16 @@ decoded_content = download.content.decode('utf-8') uniprot_reviewed_list = decoded_content.splitlines() + #save reviewed list + reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt') + with open(reviewed_list_path,'w') as reviewed_list_file: + for id in uniprot_reviewed_list: + reviewed_list_file.write(id+"\n") + + #remove unreviewed uniprot-AC for line in tab[1:]: - UniProtAC = line[0] + UniProtAC = line[1] if UniProtAC not in uniprot_reviewed_list : - line[0]="" line[1]="" """ @@ -173,12 +181,12 @@ """ #there's more id type for human - if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file - else : ids = ['BioGrid','STRING','KEGG' ] + if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ] #ids to get from dat_file + else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ] unidict = {} #keep only ids of interest in dictionaries - dat_file=species_dict[species]+"_"+files[1] + dat_file = species_dict[species]+"_"+files[1] dat_path = download_from_uniprot_ftp(dat_file,target_directory) with gzip.open(dat_path,"rt") as dat : dat_reader = csv.reader(dat,delimiter="\t") @@ -194,7 +202,8 @@ unidict[uniprotID].update({ id_type : cor_id }) elif id_type in ids : unidict[uniprotID]={id_type : cor_id} - os.remove(dat_path) + if os.path.exists(os.path.join(archive,dat_path.split("/")[-1])) : os.remove(os.path.join(archive,dat_path.split("/")[-1])) + shutil.move(dat_path, archive) #print("dat_file ok") @@ -206,33 +215,37 @@ nextprot = access_dictionary(unidict,uniprotID,'neXtProt') if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), - access_dictionary(unidict,uniprotID,'KEGG')]) + access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')]) else : - line.extend(["","","",""]) + line.extend(["","","","",""]) else : if uniprotID in unidict : line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), - access_dictionary(unidict,uniprotID,'KEGG')]) + access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')]) else : - line.extend(["","",""]) + line.extend(["","","",""]) #print ("tab ok") #add missing nextprot ID for human or replace old ones if human : #build next_dict - nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) + nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) + with open(nextprot_path,'r') as nextprot_ids : + nextprot_ids = nextprot_ids.read().splitlines() + if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1])) + shutil.move(nextprot_path,archive) next_dict = {} for nextid in nextprot_ids : next_dict[nextid.replace("NX_","")] = nextid - os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) + #os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) #add missing nextprot ID for line in tab[1:] : uniprotID=line[0] - nextprotID=line[13] + nextprotID=line[14] if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : - line[13]=next_dict[uniprotID] + line[14]=next_dict[uniprotID] output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" path = os.path.join(target_directory,output_file) @@ -240,6 +253,9 @@ with open(path,"w") as out : w = csv.writer(out,delimiter='\t') w.writerows(tab) + + subprocess.call(['tar', '-czvf', archive+".tar.gz", archive]) + shutil.rmtree(archive, ignore_errors=True) name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" @@ -267,9 +283,8 @@ ftp.cwd(ftp_dir) ftp.retrbinary("RETR " + file, open(path, 'wb').write) ftp.quit() - with open(path,'r') as nextprot_ids : - nextprot_ids = nextprot_ids.read().splitlines() - return (nextprot_ids) + + return (path) #return '' if there's no value in a dictionary, avoid error def access_dictionary (dico,key1,key2) : @@ -373,7 +388,7 @@ elif interactome=="bioplex": with requests.Session() as s: - r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv',verify=False) + r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv') r = r.content.decode('utf-8') bioplex = csv.reader(r.splitlines(), delimiter='\t') @@ -597,6 +612,7 @@ parser.add_argument("--date") parser.add_argument("-o", "--output") parser.add_argument("--database") + parser.add_argument("--tool_data_path") args = parser.parse_args() data_manager_dict = {} @@ -631,13 +647,13 @@ ## Download ID_mapping source file from Uniprot try: - id_mapping=args.id_mapping + id_mapping = args.id_mapping except NameError: id_mapping = None if id_mapping is not None: id_mapping = id_mapping .split(",") for species in id_mapping : - id_mapping_sources(data_manager_dict, species, target_directory) + id_mapping_sources(data_manager_dict, species, target_directory, args.tool_data_path) ## Download PPI ref files from biogrid/bioplex/humap try:
--- a/data_manager/resource_building.xml Thu Sep 05 07:45:16 2019 -0400 +++ b/data_manager/resource_building.xml Thu Jan 23 08:38:02 2020 -0500 @@ -1,4 +1,4 @@ -<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.09.05" tool_type="manage_data"> +<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2020.01.21" tool_type="manage_data"> <description> to create or update reference files for proteore tools </description> @@ -27,6 +27,7 @@ --database=$database.database #end if --output "$output" + --tool_data_path=$__tool_data_path__ ]]></command> @@ -43,7 +44,7 @@ <param name="tissues" type="select" multiple="false" label="Please select tissue"> <option value="HPA_normal_tissue">Normal tissue</option> <option value="HPA_pathology">Pathology</option> - <!--option value="HPA_full_atlas">Full Atlas</option--> + <option value="HPA_full_atlas">Full Atlas</option> </param> </when> <when value="peptide_atlas">
--- a/data_manager_conf.xml Thu Sep 05 07:45:16 2019 -0400 +++ b/data_manager_conf.xml Thu Jan 23 08:38:02 2020 -0500 @@ -46,6 +46,21 @@ </column> </output> </data_table> + <data_table name="proteore_protein_full_atlas"> + <output> + <column name="id"/> + <column name="release"/> + <column name="name" /> + <column name="tissue" /> + <column name="value" output_ref="output" > + <move type="file"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">protein_atlas/</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/protein_atlas/${release}.tsv</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> <data_table name="proteore_id_mapping_Human"> <output> <column name="id" />
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/proteore_protein_full_atlas.loc.sample Thu Jan 23 08:38:02 2020 -0500 @@ -0,0 +1,4 @@ +#This file lists the locations name and values of reference files for Get expression data tool +#This is a tab separated file (TAB, not 4 spaces !) +#<id> <release> <name> <tissue> <value> +#9979819281 HPA_full_atlas_19-07-2018 HPA Full Protein Atlas 19/07/2018 HPA_full_atlas /projet/galaxydev/galaxy/tool-data/protein_atlas/projet/galaxydev/galaxy/database/jobs_directory/019/19160/dataset_39308_files/HPA_full_atlas_19-07-2018.tsv
--- a/tool_data_table_conf.xml.sample Thu Sep 05 07:45:16 2019 -0400 +++ b/tool_data_table_conf.xml.sample Thu Jan 23 08:38:02 2020 -0500 @@ -12,6 +12,10 @@ <columns>id, release, name, tissue, value</columns> <file path="tool-data/proteore_protein_atlas_tumor_tissue.loc" /> </table> + <table name="proteore_protein_full_atlas" comment_char="#"> + <columns>id, release, name, tissue, value</columns> + <file path="tool-data/proteore_protein_full_atlas.loc" /> + </table> <table name="proteore_id_mapping_Human" comment_char="#"> <columns>id, release, name, species, value</columns> <file path="tool-data/proteore_id_mapping_Human.loc" />