comparison data_manager/resource_building.py @ 7:b8565596bb25 draft default tip

"planemo upload commit 7afd4b3ee25f024257ccbac6e51076d25b2a04e7"
author proteore
date Thu, 20 Aug 2020 03:33:35 -0400
parents 8f33a6e6e36c
children
comparison
equal deleted inserted replaced
6:8f33a6e6e36c 7:b8565596bb25
552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" 552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
553 path = os.path.join(target_directory,output_file) 553 path = os.path.join(target_directory,output_file)
554 name = "neXtProt release "+time.strftime("%d-%m-%Y") 554 name = "neXtProt release "+time.strftime("%d-%m-%Y")
555 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") 555 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
556 556
557 output = open(path, 'w') 557 output = open('test.csv', 'w')
558 writer = csv.writer(output,delimiter="\t") 558 writer = csv.writer(output,delimiter="\t")
559 559
560 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] 560 nextprot_file=[["NextprotID","ProteinName","SeqLength","MW","IsoPoint","TMDomains","SubcellLocations","Diseases","Function","PostTranslationalModifications","ProteinFamily","Pathway","ProteinExistence","Chr"]]
561 writer.writerows(nextprot_file) 561 writer.writerows(nextprot_file)
562 562
563 for id in ids : 563 for id in ids :
564 query="https://api.nextprot.org/entry/"+id+".json" 564 query="https://api.nextprot.org/entry/"+id+".json"
565 try: 565 try:
566 resp = requests.get(url=query) 566 resp = requests.get(url=query)
567 except : 567 except :
568 print ("wainting 1 hour before trying again") 568 print ("waiting 15 minutes before trying again")
569 time.sleep(3600) 569 time.sleep(900)
570 resp = requests.get(url=query) 570 resp = requests.get(url=query)
571 data = resp.json() 571 data = resp.json()
572 572
573 #get info from json dictionary 573 #get info from json dictionary
574 mass_mol = data["entry"]["isoforms"][0]["massAsString"] 574 mass_mol = data["entry"]["isoforms"][0]["massAsString"]
575 seq_length = data['entry']["isoforms"][0]["sequenceLength"] 575 seq_length = data['entry']["isoforms"][0]["sequenceLength"]
576 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] 576 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
577 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] 577 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
578 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) 578 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
579 protein_name = data['entry']["overview"]['proteinNames'][0]['name']
580
581 #get families description
582 if 'families' in data['entry']["overview"] and len(data['entry']["overview"]['families']) > 0:
583 families = data['entry']["overview"]['families']
584 families = [entry['description'] for entry in families]
585 protein_family = ";".join(families)
586 else:
587 protein_family = 'NA'
588
589 #get Protein function
590 if 'function-info' in data['entry']['annotationsByCategory'].keys():
591 function_info = data['entry']['annotationsByCategory']['function-info']
592 function_info = [entry['description'] for entry in function_info if entry['qualityQualifier'] == 'GOLD']
593 function = ';'.join(function_info)
594 else :
595 function = 'NA'
596
597 #Get ptm-info
598 post_trans_mod = 'NA'
599 if 'ptm-info' in data['entry']['annotationsByCategory'].keys():
600 ptm_info = data['entry']['annotationsByCategory']['ptm-info']
601 infos = [entry['description'] for entry in ptm_info if entry['qualityQualifier'] == 'GOLD']
602 post_trans_mod = ";".join(infos)
603
604 #Get pathway(s)
605 if 'pathway' in data['entry']['annotationsByCategory'].keys():
606 pathways = data['entry']['annotationsByCategory']['pathway']
607 pathways = [entry['description'] for entry in pathways if entry['qualityQualifier'] == 'GOLD']
608 pathway = ";".join(pathways)
609 else :
610 pathway = 'NA'
579 611
580 #put all subcell loc in a set 612 #put all subcell loc in a set
581 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : 613 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
582 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] 614 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
583 all_subcell_locs = set() 615 all_subcell_locs = set()
608 for tm in tm_domains : 640 for tm in tm_domains :
609 all_tm_domains.add(tm['cvTermName']) 641 all_tm_domains.add(tm['cvTermName'])
610 nb_domains+=1 642 nb_domains+=1
611 #print "nb domains ++" 643 #print "nb domains ++"
612 #print (nb_domains) 644 #print (nb_domains)
645
613 nextprot_file[:] = [] 646 nextprot_file[:] = []
614 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) 647 nextprot_file.append([id,protein_name,str(seq_length),mass_mol,iso_elec_point,str(nb_domains),all_subcell_locs,all_diseases,function,post_trans_mod,protein_family,pathway,protein_existence,chr_loc])
615 writer.writerows(nextprot_file) 648 writer.writerows(nextprot_file)
616 649
617 id = str(10000000000 - int(time.strftime("%Y%m%d"))) 650 id = str(10000000000 - int(time.strftime("%Y%m%d")))
618 651
619 data_table_entry = dict(id=id, release=release_id, name = name, value = path) 652 data_table_entry = dict(id=id, release=release_id, name = name, value = path)
620 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") 653 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
621 654
622 ####################################################################################################### 655 #######################################################################################################