Mercurial > repos > bitlab > plidflow
view PLIDflow/scripts/scrapPubChem.py @ 6:795e11fac81b draft default tip
Included new tools for standardization
author | bitlab |
---|---|
date | Wed, 22 Apr 2020 06:12:00 -0400 |
parents | |
children |
line wrap: on
line source
from requests import Session from robobrowser import RoboBrowser from pprint import pprint from bs4 import BeautifulSoup from time import sleep import random import json import csv import time import re import gc import time import sys def main(): # User information # REMEMBER TO PUT THE INPUT ARGUMENT WITH DOUBLE QUOTES if (len(sys.argv) != 2): print >> sys.stderr, "ERROR: Missing input chain" exit() chain = str(sys.argv[1]) finished = 0 while finished == 0: try: #chain = 'Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1' url = 'https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi' # Create session and browser session = Session() browser = RoboBrowser(session=session,history=False, parser="html5lib") browser.open(url) form = browser.get_form(action=re.compile(r'standardize')) #print(form.fields) form['structure'] = 'smiles' form['structuresmiles'] = chain #print(len(list(form.submit_fields.items(multi=True)))) #print(list(form.submit_fields.items(multi=True))) #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[0]) #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[1]) submit_field = form['submitjob'] submit_field.value = 'Authorize' #print(form) res = browser.submit_form(form, submit=submit_field) parsedbrowser = str(browser.parsed) #print(parsedbrowser) parsedbrowser = parsedbrowser.replace("<html>", "") parsedbrowser = parsedbrowser.replace("<head>", "") parsedbrowser = parsedbrowser.replace("</head>", "") parsedbrowser = parsedbrowser.replace("</html>", "") parsedbrowser = parsedbrowser.replace("<body>", "") parsedbrowser = parsedbrowser.replace("</body>", "") badpos = parsedbrowser.find('Output Log:') if(badpos != -1): print >> sys.stderr, parsedbrowser[badpos:] else: parsedbrowser = parsedbrowser.replace("\n", "") print(parsedbrowser) del browser gc.collect() finished = 1 except Exception as ex: finished = 0 time.sleep(5) if __name__ == "__main__": main()