Mercurial > repos > bitlab > plidflow
diff PLIDflow/scripts/scrapPubChem.py @ 6:795e11fac81b draft default tip
Included new tools for standardization
author | bitlab |
---|---|
date | Wed, 22 Apr 2020 06:12:00 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PLIDflow/scripts/scrapPubChem.py Wed Apr 22 06:12:00 2020 -0400 @@ -0,0 +1,94 @@ +from requests import Session +from robobrowser import RoboBrowser +from pprint import pprint +from bs4 import BeautifulSoup +from time import sleep +import random +import json +import csv +import time +import re +import gc +import time +import sys + +def main(): + # User information + + + # REMEMBER TO PUT THE INPUT ARGUMENT WITH DOUBLE QUOTES + if (len(sys.argv) != 2): + print >> sys.stderr, "ERROR: Missing input chain" + exit() + + chain = str(sys.argv[1]) + + finished = 0 + + while finished == 0: + + try: + + #chain = 'Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1' + url = 'https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi' + + # Create session and browser + session = Session() + browser = RoboBrowser(session=session,history=False, parser="html5lib") + browser.open(url) + + form = browser.get_form(action=re.compile(r'standardize')) + + #print(form.fields) + + form['structure'] = 'smiles' + form['structuresmiles'] = chain + + #print(len(list(form.submit_fields.items(multi=True)))) + #print(list(form.submit_fields.items(multi=True))) + #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[0]) + #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[1]) + + submit_field = form['submitjob'] + submit_field.value = 'Authorize' + #print(form) + res = browser.submit_form(form, submit=submit_field) + parsedbrowser = str(browser.parsed) + #print(parsedbrowser) + + parsedbrowser = parsedbrowser.replace("<html>", "") + parsedbrowser = parsedbrowser.replace("<head>", "") + parsedbrowser = parsedbrowser.replace("</head>", "") + parsedbrowser = parsedbrowser.replace("</html>", "") + parsedbrowser = parsedbrowser.replace("<body>", "") + parsedbrowser = parsedbrowser.replace("</body>", "") + + badpos = parsedbrowser.find('Output Log:') + if(badpos != -1): + print >> sys.stderr, parsedbrowser[badpos:] + else: + parsedbrowser = parsedbrowser.replace("\n", "") + print(parsedbrowser) + + + + del browser + gc.collect() + + finished = 1 + + + except Exception as ex: + + finished = 0 + time.sleep(5) + + + + + + + +if __name__ == "__main__": + main() +