Mercurial > repos > bitlab > plidflow
comparison PLIDflow/scripts/scrapPubChem.py @ 6:795e11fac81b draft default tip
Included new tools for standardization
| author | bitlab |
|---|---|
| date | Wed, 22 Apr 2020 06:12:00 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:97f12f7cc852 | 6:795e11fac81b |
|---|---|
| 1 from requests import Session | |
| 2 from robobrowser import RoboBrowser | |
| 3 from pprint import pprint | |
| 4 from bs4 import BeautifulSoup | |
| 5 from time import sleep | |
| 6 import random | |
| 7 import json | |
| 8 import csv | |
| 9 import time | |
| 10 import re | |
| 11 import gc | |
| 12 import time | |
| 13 import sys | |
| 14 | |
| 15 def main(): | |
| 16 # User information | |
| 17 | |
| 18 | |
| 19 # REMEMBER TO PUT THE INPUT ARGUMENT WITH DOUBLE QUOTES | |
| 20 if (len(sys.argv) != 2): | |
| 21 print >> sys.stderr, "ERROR: Missing input chain" | |
| 22 exit() | |
| 23 | |
| 24 chain = str(sys.argv[1]) | |
| 25 | |
| 26 finished = 0 | |
| 27 | |
| 28 while finished == 0: | |
| 29 | |
| 30 try: | |
| 31 | |
| 32 #chain = 'Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1' | |
| 33 url = 'https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi' | |
| 34 | |
| 35 # Create session and browser | |
| 36 session = Session() | |
| 37 browser = RoboBrowser(session=session,history=False, parser="html5lib") | |
| 38 browser.open(url) | |
| 39 | |
| 40 form = browser.get_form(action=re.compile(r'standardize')) | |
| 41 | |
| 42 #print(form.fields) | |
| 43 | |
| 44 form['structure'] = 'smiles' | |
| 45 form['structuresmiles'] = chain | |
| 46 | |
| 47 #print(len(list(form.submit_fields.items(multi=True)))) | |
| 48 #print(list(form.submit_fields.items(multi=True))) | |
| 49 #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[0]) | |
| 50 #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[1]) | |
| 51 | |
| 52 submit_field = form['submitjob'] | |
| 53 submit_field.value = 'Authorize' | |
| 54 #print(form) | |
| 55 res = browser.submit_form(form, submit=submit_field) | |
| 56 parsedbrowser = str(browser.parsed) | |
| 57 #print(parsedbrowser) | |
| 58 | |
| 59 parsedbrowser = parsedbrowser.replace("<html>", "") | |
| 60 parsedbrowser = parsedbrowser.replace("<head>", "") | |
| 61 parsedbrowser = parsedbrowser.replace("</head>", "") | |
| 62 parsedbrowser = parsedbrowser.replace("</html>", "") | |
| 63 parsedbrowser = parsedbrowser.replace("<body>", "") | |
| 64 parsedbrowser = parsedbrowser.replace("</body>", "") | |
| 65 | |
| 66 badpos = parsedbrowser.find('Output Log:') | |
| 67 if(badpos != -1): | |
| 68 print >> sys.stderr, parsedbrowser[badpos:] | |
| 69 else: | |
| 70 parsedbrowser = parsedbrowser.replace("\n", "") | |
| 71 print(parsedbrowser) | |
| 72 | |
| 73 | |
| 74 | |
| 75 del browser | |
| 76 gc.collect() | |
| 77 | |
| 78 finished = 1 | |
| 79 | |
| 80 | |
| 81 except Exception as ex: | |
| 82 | |
| 83 finished = 0 | |
| 84 time.sleep(5) | |
| 85 | |
| 86 | |
| 87 | |
| 88 | |
| 89 | |
| 90 | |
| 91 | |
| 92 if __name__ == "__main__": | |
| 93 main() | |
| 94 |
