comparison PLIDflow/scripts/scrapPubChem.py @ 6:795e11fac81b draft default tip

Included new tools for standardization
author bitlab
date Wed, 22 Apr 2020 06:12:00 -0400
parents
children
comparison
equal deleted inserted replaced
5:97f12f7cc852 6:795e11fac81b
1 from requests import Session
2 from robobrowser import RoboBrowser
3 from pprint import pprint
4 from bs4 import BeautifulSoup
5 from time import sleep
6 import random
7 import json
8 import csv
9 import time
10 import re
11 import gc
12 import time
13 import sys
14
15 def main():
16 # User information
17
18
19 # REMEMBER TO PUT THE INPUT ARGUMENT WITH DOUBLE QUOTES
20 if (len(sys.argv) != 2):
21 print >> sys.stderr, "ERROR: Missing input chain"
22 exit()
23
24 chain = str(sys.argv[1])
25
26 finished = 0
27
28 while finished == 0:
29
30 try:
31
32 #chain = 'Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1'
33 url = 'https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi'
34
35 # Create session and browser
36 session = Session()
37 browser = RoboBrowser(session=session,history=False, parser="html5lib")
38 browser.open(url)
39
40 form = browser.get_form(action=re.compile(r'standardize'))
41
42 #print(form.fields)
43
44 form['structure'] = 'smiles'
45 form['structuresmiles'] = chain
46
47 #print(len(list(form.submit_fields.items(multi=True))))
48 #print(list(form.submit_fields.items(multi=True)))
49 #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[0])
50 #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[1])
51
52 submit_field = form['submitjob']
53 submit_field.value = 'Authorize'
54 #print(form)
55 res = browser.submit_form(form, submit=submit_field)
56 parsedbrowser = str(browser.parsed)
57 #print(parsedbrowser)
58
59 parsedbrowser = parsedbrowser.replace("<html>", "")
60 parsedbrowser = parsedbrowser.replace("<head>", "")
61 parsedbrowser = parsedbrowser.replace("</head>", "")
62 parsedbrowser = parsedbrowser.replace("</html>", "")
63 parsedbrowser = parsedbrowser.replace("<body>", "")
64 parsedbrowser = parsedbrowser.replace("</body>", "")
65
66 badpos = parsedbrowser.find('Output Log:')
67 if(badpos != -1):
68 print >> sys.stderr, parsedbrowser[badpos:]
69 else:
70 parsedbrowser = parsedbrowser.replace("\n", "")
71 print(parsedbrowser)
72
73
74
75 del browser
76 gc.collect()
77
78 finished = 1
79
80
81 except Exception as ex:
82
83 finished = 0
84 time.sleep(5)
85
86
87
88
89
90
91
92 if __name__ == "__main__":
93 main()
94