annotate PLIDflow/scripts/scrapPubChem.py @ 6:795e11fac81b draft default tip

Included new tools for standardization
author bitlab
date Wed, 22 Apr 2020 06:12:00 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
1 from requests import Session
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
2 from robobrowser import RoboBrowser
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
3 from pprint import pprint
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
4 from bs4 import BeautifulSoup
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
5 from time import sleep
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
6 import random
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
7 import json
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
8 import csv
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
9 import time
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
10 import re
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
11 import gc
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
12 import time
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
13 import sys
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
14
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
15 def main():
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
16 # User information
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
17
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
18
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
19 # REMEMBER TO PUT THE INPUT ARGUMENT WITH DOUBLE QUOTES
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
20 if (len(sys.argv) != 2):
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
21 print >> sys.stderr, "ERROR: Missing input chain"
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
22 exit()
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
23
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
24 chain = str(sys.argv[1])
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
25
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
26 finished = 0
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
27
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
28 while finished == 0:
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
29
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
30 try:
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
31
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
32 #chain = 'Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1'
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
33 url = 'https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi'
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
34
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
35 # Create session and browser
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
36 session = Session()
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
37 browser = RoboBrowser(session=session,history=False, parser="html5lib")
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
38 browser.open(url)
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
39
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
40 form = browser.get_form(action=re.compile(r'standardize'))
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
41
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
42 #print(form.fields)
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
43
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
44 form['structure'] = 'smiles'
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
45 form['structuresmiles'] = chain
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
46
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
47 #print(len(list(form.submit_fields.items(multi=True))))
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
48 #print(list(form.submit_fields.items(multi=True)))
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
49 #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[0])
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
50 #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[1])
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
51
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
52 submit_field = form['submitjob']
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
53 submit_field.value = 'Authorize'
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
54 #print(form)
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
55 res = browser.submit_form(form, submit=submit_field)
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
56 parsedbrowser = str(browser.parsed)
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
57 #print(parsedbrowser)
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
58
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
59 parsedbrowser = parsedbrowser.replace("<html>", "")
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
60 parsedbrowser = parsedbrowser.replace("<head>", "")
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
61 parsedbrowser = parsedbrowser.replace("</head>", "")
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
62 parsedbrowser = parsedbrowser.replace("</html>", "")
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
63 parsedbrowser = parsedbrowser.replace("<body>", "")
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
64 parsedbrowser = parsedbrowser.replace("</body>", "")
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
65
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
66 badpos = parsedbrowser.find('Output Log:')
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
67 if(badpos != -1):
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
68 print >> sys.stderr, parsedbrowser[badpos:]
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
69 else:
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
70 parsedbrowser = parsedbrowser.replace("\n", "")
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
71 print(parsedbrowser)
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
72
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
73
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
74
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
75 del browser
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
76 gc.collect()
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
77
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
78 finished = 1
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
79
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
80
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
81 except Exception as ex:
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
82
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
83 finished = 0
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
84 time.sleep(5)
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
85
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
86
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
87
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
88
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
89
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
90
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
91
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
92 if __name__ == "__main__":
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
93 main()
795e11fac81b Included new tools for standardization
bitlab
parents:
diff changeset
94