Mercurial > repos > bitlab > plidflow
comparison PLIDflow/scripts/scrapPubChem.py @ 6:795e11fac81b draft default tip
Included new tools for standardization
author | bitlab |
---|---|
date | Wed, 22 Apr 2020 06:12:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:97f12f7cc852 | 6:795e11fac81b |
---|---|
1 from requests import Session | |
2 from robobrowser import RoboBrowser | |
3 from pprint import pprint | |
4 from bs4 import BeautifulSoup | |
5 from time import sleep | |
6 import random | |
7 import json | |
8 import csv | |
9 import time | |
10 import re | |
11 import gc | |
12 import time | |
13 import sys | |
14 | |
15 def main(): | |
16 # User information | |
17 | |
18 | |
19 # REMEMBER TO PUT THE INPUT ARGUMENT WITH DOUBLE QUOTES | |
20 if (len(sys.argv) != 2): | |
21 print >> sys.stderr, "ERROR: Missing input chain" | |
22 exit() | |
23 | |
24 chain = str(sys.argv[1]) | |
25 | |
26 finished = 0 | |
27 | |
28 while finished == 0: | |
29 | |
30 try: | |
31 | |
32 #chain = 'Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1' | |
33 url = 'https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi' | |
34 | |
35 # Create session and browser | |
36 session = Session() | |
37 browser = RoboBrowser(session=session,history=False, parser="html5lib") | |
38 browser.open(url) | |
39 | |
40 form = browser.get_form(action=re.compile(r'standardize')) | |
41 | |
42 #print(form.fields) | |
43 | |
44 form['structure'] = 'smiles' | |
45 form['structuresmiles'] = chain | |
46 | |
47 #print(len(list(form.submit_fields.items(multi=True)))) | |
48 #print(list(form.submit_fields.items(multi=True))) | |
49 #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[0]) | |
50 #print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[1]) | |
51 | |
52 submit_field = form['submitjob'] | |
53 submit_field.value = 'Authorize' | |
54 #print(form) | |
55 res = browser.submit_form(form, submit=submit_field) | |
56 parsedbrowser = str(browser.parsed) | |
57 #print(parsedbrowser) | |
58 | |
59 parsedbrowser = parsedbrowser.replace("<html>", "") | |
60 parsedbrowser = parsedbrowser.replace("<head>", "") | |
61 parsedbrowser = parsedbrowser.replace("</head>", "") | |
62 parsedbrowser = parsedbrowser.replace("</html>", "") | |
63 parsedbrowser = parsedbrowser.replace("<body>", "") | |
64 parsedbrowser = parsedbrowser.replace("</body>", "") | |
65 | |
66 badpos = parsedbrowser.find('Output Log:') | |
67 if(badpos != -1): | |
68 print >> sys.stderr, parsedbrowser[badpos:] | |
69 else: | |
70 parsedbrowser = parsedbrowser.replace("\n", "") | |
71 print(parsedbrowser) | |
72 | |
73 | |
74 | |
75 del browser | |
76 gc.collect() | |
77 | |
78 finished = 1 | |
79 | |
80 | |
81 except Exception as ex: | |
82 | |
83 finished = 0 | |
84 time.sleep(5) | |
85 | |
86 | |
87 | |
88 | |
89 | |
90 | |
91 | |
92 if __name__ == "__main__": | |
93 main() | |
94 |