diff PLIDflow/scripts/scrapPubChem.py @ 6:795e11fac81b draft default tip

Included new tools for standardization
author bitlab
date Wed, 22 Apr 2020 06:12:00 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PLIDflow/scripts/scrapPubChem.py	Wed Apr 22 06:12:00 2020 -0400
@@ -0,0 +1,94 @@
+from requests import Session
+from robobrowser import RoboBrowser
+from pprint import pprint
+from bs4 import BeautifulSoup
+from time import sleep
+import random
+import json
+import csv
+import time
+import re
+import gc
+import time
+import sys
+
+def main():
+	# User information
+
+
+	# REMEMBER TO PUT THE INPUT ARGUMENT WITH DOUBLE QUOTES
+	if (len(sys.argv) != 2):
+                print >> sys.stderr, "ERROR: Missing input chain"
+		exit()
+
+	chain = str(sys.argv[1])
+
+	finished = 0
+
+	while finished == 0:
+
+		try:
+	
+			#chain = 'Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1'
+			url = 'https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi'
+
+			# Create session and browser
+			session = Session()
+			browser = RoboBrowser(session=session,history=False, parser="html5lib")
+			browser.open(url)
+
+			form = browser.get_form(action=re.compile(r'standardize'))
+
+			#print(form.fields)
+
+			form['structure'] = 'smiles'
+			form['structuresmiles'] = chain
+
+			#print(len(list(form.submit_fields.items(multi=True))))
+			#print(list(form.submit_fields.items(multi=True)))
+			#print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[0])
+			#print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[1])
+
+			submit_field = form['submitjob']
+			submit_field.value = 'Authorize'
+			#print(form)
+			res = browser.submit_form(form, submit=submit_field)
+			parsedbrowser = str(browser.parsed)
+			#print(parsedbrowser)
+	
+			parsedbrowser = parsedbrowser.replace("<html>", "")
+			parsedbrowser = parsedbrowser.replace("<head>", "")
+			parsedbrowser = parsedbrowser.replace("</head>", "")
+			parsedbrowser = parsedbrowser.replace("</html>", "")
+			parsedbrowser = parsedbrowser.replace("<body>", "")
+			parsedbrowser = parsedbrowser.replace("</body>", "")
+
+			badpos = parsedbrowser.find('Output Log:')
+			if(badpos != -1):
+				print >> sys.stderr, parsedbrowser[badpos:]
+			else:
+				parsedbrowser = parsedbrowser.replace("\n", "")
+				print(parsedbrowser)
+
+
+
+			del browser
+			gc.collect()
+
+			finished = 1
+
+
+		except Exception as ex:
+
+			finished = 0
+			time.sleep(5)
+
+
+
+
+	
+
+
+if __name__ == "__main__":
+	main()
+