view PLIDflow/scripts/scrapPubChem.py @ 6:795e11fac81b draft default tip

Included new tools for standardization
author bitlab
date Wed, 22 Apr 2020 06:12:00 -0400
parents
children
line wrap: on
line source

from requests import Session
from robobrowser import RoboBrowser
from pprint import pprint
from bs4 import BeautifulSoup
from time import sleep
import random
import json
import csv
import time
import re
import gc
import time
import sys

def main():
	# User information


	# REMEMBER TO PUT THE INPUT ARGUMENT WITH DOUBLE QUOTES
	if (len(sys.argv) != 2):
                print >> sys.stderr, "ERROR: Missing input chain"
		exit()

	chain = str(sys.argv[1])

	finished = 0

	while finished == 0:

		try:
	
			#chain = 'Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1'
			url = 'https://pubchem.ncbi.nlm.nih.gov/standardize/standardize.cgi'

			# Create session and browser
			session = Session()
			browser = RoboBrowser(session=session,history=False, parser="html5lib")
			browser.open(url)

			form = browser.get_form(action=re.compile(r'standardize'))

			#print(form.fields)

			form['structure'] = 'smiles'
			form['structuresmiles'] = chain

			#print(len(list(form.submit_fields.items(multi=True))))
			#print(list(form.submit_fields.items(multi=True)))
			#print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[0])
			#print(' FORM action1 name: ', list(form.submit_fields.items(multi=True))[1])

			submit_field = form['submitjob']
			submit_field.value = 'Authorize'
			#print(form)
			res = browser.submit_form(form, submit=submit_field)
			parsedbrowser = str(browser.parsed)
			#print(parsedbrowser)
	
			parsedbrowser = parsedbrowser.replace("<html>", "")
			parsedbrowser = parsedbrowser.replace("<head>", "")
			parsedbrowser = parsedbrowser.replace("</head>", "")
			parsedbrowser = parsedbrowser.replace("</html>", "")
			parsedbrowser = parsedbrowser.replace("<body>", "")
			parsedbrowser = parsedbrowser.replace("</body>", "")

			badpos = parsedbrowser.find('Output Log:')
			if(badpos != -1):
				print >> sys.stderr, parsedbrowser[badpos:]
			else:
				parsedbrowser = parsedbrowser.replace("\n", "")
				print(parsedbrowser)



			del browser
			gc.collect()

			finished = 1


		except Exception as ex:

			finished = 0
			time.sleep(5)




	


if __name__ == "__main__":
	main()