Mercurial > repos > galaxyp > uniprotxml_downloader
annotate uniprotxml_downloader.py @ 0:0bd2688166a5 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
author | galaxyp |
---|---|
date | Tue, 08 Mar 2016 12:03:49 -0500 |
parents | |
children | e1abc9a35c64 |
rev | line source |
---|---|
0
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
1 #!/usr/bin/env python |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
2 """ |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
3 # |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
4 #------------------------------------------------------------------------------ |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
5 # University of Minnesota |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
6 # Copyright 2016, Regents of the University of Minnesota |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
7 #------------------------------------------------------------------------------ |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
8 # Author: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
9 # |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
10 # James E Johnson |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
11 # |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
12 #------------------------------------------------------------------------------ |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
13 """ |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
14 import sys |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
15 import re |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
16 import optparse |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
17 import urllib |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
18 |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
19 |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
20 def __main__(): |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
21 # Parse Command Line |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
22 parser = optparse.OptionParser() |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
23 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
24 parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml') |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
25 parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml') |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
26 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
27 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
28 (options, args) = parser.parse_args() |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
29 |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
30 taxids = options.taxon if options.taxon else ['9606'] |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
31 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
32 taxon_query = ' OR '.join(taxon_queries) |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
33 if options.output: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
34 dest_path = options.output |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
35 else: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
36 dest_path = "uniprot_%s.xml" % '_'.join(taxids) |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
37 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
38 url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed) |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
39 if options.debug: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
40 print >> sys.stderr, url |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
41 try: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
42 (fname, msg) = urllib.urlretrieve(url, dest_path) |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
43 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
44 if 'Content-Length' in headers and headers['Content-Length'] == 0: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
45 print >> sys.stderr, url |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
46 print >> sys.stderr, msg |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
47 exit(1) |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
48 elif True: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
49 pass |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
50 else: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
51 with open(dest_path, 'r') as contents: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
52 while True: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
53 line = contents.readline() |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
54 if options.debug: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
55 print >> sys.stderr, line |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
56 if line is None or not line.startswith('<?'): |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
57 break |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
58 # pattern match <root or <ns:root for any ns string |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
59 pattern = '^<(\w*:)?uniprot' |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
60 if re.match(pattern, line): |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
61 break |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
62 else: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
63 print >> sys.stderr, "failed: Not a uniprot xml file" |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
64 exit(1) |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
65 |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
66 if options.verbose: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
67 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
68 if 'X-UniProt-Release' in headers: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
69 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
70 if 'X-Total-Results' in headers: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
71 print >> sys.stdout, "Entries:%s" % headers['X-Total-Results'] |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
72 print >> sys.stdout, "%s" % url |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
73 except Exception, e: |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
74 print >> sys.stderr, "failed: %s" % e |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
75 |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
76 |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
77 if __name__ == "__main__": |
0bd2688166a5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
galaxyp
parents:
diff
changeset
|
78 __main__() |