Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Wed, 22 Jan 2020 08:41:44 -0500 |
parents | |
children | 0cbed1c0a762 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:965517909457 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # -*- coding: utf-8 -*- | |
4 | |
5 ''' | |
6 Adapted from: | |
7 https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py | |
8 mickaelsilva | |
9 ''' | |
10 | |
11 import urllib2 | |
12 import sys | |
13 import urllib | |
14 import xml.etree.ElementTree as ET | |
15 import time | |
16 import argparse | |
17 import os | |
18 | |
19 | |
20 def runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True): | |
21 print '\n' + 'Searching RunIDs for ' + taxonname | |
22 | |
23 taxonname = urllib.quote(taxonname) | |
24 url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml" | |
25 try: | |
26 content = urllib2.urlopen(url) | |
27 xml = content.read() | |
28 tree = ET.fromstring(xml) | |
29 taxonid = '' | |
30 except: | |
31 print "Ooops!There might be a problem with the ena service, try later or check if the xml is well formated at " + url | |
32 raise | |
33 for child in tree: | |
34 taxonid = child.get('taxId') | |
35 if (taxonid): | |
36 print "\n" + "Taxon ID found: " + taxonid | |
37 url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + taxonid + "%29%22&result=read_run&display=xml" | |
38 | |
39 content = urllib2.urlopen(url) | |
40 xml = content.read() | |
41 tree = ET.fromstring(xml) | |
42 | |
43 runid = '' | |
44 n = 0 | |
45 with open(outputfile, "wb") as f: | |
46 f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n") | |
47 model = '' | |
48 prjid = '' | |
49 length_line = 0 | |
50 omics = '' | |
51 libraryType = '' | |
52 for child in tree: | |
53 runid = child.get('accession') | |
54 | |
55 n += 1 | |
56 | |
57 if getmachine is True or getOmicsDataType is True or getLibraryType is True: | |
58 for child2 in child: | |
59 if child2.tag == 'EXPERIMENT_REF': | |
60 expid = child2.get('accession') | |
61 url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml" | |
62 content = urllib2.urlopen(url2) | |
63 xml = content.read() | |
64 tree2 = ET.fromstring(xml) | |
65 try: | |
66 for child3 in tree2: | |
67 for child4 in child3: | |
68 if child4.tag == 'PLATFORM': | |
69 for child5 in child4: | |
70 for child6 in child5: | |
71 if child6.tag == 'INSTRUMENT_MODEL': | |
72 model = child6.text | |
73 elif child4.tag == 'STUDY_REF': | |
74 prjid = child4.get('accession') | |
75 elif child4.tag == 'DESIGN': | |
76 if getOmicsDataType is True or getLibraryType is True: | |
77 for child5 in child4: | |
78 if child5.tag == 'LIBRARY_DESCRIPTOR': | |
79 for child6 in child5: | |
80 if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True: | |
81 omics = child6.text | |
82 elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True: | |
83 libraryType = child6[0].tag | |
84 except: | |
85 model = 'not found' | |
86 omics = 'not found' | |
87 libraryType = 'not found' | |
88 f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n") | |
89 if print_True: | |
90 line = "run acession %s sequenced on %s from project %s for %s %s end data" % (runid, model, prjid, omics, libraryType) | |
91 if length_line < len(line): | |
92 length_line = len(line) | |
93 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) | |
94 sys.stderr.flush() | |
95 else: | |
96 f.write(str(runid) + '\t' * 4 + "\n") | |
97 if print_True: | |
98 line = "run acession %s" % (runid, prjid) | |
99 if length_line < len(line): | |
100 length_line = len(line) | |
101 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) | |
102 sys.stderr.flush() | |
103 print "\n" | |
104 print "\nfound %s run id's" % n | |
105 | |
106 else: | |
107 print "taxon name does not exist" | |
108 | |
109 | |
110 def main(): | |
111 parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the sequencing was performed, given a taxon name accepted by the European nucleotide Archive") | |
112 parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True) | |
113 parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True) | |
114 parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true', required=False) | |
115 parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output', action='store_true') | |
116 parser.add_argument('--getLibraryType', help='Informs the programme to include library type (examples: PAIRED / SINGLE) in the output', action='store_true') | |
117 | |
118 args = parser.parse_args() | |
119 | |
120 getmachine = args.g | |
121 taxonname = args.i[0] | |
122 | |
123 outdir = os.path.dirname(os.path.abspath(args.o[0])) | |
124 if not os.path.isdir(outdir): | |
125 os.makedirs(outdir) | |
126 outputfile = os.path.abspath(args.o[0]) | |
127 | |
128 getOmicsDataType = args.getOmicsDataType | |
129 getLibraryType = args.getLibraryType | |
130 | |
131 runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True) | |
132 | |
133 | |
134 if __name__ == "__main__": | |
135 main() |