comparison scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:965517909457 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Wed, 22 Jan 2020 08:41:44 -0500
parents
children 0cbed1c0a762
comparison
equal deleted inserted replaced
-1:000000000000 0:965517909457
1 #!/usr/bin/env python
2
3 # -*- coding: utf-8 -*-
4
5 '''
6 Adapted from:
7 https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py
8 mickaelsilva
9 '''
10
11 import urllib2
12 import sys
13 import urllib
14 import xml.etree.ElementTree as ET
15 import time
16 import argparse
17 import os
18
19
20 def runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True):
21 print '\n' + 'Searching RunIDs for ' + taxonname
22
23 taxonname = urllib.quote(taxonname)
24 url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml"
25 try:
26 content = urllib2.urlopen(url)
27 xml = content.read()
28 tree = ET.fromstring(xml)
29 taxonid = ''
30 except:
31 print "Ooops!There might be a problem with the ena service, try later or check if the xml is well formated at " + url
32 raise
33 for child in tree:
34 taxonid = child.get('taxId')
35 if (taxonid):
36 print "\n" + "Taxon ID found: " + taxonid
37 url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + taxonid + "%29%22&result=read_run&display=xml"
38
39 content = urllib2.urlopen(url)
40 xml = content.read()
41 tree = ET.fromstring(xml)
42
43 runid = ''
44 n = 0
45 with open(outputfile, "wb") as f:
46 f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n")
47 model = ''
48 prjid = ''
49 length_line = 0
50 omics = ''
51 libraryType = ''
52 for child in tree:
53 runid = child.get('accession')
54
55 n += 1
56
57 if getmachine is True or getOmicsDataType is True or getLibraryType is True:
58 for child2 in child:
59 if child2.tag == 'EXPERIMENT_REF':
60 expid = child2.get('accession')
61 url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml"
62 content = urllib2.urlopen(url2)
63 xml = content.read()
64 tree2 = ET.fromstring(xml)
65 try:
66 for child3 in tree2:
67 for child4 in child3:
68 if child4.tag == 'PLATFORM':
69 for child5 in child4:
70 for child6 in child5:
71 if child6.tag == 'INSTRUMENT_MODEL':
72 model = child6.text
73 elif child4.tag == 'STUDY_REF':
74 prjid = child4.get('accession')
75 elif child4.tag == 'DESIGN':
76 if getOmicsDataType is True or getLibraryType is True:
77 for child5 in child4:
78 if child5.tag == 'LIBRARY_DESCRIPTOR':
79 for child6 in child5:
80 if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True:
81 omics = child6.text
82 elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True:
83 libraryType = child6[0].tag
84 except:
85 model = 'not found'
86 omics = 'not found'
87 libraryType = 'not found'
88 f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n")
89 if print_True:
90 line = "run acession %s sequenced on %s from project %s for %s %s end data" % (runid, model, prjid, omics, libraryType)
91 if length_line < len(line):
92 length_line = len(line)
93 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line))))
94 sys.stderr.flush()
95 else:
96 f.write(str(runid) + '\t' * 4 + "\n")
97 if print_True:
98 line = "run acession %s" % (runid, prjid)
99 if length_line < len(line):
100 length_line = len(line)
101 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line))))
102 sys.stderr.flush()
103 print "\n"
104 print "\nfound %s run id's" % n
105
106 else:
107 print "taxon name does not exist"
108
109
110 def main():
111 parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the sequencing was performed, given a taxon name accepted by the European nucleotide Archive")
112 parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True)
113 parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True)
114 parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true', required=False)
115 parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output', action='store_true')
116 parser.add_argument('--getLibraryType', help='Informs the programme to include library type (examples: PAIRED / SINGLE) in the output', action='store_true')
117
118 args = parser.parse_args()
119
120 getmachine = args.g
121 taxonname = args.i[0]
122
123 outdir = os.path.dirname(os.path.abspath(args.o[0]))
124 if not os.path.isdir(outdir):
125 os.makedirs(outdir)
126 outputfile = os.path.abspath(args.o[0])
127
128 getOmicsDataType = args.getOmicsDataType
129 getLibraryType = args.getLibraryType
130
131 runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True)
132
133
134 if __name__ == "__main__":
135 main()