Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
diff scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Wed, 22 Jan 2020 08:41:44 -0500 |
parents | |
children | 0cbed1c0a762 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/ReMatCh/modules/seqFromWebTaxon.py Wed Jan 22 08:41:44 2020 -0500 @@ -0,0 +1,135 @@ +#!/usr/bin/env python + +# -*- coding: utf-8 -*- + +''' +Adapted from: +https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py +mickaelsilva +''' + +import urllib2 +import sys +import urllib +import xml.etree.ElementTree as ET +import time +import argparse +import os + + +def runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True): + print '\n' + 'Searching RunIDs for ' + taxonname + + taxonname = urllib.quote(taxonname) + url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml" + try: + content = urllib2.urlopen(url) + xml = content.read() + tree = ET.fromstring(xml) + taxonid = '' + except: + print "Ooops!There might be a problem with the ena service, try later or check if the xml is well formated at " + url + raise + for child in tree: + taxonid = child.get('taxId') + if (taxonid): + print "\n" + "Taxon ID found: " + taxonid + url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + taxonid + "%29%22&result=read_run&display=xml" + + content = urllib2.urlopen(url) + xml = content.read() + tree = ET.fromstring(xml) + + runid = '' + n = 0 + with open(outputfile, "wb") as f: + f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n") + model = '' + prjid = '' + length_line = 0 + omics = '' + libraryType = '' + for child in tree: + runid = child.get('accession') + + n += 1 + + if getmachine is True or getOmicsDataType is True or getLibraryType is True: + for child2 in child: + if child2.tag == 'EXPERIMENT_REF': + expid = child2.get('accession') + url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml" + content = urllib2.urlopen(url2) + xml = content.read() + tree2 = ET.fromstring(xml) + try: + for child3 in tree2: + for child4 in child3: + if child4.tag == 'PLATFORM': + for child5 in child4: + for child6 in child5: + if child6.tag == 'INSTRUMENT_MODEL': + model = child6.text + elif child4.tag == 'STUDY_REF': + prjid = child4.get('accession') + elif child4.tag == 'DESIGN': + if getOmicsDataType is True or getLibraryType is True: + for child5 in child4: + if child5.tag == 'LIBRARY_DESCRIPTOR': + for child6 in child5: + if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True: + omics = child6.text + elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True: + libraryType = child6[0].tag + except: + model = 'not found' + omics = 'not found' + libraryType = 'not found' + f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n") + if print_True: + line = "run acession %s sequenced on %s from project %s for %s %s end data" % (runid, model, prjid, omics, libraryType) + if length_line < len(line): + length_line = len(line) + sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) + sys.stderr.flush() + else: + f.write(str(runid) + '\t' * 4 + "\n") + if print_True: + line = "run acession %s" % (runid, prjid) + if length_line < len(line): + length_line = len(line) + sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) + sys.stderr.flush() + print "\n" + print "\nfound %s run id's" % n + + else: + print "taxon name does not exist" + + +def main(): + parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the sequencing was performed, given a taxon name accepted by the European nucleotide Archive") + parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True) + parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True) + parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true', required=False) + parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output', action='store_true') + parser.add_argument('--getLibraryType', help='Informs the programme to include library type (examples: PAIRED / SINGLE) in the output', action='store_true') + + args = parser.parse_args() + + getmachine = args.g + taxonname = args.i[0] + + outdir = os.path.dirname(os.path.abspath(args.o[0])) + if not os.path.isdir(outdir): + os.makedirs(outdir) + outputfile = os.path.abspath(args.o[0]) + + getOmicsDataType = args.getOmicsDataType + getLibraryType = args.getLibraryType + + runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True) + + +if __name__ == "__main__": + main()