diff scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:965517909457 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Wed, 22 Jan 2020 08:41:44 -0500
parents
children 0cbed1c0a762
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/ReMatCh/modules/seqFromWebTaxon.py	Wed Jan 22 08:41:44 2020 -0500
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+
+# -*- coding: utf-8 -*-
+
+'''
+Adapted from:
+https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py
+mickaelsilva
+'''
+
+import urllib2
+import sys
+import urllib
+import xml.etree.ElementTree as ET
+import time
+import argparse
+import os
+
+
+def runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True):
+    print '\n' + 'Searching RunIDs for ' + taxonname
+
+    taxonname = urllib.quote(taxonname)
+    url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml"
+    try:
+        content = urllib2.urlopen(url)
+        xml = content.read()
+        tree = ET.fromstring(xml)
+        taxonid = ''
+    except:
+        print "Ooops!There might be a problem with the ena service, try later or check if the xml is well formated at " + url
+        raise
+    for child in tree:
+        taxonid = child.get('taxId')
+    if (taxonid):
+        print "\n" + "Taxon ID found: " + taxonid
+        url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + taxonid + "%29%22&result=read_run&display=xml"
+
+        content = urllib2.urlopen(url)
+        xml = content.read()
+        tree = ET.fromstring(xml)
+
+        runid = ''
+        n = 0
+        with open(outputfile, "wb") as f:
+            f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n")
+            model = ''
+            prjid = ''
+            length_line = 0
+            omics = ''
+            libraryType = ''
+            for child in tree:
+                runid = child.get('accession')
+
+                n += 1
+
+                if getmachine is True or getOmicsDataType is True or getLibraryType is True:
+                    for child2 in child:
+                        if child2.tag == 'EXPERIMENT_REF':
+                            expid = child2.get('accession')
+                            url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml"
+                            content = urllib2.urlopen(url2)
+                            xml = content.read()
+                            tree2 = ET.fromstring(xml)
+                            try:
+                                for child3 in tree2:
+                                    for child4 in child3:
+                                        if child4.tag == 'PLATFORM':
+                                            for child5 in child4:
+                                                for child6 in child5:
+                                                    if child6.tag == 'INSTRUMENT_MODEL':
+                                                        model = child6.text
+                                        elif child4.tag == 'STUDY_REF':
+                                            prjid = child4.get('accession')
+                                        elif child4.tag == 'DESIGN':
+                                            if getOmicsDataType is True or getLibraryType is True:
+                                                for child5 in child4:
+                                                    if child5.tag == 'LIBRARY_DESCRIPTOR':
+                                                        for child6 in child5:
+                                                            if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True:
+                                                                omics = child6.text
+                                                            elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True:
+                                                                libraryType = child6[0].tag
+                            except:
+                                model = 'not found'
+                                omics = 'not found'
+                                libraryType = 'not found'
+                    f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n")
+                    if print_True:
+                        line = "run acession %s sequenced on %s from project %s for %s %s end data" % (runid, model, prjid, omics, libraryType)
+                        if length_line < len(line):
+                            length_line = len(line)
+                        sys.stderr.write("\r" + line + str(' ' * (length_line - len(line))))
+                        sys.stderr.flush()
+                else:
+                    f.write(str(runid) + '\t' * 4 + "\n")
+                    if print_True:
+                        line = "run acession %s" % (runid, prjid)
+                        if length_line < len(line):
+                            length_line = len(line)
+                        sys.stderr.write("\r" + line + str(' ' * (length_line - len(line))))
+                        sys.stderr.flush()
+        print "\n"
+        print "\nfound %s run id's" % n
+
+    else:
+        print "taxon name does not exist"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the sequencing was performed, given a taxon name accepted by the European nucleotide Archive")
+    parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True)
+    parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True)
+    parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true', required=False)
+    parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output', action='store_true')
+    parser.add_argument('--getLibraryType', help='Informs the programme to include library type (examples: PAIRED / SINGLE) in the output', action='store_true')
+
+    args = parser.parse_args()
+
+    getmachine = args.g
+    taxonname = args.i[0]
+
+    outdir = os.path.dirname(os.path.abspath(args.o[0]))
+    if not os.path.isdir(outdir):
+        os.makedirs(outdir)
+    outputfile = os.path.abspath(args.o[0])
+
+    getOmicsDataType = args.getOmicsDataType
+    getLibraryType = args.getLibraryType
+
+    runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True)
+
+
+if __name__ == "__main__":
+    main()