Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
| author | cstrittmatter |
|---|---|
| date | Wed, 22 Jan 2020 08:41:44 -0500 |
| parents | |
| children | 0cbed1c0a762 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:965517909457 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # -*- coding: utf-8 -*- | |
| 4 | |
| 5 ''' | |
| 6 Adapted from: | |
| 7 https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py | |
| 8 mickaelsilva | |
| 9 ''' | |
| 10 | |
| 11 import urllib2 | |
| 12 import sys | |
| 13 import urllib | |
| 14 import xml.etree.ElementTree as ET | |
| 15 import time | |
| 16 import argparse | |
| 17 import os | |
| 18 | |
| 19 | |
| 20 def runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True): | |
| 21 print '\n' + 'Searching RunIDs for ' + taxonname | |
| 22 | |
| 23 taxonname = urllib.quote(taxonname) | |
| 24 url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml" | |
| 25 try: | |
| 26 content = urllib2.urlopen(url) | |
| 27 xml = content.read() | |
| 28 tree = ET.fromstring(xml) | |
| 29 taxonid = '' | |
| 30 except: | |
| 31 print "Ooops!There might be a problem with the ena service, try later or check if the xml is well formated at " + url | |
| 32 raise | |
| 33 for child in tree: | |
| 34 taxonid = child.get('taxId') | |
| 35 if (taxonid): | |
| 36 print "\n" + "Taxon ID found: " + taxonid | |
| 37 url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + taxonid + "%29%22&result=read_run&display=xml" | |
| 38 | |
| 39 content = urllib2.urlopen(url) | |
| 40 xml = content.read() | |
| 41 tree = ET.fromstring(xml) | |
| 42 | |
| 43 runid = '' | |
| 44 n = 0 | |
| 45 with open(outputfile, "wb") as f: | |
| 46 f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n") | |
| 47 model = '' | |
| 48 prjid = '' | |
| 49 length_line = 0 | |
| 50 omics = '' | |
| 51 libraryType = '' | |
| 52 for child in tree: | |
| 53 runid = child.get('accession') | |
| 54 | |
| 55 n += 1 | |
| 56 | |
| 57 if getmachine is True or getOmicsDataType is True or getLibraryType is True: | |
| 58 for child2 in child: | |
| 59 if child2.tag == 'EXPERIMENT_REF': | |
| 60 expid = child2.get('accession') | |
| 61 url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml" | |
| 62 content = urllib2.urlopen(url2) | |
| 63 xml = content.read() | |
| 64 tree2 = ET.fromstring(xml) | |
| 65 try: | |
| 66 for child3 in tree2: | |
| 67 for child4 in child3: | |
| 68 if child4.tag == 'PLATFORM': | |
| 69 for child5 in child4: | |
| 70 for child6 in child5: | |
| 71 if child6.tag == 'INSTRUMENT_MODEL': | |
| 72 model = child6.text | |
| 73 elif child4.tag == 'STUDY_REF': | |
| 74 prjid = child4.get('accession') | |
| 75 elif child4.tag == 'DESIGN': | |
| 76 if getOmicsDataType is True or getLibraryType is True: | |
| 77 for child5 in child4: | |
| 78 if child5.tag == 'LIBRARY_DESCRIPTOR': | |
| 79 for child6 in child5: | |
| 80 if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True: | |
| 81 omics = child6.text | |
| 82 elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True: | |
| 83 libraryType = child6[0].tag | |
| 84 except: | |
| 85 model = 'not found' | |
| 86 omics = 'not found' | |
| 87 libraryType = 'not found' | |
| 88 f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n") | |
| 89 if print_True: | |
| 90 line = "run acession %s sequenced on %s from project %s for %s %s end data" % (runid, model, prjid, omics, libraryType) | |
| 91 if length_line < len(line): | |
| 92 length_line = len(line) | |
| 93 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) | |
| 94 sys.stderr.flush() | |
| 95 else: | |
| 96 f.write(str(runid) + '\t' * 4 + "\n") | |
| 97 if print_True: | |
| 98 line = "run acession %s" % (runid, prjid) | |
| 99 if length_line < len(line): | |
| 100 length_line = len(line) | |
| 101 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) | |
| 102 sys.stderr.flush() | |
| 103 print "\n" | |
| 104 print "\nfound %s run id's" % n | |
| 105 | |
| 106 else: | |
| 107 print "taxon name does not exist" | |
| 108 | |
| 109 | |
| 110 def main(): | |
| 111 parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the sequencing was performed, given a taxon name accepted by the European nucleotide Archive") | |
| 112 parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True) | |
| 113 parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True) | |
| 114 parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true', required=False) | |
| 115 parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output', action='store_true') | |
| 116 parser.add_argument('--getLibraryType', help='Informs the programme to include library type (examples: PAIRED / SINGLE) in the output', action='store_true') | |
| 117 | |
| 118 args = parser.parse_args() | |
| 119 | |
| 120 getmachine = args.g | |
| 121 taxonname = args.i[0] | |
| 122 | |
| 123 outdir = os.path.dirname(os.path.abspath(args.o[0])) | |
| 124 if not os.path.isdir(outdir): | |
| 125 os.makedirs(outdir) | |
| 126 outputfile = os.path.abspath(args.o[0]) | |
| 127 | |
| 128 getOmicsDataType = args.getOmicsDataType | |
| 129 getLibraryType = args.getLibraryType | |
| 130 | |
| 131 runSeqFromWebTaxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True) | |
| 132 | |
| 133 | |
| 134 if __name__ == "__main__": | |
| 135 main() |
