Mercurial > repos > ieguinoa > data_manager_fetch_tx2gene
changeset 3:d71f65b854de draft
Uploaded
author | ieguinoa |
---|---|
date | Fri, 19 Oct 2018 07:36:02 -0400 |
parents | 7d3ffe28ff3f |
children | bacd91d8b05a |
files | data_manager/data_manager_fetch_tx2gene.py data_manager/data_manager_fetch_tx2gene.xml data_manager/get_tx2gene_table.R tool-data/tx2gene.loc.sample tool_data_table_conf.xml.sample |
diffstat | 5 files changed, 67 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_fetch_tx2gene.py Wed Oct 10 11:44:17 2018 -0400 +++ b/data_manager/data_manager_fetch_tx2gene.py Fri Oct 19 07:36:02 2018 -0400 @@ -11,6 +11,7 @@ import zipfile import gzip import bz2 +import subprocess try: # For Python 3.0 and later from urllib.request import urlopen @@ -93,20 +94,35 @@ return [ bz2.BZ2File( fh.name, 'rb') ] -def convert_tx2gene( fasta_filename, file_type, params ): - if file_type is 'tx2gene': +def convert_to_tx2gene( rscript_gff_to_tx2gene, fasta_filename, file_type, params ): + if file_type == 'tx2gene': return #no need to extract tx2gene table + #print file_type #If the file is actually a GFF/GTF file then extract the tx2gene gff_temp_filename = tempfile.NamedTemporaryFile().name shutil.move(fasta_filename, gff_temp_filename) args= ['Rscript'] - args.append(RSCRIPT_GFF_TO_TX2GENE) - args.append(gff_temp_filename) - args.append(fasta_filename) + args.append(rscript_gff_to_tx2gene) + args.extend(['-x',gff_temp_filename]) + args.extend(['-o',fasta_filename]) + args.extend(['-t',file_type]) + tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-stderr" ) + return_code = subprocess.call( args=args, shell=False, stderr=tmp_stderr.fileno() ) + #return_code = subprocess.call( args=args, shell=False, stderr=None) + if return_code: + tmp_stderr.flush() + tmp_stderr.seek(0) + print >> sys.stderr, "Error in process call" + while True: + chunk = tmp_stderr.read( CHUNK_SIZE ) + if not chunk: + break + sys.stderr.write( chunk ) + sys.exit( return_code ) + tmp_stderr.close() - #assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method ) - #return SORTING_METHODS[ sort_method ]( fasta_filename, params ) - + + def _download_file(start, fh): tmp = tempfile.NamedTemporaryFile() tmp.write(start) @@ -143,29 +159,29 @@ -def add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params): - for data_table_name, data_table_entry in _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): +def add_fasta_to_table(rscript_gff_to_tx2gene, data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params): + for data_table_name, data_table_entry in _stream_fasta_to_file(rscript_gff_to_tx2gene, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ): if data_table_entry: _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) -def download_from_url( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): +def download_from_url(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) fasta_readers = [ get_stream_reader(urlopen( url ), tmp_dir) for url in urls ] - add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params) + add_fasta_to_table(rscript_gff_to_tx2gene,data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id,sequence_name, params) -def download_from_history( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): +def download_from_history(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): #TODO: allow multiple FASTA input files input_filename = params['param_dict']['reference_source']['input_fasta'] if isinstance( input_filename, list ): fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] else: fasta_readers = get_stream_reader(open(input_filename), tmp_dir) - add_fasta_to_table(data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params) + add_fasta_to_table(rscript_gff_to_tx2gene,data_manager_dict, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params) -def copy_from_directory( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): +def copy_from_directory(rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir ): input_filename = params['param_dict']['reference_source']['fasta_filename'] create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' if create_symlink: @@ -175,7 +191,7 @@ fasta_readers = [ get_stream_reader(open(filename, 'rb'), tmp_dir) for filename in input_filename ] else: fasta_readers = get_stream_reader(open(input_filename), tmp_dir) - data_table_entries = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ) + data_table_entries = _stream_fasta_to_file(rscript_gff_to_tx2gene, fasta_readers, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params ) for data_table_name, data_table_entry in data_table_entries: if data_table_entry: _add_data_table_entry( data_manager_dict, data_table_entry, data_table_name ) @@ -188,7 +204,7 @@ return data_manager_dict -def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ): +def _stream_fasta_to_file( rscript_gff_to_tx2gene, fasta_stream, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, params, close_stream=True ): fasta_base_filename = "%s_tx2gene.tab" % sequence_id fasta_filename = os.path.join( target_directory, fasta_base_filename ) with open( fasta_filename, 'wb+' ) as fasta_writer: @@ -220,7 +236,7 @@ if close_stream: fasta_stream.close() - convert_to_tx2gene( fasta_filename, params['param_dict']['file_type'], params ) + convert_to_tx2gene( rscript_gff_to_tx2gene,fasta_filename, params['param_dict']['file_type'], params ) return [ ( DATA_TABLE_NAME, dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) ) ] @@ -271,17 +287,17 @@ #Parse Command Line parser = optparse.OptionParser() parser.add_option( '-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description' ) + parser.add_option( '-b', '--base_dir', dest='base_dir', action='store', type='string', default=None, help='base_dir') parser.add_option( '-t', '--type', dest='file_type', action='store', type='string', default=None, help='file_type') (options, args) = parser.parse_args() filename = args[0] #global DATA_TABLE_NAME - global RSCRIPT_GFF_TO_TX2GENE= os.path.join( options.base_dir, 'tximport.r') - + rscript_gff_to_tx2gene=os.path.join( options.base_dir, 'get_tx2gene_table.R') - if options.file_type == 'gff_gtf': - #DATA_TABLE_NAME= 'representative_gff' - else: #file_type='tx2gene' + #input_type='gff_gtf' + #if options.file_type != 'gff_gtf': + # file_type='tx2gene' params = loads( open( filename ).read() ) target_directory = params[ 'output_data' ][0]['extra_files_path'] @@ -297,7 +313,7 @@ tmp_dir = tempfile.mkdtemp() #Fetch the input file try: - REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir) + REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( rscript_gff_to_tx2gene, data_manager_dict, params, target_directory, dbkey, dbkey_name, sequence_id, sequence_name, tmp_dir) finally: cleanup_before_exit(tmp_dir) #save info to json file
--- a/data_manager/data_manager_fetch_tx2gene.xml Wed Oct 10 11:44:17 2018 -0400 +++ b/data_manager/data_manager_fetch_tx2gene.xml Fri Oct 19 07:36:02 2018 -0400 @@ -1,5 +1,10 @@ <tool id="data_manager_fetch_tx2gene" name="Create entries in tx2gene data table" version="0.0.1" tool_type="manage_data"> <description>fetching</description> + <requirements> + <requirement type="package" version="1.26.4">bioconductor-genomicfeatures</requirement> + <requirement type="package">r-getopt</requirement> + </requirements> + <command><![CDATA[ python "$__tool_directory__"/data_manager_fetch_tx2gene.py "${out_file}" --type $file_type @@ -14,9 +19,10 @@ <param type="text" name="sequence_id" value="" label="ID for sequence" /> <param name="file_type" type="select" label="Select input type: GFF/GTF file(features will be extracted to create tx2gene table) or transcript to gene table file(tab separated)"> - <option value="gff_gtf">GFF/GTF file</option> + <option value="gtf">GTF file</option> + <option value="gff3">GFF3 file</option> <option value="tx2gene">tx2gene</option> - </param> + </param> <conditional name="reference_source"> <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> <option value="url">URL</option>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/get_tx2gene_table.R Fri Oct 19 07:36:02 2018 -0400 @@ -0,0 +1,17 @@ +library(getopt) + +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "input_type", "t", 1, "character", + "outfile", "o", 1, "character", + "gtfFile", "x", 1, "character"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +suppressPackageStartupMessages({library("GenomicFeatures")}) +txdb <- makeTxDbFromGFF(opt$gtfFile, format=opt$input_type) +k <- keys(txdb, keytype = "GENEID") +df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME") +tx2gene <- df[, 2:1] # tx ID, then gene ID +write.table(tx2gene,file = opt$outfile, quote = FALSE, sep = " ",row.names = FALSE,col.names = FALSE) +
--- a/tool-data/tx2gene.loc.sample Wed Oct 10 11:44:17 2018 -0400 +++ b/tool-data/tx2gene.loc.sample Fri Oct 19 07:36:02 2018 -0400 @@ -1,3 +1,3 @@ #The tx2gene.loc file has this format: # -#<unique_build_id> <dbkey> <display_name> <path_to_gff_file> +#<unique_build_id> <dbkey> <display_name> <path_to_tx2gene_file>
--- a/tool_data_table_conf.xml.sample Wed Oct 10 11:44:17 2018 -0400 +++ b/tool_data_table_conf.xml.sample Fri Oct 19 07:36:02 2018 -0400 @@ -1,4 +1,4 @@ <?xml version="1.0"?> <tables> - <table name="tx2gene_table" comment_char="#" allow_duplicate_entries="False"><columns>value, dbkey, name, path</columns><file path="tool-data/tx2gene.loc" /></table> + <table name="tx2gene" comment_char="#" allow_duplicate_entries="False"><columns>value, dbkey, name, path</columns><file path="tool-data/tx2gene.loc" /></table> </tables>