Mercurial > repos > fabio > sbtas_se
view retrieve.py @ 2:4291c9d1ff07 draft
Uploaded 20180124
author | fabio |
---|---|
date | Wed, 24 Jan 2018 11:26:24 -0500 |
parents | 00d6e82d74e9 |
children |
line wrap: on
line source
#!/usr/bin/env python # NCBI SRA Tools # https://galaxyproject.org/tutorials/upload/ import os import optparse from subprocess import Popen, PIPE db_key = "?"; sra_instant_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/"; def convertSRA(tmp_dir, accession_number, data_format): absolute_tmp_dir = os.path.abspath(tmp_dir); sra_file_path = os.path.join(absolute_tmp_dir, accession_number+".sra"); if os.path.isdir(absolute_tmp_dir) and os.path.exists(sra_file_path): process = None; if data_format == ".fasta.gz": process = Popen(["fastq-dump", "--fasta", "--gzip", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE); elif data_format == ".fastq.gz": process = Popen(["fastq-dump", "--gzip", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE); elif data_format == ".fasta": process = Popen(["fastq-dump", "--fasta", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE); elif data_format == ".fastq": process = Popen(["fastq-dump", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE); else: process = None; if process is not None: (output, err) = process.communicate(); if err: # kill the process # kill_process(process.pid); # remove any trace of the output file an_file_path = os.path.join(tmp_dir, accession_number+data_format); if os.path.exists(an_file_path): os.unlink(an_file_path); # try to restart the process return downloadAccessionData(tmp_dir, accession_number, data_format); #exit_code = process.wait(); return os.path.join(tmp_dir, accession_number+data_format); return ""; def downloadAccessionData(accession_number, accession_path, appdata_path, data_format, limit=10): split = accession_number[:6]; srr_path = sra_instant_url+split+"/"+accession_number+"/"+accession_number+".sra"; sra_file_path = os.path.join(appdata_path, accession_number+".sra"); process = Popen(['wget', srr_path, "--output-document="+sra_file_path], stdout=PIPE); (output, err) = process.communicate(); if err: # remove any trace of the output file if os.path.exists(an_file_path): os.unlink(an_file_path); # try to restart the process if limit > 0: return downloadAccessionData(accession_number, accession_path, appdata_path, data_format, limit-1); return -1; if os.path.exists(sra_file_path): converted_file_path = convertSRA(appdata_path, accession_number, data_format); if os.path.exists(converted_file_path): os.rename(converted_file_path, accession_path); os.unlink(sra_file_path); return 0; def process_accessions( options, args ): # create appdata dir if it does not exist appdata_path = options.appdata; if not os.path.exists(appdata_path): os.makedirs(appdata_path); data_format = options.dataformat; ''' # Collection test test_file_name = "Test Collection" + "_" + "SRRtest" + "_" + data_format[1:] + "_" + db_key; test_file_path = os.path.join(appdata_path, test_file_name); file = open(test_file_path, "w"); file.write("Hello World"); file.close(); ''' # read inputs comma_sep_file_paths = options.files; #print("files: "+str(comma_sep_file_paths)+" - "+str(type(comma_sep_file_paths))); # check if options.files contains at least one file path if comma_sep_file_paths is not None: # split file paths file_paths = comma_sep_file_paths.split(","); # split file names comma_sep_file_names = str(options.names); #print("names: "+str(comma_sep_file_names)); file_names = comma_sep_file_names.split(","); # populate a dictionary with the files containing the sequences to query for idx, file_path in enumerate(file_paths): file_name = file_names[idx]; #print(file_name + ": " + file_path); with open(file_path) as accessions: for line in accessions: if line.strip() != "" and not line.startswith(">"): accession_number = line.strip(); filename_with_collection_prefix = file_name + "_" + accession_number + "_" + data_format[1:] + "_" + db_key; accession_path = os.path.join(appdata_path, filename_with_collection_prefix) # download fastq filte related to accession_number downloadAccessionData( accession_number, accession_path, appdata_path, data_format ); return 0; def __main__(): # Parse the command line options usage = "Usage: retrieve.py --files comma_sep_file_paths --names comma_seq_file_names --format data_format --appdata folder_name"; parser = optparse.OptionParser(usage = usage); parser.add_option("-f", "--files", type="string", action="store", dest="files", help="comma separated files path"); parser.add_option("-n", "--names", type="string", action="store", dest="names", help="comma separated names associated to the files specified in --files"); parser.add_option("-e", "--format", type="string", action="store", dest="dataformat", help="data format"); parser.add_option("-a", "--appdata", type="string", action="store", dest="appdata", help="appdata folder name"); (options, args) = parser.parse_args(); return process_accessions( options, args ); if __name__ == "__main__": __main__()