Mercurial > repos > fabio > srase
comparison retrieve.py @ 0:854be3d51221 draft
Uploaded 20171204
| author | fabio | 
|---|---|
| date | Mon, 04 Dec 2017 16:05:45 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:854be3d51221 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # NCBI SRA Tools | |
| 4 # https://galaxyproject.org/tutorials/upload/ | |
| 5 | |
| 6 import os | |
| 7 import optparse | |
| 8 from subprocess import Popen, PIPE | |
| 9 | |
| 10 db_key = "?"; | |
| 11 sra_instant_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/"; | |
| 12 | |
| 13 def convertSRA(tmp_dir, accession_number, data_format): | |
| 14 absolute_tmp_dir = os.path.abspath(tmp_dir); | |
| 15 sra_file_path = os.path.join(absolute_tmp_dir, accession_number+".sra"); | |
| 16 if os.path.isdir(absolute_tmp_dir) and os.path.exists(sra_file_path): | |
| 17 process = None; | |
| 18 if data_format == ".fasta.gz": | |
| 19 process = Popen(["fastq-dump", "--fasta", "--gzip", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE); | |
| 20 elif data_format == ".fastq.gz": | |
| 21 process = Popen(["fastq-dump", "--gzip", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE); | |
| 22 elif data_format == ".fasta": | |
| 23 process = Popen(["fastq-dump", "--fasta", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE); | |
| 24 elif data_format == ".fastq": | |
| 25 process = Popen(["fastq-dump", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE); | |
| 26 else: | |
| 27 process = None; | |
| 28 if process is not None: | |
| 29 (output, err) = process.communicate(); | |
| 30 if err: | |
| 31 # kill the process | |
| 32 # kill_process(process.pid); | |
| 33 # remove any trace of the output file | |
| 34 an_file_path = os.path.join(tmp_dir, accession_number+data_format); | |
| 35 if os.path.exists(an_file_path): | |
| 36 os.unlink(an_file_path); | |
| 37 # try to restart the process | |
| 38 return downloadAccessionData(tmp_dir, accession_number, data_format); | |
| 39 #exit_code = process.wait(); | |
| 40 return os.path.join(tmp_dir, accession_number+data_format); | |
| 41 return ""; | |
| 42 | |
| 43 def downloadAccessionData(accession_number, accession_path, appdata_path, data_format, limit=10): | |
| 44 split = accession_number[:6]; | |
| 45 srr_path = sra_instant_url+split+"/"+accession_number+"/"+accession_number+".sra"; | |
| 46 sra_file_path = os.path.join(appdata_path, accession_number+".sra"); | |
| 47 process = Popen(['wget', srr_path, "--output-document="+sra_file_path], stdout=PIPE); | |
| 48 (output, err) = process.communicate(); | |
| 49 if err: | |
| 50 # remove any trace of the output file | |
| 51 if os.path.exists(an_file_path): | |
| 52 os.unlink(an_file_path); | |
| 53 # try to restart the process | |
| 54 if limit > 0: | |
| 55 return downloadAccessionData(accession_number, accession_path, appdata_path, data_format, limit-1); | |
| 56 return -1; | |
| 57 if os.path.exists(sra_file_path): | |
| 58 converted_file_path = convertSRA(appdata_path, accession_number, data_format); | |
| 59 if os.path.exists(converted_file_path): | |
| 60 os.rename(converted_file_path, accession_path); | |
| 61 os.unlink(sra_file_path); | |
| 62 return 0; | |
| 63 | |
| 64 def process_accessions( options, args ): | |
| 65 # create appdata dir if it does not exist | |
| 66 appdata_path = options.appdata; | |
| 67 if not os.path.exists(appdata_path): | |
| 68 os.makedirs(appdata_path); | |
| 69 data_format = options.dataformat; | |
| 70 ''' | |
| 71 # Collection test | |
| 72 test_file_name = "Test Collection" + "_" + "SRRtest" + "_" + data_format[1:] + "_" + db_key; | |
| 73 test_file_path = os.path.join(appdata_path, test_file_name); | |
| 74 file = open(test_file_path, "w"); | |
| 75 file.write("Hello World"); | |
| 76 file.close(); | |
| 77 ''' | |
| 78 # read inputs | |
| 79 comma_sep_file_paths = options.files; | |
| 80 #print("files: "+str(comma_sep_file_paths)+" - "+str(type(comma_sep_file_paths))); | |
| 81 # check if options.files contains at least one file path | |
| 82 if comma_sep_file_paths is not None: | |
| 83 # split file paths | |
| 84 file_paths = comma_sep_file_paths.split(","); | |
| 85 # split file names | |
| 86 comma_sep_file_names = str(options.names); | |
| 87 #print("names: "+str(comma_sep_file_names)); | |
| 88 file_names = comma_sep_file_names.split(","); | |
| 89 # populate a dictionary with the files containing the sequences to query | |
| 90 for idx, file_path in enumerate(file_paths): | |
| 91 file_name = file_names[idx]; | |
| 92 #print(file_name + ": " + file_path); | |
| 93 with open(file_path) as accessions: | |
| 94 for line in accessions: | |
| 95 if line.strip() != "" and not line.startswith(">"): | |
| 96 accession_number = line.strip(); | |
| 97 filename_with_collection_prefix = file_name + "_" + accession_number + "_" + data_format[1:] + "_" + db_key; | |
| 98 accession_path = os.path.join(appdata_path, filename_with_collection_prefix) | |
| 99 # download fastq filte related to accession_number | |
| 100 downloadAccessionData( accession_number, accession_path, appdata_path, data_format ); | |
| 101 return 0; | |
| 102 | |
| 103 def __main__(): | |
| 104 # Parse the command line options | |
| 105 usage = "Usage: retrieve.py --files comma_sep_file_paths --names comma_seq_file_names --format data_format --appdata folder_name"; | |
| 106 parser = optparse.OptionParser(usage = usage); | |
| 107 parser.add_option("-f", "--files", type="string", | |
| 108 action="store", dest="files", help="comma separated files path"); | |
| 109 parser.add_option("-n", "--names", type="string", | |
| 110 action="store", dest="names", help="comma separated names associated to the files specified in --files"); | |
| 111 parser.add_option("-e", "--format", type="string", | |
| 112 action="store", dest="dataformat", help="data format"); | |
| 113 parser.add_option("-a", "--appdata", type="string", | |
| 114 action="store", dest="appdata", help="appdata folder name"); | |
| 115 parser.add_option("-v", "--version", action="store_true", dest="version", | |
| 116 default=False, help="display version and exit"); | |
| 117 (options, args) = parser.parse_args(); | |
| 118 if options.version: | |
| 119 print __version__; | |
| 120 else: | |
| 121 return process_accessions( options, args ); | |
| 122 | |
| 123 if __name__ == "__main__": __main__() | 
