Mercurial > repos > mingchen0919 > rmarkdown_bdss_client
diff bdss_client_sra.Rmd @ 0:512d008295db draft default tip
planemo upload for repository https://github.com/statonlab/docker-GRReport/tree/master/my_tools/rmarkdown_bdss_client_main commit d9ab791a7ce12362dc6e28c0a518a3f23dd581fe-dirty
author | mingchen0919 |
---|---|
date | Tue, 17 Oct 2017 14:09:01 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bdss_client_sra.Rmd Tue Oct 17 14:09:01 2017 -0400 @@ -0,0 +1,105 @@ +--- +title: 'Download and extract single end fastq/fasta data with BDSS client from SRA accessions' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango +--- + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set( + echo = ECHO, + error=TRUE +) +``` + +# Command line arguments + +```{r 'command line arguments'} +str(opt) +``` + +# BDSS configuration file + +First, we create a bdss configuration file `bdss.cfg` in the current directory. + +```{r} +system('echo "[metadata_repository]" > bdss.cfg') +system('echo url=http://bdss.bioinfo.wsu.edu/ >> bdss.cfg') +``` + +# Download and extract reads + +```{r 'download and extract reads'} +# create two directories, one for single end and the other for paired end SRA reads. +dir.create('se_read_files_directory') +dir.create('pe_read_files_directory') +# download and extract reads (single end) +sra_ids_se = strsplit(gsub(',', ' ', 'SRA_IDS_SE'), ' ')[[1]] +sra_ids_se = sra_ids_se[sra_ids_se != ''] +# loop through SRA accessions to download and extract reads. +for(id in sra_ids_se) { + # build URL from SRA id + url = paste0('ftp://ftp.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/', + substr(id, 1, 3), '/', + substr(id, 1, 6), '/', id, '/', id, '.sra') + # download sra file with bdss + bdss_command = paste0('/tool_deps/_conda/bin/bdss transfer -u ', url) + system(bdss_command, intern = TRUE) + # convert .sra to .fastq/.fasta + if('FORMAT' == 'fasta') { + command = paste0('fastq-dump --fasta -O se_read_files_directory ', id, '.sra') + } else { + command = paste0('fastq-dump -O se_read_files_directory ', id, '.sra') + } + cat('----convert SRA to fastq/fasta------\n') + print(system(command, intern = TRUE)) +} + +# download and extract reads (paired end) +sra_ids_pe = strsplit(gsub(',', ' ', 'SRA_IDS_PE'), ' ')[[1]] +sra_ids_pe = sra_ids_pe[sra_ids_pe != ''] +# loop through SRA accessions to download and extract reads. +for(id in sra_ids_pe) { + # build URL from SRA id + url = paste0('ftp://ftp.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/', + substr(id, 1, 3), '/', + substr(id, 1, 6), '/', id, '/', id, '.sra') + # download sra file with bdss + bdss_command = paste0('/tool_deps/_conda/bin/bdss transfer -u ', url) + system(bdss_command, intern = TRUE) + # convert .sra to .fastq/.fasta + if('FORMAT' == 'fasta') { + command = paste0('fastq-dump --fasta --split-files -O pe_read_files_directory ', id, '.sra') + } else { + command = paste0('fastq-dump --split-files -O pe_read_files_directory ', id, '.sra') + } + cat('----convert SRA to fastq/fasta------\n') + command_stdout = system(command, intern = TRUE) + print(command_stdout) + if(!(paste0(id, '_2.FORMAT') %in% list.files('pe_read_files_directory'))) { + # this is not a paired end SRA file. The corresponding file will be deleted. + cat(paste0(id, ' is not paired end SRA, the corresponding fastq/fasta file will deleted.')) + system(paste0('rm pe_read_files_directory/', id, '_1.*'), intern = TRUE) + } + +} + +cat('-----single end files----\n') +list.files('./se_read_files_directory') +cat('-----paired end files----\n') +list.files('./pe_read_files_directory') + +cat('-----Renaming files------\n') +# rename files for paired end reads +old_files = paste0('./pe_read_files_directory/', list.files('./pe_read_files_directory')) +print(old_files) +new_files = gsub('_1', '_forward', old_files) +new_files = gsub('_2', '_reverse', new_files) +print(new_files) +file.rename(old_files, new_files) +``` + +