Mercurial > repos > jjohnson > barcode_splitter
changeset 0:bc23f6946bb8 default tip
Alternative barcode splitters that move selected results to the users history.
| author | Jim Johnson <jj@umn.edu> | 
|---|---|
| date | Tue, 19 Jul 2011 13:03:32 -0500 | 
| parents | |
| children | |
| files | fastx_barcode_splitter.xml fastx_barcode_splitter_galaxy_wrapper.py fastx_barcode_splitter_single.xml fastx_barcode_splitter_single_galaxy_wrapper.py | 
| diffstat | 4 files changed, 328 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_barcode_splitter.xml Tue Jul 19 13:03:32 2011 -0500 @@ -0,0 +1,88 @@ +<tool id="cshl_fastx_barcode_splitter" name="Barcode Splitter" force_history_refresh="True"> + <description></description> + <requirements><requirement type="package">fastx_toolkit</requirement></requirements> + <command interpreter="python">fastx_barcode_splitter_galaxy_wrapper.py + ## params for galaxy wrapper + $output + "$output.id" + "$input.ext" + "$__new_file_path__" + --barcodes='$barcodes' + $BARCODE $input "$input.name" "$output.extra_files_path" + ## params for fastx_barcode_splitter + --mismatches $mismatches --partial $partial $EOL + </command> + + <inputs> + <param format="txt" name="BARCODE" type="data" label="Barcodes to use" /> + <param format="fasta,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" /> + + <param name="EOL" type="select" label="Barcodes found at"> + <option value="--bol">Start of sequence (5' end)</option> + <option value="--eol">End of sequence (3' end)</option> + </param> + + <param name="mismatches" type="integer" size="3" value="2" label="Number of allowed mismatches" /> + + <param name="partial" type="integer" size="3" value="0" label="Number of allowed barcodes nucleotide deletions" /> + + <param name="barcodes" type="select" multiple="true" label="Select barcodes to add as new datasets to history"> + <options from_dataset="BARCODE"> + <column name="name" index="0"/> + <column name="value" index="0"/> + <filter type="unique_value" name="unq_bc" column="0" /> + <filter type="add_value" name="unmatched" value="unmatched"/> + </options> + </param> + </inputs> + + <outputs> + <data format="html" name="output" /> + </outputs> + + <tests> + <test> + <!-- Split a FASTQ file --> + <param name="BARCODE" value="fastx_barcode_splitter1.txt" /> + <param name="input" value="fastx_barcode_splitter1.fastq" ftype="fastqsolexa" /> + <param name="EOL" value="Start of sequence (5' end)" /> + <param name="mismatches" value="2" /> + <param name="partial" value="0" /> + <output name="output" file="fastx_barcode_splitter1.out" /> + </test> + </tests> + +<help> + +**What it does** + +This tool splits a Solexa library (FASTQ file) or a regular FASTA file into several files, using barcodes as the split criteria. + +-------- + +**Barcode file Format** + +Barcode files are simple text files. +Each line should contain an identifier (descriptive name for the barcode), and the barcode itself (A/C/G/T), separated by a TAB character. +Example:: + + #This line is a comment (starts with a 'number' sign) + BC1 GATCT + BC2 ATCGT + BC3 GTGAT + BC4 TGTCT + +For each barcode, a new FASTQ file will be created (with the barcode's identifier as part of the file name). +Sequences matching the barcode will be stored in the appropriate file. + +One additional FASTQ file will be created (the 'unmatched' file), where sequences not matching any barcode will be stored. + +The output of this tool is an HTML file, displaying the split counts and the file locations. + +**Output Example** + +.. image:: ./static/fastx_icons/barcode_splitter_output_example.png + +</help> +</tool> +<!-- FASTX-barcode-splitter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_barcode_splitter_galaxy_wrapper.py Tue Jul 19 13:03:32 2011 -0500 @@ -0,0 +1,91 @@ +import sys, os, os.path, tempfile, shutil, re, shlex, subprocess + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +# tranform fastx_barcode_splitter result to html +def results_to_html(results_path,html_path,basepath,print_stdout ): + pat = '%s[/]?([^\t]*)' % basepath + rep = '<a href=\"\\1\">\\1</a>' + txt = open(results_path,'r') + html = open(html_path,'w') + html.write('<html><body><table border=1>\n') + try: + for line in txt: + html.write('<tr><td>%s</td></tr>' % re.sub('\t','</td><td>',re.sub(pat,rep,line))) + if print_stdout: + print >> sys.stdout, '\t'.join(line.split('\t')[:2]) + except Exception, e: + print(str(e)) + pass + html.write('</table></body></html>\n') + html.close() + txt.close() + +def __main__(): + """ + ##params for galaxy wrapper + $output + "$output.id" + "$input.ext" + "$__new_file_path__" + --barcodes='$barcodes' + $BARCODE $input "$input.name" "$output.extra_files_path" + ## params for fastx_barcode_splitter + --mismatches $mismatches --partial $partial $EOL + """ + output = sys.argv[1] + output_id = sys.argv[2] + file_ext = sys.argv[3] + new_file_path = sys.argv[4] + select_barcodes = sys.argv[5].replace('--barcodes=','') + barcodes = sys.argv[6] + fastx = sys.argv[7] + fastx_name = sys.argv[8] + extra_files_path = sys.argv[9] + script_args = ' '.join(sys.argv[10:]) + #Sanitize library name, make sure we can create a file with this name + lib_name = re.sub('\W','_',re.sub('\.\W*$','',fastx_name))+'_' + prefix = os.path.join(extra_files_path,lib_name) + # Check that input datasets exist + if not os.path.isfile(fastx): + stop_err('Error: Input file (%s) not found!' % fastx) + if not os.path.isfile(barcodes): + stop_err('Error: barcode file (%s) not found!' % barcodes) + try: + # Check that extra_files_path exists + if not os.path.isdir(extra_files_path): + os.makedirs(extra_files_path) + cmd_line = 'zcat -f %s | fastx_barcode_splitter.pl --bcfile %s --prefix %s --suffix %s %s' %(fastx,barcodes,prefix,'.'+file_ext,script_args) + # print >> sys.stderr, cmd_line + # Create file to collect results written to stdout + tmp_dir = tempfile.mkdtemp() + result_path = tempfile.NamedTemporaryFile(dir=tmp_dir, prefix='results_', suffix='.out' ).name + result_file = open( result_path, 'wb' ) + proc = subprocess.Popen( args=cmd_line, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE,stdout=result_file.fileno() ) + returncode = proc.wait() + result_file.close() + stderr = proc.stderr.read() + if returncode != 0: + raise Exception, stderr + # copy results to ouptut + results_to_html(result_path,output,extra_files_path,True) + # make new datasets for selected barcodes + if select_barcodes != None and len(select_barcodes) > 0: + flist = os.listdir(extra_files_path) + for barcode in select_barcodes.split(','): + for fname in flist: + if fname.find('_'+barcode+'.'+file_ext) >= 0: + fpath = os.path.join(extra_files_path,fname) + # filename pattern required by galaxy + fn = "%s_%s_%s_%s_%s" % ( 'primary', output_id, barcode, 'visible', file_ext ) + npath = os.path.join(new_file_path,fn) + try: + os.link(fpath, npath) + except: + shutil.copy2(fpath, npath) + except Exception, e: + raise Exception, 'Exception caught attempting conversion: ' + str( e ) + +if __name__ == "__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_barcode_splitter_single.xml Tue Jul 19 13:03:32 2011 -0500 @@ -0,0 +1,63 @@ +<tool id="cshl_fastx_barcode_splitter_single" name="Barcode Splitter (Single)"> + <description></description> + <requirements><requirement type="package">fastx_toolkit</requirement></requirements> + <command interpreter="python">fastx_barcode_splitter_single_galaxy_wrapper.py + $matched_output + $unmatched_output + "$input.ext" + --barcodes='$barcode' + $input "$input.name" + --mismatches $mismatches --partial $partial $EOL + </command> + + <inputs> + <!-- <param format="txt" name="BARCODE" type="data" label="Barcodes to use" /> --> + <param format="fasta,fastqsanger,fastqsolexa,fastqillumina" name="input" type="data" label="Library to split" /> + + <param name="EOL" type="select" label="Barcodes found at"> + <option value="--bol">Start of sequence (5' end)</option> + <option value="--eol">End of sequence (3' end)</option> + </param> + + <param name="mismatches" type="integer" size="3" value="2" label="Number of allowed mismatches" /> + + <param name="partial" type="integer" size="3" value="0" label="Number of allowed barcodes nucleotide deletions" /> + + <param name="barcode" type="text" label="Barcode to extract" /> + + <!-- + <param name="barcodes" type="select" multiple="true" label="Select barcodes to add as new datasets to history"> + <options from_dataset="BARCODE"> + <column name="name" index="0"/> + <column name="value" index="0"/> + <filter type="unique_value" name="unq_bc" column="0" /> + <filter type="add_value" name="unmatched" value="unmatched"/> + </options> + </param> + --> + </inputs> + + <outputs> + <data format_source="input" name="matched_output" label="Barcode Splitter on ${input.name} (Matching sequences)" /> + <data format_source="input" name="unmatched_output" label="Barcode Splitter on ${input.name} (Unmatched sequences)" /> + </outputs> + + <tests> + </tests> + +<help> + +**What it does** + +This tool splits a Solexa library (FASTQ file) or a regular FASTA file into two files using a barcode as the split criteria. + +-------- + +A new FASTQ file will be created (with the barcode's identifier as part of the file name). +Sequences matching the barcode will be stored in the appropriate file. + +An additional FASTQ file will be created (the 'unmatched' file), where sequences not matching this barcode will be stored. + +</help> +</tool> +<!-- FASTX-barcode-splitter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_barcode_splitter_single_galaxy_wrapper.py Tue Jul 19 13:03:32 2011 -0500 @@ -0,0 +1,86 @@ +import sys, os, os.path, tempfile, shutil, re, shlex, subprocess + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +# tranform fastx_barcode_splitter result to html +def results_to_html(results_path,html_path,basepath,print_stdout ): + pat = '%s[/]?([^\t]*)' % basepath + rep = '<a href=\"\\1\">\\1</a>' + txt = open(results_path,'r') + html = open(html_path,'w') + html.write('<html><body><table border=1>\n') + try: + for line in txt: + html.write('<tr><td>%s</td></tr>' % re.sub('\t','</td><td>',re.sub(pat,rep,line))) + if print_stdout: + print >> sys.stdout, '\t'.join(line.split('\t')[:2]) + except Exception, e: + print(str(e)) + pass + html.write('</table></body></html>\n') + html.close() + txt.close() + +def __main__(): + """ + ##params for galaxy wrapper + $output + "$output.id" + "$input.ext" + "$__new_file_path__" + --barcodes='$barcodes' + $BARCODE $input "$input.name" "$output.extra_files_path" + ## params for fastx_barcode_splitter + --mismatches $mismatches --partial $partial $EOL + """ + + output = sys.argv[1] + output_unmatched = sys.argv[2] + file_ext = sys.argv[3] + select_barcode = sys.argv[4].replace('--barcodes=','') + barcodes = os.path.abspath("barcodes") + with open(barcodes, 'w') as f: + f.write("barcode\t%s\n" % (select_barcode)) + + #barcodes = sys.argv[6] + fastx = sys.argv[5] + fastx_name = sys.argv[6] + #extra_files_path = sys.argv[9] + script_args = ' '.join(sys.argv[7:]) + #Sanitize library name, make sure we can create a file with this name + lib_name = re.sub('\W','_',re.sub('\.\W*$','',fastx_name))+'_' + # Check that input datasets exist + if not os.path.isfile(fastx): + stop_err('Error: Input file (%s) not found!' % fastx) + try: + prefix = lib_name + cmd_line = 'zcat -f %s | fastx_barcode_splitter.pl --bcfile %s --prefix %s --suffix %s %s' %(fastx,barcodes,prefix,'.'+file_ext,script_args) + # print >> sys.stderr, cmd_line + # Create file to collect results written to stdout + tmp_dir = tempfile.mkdtemp() + result_path = tempfile.NamedTemporaryFile(dir=tmp_dir, prefix='results_', suffix='.out' ).name + result_file = open( result_path, 'wb' ) + proc = subprocess.Popen( args=cmd_line, shell=True, cwd=tmp_dir, stderr=subprocess.PIPE,stdout=result_file.fileno() ) + returncode = proc.wait() + result_file.close() + stderr = proc.stderr.read() + if returncode != 0: + raise Exception, stderr + # copy results to ouptut + #results_to_html(result_path,output,extra_files_path,True) + # make new datasets for selected barcodes + flist = os.listdir(tmp_dir) + for fname in flist: + if fname.find('_'+barcode+'.'+file_ext) >= 0: + fpath = os.path.join(tmp_dir,fname) + shutil.copy2(fpath, output) + for fname in flist: + if fname.find('_unmatched.' + file_ext) > 0: + fpath = os.path.join(tmp_dir, fname) + shutil.copy2(fpath, output_unmatched) + except Exception, e: + raise Exception, 'Exception caught attempting conversion: ' + str( e ) + +if __name__ == "__main__": __main__()
