Mercurial > repos > drosofff > msp_sr_bowtie_cascade
comparison sRbowtieCascade.py @ 0:0528fced93a9 draft default tip
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
| author | drosofff |
|---|---|
| date | Wed, 27 May 2015 17:31:35 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0528fced93a9 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # small RNA oriented bowtie wrapper in cascade for small RNA data set genome annotation | |
| 3 # version 0.9 13-6-2014 | |
| 4 # Usage sRbowtie_cascade.py see Parser() for valid arguments | |
| 5 # Christophe Antoniewski <drosofff@gmail.com> | |
| 6 | |
| 7 import sys, os, subprocess, tempfile, shutil, argparse | |
| 8 from collections import defaultdict | |
| 9 | |
| 10 def Parser(): | |
| 11 the_parser = argparse.ArgumentParser() | |
| 12 the_parser.add_argument('--output', action="store", type=str, help="output file") | |
| 13 the_parser.add_argument('--num-threads', dest="num_threads", action="store", type=str, help="number of bowtie threads") | |
| 14 the_parser.add_argument('--mismatch', action="store", type=str, help="number of mismatches allowed") | |
| 15 the_parser.add_argument('--indexing-flags', dest="indexing_flags", nargs='+', help="whether the index should be generated or not by bowtie-buid") | |
| 16 the_parser.add_argument('--index',nargs='+', help="paths to indexed or fasta references") | |
| 17 the_parser.add_argument('--indexName',nargs='+', help="Names of the indexes") | |
| 18 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") | |
| 19 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") | |
| 20 args = the_parser.parse_args() | |
| 21 return args | |
| 22 | |
| 23 def stop_err( msg ): | |
| 24 sys.stderr.write( '%s\n' % msg ) | |
| 25 sys.exit() | |
| 26 | |
| 27 def bowtie_squash(fasta): | |
| 28 tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes | |
| 29 ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) | |
| 30 ref_file_name = ref_file.name | |
| 31 ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name | |
| 32 os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name | |
| 33 cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir) | |
| 34 try: | |
| 35 FNULL = open(os.devnull, 'w') | |
| 36 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string | |
| 37 tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file | |
| 38 proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null | |
| 39 returncode = proc.wait() | |
| 40 tmp_stderr.close() | |
| 41 FNULL.close() | |
| 42 sys.stdout.write(cmd1 + "\n") | |
| 43 except Exception, e: | |
| 44 # clean up temp dir | |
| 45 if os.path.exists( tmp_index_dir ): | |
| 46 shutil.rmtree( tmp_index_dir ) | |
| 47 stop_err( 'Error indexing reference sequence\n' + str( e ) ) | |
| 48 # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment() | |
| 49 index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention | |
| 50 return index_full_path | |
| 51 | |
| 52 def make_working_dir(): | |
| 53 working_dir = tempfile.mkdtemp() | |
| 54 return working_dir | |
| 55 | |
| 56 def Clean_TempDir(directory): | |
| 57 if os.path.exists( directory ): | |
| 58 shutil.rmtree( directory ) | |
| 59 return | |
| 60 | |
| 61 def bowtie_alignment(command_line="None", working_dir = ""): | |
| 62 FNULL = open(os.devnull, 'w') | |
| 63 p = subprocess.Popen(args=command_line, cwd=working_dir, shell=True, stderr=FNULL, stdout=FNULL) | |
| 64 returncode = p.wait() | |
| 65 sys.stdout.write("%s\n" % command_line) | |
| 66 FNULL.close() | |
| 67 #p = subprocess.Popen(["wc", "-l", "%s/al.fasta"%working_dir], cwd=working_dir, stdout=subprocess.PIPE) | |
| 68 #aligned = p.communicate()[0].split()[0] | |
| 69 aligned = 0 | |
| 70 try: # hacked at gcc2014 in case of no alignment, no al.fasta file generated (?) | |
| 71 F = open ("%s/al.fasta" % working_dir, "r") | |
| 72 for line in F: | |
| 73 aligned += 1 | |
| 74 F.close() | |
| 75 except: pass | |
| 76 sys.stdout.write("Aligned: %s\n" % aligned) | |
| 77 return aligned/2 | |
| 78 | |
| 79 def CommandLiner (v_mis="1", pslots="12", index="dum/my", input="dum/my", working_dir=""): | |
| 80 return "bowtie -v %s -k 1 --best -p %s --al %s/al.fasta --un %s/unal.fasta --suppress 1,2,3,4,5,6,7,8 %s -f %s" % (v_mis, pslots, working_dir, working_dir, index, input) | |
| 81 | |
| 82 def __main__(): | |
| 83 args = Parser() | |
| 84 ## first we make all indexes available. They can be already available or be squashed by bowtie-build | |
| 85 ## we keep them in a list that alternates indexPath and "toClear" or "DoNotDelete" | |
| 86 BowtieIndexList = [] | |
| 87 for indexing_flags, bowtiePath in zip (args.indexing_flags, args.index): | |
| 88 if indexing_flags == "history": | |
| 89 BowtieIndexList.append ( bowtie_squash (bowtiePath) ) | |
| 90 BowtieIndexList.append ( "toClear" ) | |
| 91 else: | |
| 92 BowtieIndexList.append ( bowtiePath ) | |
| 93 BowtieIndexList.append ( "DoNotDelete") | |
| 94 ###### temporary Indexes are generated. They must be deleted at the end (after removing file name in the temp path) | |
| 95 ResultDict = defaultdict(list) | |
| 96 for label, input in zip(args.label, args.input): ## the main cascade, iterating over samples and bowtie indexes | |
| 97 workingDir = make_working_dir() | |
| 98 cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[0], input=input, working_dir=workingDir) | |
| 99 ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) # first step of the cascade | |
| 100 if len(BowtieIndexList) > 2: # is there a second step to perform ? | |
| 101 os.rename("%s/al.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) ## end of first step. the aligned reads are the input of the next step | |
| 102 cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[2], input="%s/toAlign.fasta"%workingDir, working_dir=workingDir) | |
| 103 ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )## second step of the cascade | |
| 104 if len(BowtieIndexList) > 4: ## remaining steps | |
| 105 for BowtieIndexPath in BowtieIndexList[4::2]: | |
| 106 try: | |
| 107 os.unlink("%s/al.fasta" % workingDir) # hacked at gcc 2014, to remove previous al.fasta file that may interfere with counting if new al.fasta is empty | |
| 108 except: pass | |
| 109 os.rename("%s/unal.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) | |
| 110 cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexPath, input="%s/toAlign.fasta"%workingDir, working_dir=workingDir) | |
| 111 ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) | |
| 112 Fun = open("%s/unal.fasta"%workingDir, "r") ## to finish, compute the number of unmatched reads | |
| 113 n = 0 | |
| 114 for line in Fun: | |
| 115 n += 1 | |
| 116 ResultDict[label].append(n/2) | |
| 117 Fun.close() | |
| 118 Clean_TempDir (workingDir) # clean the sample working directory | |
| 119 ## cleaning | |
| 120 for IndexPath, IndexFlag in zip(BowtieIndexList[::2], BowtieIndexList[1::2]): | |
| 121 if IndexFlag == "toClear": | |
| 122 Clean_TempDir ("/".join(IndexPath.split("/")[:-1])) | |
| 123 ## end of cleaning | |
| 124 | |
| 125 | |
| 126 | |
| 127 F = open (args.output, "w") | |
| 128 print >> F, "alignment reference\t%s" % "\t".join(args.label) | |
| 129 for i, reference in enumerate(args.indexName): | |
| 130 F.write ("%s" % reference) | |
| 131 for sample in args.label: | |
| 132 F.write ("\t%s" % "{:,}".format(ResultDict[sample][i]) ) | |
| 133 print >> F | |
| 134 F.write ("Remaining Unmatched") | |
| 135 for sample in args.label: | |
| 136 F.write ("\t%s" % "{:,}".format(ResultDict[sample][-1]) ) | |
| 137 print >> F | |
| 138 | |
| 139 F.close() | |
| 140 | |
| 141 if __name__=="__main__": __main__() |
