Mercurial > repos > petr-novak > re_utils
annotate RM_custom_search.py @ 33:f1738f8649b0 draft
planemo upload commit 39094a128ea3dd2c39f4997c6de739c33c07e5f3-dirty
| author | petr-novak | 
|---|---|
| date | Fri, 04 Aug 2023 08:09:40 +0000 | 
| parents | 628b235d76c7 | 
| children | 
| rev | line source | 
|---|---|
| 14 | 1 #!/usr/bin/env python | 
| 0 | 2 ''' RepeatMasker search against custom database | 
| 3 input: | |
| 4 - archive with sequencing data | |
| 5 - custom repeat database | |
| 6 ''' | |
| 7 import zipfile | |
| 8 import shutil | |
| 9 import os | |
| 10 import subprocess | |
| 11 from parallel import parallel2 | |
| 12 import glob | |
| 13 | |
| 14 print(os.environ) | |
| 15 def extract_sequences(f): | |
| 16 # check archive: | |
| 17 try: | |
| 18 z=zipfile.ZipFile(f) | |
| 19 # extract only dirCLXXXX/reads.fas | |
| 20 seq_list = [] | |
| 21 for filein in z.namelist(): | |
| 11 | 22 c1 = filein.lower().startswith("seqclust/clustering/clusters/dir_cl") | 
| 23 c2 = filein.endswith("reads.fas") | |
| 24 c3 = filein.endswith("reads.fasta") # in newer RE2 versions | |
| 25 if c1 and (c2 or c3): | |
| 0 | 26 outdir = filein.split("/")[3] | 
| 27 outfile = outdir +"/reads.fas" | |
| 28 source = z.open(filein) | |
| 29 os.mkdir(outdir) | |
| 30 target = open(outfile, "wb") | |
| 31 shutil.copyfileobj(source, target) | |
| 32 seq_list.append(outfile) | |
| 33 if len(seq_list) == 0: | |
| 34 raise ValueError() | |
| 35 | |
| 36 except zipfile.BadZipfile as e: | |
| 37 print("unable to extract sequences from archive!") | |
| 38 raise e | |
| 39 | |
| 40 except IOError as e: | |
| 41 print("unable to extract sequences from archive!") | |
| 42 raise e | |
| 43 | |
| 44 except ValueError as e: | |
| 45 print("No sequences found in archive!") | |
| 46 raise e | |
| 47 | |
| 48 seq_list.sort() | |
| 49 return seq_list | |
| 50 | |
| 51 | |
| 52 def RepeatMasker(reads,database): | |
| 53 args = ["RepeatMasker", reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)] | |
| 54 print(args) | |
| 55 status=subprocess.call(args , stderr = open(os.devnull, 'wb')) | |
| 56 return status | |
| 57 | |
| 58 def summarizeRepeatMaskerOutput(htmlout = "summary.html"): | |
| 59 cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r" | |
| 32 
628b235d76c7
planemo upload commit 39094a128ea3dd2c39f4997c6de739c33c07e5f3-dirty
 petr-novak parents: 
14diff
changeset | 60 args = [ "Rscript", cmd, "dir_CL*/reads.fas", "dir_CL*/reads.fas.out", | 
| 
628b235d76c7
planemo upload commit 39094a128ea3dd2c39f4997c6de739c33c07e5f3-dirty
 petr-novak parents: 
14diff
changeset | 61 "RM-custom_output_table" ] | 
| 0 | 62 status=subprocess.call(args) | 
| 32 
628b235d76c7
planemo upload commit 39094a128ea3dd2c39f4997c6de739c33c07e5f3-dirty
 petr-novak parents: 
14diff
changeset | 63 cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R" | 
| 
628b235d76c7
planemo upload commit 39094a128ea3dd2c39f4997c6de739c33c07e5f3-dirty
 petr-novak parents: 
14diff
changeset | 64 args = ["Rscript", cmd, htmlout] | 
| 0 | 65 status=subprocess.call(args) | 
| 66 return status | |
| 67 | |
| 68 | |
| 69 def main(): | |
| 70 from optparse import OptionParser | |
| 71 | |
| 72 parser = OptionParser() | |
| 73 parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive") | |
| 74 parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database") | |
| 75 parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory") | |
| 76 parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html') | |
| 77 | |
| 78 options, args = parser.parse_args() | |
| 79 config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config" | |
| 80 | |
| 81 | |
| 82 seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING | |
| 83 parallel2(RepeatMasker, seq_files, [options.database]) | |
| 84 | |
| 85 status = summarizeRepeatMaskerOutput(options.report) | |
| 86 | |
| 87 | |
| 88 | |
| 89 if __name__== "__main__": | |
| 90 main() | |
| 91 | |
| 92 | 
