annotate piq_wrapper.py @ 5:f77a3f2876c3 draft

Uploaded
author sjung
date Wed, 24 May 2017 00:36:16 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
1 #!/usr/bin/python
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
2
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
3 import optparse, os, re, shutil, sys, tempfile, glob, shlex, vcf, pysam, tarfile, csv, operator
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
4 from subprocess import *
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
5 import subprocess
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
6 import multiprocessing
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
7 from joblib import Parallel, delayed
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
8 from itertools import izip
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
9
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
10 NCPU = multiprocessing.cpu_count()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
11 CHUNK_SIZE = 2**20 #1mb
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
12
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
13 def create_dict(files):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
14 tmp=dict()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
15 t1 = [os.path.basename(i) for i in files]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
16 t2 = ['-'.join(i.split('-')[0:2]) for i in t1 if 'RC' not in i]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
17 for i in t2:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
18 for j in files:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
19 if i in j and i in tmp:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
20 tmp[i].append(j)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
21 elif i in j and i not in tmp:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
22 tmp[i] = [j]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
23 else:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
24 continue
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
25 return tmp
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
26
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
27 def file_merge(file1, file2, out_dir):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
28 listF = (file1, file2)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
29 outF=os.path.splitext(os.path.basename(file1))[0]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
30 outExt = os.path.splitext(os.path.basename(file1))[1]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
31 with open("%s/%s_merged%s" % (out_dir,outF,outExt),"wb") as fw:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
32 for i in range(len(listF)):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
33 with open(listF[i],"rb") as f:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
34 writer = csv.writer(fw, delimiter=',', quotechar='', quoting=csv.QUOTE_NONE, escapechar='\\')
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
35 for i in range(len(listF)):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
36 with open(listF[i],"rb") as f:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
37 file_f=csv.reader(f, delimiter='\t')
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
38 file_f.next()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
39 for line in file_f:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
40 writer.writerow(line)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
41
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
42
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
43 def cleanup_before_exit( tmp_dir ):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
44 if tmp_dir and os.path.exists( tmp_dir ):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
45 shutil.rmtree( tmp_dir )
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
46
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
47 def parallel_jobs(i, piq_dir_path, tmp_dir, inputF, out_dir, output_dir):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
48 cmd=[]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
49 motif=i
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
50 piq_cmd1 = "Rscript %s/pwmmatch.exact.r %s/common.r %s/jasparVertebrateCore2016-519-matrices.txt %d %s/; " % (piq_dir_path, piq_dir_path, piq_dir_path, motif, tmp_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
51 #piq_cmd1 = "Rscript %s/pwmmatch.exact.r %s/common.r %s/jasparfix.txt %d %s/; " % (piq_dir_path, piq_dir_path, piq_dir_path, motif, tmp_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
52 piq_cmd2 = "Rscript %s/bam2rdata.r %s/common.r %s/bam.RData %s; " % (piq_dir_path, piq_dir_path, tmp_dir, inputF)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
53 piq_cmd3 = "Rscript %s/pertf.r %s/common.r %s/ %s %s/ %s/bam.RData %d; " % (piq_dir_path, piq_dir_path, tmp_dir, tmp_dir, out_dir, tmp_dir, motif)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
54 piq_cmd = piq_cmd1+piq_cmd2+piq_cmd3
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
55 cmd.append(piq_cmd)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
56 cmd="".join(cmd)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
57 #print "cmd:%s" % cmd
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
58 stdout = tempfile.NamedTemporaryFile( prefix="piq-stdout-", dir=tmp_dir )
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
59 stderr = tempfile.NamedTemporaryFile( prefix="piq-stderr-", dir=tmp_dir )
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
60 proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=output_dir )
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
61 return_code = proc.wait()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
62
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
63 if return_code:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
64 stderr_target = sys.stderr
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
65 else:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
66 stderr_target = sys.stdout
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
67 stderr.flush()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
68 stderr.seek(0)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
69 while True:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
70 chunk = stderr.read( CHUNK_SIZE )
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
71 if chunk:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
72 stderr_target.write( chunk )
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
73 else:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
74 break
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
75 stderr.close()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
76 stdout.close()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
77
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
78 def __main__():
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
79 piq_dir_path="/mnt/galaxyTools/tools/piq/1.3/thashim"
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
80 parser = optparse.OptionParser()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
81 parser.add_option('', '--input', dest="inputF", action='store', type="string", default=None, help='')
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
82 parser.add_option( '', '--output', dest='outputF', action='store', type="string", default=None, help='')
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
83 parser.add_option( '', '--out-dir', dest='output_dir', action='store', type="string", default=None, help='If specified, the output directory for extra files.' )
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
84
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
85 (options, args) = parser.parse_args()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
86
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
87 if not os.path.exists(options.output_dir):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
88 os.mkdir(options.output_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
89 out_dir = "%s/out" % options.output_dir;
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
90 if not os.path.exists(out_dir):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
91 os.mkdir(out_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
92 tmp_dir = "%s/tmp" % options.output_dir;
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
93 if not os.path.exists(tmp_dir):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
94 os.mkdir(tmp_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
95
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
96 maxMotif=519
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
97 Parallel(n_jobs=NCPU)(delayed(parallel_jobs)(i, piq_dir_path, tmp_dir, options.inputF, out_dir, options.output_dir) for i in range(1,maxMotif+1))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
98
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
99 #combine files
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
100 csvFiles =sorted(glob.glob("%s/*calls.all.csv" % out_dir))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
101 csvList=create_dict(csvFiles)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
102
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
103 for v in csvList.values():
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
104 if len(v) == 2:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
105 file_merge(v[0],v[1],out_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
106 else:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
107 outF=os.path.splitext(os.path.basename(v[0]))[0]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
108 outExt=os.path.splitext(os.path.basename(v[0]))[1]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
109 shutil.copy(v[0],"%s/%s_merged%s" % (out_dir, outF, outExt))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
110
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
111 bedFiles=sorted(glob.glob("%s/*calls.all.bed" % out_dir))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
112 bedList=create_dict(bedFiles)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
113
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
114 for v in bedList.values():
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
115 if len(v) == 2:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
116 file_merge(v[0],v[1],out_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
117 else:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
118 outF=os.path.splitext(os.path.basename(v[0]))[0]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
119 outExt=os.path.splitext(os.path.basename(v[0]))[1]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
120 shutil.copy(v[0],"%s/%s_merged%s" % (out_dir, outF, outExt))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
121
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
122 m_csvFiles=sorted(glob.glob("%s/*_merged.csv" % out_dir))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
123 m_bedFiles=sorted(glob.glob("%s/*_merged.bed" % out_dir))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
124
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
125 for i in range(len(m_csvFiles)):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
126 outF=os.path.splitext(os.path.basename(m_csvFiles[i]))[0]
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
127 #print "\noutF: %s\n" % outF
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
128 with open('%s/%s_all_tmp' % (out_dir,outF), 'wb') as res, open(m_bedFiles[i]) as f1, open(m_csvFiles[i]) as f2:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
129 for line1, line2 in zip(f1, f2):
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
130 res.write("{},{}\n".format(line1.rstrip(), line2.rstrip()))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
131 with open('%s/%s_all_tmp' % (out_dir,outF), "rb") as f3:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
132 rdr=csv.reader(f3)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
133
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
134 with open('%s/%s_final.bed' % (out_dir,outF),'wb') as result:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
135 wtr = csv.writer(result, delimiter='\t', quotechar='', quoting=csv.QUOTE_NONE)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
136 #for r in sorted_list:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
137 for r in rdr:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
138 if len(r) > 12:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
139 wtr.writerow((r[0],r[1],r[2],r[3],r[5],r[9].replace('\\',""),r[10].replace('\\',""),r[11].replace('\\',""),r[12]))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
140 filenames=sorted(glob.glob("%s/*_final.bed" % out_dir))
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
141 #shutil.copy('%s/*_final.bed' % out_dir, options.output_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
142 with open('%s/all_motif_comb_final.bed' % options.output_dir, 'w') as outfile:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
143 for fname in filenames:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
144 shutil.copy('%s' % fname, options.output_dir)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
145 with open(fname) as infile:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
146 for line in infile:
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
147 outfile.write(line)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
148
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
149 shutil.copy('%s/all_motif_comb_final.bed' % options.output_dir, options.outputF)
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
150
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
151 # concatenerate calls.all and RC.calls.all for csv and bed files
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
152 cleanup_before_exit( options.output_dir )
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
153
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
154 if __name__=="__main__": __main__()
f77a3f2876c3 Uploaded
sjung
parents:
diff changeset
155