diff mergeJobs.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mergeJobs.py	Tue May 15 02:39:53 2018 -0400
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+import os
+import argparse
+
+def getListOfFiles(inDir,pattern):
+	return [inDir+n for n in os.listdir(inDir) if (pattern in n and os.path.getsize(inDir+n))]
+
+TEMP_IND = 0
+def stripVCF_header(fn):
+	global TEMP_IND
+	f = open(fn,'r')
+	ftn = fn+'_temp'+str(TEMP_IND)
+	f_t = open(ftn,'w')
+	hasHeader = False
+	for line in f:
+		if line[0] == '#':
+			if not hasHeader:
+				TEMP_IND += 1
+			hasHeader = True
+		elif hasHeader:
+			f_t.write(line)
+		else:
+			break
+	f_t.close()
+	f.close()
+	if hasHeader:
+		return ftn
+	else:
+		os.system('rm '+ftn)
+		return fn
+
+def catListOfFiles(l,outName,gzipped=False):
+	for n in l:
+		if n[-3:] == '.gz' or n[-5:] == '.gzip':
+			gzipped = True
+	if gzipped:
+		for n in l:
+			if not n[-3:] == '.gz' and not n[-5:] == '.gzip':
+				print '\nError: Found a mixture of compressed and decompressed files with the specified prefix. Abandoning ship...\n'
+				for m in l:
+					print m
+				print ''
+				exit(1)
+		cmd = 'cat '+' '.join(sorted(l))+' > '+outName+'.gz'
+	else:
+		cmd = 'cat '+' '.join(sorted(l))+' > '+outName
+	print cmd
+	os.system(cmd)
+
+def catBams(l,outName,samtools_exe):
+	l_sort = sorted(l)
+	tmp = outName+'.tempHeader.sam'
+	os.system(samtools_exe+' view -H '+l_sort[0]+' > '+tmp)
+	cmd = samtools_exe+' cat -h '+tmp+' '+' '.join(l_sort)+' > '+outName
+	print cmd
+	os.system(cmd)
+	os.system('rm '+tmp)
+
+
+#####################################
+#				main()				#
+#####################################
+
+def main():
+
+	parser = argparse.ArgumentParser(description='mergeJobs.py')
+	parser.add_argument('-i', type=str, required=True, metavar='<str>', nargs='+', help="* input prefix: [prefix_1] [prefix_2] ...")
+	parser.add_argument('-o', type=str, required=True, metavar='<str>',            help="* output prefix")
+	parser.add_argument('-s', type=str, required=True, metavar='<str>',            help="* /path/to/samtools")
+
+	args = parser.parse_args()
+	(INP,OUP,SAMTOOLS) = (args.i,args.o,args.s)
+
+	inDir = '/'.join(INP[0].split('/')[:-1])+'/'
+	if inDir == '/':
+		inDir = './'
+	#print inDir
+
+	INP_LIST = []
+	for n in INP:
+		if n[-1] == '/':
+			n = n[:-1]
+		INP_LIST.append(n.split('/')[-1])
+	listing_r1 = []
+	listing_r2 = []
+	listing_b  = []
+	listing_v  = []
+	for n in INP_LIST:
+		listing_r1 += getListOfFiles(inDir,n+'_read1.fq.job')
+		listing_r2 += getListOfFiles(inDir,n+'_read2.fq.job')
+		listing_b  += getListOfFiles(inDir,n+'_golden.bam.job')
+		if len(listing_v):	# remove headers from vcf files that aren't the first being processed
+			initList   = getListOfFiles(inDir,n+'_golden.vcf.job')
+			listing_v += [stripVCF_header(n) for n in initList]
+		else:
+			listing_v  += getListOfFiles(inDir,n+'_golden.vcf.job')
+	
+	#
+	#	merge fq files
+	#
+	if len(listing_r1):
+		catListOfFiles(listing_r1,OUP+'_read1.fq')
+	if len(listing_r2):
+		catListOfFiles(listing_r2,OUP+'_read2.fq')
+
+	#
+	#	merge golden alignments, if present
+	#
+	if len(listing_b):
+		catBams(listing_b,OUP+'_golden.bam',SAMTOOLS)
+
+	#
+	#	merge golden vcfs, if present
+	#
+	if len(listing_v):
+		catListOfFiles(listing_v,OUP+'_golden.vcf')
+
+
+if __name__ == "__main__":
+	main()
+