Mercurial > repos > drosofff > repenrich
diff RepEnrich.py @ 0:1435d142041b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit d5ebd581fa3a22ca61ce07a31c01bb70610fbcf5
author | drosofff |
---|---|
date | Tue, 23 May 2017 18:37:22 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RepEnrich.py Tue May 23 18:37:22 2017 -0400 @@ -0,0 +1,382 @@ +#!/usr/bin/env python +import argparse +import csv +import numpy +import os +import shlex +import shutil +import subprocess +import sys + +parser = argparse.ArgumentParser(description='Part II: Conducting the alignments to the psuedogenomes. Before doing this step you will require 1) a bamfile of the unique alignments with index 2) a fastq file of the reads mapping to more than one location. These files can be obtained using the following bowtie options [EXAMPLE: bowtie -S -m 1 --max multimap.fastq mm9 mate1_reads.fastq] Once you have the unique alignment bamfile and the reads mapping to more than one location in a fastq file you can run this step. EXAMPLE: python master_output.py /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1 /users/nneretti/data/annotation/hg19/setup_folder HeLa_InputChIPseq_Rep1_multimap.fastq HeLa_InputChIPseq_Rep1.bam') +parser.add_argument('--version', action='version', version='%(prog)s 0.1') +parser.add_argument('annotation_file', action= 'store', metavar='annotation_file', help='List RepeatMasker.org annotation file for your organism. The file may be downloaded from the RepeatMasker.org website. Example: /data/annotation/hg19/hg19_repeatmasker.txt') +parser.add_argument('outputfolder', action= 'store', metavar='outputfolder', help='List folder to contain results. Example: /outputfolder') +parser.add_argument('outputprefix', action= 'store', metavar='outputprefix', help='Enter prefix name for data. Example: HeLa_InputChIPseq_Rep1') +parser.add_argument('setup_folder', action= 'store', metavar='setup_folder', help='List folder that contains the repeat element psuedogenomes. Example /data/annotation/hg19/setup_folder') +parser.add_argument('fastqfile', action= 'store', metavar='fastqfile', help='Enter file for the fastq reads that map to multiple locations. Example /data/multimap.fastq') +parser.add_argument('alignment_bam', action= 'store', metavar='alignment_bam', help='Enter bamfile output for reads that map uniquely. Example /bamfiles/old.bam') +parser.add_argument('--pairedend', action= 'store', dest='pairedend', default= 'FALSE', help='Designate this option for paired-end sequencing. Default FALSE change to TRUE') +parser.add_argument('--collapserepeat', action= 'store', dest='collapserepeat', metavar='collapserepeat', default= 'Simple_repeat', help='Designate this option to generate a collapsed repeat type. Uncollapsed output is generated in addition to collapsed repeat type. Simple_repeat is default to simplify downstream analysis. You can change the default to another repeat name to collapse a seperate specific repeat instead or if the name of Simple_repeat is different for your organism. Default Simple_repeat') +parser.add_argument('--fastqfile2', action= 'store', dest='fastqfile2', metavar='fastqfile2', default= 'none', help='Enter fastqfile2 when using paired-end option. Default none') +parser.add_argument('--cpus', action= 'store', dest='cpus', metavar='cpus', default= "1", type=int, help='Enter available cpus per node. The more cpus the faster RepEnrich performs. RepEnrich is designed to only work on one node. Default: "1"') +parser.add_argument('--allcountmethod', action= 'store', dest='allcountmethod', metavar='allcountmethod', default= "FALSE", help='By default the pipeline only outputs the fraction count method. Consdidered to be the best way to count multimapped reads. Changing this option will include the unique count method, a conservative count, and the total count method, a liberal counting strategy. Our evaluation of simulated data indicated fraction counting is best. Default = FALSE, change to TRUE') +parser.add_argument('--is_bed', action= 'store', dest='is_bed', metavar='is_bed', default= 'FALSE', help='Is the annotation file a bed file. This is also a compatible format. The file needs to be a tab seperated bed with optional fields. Ex. format chr\tstart\tend\tName_element\tclass\tfamily. The class and family should identical to name_element if not applicable. Default FALSE change to TRUE') +args = parser.parse_args() + +# parameters +annotation_file = args.annotation_file +outputfolder = args.outputfolder +outputfile_prefix = args.outputprefix +setup_folder = args.setup_folder +repeat_bed = setup_folder + os.path.sep + 'repnames.bed' +unique_mapper_bam = args.alignment_bam +fastqfile_1 = args.fastqfile +fastqfile_2 = args.fastqfile2 +cpus = args.cpus +b_opt = "-k1 -p " +str(1) +" --quiet" +simple_repeat = args.collapserepeat +paired_end = args.pairedend +allcountmethod = args.allcountmethod +is_bed = args.is_bed + +################################################################################ +# check that the programs we need are available +try: + subprocess.call(shlex.split("coverageBed -h"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) + subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) +except OSError: + print ("Error: Bowtie or BEDTools not loaded") + raise + +################################################################################ +# define a csv reader that reads space deliminated files +print ('Preparing for analysis using RepEnrich...') +csv.field_size_limit(sys.maxsize) +def import_text(filename, separator): + for line in csv.reader(open(filename), delimiter=separator, + skipinitialspace=True): + if line: + yield line + +################################################################################ +# build dictionaries to convert repclass and rep families' +if is_bed == "FALSE": + repeatclass = {} + repeatfamily = {} + fin = import_text(annotation_file, ' ') + x = 0 + for line in fin: + if x>2: + classfamily =[] + classfamily = line[10].split(os.path.sep) + line9 = line[9].replace("(","_").replace(")","_").replace("/","_") + repeatclass[line9] = classfamily[0] + if len(classfamily) == 2: + repeatfamily[line9] = classfamily[1] + else: + repeatfamily[line9] = classfamily[0] + x +=1 +if is_bed == "TRUE": + repeatclass = {} + repeatfamily = {} + fin = open(annotation_file, 'r') + for line in fin: + line=line.strip('\n') + line=line.split('\t') + theclass =line[4] + thefamily = line[5] + line3 = line[3].replace("(","_").replace(")","_").replace("/","_") + repeatclass[line3] = theclass + repeatfamily[line3] = thefamily +fin.close() + +################################################################################ +# build list of repeats initializing dictionaries for downstream analysis' +fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t') +repeat_key ={} +rev_repeat_key ={} +repeat_list = [] +reptotalcounts = {} +classfractionalcounts = {} +familyfractionalcounts = {} +classtotalcounts = {} +familytotalcounts = {} +reptotalcounts_simple = {} +fractionalcounts = {} +i = 0 +for line in fin: + reptotalcounts[line[0]] = 0 + fractionalcounts[line[0]] = 0 + if line[0] in repeatclass: + classtotalcounts[repeatclass[line[0]]] = 0 + classfractionalcounts[repeatclass[line[0]]] = 0 + if line[0] in repeatfamily: + familytotalcounts[repeatfamily[line[0]]] = 0 + familyfractionalcounts[repeatfamily[line[0]]] = 0 + if line[0] in repeatfamily: + if repeatfamily[line[0]] == simple_repeat: + reptotalcounts_simple[simple_repeat] = 0 + else: + reptotalcounts_simple[line[0]] = 0 + repeat_list.append(line[0]) + repeat_key[line[0]] = int(line[1]) + rev_repeat_key[int(line[1])] = line[0] +fin.close() +################################################################################ +# map the repeats to the psuedogenomes: +if not os.path.exists(outputfolder): + os.mkdir(outputfolder) +################################################################################ +# Conduct the regions sorting +print ('Conducting region sorting on unique mapping reads....') +fileout= outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt' +with open(fileout, 'w') as stdout: + command = shlex.split("coverageBed -abam " +unique_mapper_bam+" -b " +setup_folder + os.path.sep + 'repnames.bed') + p = subprocess.Popen(command, stdout=stdout) +p.communicate() +stdout.close() +filein = open(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt','r') +counts = {} +sumofrepeatreads=0 +for line in filein: + line= line.split('\t') + if not str(repeat_key[line[3]]) in counts: + counts[str(repeat_key[line[3]])]=0 + counts[str(repeat_key[line[3]])]+=int(line[4]) + sumofrepeatreads+=int(line[4]) +print ('Identified ' + str(sumofrepeatreads) + 'unique reads that mapped to repeats.') +################################################################################ +if paired_end == 'TRUE': + if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') + if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie') + folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' + folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie' +################################################################################ + print ("Processing repeat psuedogenomes...") + ps = [] + psb= [] + ticker= 0 + for metagenome in repeat_list: + metagenomepath = setup_folder + os.path.sep + metagenome + file1=folder_pair1 + os.path.sep + metagenome + '.bowtie' + file2 =folder_pair2 + os.path.sep + metagenome + '.bowtie' + with open(file1, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_1) + p = subprocess.Popen(command,stdout=stdout) + with open(file2, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_2) + pp = subprocess.Popen(command,stdout=stdout) + ps.append(p) + ticker +=1 + psb.append(pp) + ticker +=1 + if ticker == cpus: + for p in ps: + p.communicate() + for p in psb: + p.communicate() + ticker = 0 + psb =[] + ps = [] + if len(ps) > 0: + for p in ps: + p.communicate() + stdout.close() + +################################################################################ +# combine the output from both read pairs: + print ('sorting and combining the output for both read pairs...') + if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') + sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' + for metagenome in repeat_list: + file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' + file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' + fileout= sorted_bowtie + os.path.sep + metagenome + '.bowtie' + with open(fileout, 'w') as stdout: + p1 = subprocess.Popen(['cat',file1,file2], stdout = subprocess.PIPE) + p2 = subprocess.Popen(['cut', '-f1',"-d "], stdin = p1.stdout, stdout = subprocess.PIPE) + p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin = p2.stdout, stdout = subprocess.PIPE) + p4 = subprocess.Popen(['sort'], stdin=p3.stdout, stdout = subprocess.PIPE) + p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout = stdout) + p5.communicate() + stdout.close() + print ('completed ...') +################################################################################ +if paired_end == 'FALSE': + if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') + folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' +################################################################################ + ps = [] + ticker= 0 + print ("Processing repeat psuedogenomes...") + for metagenome in repeat_list: + metagenomepath = setup_folder + os.path.sep + metagenome + file1=folder_pair1 + os.path.sep + metagenome + '.bowtie' + with open(file1, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_1) + p = subprocess.Popen(command,stdout=stdout) + ps.append(p) + ticker +=1 + if ticker == cpus: + for p in ps: + p.communicate() + ticker = 0 + ps = [] + if len(ps) > 0: + for p in ps: + p.communicate() + stdout.close() + +################################################################################ +# combine the output from both read pairs: + print ('Sorting and combining the output for both read pairs....') + if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') + sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' + for metagenome in repeat_list: + file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' + fileout= sorted_bowtie + os.path.sep + metagenome + '.bowtie' + with open(fileout, 'w') as stdout: + p1 = subprocess.Popen(['cat',file1], stdout = subprocess.PIPE) + p2 = subprocess.Popen(['cut', '-f1'], stdin = p1.stdout, stdout = subprocess.PIPE) + p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin = p2.stdout, stdout = subprocess.PIPE) + p4 = subprocess.Popen(['sort'], stdin = p3.stdout,stdout = subprocess.PIPE) + p5 = subprocess.Popen(['uniq'], stdin = p4.stdout,stdout = stdout) + p5.communicate() + stdout.close() + print ('completed ...') + +################################################################################ +# build a file of repeat keys for all reads +print ('Writing and processing intermediate files...') +sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' +readid = {} +sumofrepeatreads=0 +for rep in repeat_list: + for data in import_text(sorted_bowtie + os.path.sep + rep + '.bowtie', '\t'): + readid[data[0]] = '' +for rep in repeat_list: + for data in import_text(sorted_bowtie + os.path.sep + rep + '.bowtie', '\t'): + readid[data[0]]+=str(repeat_key[rep]) + str(',') +for subfamilies in readid.values(): + if not subfamilies in counts: + counts[subfamilies]=0 + counts[subfamilies] +=1 + sumofrepeatreads+=1 +del readid +print ('Identified ' + str(sumofrepeatreads) + ' reads that mapped to repeats for unique and multimappers.') + +################################################################################ +print ("Conducting final calculations...") +# build a converter to numeric label for repeat and yield a combined list of repnames seperated by backslash +def convert(x): + x = x.strip(',') + x = x.split(',') + global repname + repname = "" + for i in x: + repname = repname + os.path.sep + rev_repeat_key[int(i)] +# building the total counts for repeat element enrichment... +for x in counts.keys(): + count= counts[x] + x = x.strip(',') + x = x.split(',') + for i in x: + reptotalcounts[rev_repeat_key[int(i)]] += int(count) +# building the fractional counts for repeat element enrichment... +for x in counts.keys(): + count= counts[x] + x = x.strip(',') + x = x.split(',') + splits = len(x) + for i in x: + fractionalcounts[rev_repeat_key[int(i)]] += float(numpy.divide(float(count),float(splits))) +# building categorized table of repeat element enrichment... +repcounts = {} +repcounts['other'] = 0 +for key in counts.keys(): + convert(key) + repcounts[repname] = counts[key] +# building the total counts for class enrichment... +for key in reptotalcounts.keys(): + classtotalcounts[repeatclass[key]] += reptotalcounts[key] +# building total counts for family enrichment... +for key in reptotalcounts.keys(): + familytotalcounts[repeatfamily[key]] += reptotalcounts[key] +# building unique counts table' +repcounts2 = {} +for rep in repeat_list: + if "/" +rep in repcounts: + repcounts2[rep] = repcounts["/" +rep] + else: + repcounts2[rep] = 0 +# building the fractionalcounts counts for class enrichment... +for key in fractionalcounts.keys(): + classfractionalcounts[repeatclass[key]] += fractionalcounts[key] +# building fractional counts for family enrichment... +for key in fractionalcounts.keys(): + familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] + +################################################################################ +print ('Writing final output and removing intermediate files...') +# print output to file of the categorized counts and total overlapping counts: +if allcountmethod == "TRUE": + fout1 = open(outputfolder + os.path.sep + outputfile_prefix + '_total_counts.txt' , 'w') + for key in reptotalcounts.keys(): + fout1.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(reptotalcounts[key]) + '\n') + fout2 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_total_counts.txt' , 'w') + for key in classtotalcounts.keys(): + fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n') + fout3 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_total_counts.txt' , 'w') + for key in familytotalcounts.keys(): + fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n') + fout4 = open(outputfolder + os.path.sep + outputfile_prefix + '_unique_counts.txt' , 'w') + for key in repcounts2.keys(): + fout4.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n') + fout5 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_fraction_counts.txt' , 'w') + for key in classfractionalcounts.keys(): + fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') + fout6 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_fraction_counts.txt' , 'w') + for key in familyfractionalcounts.keys(): + fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') + fout7 = open(outputfolder + os.path.sep + outputfile_prefix + '_fraction_counts.txt' , 'w') + for key in fractionalcounts.keys(): + fout7.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) + '\n') + fout1.close() + fout2.close() + fout3.close() + fout4.close() + fout5.close() + fout6.close() + fout7.close() +else: + fout1 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_fraction_counts.txt' , 'w') + for key in classfractionalcounts.keys(): + fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') + fout2 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_fraction_counts.txt' , 'w') + for key in familyfractionalcounts.keys(): + fout2.write(str(key) + '\t' + str(familyfractionalcounts[key])+ '\n') + fout3 = open(outputfolder + os.path.sep + outputfile_prefix + '_fraction_counts.txt' , 'w') + for key in fractionalcounts.keys(): + fout3.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) + '\n') + fout1.close() + fout2.close() + fout3.close() + +################################################################################ +# Remove Large intermediate files +if os.path.exists(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'): + os.remove(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt') +if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie') +if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie') +if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie') + +print ("... Done")