Mercurial > repos > artbio > repenrich
diff RepEnrich.py @ 0:f6f0f1e5e940 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 61e203df0be5ed877ff92b917c7cde6eeeab8310
author | artbio |
---|---|
date | Wed, 02 Aug 2017 05:17:29 -0400 |
parents | |
children | 15e3e29f310e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RepEnrich.py Wed Aug 02 05:17:29 2017 -0400 @@ -0,0 +1,502 @@ +#!/usr/bin/env python + +import argparse +import csv +import os +import shlex + +import shutil +import subprocess +import sys + +import numpy + + +parser = argparse.ArgumentParser(description=''' + Part II: Conducting the alignments to the psuedogenomes. Before\ + doing this step you will require 1) a bamfile of the unique\ + alignments with index 2) a fastq file of the reads mapping to\ + more than one location. These files can be obtained using the\ + following bowtie options [EXAMPLE: bowtie -S -m 1\ + --max multimap.fastq mm9 mate1_reads.fastq] Once you have the\ + unique alignment bamfile and the reads mapping to more than one\ + location in a fastq file you can run this step. EXAMPLE: python\ + master_output.py\ + /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\ + /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/ + HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\ + /users/nneretti/data/annotation/hg19/setup_folder\ + HeLa_InputChIPseq_Rep1_multimap.fastq\ + HeLa_InputChIPseq_Rep1.bam''') +parser.add_argument('--version', action='version', version='%(prog)s 0.1') +parser.add_argument('annotation_file', action='store', + metavar='annotation_file', + help='List RepeatMasker.org annotation file for your\ + organism. The file may be downloaded from the\ + RepeatMasker.org website. Example:\ + /data/annotation/hg19/hg19_repeatmasker.txt') +parser.add_argument('outputfolder', action='store', metavar='outputfolder', + help='List folder to contain results.\ + Example: /outputfolder') +parser.add_argument('outputprefix', action='store', metavar='outputprefix', + help='Enter prefix name for data.\ + Example: HeLa_InputChIPseq_Rep1') +parser.add_argument('setup_folder', action='store', metavar='setup_folder', + help='List folder that contains the repeat element\ + pseudogenomes.\ + Example: /data/annotation/hg19/setup_folder') +parser.add_argument('fastqfile', action='store', metavar='fastqfile', + help='Enter file for the fastq reads that map to multiple\ + locations. Example: /data/multimap.fastq') +parser.add_argument('alignment_bam', action='store', metavar='alignment_bam', + help='Enter bamfile output for reads that map uniquely.\ + Example /bamfiles/old.bam') +parser.add_argument('--pairedend', action='store', dest='pairedend', + default='FALSE', + help='Designate this option for paired-end sequencing.\ + Default FALSE change to TRUE') +parser.add_argument('--collapserepeat', action='store', dest='collapserepeat', + metavar='collapserepeat', default='Simple_repeat', + help='Designate this option to generate a collapsed repeat\ + type. Uncollapsed output is generated in addition to\ + collapsed repeat type. Simple_repeat is default to\ + simplify downstream analysis. You can change the\ + default to another repeat name to collapse a\ + seperate specific repeat instead or if the name of\ + Simple_repeat is different for your organism.\ + Default Simple_repeat') +parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', + metavar='fastqfile2', default='none', + help='Enter fastqfile2 when using paired-end option.\ + Default none') +parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', + default="1", type=int, + help='Enter available cpus per node. The more cpus the\ + faster RepEnrich performs. RepEnrich is designed to\ + only work on one node. Default: "1"') +parser.add_argument('--allcountmethod', action='store', dest='allcountmethod', + metavar='allcountmethod', default="FALSE", + help='By default the pipeline only outputs the fraction\ + count method. Consdidered to be the best way to\ + count multimapped reads. Changing this option will\ + include the unique count method, a conservative\ + count, and the total count method, a liberal\ + counting strategy. Our evaluation of simulated data\ + indicated fraction counting is best.\ + Default = FALSE, change to TRUE') +parser.add_argument('--is_bed', action='store', dest='is_bed', + metavar='is_bed', default='FALSE', + help='Is the annotation file a bed file.\ + This is also a compatible format. The file needs to\ + be a tab seperated bed with optional fields.\ + Ex. format: chr\tstart\tend\tName_element\tclass\ + \tfamily. The class and family should identical to\ + name_element if not applicable.\ + Default FALSE change to TRUE') +args = parser.parse_args() + +# parameters +annotation_file = args.annotation_file +outputfolder = args.outputfolder +outputfile_prefix = args.outputprefix +setup_folder = args.setup_folder +repeat_bed = setup_folder + os.path.sep + 'repnames.bed' +unique_mapper_bam = args.alignment_bam +fastqfile_1 = args.fastqfile +fastqfile_2 = args.fastqfile2 +cpus = args.cpus +b_opt = "-k1 -p " + str(1) + " --quiet" +simple_repeat = args.collapserepeat +paired_end = args.pairedend +allcountmethod = args.allcountmethod +is_bed = args.is_bed + +############################################################################## +# check that the programs we need are available +try: + subprocess.call(shlex.split("coverageBed -h"), + stdout=open(os.devnull, 'wb'), + stderr=open(os.devnull, 'wb')) + subprocess.call(shlex.split("bowtie --version"), + stdout=open(os.devnull, 'wb'), + stderr=open(os.devnull, 'wb')) +except OSError: + print("Error: Bowtie or BEDTools not loaded") + raise + +############################################################################## +# define a csv reader that reads space deliminated files +print('Preparing for analysis using RepEnrich...') +csv.field_size_limit(sys.maxsize) + + +def import_text(filename, separator): + for line in csv.reader(open(filename), delimiter=separator, + skipinitialspace=True): + if line: + yield line + + +############################################################################## +# build dictionaries to convert repclass and rep families' +if is_bed == "FALSE": + repeatclass = {} + repeatfamily = {} + fin = import_text(annotation_file, ' ') + x = 0 + for line in fin: + if x > 2: + classfamily = [] + classfamily = line[10].split(os.path.sep) + line9 = line[9].replace("(", "_").replace( + ")", "_").replace("/", "_") + repeatclass[line9] = classfamily[0] + if len(classfamily) == 2: + repeatfamily[line9] = classfamily[1] + else: + repeatfamily[line9] = classfamily[0] + x += 1 +if is_bed == "TRUE": + repeatclass = {} + repeatfamily = {} + fin = open(annotation_file, 'r') + for line in fin: + line = line.strip('\n') + line = line.split('\t') + theclass = line[4] + thefamily = line[5] + line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") + repeatclass[line3] = theclass + repeatfamily[line3] = thefamily +fin.close() + +############################################################################## +# build list of repeats initializing dictionaries for downstream analysis' +fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t') +repeat_key = {} +rev_repeat_key = {} +repeat_list = [] +reptotalcounts = {} +classfractionalcounts = {} +familyfractionalcounts = {} +classtotalcounts = {} +familytotalcounts = {} +reptotalcounts_simple = {} +fractionalcounts = {} +i = 0 +for line in fin: + reptotalcounts[line[0]] = 0 + fractionalcounts[line[0]] = 0 + if line[0] in repeatclass: + classtotalcounts[repeatclass[line[0]]] = 0 + classfractionalcounts[repeatclass[line[0]]] = 0 + if line[0] in repeatfamily: + familytotalcounts[repeatfamily[line[0]]] = 0 + familyfractionalcounts[repeatfamily[line[0]]] = 0 + if line[0] in repeatfamily: + if repeatfamily[line[0]] == simple_repeat: + reptotalcounts_simple[simple_repeat] = 0 + else: + reptotalcounts_simple[line[0]] = 0 + repeat_list.append(line[0]) + repeat_key[line[0]] = int(line[1]) + rev_repeat_key[int(line[1])] = line[0] +fin.close() +############################################################################## +# map the repeats to the psuedogenomes: +if not os.path.exists(outputfolder): + os.mkdir(outputfolder) +############################################################################## +# Conduct the regions sorting +print('Conducting region sorting on unique mapping reads....') +fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt' +with open(fileout, 'w') as stdout: + command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " + + setup_folder + os.path.sep + 'repnames.bed') + p = subprocess.Popen(command, stdout=stdout) +p.communicate() +stdout.close() +filein = open(outputfolder + os.path.sep + outputfile_prefix + + '_regionsorter.txt', 'r') +counts = {} +sumofrepeatreads = 0 +for line in filein: + line = line.split('\t') + if not str(repeat_key[line[3]]) in counts: + counts[str(repeat_key[line[3]])] = 0 + counts[str(repeat_key[line[3]])] += int(line[4]) + sumofrepeatreads += int(line[4]) +print('Identified ' + str(sumofrepeatreads) + + 'unique reads that mapped to repeats.') +############################################################################## +if paired_end == 'TRUE': + if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') + if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie') + folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' + folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie' +############################################################################## + print("Processing repeat psuedogenomes...") + ps = [] + psb = [] + ticker = 0 + for metagenome in repeat_list: + metagenomepath = setup_folder + os.path.sep + metagenome + file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' + file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' + with open(file1, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + + metagenomepath + " " + fastqfile_1) + p = subprocess.Popen(command, stdout=stdout) + with open(file2, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + + metagenomepath + " " + fastqfile_2) + pp = subprocess.Popen(command, stdout=stdout) + ps.append(p) + ticker += 1 + psb.append(pp) + ticker += 1 + if ticker == cpus: + for p in ps: + p.communicate() + for p in psb: + p.communicate() + ticker = 0 + psb = [] + ps = [] + if len(ps) > 0: + for p in ps: + p.communicate() + stdout.close() + +############################################################################## +# combine the output from both read pairs: + print('sorting and combining the output for both read pairs...') + if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') + sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' + for metagenome in repeat_list: + file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' + file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' + fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie' + with open(fileout, 'w') as stdout: + p1 = subprocess.Popen(['cat', file1, file2], + stdout=subprocess.PIPE) + p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout, + stdout=subprocess.PIPE) + p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, + stdout=subprocess.PIPE) + p4 = subprocess.Popen(['sort'], stdin=p3.stdout, + stdout=subprocess.PIPE) + p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) + p5.communicate() + stdout.close() + print('completed ...') +############################################################################## +if paired_end == 'FALSE': + if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') + folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' +############################################################################## + ps = [] + ticker = 0 + print("Processing repeat psuedogenomes...") + for metagenome in repeat_list: + metagenomepath = setup_folder + os.path.sep + metagenome + file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' + with open(file1, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + + metagenomepath + " " + fastqfile_1) + p = subprocess.Popen(command, stdout=stdout) + ps.append(p) + ticker += 1 + if ticker == cpus: + for p in ps: + p.communicate() + ticker = 0 + ps = [] + if len(ps) > 0: + for p in ps: + p.communicate() + stdout.close() + +############################################################################## +# combine the output from both read pairs: + print('Sorting and combining the output for both read pairs....') + if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') + sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' + for metagenome in repeat_list: + file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' + fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie' + with open(fileout, 'w') as stdout: + p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE) + p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout, + stdout=subprocess.PIPE) + p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, + stdout=subprocess.PIPE) + p4 = subprocess.Popen(['sort'], stdin=p3.stdout, + stdout=subprocess.PIPE) + p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) + p5.communicate() + stdout.close() + print('completed ...') + +############################################################################## +# build a file of repeat keys for all reads +print('Writing and processing intermediate files...') +sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' +readid = {} +sumofrepeatreads = 0 +for rep in repeat_list: + for data in import_text(sorted_bowtie + os.path.sep + + rep + '.bowtie', '\t'): + readid[data[0]] = '' +for rep in repeat_list: + for data in import_text(sorted_bowtie + os.path.sep + + rep + '.bowtie', '\t'): + readid[data[0]] += str(repeat_key[rep]) + str(',') +for subfamilies in readid.values(): + if subfamilies not in counts: + counts[subfamilies] = 0 + counts[subfamilies] += 1 + sumofrepeatreads += 1 +del readid +print('Identified ' + str(sumofrepeatreads) + + ' reads that mapped to repeats for unique and multimappers.') + +############################################################################## +print("Conducting final calculations...") + + +def convert(x): + ''' + build a converter to numeric label for repeat and yield a combined list + of repnames seperated by backslash + ''' + x = x.strip(',') + x = x.split(',') + global repname + repname = "" + for i in x: + repname = repname + os.path.sep + rev_repeat_key[int(i)] + + +# building the total counts for repeat element enrichment... +for x in counts.keys(): + count = counts[x] + x = x.strip(',') + x = x.split(',') + for i in x: + reptotalcounts[rev_repeat_key[int(i)]] += int(count) +# building the fractional counts for repeat element enrichment... +for x in counts.keys(): + count = counts[x] + x = x.strip(',') + x = x.split(',') + splits = len(x) + for i in x: + fractionalcounts[rev_repeat_key[int(i)]] += float( + numpy.divide(float(count), float(splits))) +# building categorized table of repeat element enrichment... +repcounts = {} +repcounts['other'] = 0 +for key in counts.keys(): + convert(key) + repcounts[repname] = counts[key] +# building the total counts for class enrichment... +for key in reptotalcounts.keys(): + classtotalcounts[repeatclass[key]] += reptotalcounts[key] +# building total counts for family enrichment... +for key in reptotalcounts.keys(): + familytotalcounts[repeatfamily[key]] += reptotalcounts[key] +# building unique counts table' +repcounts2 = {} +for rep in repeat_list: + if "/" + rep in repcounts: + repcounts2[rep] = repcounts["/" + rep] + else: + repcounts2[rep] = 0 +# building the fractionalcounts counts for class enrichment... +for key in fractionalcounts.keys(): + classfractionalcounts[repeatclass[key]] += fractionalcounts[key] +# building fractional counts for family enrichment... +for key in fractionalcounts.keys(): + familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] + +############################################################################## +print('Writing final output and removing intermediate files...') +# print output to file of the categorized counts and total overlapping counts: +if allcountmethod == "TRUE": + fout1 = open(outputfolder + os.path.sep + outputfile_prefix + + '_total_counts.txt', 'w') + for key in reptotalcounts.keys(): + fout1.write(str(key) + '\t' + repeatclass[key] + '\t' + + repeatfamily[key] + '\t' + str(reptotalcounts[key]) + + '\n') + fout2 = open(outputfolder + os.path.sep + outputfile_prefix + + '_class_total_counts.txt', 'w') + for key in classtotalcounts.keys(): + fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n') + fout3 = open(outputfolder + os.path.sep + outputfile_prefix + + '_family_total_counts.txt', 'w') + for key in familytotalcounts.keys(): + fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n') + fout4 = open(outputfolder + os.path.sep + outputfile_prefix + + '_unique_counts.txt', 'w') + for key in repcounts2.keys(): + fout4.write(str(key) + '\t' + repeatclass[key] + '\t' + + repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n') + fout5 = open(outputfolder + os.path.sep + outputfile_prefix + + '_class_fraction_counts.txt', 'w') + for key in classfractionalcounts.keys(): + fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') + fout6 = open(outputfolder + os.path.sep + outputfile_prefix + + '_family_fraction_counts.txt', 'w') + for key in familyfractionalcounts.keys(): + fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') + fout7 = open(outputfolder + os.path.sep + outputfile_prefix + + '_fraction_counts.txt', 'w') + for key in fractionalcounts.keys(): + fout7.write(str(key) + '\t' + repeatclass[key] + '\t' + + repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) + + '\n') + fout1.close() + fout2.close() + fout3.close() + fout4.close() + fout5.close() + fout6.close() + fout7.close() +else: + fout1 = open(outputfolder + os.path.sep + outputfile_prefix + + '_class_fraction_counts.txt', 'w') + for key in classfractionalcounts.keys(): + fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') + fout2 = open(outputfolder + os.path.sep + outputfile_prefix + + '_family_fraction_counts.txt', 'w') + for key in familyfractionalcounts.keys(): + fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') + fout3 = open(outputfolder + os.path.sep + outputfile_prefix + + '_fraction_counts.txt', 'w') + for key in fractionalcounts.keys(): + fout3.write(str(key) + '\t' + repeatclass[key] + '\t' + + repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) + + '\n') + fout1.close() + fout2.close() + fout3.close() +############################################################################## +# Remove Large intermediate files +if os.path.exists(outputfolder + os.path.sep + outputfile_prefix + + '_regionsorter.txt'): + os.remove(outputfolder + os.path.sep + outputfile_prefix + + '_regionsorter.txt') +if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie') +if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie') +if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie') +print("... Done")