Mercurial > repos > artbio > repenrich
view RepEnrich.py @ 9:db32bb7bda01 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit ccf4b178d13aeedf7214f2400e7411bcfa3e73b4-dirty
author | artbio |
---|---|
date | Tue, 11 Dec 2018 12:50:46 -0500 |
parents | d1f7ab78f7b5 |
children | 6f4143893463 |
line wrap: on
line source
import argparse import csv import os import shlex import shutil import subprocess import sys import numpy parser = argparse.ArgumentParser(description=''' Part II: Conducting the alignments to the psuedogenomes. Before\ doing this step you will require 1) a bamfile of the unique\ alignments with index 2) a fastq file of the reads mapping to\ more than one location. These files can be obtained using the\ following bowtie options [EXAMPLE: bowtie -S -m 1\ --max multimap.fastq mm9 mate1_reads.fastq] Once you have the\ unique alignment bamfile and the reads mapping to more than one\ location in a fastq file you can run this step. EXAMPLE: python\ master_output.py\ /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt\ /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/ HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1\ /users/nneretti/data/annotation/hg19/setup_folder\ HeLa_InputChIPseq_Rep1_multimap.fastq\ HeLa_InputChIPseq_Rep1.bam''') parser.add_argument('--version', action='version', version='%(prog)s 0.1') parser.add_argument('annotation_file', action='store', metavar='annotation_file', help='List RepeatMasker.org annotation file for your\ organism. The file may be downloaded from the\ RepeatMasker.org website. Example:\ /data/annotation/hg19/hg19_repeatmasker.txt') parser.add_argument('outputfolder', action='store', metavar='outputfolder', help='List folder to contain results.\ Example: /outputfolder') parser.add_argument('outputprefix', action='store', metavar='outputprefix', help='Enter prefix name for data.\ Example: HeLa_InputChIPseq_Rep1') parser.add_argument('setup_folder', action='store', metavar='setup_folder', help='List folder that contains the repeat element\ pseudogenomes.\ Example: /data/annotation/hg19/setup_folder') parser.add_argument('fastqfile', action='store', metavar='fastqfile', help='Enter file for the fastq reads that map to multiple\ locations. Example: /data/multimap.fastq') parser.add_argument('alignment_bam', action='store', metavar='alignment_bam', help='Enter bamfile output for reads that map uniquely.\ Example /bamfiles/old.bam') parser.add_argument('--pairedend', action='store', dest='pairedend', default='FALSE', help='Designate this option for paired-end sequencing.\ Default FALSE change to TRUE') parser.add_argument('--collapserepeat', action='store', dest='collapserepeat', metavar='collapserepeat', default='Simple_repeat', help='Designate this option to generate a collapsed repeat\ type. Uncollapsed output is generated in addition to\ collapsed repeat type. Simple_repeat is default to\ simplify downstream analysis. You can change the\ default to another repeat name to collapse a\ seperate specific repeat instead or if the name of\ Simple_repeat is different for your organism.\ Default Simple_repeat') parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', metavar='fastqfile2', default='none', help='Enter fastqfile2 when using paired-end option.\ Default none') parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', default="1", type=int, help='Enter available cpus per node. The more cpus the\ faster RepEnrich performs. RepEnrich is designed to\ only work on one node. Default: "1"') parser.add_argument('--allcountmethod', action='store', dest='allcountmethod', metavar='allcountmethod', default="FALSE", help='By default the pipeline only outputs the fraction\ count method. Consdidered to be the best way to\ count multimapped reads. Changing this option will\ include the unique count method, a conservative\ count, and the total count method, a liberal\ counting strategy. Our evaluation of simulated data\ indicated fraction counting is best.\ Default = FALSE, change to TRUE') parser.add_argument('--is_bed', action='store', dest='is_bed', metavar='is_bed', default='FALSE', help='Is the annotation file a bed file.\ This is also a compatible format. The file needs to\ be a tab seperated bed with optional fields.\ Ex. format: chr\tstart\tend\tName_element\tclass\ \tfamily. The class and family should identical to\ name_element if not applicable.\ Default FALSE change to TRUE') args = parser.parse_args() # parameters annotation_file = args.annotation_file outputfolder = args.outputfolder outputfile_prefix = args.outputprefix setup_folder = args.setup_folder repeat_bed = setup_folder + os.path.sep + 'repnames.bed' unique_mapper_bam = args.alignment_bam fastqfile_1 = args.fastqfile fastqfile_2 = args.fastqfile2 cpus = args.cpus b_opt = "-k1 -p " + str(1) + " --quiet" simple_repeat = args.collapserepeat paired_end = args.pairedend allcountmethod = args.allcountmethod is_bed = args.is_bed ############################################################################## # check that the programs we need are available try: subprocess.call(shlex.split("coverageBed -h"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) except OSError: print("Error: Bowtie or BEDTools not loaded") raise ############################################################################## # define a csv reader that reads space deliminated files print('Preparing for analysis using RepEnrich...') csv.field_size_limit(sys.maxsize) def import_text(filename, separator): for line in csv.reader(open(filename), delimiter=separator, skipinitialspace=True): if line: yield line ############################################################################## # build dictionaries to convert repclass and rep families' if is_bed == "FALSE": repeatclass = {} repeatfamily = {} fin = import_text(annotation_file, ' ') x = 0 for line in fin: if x > 2: classfamily = [] classfamily = line[10].split(os.path.sep) line9 = line[9].replace("(", "_").replace( ")", "_").replace("/", "_") repeatclass[line9] = classfamily[0] if len(classfamily) == 2: repeatfamily[line9] = classfamily[1] else: repeatfamily[line9] = classfamily[0] x += 1 if is_bed == "TRUE": repeatclass = {} repeatfamily = {} fin = open(annotation_file, 'r') for line in fin: line = line.strip('\n') line = line.split('\t') theclass = line[4] thefamily = line[5] line3 = line[3].replace("(", "_").replace(")", "_").replace("/", "_") repeatclass[line3] = theclass repeatfamily[line3] = thefamily fin.close() ############################################################################## # build list of repeats initializing dictionaries for downstream analysis' fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t') repeat_key = {} rev_repeat_key = {} repeat_list = [] reptotalcounts = {} classfractionalcounts = {} familyfractionalcounts = {} classtotalcounts = {} familytotalcounts = {} reptotalcounts_simple = {} fractionalcounts = {} i = 0 for line in fin: reptotalcounts[line[0]] = 0 fractionalcounts[line[0]] = 0 if line[0] in repeatclass: classtotalcounts[repeatclass[line[0]]] = 0 classfractionalcounts[repeatclass[line[0]]] = 0 if line[0] in repeatfamily: familytotalcounts[repeatfamily[line[0]]] = 0 familyfractionalcounts[repeatfamily[line[0]]] = 0 if line[0] in repeatfamily: if repeatfamily[line[0]] == simple_repeat: reptotalcounts_simple[simple_repeat] = 0 else: reptotalcounts_simple[line[0]] = 0 repeat_list.append(line[0]) repeat_key[line[0]] = int(line[1]) rev_repeat_key[int(line[1])] = line[0] fin.close() ############################################################################## # map the repeats to the psuedogenomes: if not os.path.exists(outputfolder): os.mkdir(outputfolder) ############################################################################## # Conduct the regions sorting print('Conducting region sorting on unique mapping reads....') fileout = outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt' with open(fileout, 'w') as stdout: command = shlex.split("coverageBed -abam " + unique_mapper_bam + " -b " + setup_folder + os.path.sep + 'repnames.bed') p = subprocess.Popen(command, stdout=stdout) p.communicate() stdout.close() filein = open(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt', 'r') counts = {} sumofrepeatreads = 0 for line in filein: line = line.split('\t') if not str(repeat_key[line[3]]) in counts: counts[str(repeat_key[line[3]])] = 0 counts[str(repeat_key[line[3]])] += int(line[4]) sumofrepeatreads += int(line[4]) print('Identified ' + str(sumofrepeatreads) + 'unique reads that mapped to repeats.') ############################################################################## if paired_end == 'TRUE': if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie') folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie' ############################################################################## print("Processing repeat psuedogenomes...") ps = [] psb = [] ticker = 0 for metagenome in repeat_list: metagenomepath = setup_folder + os.path.sep + metagenome file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' with open(file1, 'w') as stdout: command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_1) p = subprocess.Popen(command, stdout=stdout) with open(file2, 'w') as stdout: command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_2) pp = subprocess.Popen(command, stdout=stdout) ps.append(p) ticker += 1 psb.append(pp) ticker += 1 if ticker == cpus: for p in ps: p.communicate() for p in psb: p.communicate() ticker = 0 psb = [] ps = [] if len(ps) > 0: for p in ps: p.communicate() stdout.close() ############################################################################## # combine the output from both read pairs: print('sorting and combining the output for both read pairs...') if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' for metagenome in repeat_list: file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie' with open(fileout, 'w') as stdout: p1 = subprocess.Popen(['cat', file1, file2], stdout=subprocess.PIPE) p2 = subprocess.Popen(['cut', '-f1', "-d "], stdin=p1.stdout, stdout=subprocess.PIPE) p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, stdout=subprocess.PIPE) p4 = subprocess.Popen(['sort'], stdin=p3.stdout, stdout=subprocess.PIPE) p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) p5.communicate() stdout.close() print('completed ...') ############################################################################## if paired_end == 'FALSE': if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' ############################################################################## ps = [] ticker = 0 print("Processing repeat psuedogenomes...") for metagenome in repeat_list: metagenomepath = setup_folder + os.path.sep + metagenome file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' with open(file1, 'w') as stdout: command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_1) p = subprocess.Popen(command, stdout=stdout) ps.append(p) ticker += 1 if ticker == cpus: for p in ps: p.communicate() ticker = 0 ps = [] if len(ps) > 0: for p in ps: p.communicate() stdout.close() ############################################################################## # combine the output from both read pairs: print('Sorting and combining the output for both read pairs....') if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' for metagenome in repeat_list: file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' fileout = sorted_bowtie + os.path.sep + metagenome + '.bowtie' with open(fileout, 'w') as stdout: p1 = subprocess.Popen(['cat', file1], stdout=subprocess.PIPE) p2 = subprocess.Popen(['cut', '-f1'], stdin=p1.stdout, stdout=subprocess.PIPE) p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin=p2.stdout, stdout=subprocess.PIPE) p4 = subprocess.Popen(['sort'], stdin=p3.stdout, stdout=subprocess.PIPE) p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout=stdout) p5.communicate() stdout.close() print('completed ...') ############################################################################## # build a file of repeat keys for all reads print('Writing and processing intermediate files...') sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' readid = {} sumofrepeatreads = 0 for rep in repeat_list: for data in import_text(sorted_bowtie + os.path.sep + rep + '.bowtie', '\t'): readid[data[0]] = '' for rep in repeat_list: for data in import_text(sorted_bowtie + os.path.sep + rep + '.bowtie', '\t'): readid[data[0]] += str(repeat_key[rep]) + str(',') for subfamilies in readid.values(): if subfamilies not in counts: counts[subfamilies] = 0 counts[subfamilies] += 1 sumofrepeatreads += 1 del readid print('Identified ' + str(sumofrepeatreads) + ' reads that mapped to repeats for unique and multimappers.') ############################################################################## print("Conducting final calculations...") def convert(x): ''' build a converter to numeric label for repeat and yield a combined list of repnames seperated by backslash ''' x = x.strip(',') x = x.split(',') global repname repname = "" for i in x: repname = repname + os.path.sep + rev_repeat_key[int(i)] # building the total counts for repeat element enrichment... for x in counts.keys(): count = counts[x] x = x.strip(',') x = x.split(',') for i in x: reptotalcounts[rev_repeat_key[int(i)]] += int(count) # building the fractional counts for repeat element enrichment... for x in counts.keys(): count = counts[x] x = x.strip(',') x = x.split(',') splits = len(x) for i in x: fractionalcounts[rev_repeat_key[int(i)]] += float( numpy.divide(float(count), float(splits))) # building categorized table of repeat element enrichment... repcounts = {} repcounts['other'] = 0 for key in counts.keys(): convert(key) repcounts[repname] = counts[key] # building the total counts for class enrichment... for key in reptotalcounts.keys(): classtotalcounts[repeatclass[key]] += reptotalcounts[key] # building total counts for family enrichment... for key in reptotalcounts.keys(): familytotalcounts[repeatfamily[key]] += reptotalcounts[key] # building unique counts table' repcounts2 = {} for rep in repeat_list: if "/" + rep in repcounts: repcounts2[rep] = repcounts["/" + rep] else: repcounts2[rep] = 0 # building the fractionalcounts counts for class enrichment... for key in fractionalcounts.keys(): classfractionalcounts[repeatclass[key]] += fractionalcounts[key] # building fractional counts for family enrichment... for key in fractionalcounts.keys(): familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] ############################################################################## print('Writing final output and removing intermediate files...') # print output to file of the categorized counts and total overlapping counts: if allcountmethod == "TRUE": fout1 = open(outputfolder + os.path.sep + outputfile_prefix + '_total_counts.txt', 'w') for key in sorted(reptotalcounts.keys()): fout1.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(reptotalcounts[key]) + '\n') fout2 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_total_counts.txt', 'w') for key in sorted(classtotalcounts.keys()): fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n') fout3 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_total_counts.txt', 'w') for key in sorted(familytotalcounts.keys()): fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n') fout4 = open(outputfolder + os.path.sep + outputfile_prefix + '_unique_counts.txt', 'w') for key in sorted(repcounts2.keys()): fout4.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n') fout5 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_fraction_counts.txt', 'w') for key in sorted(classfractionalcounts.keys()): fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') fout6 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_fraction_counts.txt', 'w') for key in sorted(familyfractionalcounts.keys()): fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') fout7 = open(outputfolder + os.path.sep + outputfile_prefix + '_fraction_counts.txt', 'w') for key in sorted(fractionalcounts.keys()): fout7.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) + '\n') fout1.close() fout2.close() fout3.close() fout4.close() fout5.close() fout6.close() fout7.close() else: fout1 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_fraction_counts.txt', 'w') for key in sorted(classfractionalcounts.keys()): fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') fout2 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_fraction_counts.txt', 'w') for key in sorted(familyfractionalcounts.keys()): fout2.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') fout3 = open(outputfolder + os.path.sep + outputfile_prefix + '_fraction_counts.txt', 'w') for key in sorted(fractionalcounts.keys()): fout3.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) + '\n') fout1.close() fout2.close() fout3.close() ############################################################################## # Remove Large intermediate files if os.path.exists(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'): os.remove(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt') if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie') if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie') if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie') print("... Done")