Mercurial > repos > artbio > small_rna_signatures
diff overlapping_reads.py @ 3:4d9682bd3a6b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 96ed5824190aff281cc3aa47dc60fc66aac41db3
author | artbio |
---|---|
date | Sat, 02 Sep 2017 06:35:15 -0400 |
parents | 6f1378738798 |
children | 20d28cfdeefe |
line wrap: on
line diff
--- a/overlapping_reads.py Wed Aug 30 05:40:18 2017 -0400 +++ b/overlapping_reads.py Sat Sep 02 06:35:15 2017 -0400 @@ -36,90 +36,106 @@ self.bam_object = pysam.AlignmentFile(bam_file, 'rb') self.chromosomes = dict(zip(self.bam_object.references, self.bam_object.lengths)) - self.map_dict = self.create_map(self.bam_object) + self.all_query_positions = self.query_positions(self.bam_object) + self.readdic = self.make_readdic(self.bam_object) - def create_map(self, bam_object): - ''' - Returns a map_dictionary {(chromosome,read_position,polarity): - [read_length, ...]} - ''' - map_dictionary = defaultdict(list) - # get empty value for start and end of each chromosome - for chrom in self.chromosomes: - map_dictionary[(chrom, 1, 'F')] = [] - map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] + def make_readdic(self, bam_object): + readdic = defaultdict(int) + for read in bam_object.fetch(): + readdic[read.query_sequence] += 1 + return readdic + + def query_positions(self, bam_object): + all_query_positions = defaultdict(list) for chrom in self.chromosomes: for read in bam_object.fetch(chrom): - positions = read.positions # a list of covered positions - if read.is_reverse: - map_dictionary[(chrom, positions[-1]+1, - 'R')].append(read.query_alignment_length) + if not read.is_reverse: + all_query_positions[chrom].append( + read.get_reference_positions(full_length=True)[0]) else: - map_dictionary[(chrom, positions[0]+1, - 'F')].append(read.query_alignment_length) - return map_dictionary + all_query_positions[chrom].append( + read.get_reference_positions(full_length=True)[-1]) + all_query_positions[chrom] = sorted( + list(set(all_query_positions[chrom]))) + return all_query_positions - def signature_tables(self, minquery, maxquery, mintarget, maxtarget): + def direct_pairing(self, minquery, maxquery, mintarget, maxtarget, + file, overlap=10): + F = open(file, 'w') query_range = range(minquery, maxquery + 1) target_range = range(mintarget, maxtarget + 1) - Query_table = defaultdict(dict) - Target_table = defaultdict(dict) - for key in self.map_dict: - for size in self.map_dict[key]: - if size in query_range or size in target_range: - if key[2] == 'F': - coordinate = key[1] - else: - coordinate = -key[1] - if size in query_range: - Query_table[key[0]][coordinate] = Query_table[key[0]].get( - coordinate, 0) + 1 - if size in target_range: - Target_table[key[0]][coordinate] = \ - Target_table[key[0]].get(coordinate, 0) + 1 - return Query_table, Target_table - - def search_overlaps(self, minquery, maxquery, mintarget, maxtarget, - overlap=10): - Query_table, Target_table = self.signature_tables(minquery, maxquery, - mintarget, maxtarget) - overlap_groups = defaultdict(list) - for chrom in Query_table: - for coord in Query_table[chrom]: - if Target_table[chrom].get(-coord - overlap + 1, 0): - overlap_groups[chrom].append(coord) - return overlap_groups - - def feed_overlaps(self, overlap_groups, minquery, output, overlap=10): - F = open(output, 'w') - for chrom in sorted(overlap_groups): - for pos in sorted(overlap_groups[chrom]): - if pos > 0: # read are forward - reads = self.bam_object.fetch(chrom, start=pos-1, - end=pos-1+overlap-1) - for read in reads: - positions = read.positions - if pos-1 == positions[0] and \ - read.query_alignment_length >= minquery: - F.write('>%s|%s|%s|%s\n%s\n' % ( - chrom, pos, 'F', - read.query_alignment_length, - read.query_sequence)) - else: # reads are reverse - reads = self.bam_object.fetch(chrom, - start=-pos-1-overlap+1, - end=-pos-1) - for read in reads: - positions = read.positions - if -pos-1 == positions[-1] and \ - read.query_alignment_length >= minquery: - readseq = self.revcomp(read.query_sequence) - readsize = read.query_alignment_length - F.write('>%s|%s|%s|%s\n%s\n' % (chrom, - positions[0] + 1, - 'R', readsize, readseq)) - F.close() - return + stringresult = [] + for chrom in sorted(self.chromosomes): + for pos in (self.all_query_positions[chrom]): + iterreads_1 = self.bam_object.fetch(chrom, + start=pos, end=pos+overlap-1) + iterreads_2 = self.bam_object.fetch(chrom, + start=pos, end=pos+overlap-1) + iterreads_3 = self.bam_object.fetch(chrom, + start=pos, end=pos+overlap-1) + iterreads_4 = self.bam_object.fetch(chrom, + start=pos, end=pos+overlap-1) + # 1 + for queryread in iterreads_1: + if queryread.get_reference_positions( + full_length=True)[0] == pos and \ + queryread.query_alignment_length in query_range \ + and not queryread.is_reverse: + for targetread in iterreads_2: + if (targetread. + get_reference_positions(full_length=True)[-1] + == queryread.get_reference_positions( + full_length=True)[overlap-1] and + targetread.query_alignment_length in + target_range and targetread.is_reverse): + targetreadseq = self.revcomp( + targetread.query_sequence) + stringresult.append( + '>%s|%s|%s|%s|n=%s\n%s\n' % + (chrom, queryread.get_reference_positions( + full_length=True)[0]+1, + 'F', queryread.query_alignment_length, + self.readdic[queryread.query_sequence], + queryread.query_sequence)) + stringresult.append( + '>%s|%s|%s|%s|n=%s\n%s\n' % + (chrom, targetread.get_reference_positions( + full_length=True)[0]+1, + 'R', targetread.query_alignment_length, + self.readdic[targetread.query_sequence], + targetreadseq)) + # 2 + for queryread in iterreads_3: + if queryread.get_reference_positions( + full_length=True)[-1] == pos+overlap-1 and \ + queryread.query_alignment_length in query_range \ + and queryread.is_reverse: + for targetread in iterreads_4: + if (targetread. + get_reference_positions(full_length=True)[0] + == pos and targetread.query_alignment_length + in target_range and not + targetread.is_reverse): + queryreadseq = self.revcomp( + queryread.query_sequence) + targetreadseq = targetread.query_sequence + stringresult.append( + '>%s|%s|%s|%s|n=%s\n%s\n' % + (chrom, queryread.get_reference_positions( + full_length=True)[0]+1, 'R', + queryread.query_alignment_length, + self.readdic[queryread.query_sequence], + queryreadseq)) + stringresult.append( + '>%s|%s|%s|%s|n=%s\n%s\n' % + (chrom, targetread.get_reference_positions( + full_length=True)[0]+1, + 'F', targetread.query_alignment_length, + self.readdic[targetread.query_sequence], + targetreadseq)) + stringresult = sorted(set(stringresult), + key=lambda x: stringresult.index(x)) + F.write(''.join(stringresult)) def revcomp(self, sequence): antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} @@ -129,12 +145,11 @@ def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10): mapobj = Map(input) - mapobj.feed_overlaps(mapobj.search_overlaps(minquery, maxquery, - mintarget, maxtarget, - overlap), minquery, output) + mapobj.direct_pairing(minquery, maxquery, mintarget, maxtarget, + output, overlap) if __name__ == "__main__": args = Parser() main(args.input, args.minquery, args.maxquery, args.mintarget, - args.maxtarget, args.output) + args.maxtarget, args.output, args.overlap)