0
|
1 """
|
|
2 .. module:: cage_scan_clustering
|
|
3 :platform: Unix
|
|
4 :synopsis: cluster cage scan reads based on arbitrary distance. It assumes that the reads are sorted by chrom, TSS and strand.
|
|
5 :version: 1.0
|
|
6
|
|
7 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com>
|
|
8
|
|
9 .. source: https://github.com/mmendez12/umicount
|
|
10 """
|
|
11
|
|
12 import csv
|
|
13 import argparse
|
|
14
|
|
15
|
|
16 import bed12
|
|
17
|
|
18 #TODO: rewrite tests
|
|
19 def print_read_to_bed12(reads):
|
|
20 """ Merge the reads by blocks and print a single read in the BED12 format on stdout.
|
|
21 It assumes that the reads are on the same TSS and contains
|
|
22 fingerprint information in the read's name.
|
|
23
|
|
24 Args:
|
|
25 reads: A list of reads
|
|
26
|
|
27 """
|
|
28 block_sizes, block_starts = bed12.merge_overlapping_blocks(reads)
|
|
29
|
|
30 #bed12
|
|
31 first_read = sorted(reads, key=bed12.get_start)[0]
|
|
32 chrom = bed12.get_chrom(first_read)
|
|
33 start = bed12.get_start(first_read)
|
|
34 end = start + block_starts[-1] + block_sizes[-1]
|
|
35
|
|
36 score = len(reads)
|
|
37
|
|
38 strand = bed12.get_strand(first_read)
|
|
39
|
|
40 if strand == '+':
|
|
41 thick_start = start
|
|
42 thick_end = start + block_sizes[0]
|
|
43 else:
|
|
44 thick_start = end - block_sizes[-1]
|
|
45 thick_end = end
|
|
46
|
|
47 color = "255,0,0"
|
|
48 block_count = len(block_sizes)
|
|
49 block_sizes = ','.join(map(str, block_sizes))
|
|
50 block_starts = ','.join(map(str, block_starts))
|
|
51
|
|
52 name = map(str, [chrom, start, end, strand])
|
|
53 name = ":".join(name)
|
|
54
|
|
55 output = [chrom, start, end, name, score, strand, thick_start, thick_end,
|
|
56 color, block_count, block_sizes, block_starts]
|
|
57
|
|
58 output_str = map(str, output)
|
|
59 print '\t'.join(output_str)
|
|
60
|
|
61
|
|
62 def overlapping_reads(reads, distance):
|
|
63 """returns all the overlapping reads within a given distance"""
|
|
64
|
|
65 reads_list = []
|
|
66 cur_tss = 0
|
|
67 cur_chrom = ''
|
|
68
|
|
69 for read in reads:
|
|
70
|
|
71 if not cur_tss:
|
|
72 cur_tss = bed12.get_tss(read)
|
|
73 reads_list.append(read)
|
|
74 cur_chrom = bed12.get_chrom(read)
|
|
75 continue
|
|
76
|
|
77
|
|
78 tss = bed12.get_tss(read)
|
|
79 chrom = bed12.get_chrom(read)
|
|
80
|
|
81 #if not overlap
|
|
82 if (tss - cur_tss > distance) or (chrom != cur_chrom):
|
|
83 yield reads_list
|
|
84 reads_list = [read]
|
|
85 cur_tss = tss
|
|
86 cur_chrom = chrom
|
|
87 else:
|
|
88 reads_list.append(read)
|
|
89 cur_tss = tss
|
|
90
|
|
91 yield reads_list
|
|
92
|
|
93
|
|
94 def main():
|
|
95
|
|
96 parser = argparse.ArgumentParser()
|
|
97 parser.add_argument('bed_file', help='input file')
|
|
98 parser.add_argument('-t', '--tag_distance', default=20, type=int, help='cluster all the cage tags at distance d')
|
|
99
|
|
100 args = parser.parse_args()
|
|
101
|
|
102 with open(args.bed_file) as bedfile:
|
|
103
|
|
104 reader = csv.reader(bedfile, delimiter='\t')
|
|
105 reads = (line for line in reader)
|
|
106
|
|
107 #for each reads on the same tss
|
|
108 for read_list in overlapping_reads(reads, args.tag_distance):
|
|
109 print_read_to_bed12(read_list)
|
|
110
|
|
111
|
|
112 if __name__ == '__main__':
|
|
113 main()
|
|
114
|
|
115 #TODO: combine this script with fingerprint.py
|