Mercurial > repos > brenninc > umicount
comparison dedup_fingerprint.py @ 0:d1d0ee366702 draft default tip
Uploaded first version
author | brenninc |
---|---|
date | Wed, 11 May 2016 04:53:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d1d0ee366702 |
---|---|
1 """ | |
2 .. module:: dedup_fingerprint | |
3 :platform: Unix | |
4 :synopsis: Use UMI to count transcripts, assumes there is no barcode | |
5 | |
6 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com> | |
7 | |
8 .. source: https://github.com/mmendez12/umicount | |
9 | |
10 """ | |
11 | |
12 import csv | |
13 import itertools | |
14 import subprocess | |
15 import argparse | |
16 import tempfile | |
17 import os | |
18 import shutil | |
19 from collections import defaultdict | |
20 | |
21 import bed12 | |
22 | |
23 | |
24 def get_fingerprint(read): | |
25 """Get fingerprint id from the read's name. It assumes that the read's name | |
26 contains the following pattern *FP:XXX;* where *XXX* is the fingerprint id. | |
27 | |
28 Args: | |
29 read: A list of twelve elements where each element refers to a field in the BED format. | |
30 | |
31 Returns: | |
32 A string containing the fingerprint id | |
33 | |
34 >>> read = ['chrX', '100', '200', 'FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] | |
35 >>> get_fingerprint(read) | |
36 '0012' | |
37 """ | |
38 return read[3].split('FP:')[1].split(';')[0] | |
39 | |
40 | |
41 def print_read_to_bed12(key, reads): | |
42 """ Merge the reads by blocks and print a single read in the BED12 format on stdout. | |
43 It assumes that the reads are on the same TSS and contains | |
44 fingerprint information in the read's name. | |
45 | |
46 Args: | |
47 key: A tuple that contain the chromosome, barcode and fingerprint information. | |
48 | |
49 reads: A list of reads (in a list) from the same TSS, that have similar barcode and fingerprint. | |
50 | |
51 >>> reads = [] | |
52 >>> reads.append(['chrX', '100', '200', 'FP:0012', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75']) | |
53 >>> reads.append(['chrX', '100', '300', 'FP:0012', '12', '+', '100', '110', '255,0,0', '3', '20,25', '0,175']) | |
54 >>> print_read_to_bed12(('chrX', '0012'), reads) #doctest: +NORMALIZE_WHITESPACE | |
55 chrX 100 300 FP:0012 2 + 100 120 255,0,0 3 20,25,25 0,75,175 | |
56 """ | |
57 block_sizes, block_starts = bed12.merge_overlapping_blocks(reads) | |
58 | |
59 #bed12 | |
60 first_read = sorted(reads, key = bed12.get_start)[0] | |
61 chrom, fingerprint = key | |
62 start = bed12.get_start(first_read) | |
63 end = start + block_starts[-1] + block_sizes[-1] | |
64 name = "FP:{0}".format(fingerprint) | |
65 score = len(reads) | |
66 | |
67 strand = bed12.get_strand(first_read) | |
68 | |
69 if strand == '+': | |
70 thick_start = start | |
71 thick_end = start + block_sizes[0] | |
72 else: | |
73 thick_start = end - block_sizes[-1] | |
74 thick_end = end | |
75 | |
76 color = "255,0,0" | |
77 block_count = len(block_sizes) | |
78 block_sizes = ','.join(map(str, block_sizes)) | |
79 block_starts = ','.join(map(str, block_starts)) | |
80 | |
81 output = [chrom, start, end, name, score, strand, thick_start, thick_end, | |
82 color, block_count, block_sizes, block_starts] | |
83 | |
84 output_str = map(str, output) | |
85 print '\t'.join(output_str) | |
86 | |
87 | |
88 def main(): | |
89 | |
90 #PARSER TODO: move this code somewhere else | |
91 parser = argparse.ArgumentParser() | |
92 group = parser.add_mutually_exclusive_group(required=True) | |
93 group.add_argument("-d", "--directory", help="absolute path of the folder containing the bed files") | |
94 group.add_argument("-f", "--file", help="a bed file") | |
95 parser.add_argument("-o", help='name of the output file. Only works if the script is called with the -f option, \ | |
96 ignored otherwise.') | |
97 | |
98 args = parser.parse_args() | |
99 | |
100 if args.directory: | |
101 path, folder, files = os.walk(args.directory).next() | |
102 elif args.file: | |
103 path = '' | |
104 files = [args.file] | |
105 #ENDPARSER | |
106 | |
107 #create a temporary directory | |
108 tmp_dir = tempfile.mkdtemp() | |
109 | |
110 plus_strand_tmp_file = open(os.path.join(tmp_dir, '+'), 'w') | |
111 minus_strand_tmp_file = open(os.path.join(tmp_dir, '-'), 'w') | |
112 plus_and_minus_sorted_path = os.path.join(tmp_dir, '+-s') | |
113 | |
114 #creates two temporary bed files containing either reads on the plus or minus strand | |
115 for bed_file in files: | |
116 | |
117 with open(os.path.join(path, bed_file)) as bed_file: | |
118 reader = csv.reader(bed_file, delimiter='\t') | |
119 | |
120 for read in reader: | |
121 strand = bed12.get_strand(read) | |
122 if strand == '+': | |
123 plus_strand_tmp_file.write('\t'.join(read) + '\n') | |
124 elif strand == '-': | |
125 minus_strand_tmp_file.write('\t'.join(read) + '\n') | |
126 | |
127 | |
128 #close the files | |
129 plus_strand_tmp_file.close() | |
130 minus_strand_tmp_file.close() | |
131 | |
132 #call unix sort on the file containing reads on the plus strand by tss | |
133 with open(os.path.join(tmp_dir, '+sorted'), "w") as outfile: | |
134 subprocess.call(["sort", '-k1,1', '-k2,2n', os.path.join(tmp_dir, '+')], stdout=outfile) | |
135 | |
136 #call unix sort on the file containing reads on the minus strand by tss | |
137 with open(os.path.join(tmp_dir, '-sorted'), "w") as outfile: | |
138 subprocess.call(["sort", '-k1,1', '-k3,3n', os.path.join(tmp_dir, '-')], stdout=outfile) | |
139 | |
140 #concatenate the files sorted by tss | |
141 with open(plus_and_minus_sorted_path, "w") as outfile: | |
142 subprocess.call(['cat', os.path.join(tmp_dir, '+sorted'), os.path.join(tmp_dir, '-sorted')], stdout=outfile) | |
143 | |
144 with open(plus_and_minus_sorted_path) as bedfile: | |
145 reader = csv.reader(bedfile, delimiter='\t') | |
146 reads = (line for line in reader) | |
147 | |
148 #for each reads on the same tss | |
149 for tss, same_tss_reads in itertools.groupby(reads, bed12.get_tss): | |
150 d = defaultdict(list) | |
151 | |
152 #group the reads by chr and fingerprint | |
153 for read in same_tss_reads: | |
154 key = (bed12.get_chrom(read), get_fingerprint(read)) | |
155 d[key].append(read) | |
156 | |
157 #merge and print the reads that have same tss, and fingerprint | |
158 for key, same_fingerprint_reads in d.iteritems(): | |
159 print_read_to_bed12(key, same_fingerprint_reads) | |
160 | |
161 shutil.rmtree(tmp_dir) | |
162 | |
163 | |
164 if __name__ == '__main__': | |
165 main() | |
166 | |
167 #TODO: combine this with dedup_barcode_fingerprint |