comparison dedup_fingerprint.py @ 0:d1d0ee366702 draft default tip

Uploaded first version
author brenninc
date Wed, 11 May 2016 04:53:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d1d0ee366702
1 """
2 .. module:: dedup_fingerprint
3 :platform: Unix
4 :synopsis: Use UMI to count transcripts, assumes there is no barcode
5
6 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com>
7
8 .. source: https://github.com/mmendez12/umicount
9
10 """
11
12 import csv
13 import itertools
14 import subprocess
15 import argparse
16 import tempfile
17 import os
18 import shutil
19 from collections import defaultdict
20
21 import bed12
22
23
24 def get_fingerprint(read):
25 """Get fingerprint id from the read's name. It assumes that the read's name
26 contains the following pattern *FP:XXX;* where *XXX* is the fingerprint id.
27
28 Args:
29 read: A list of twelve elements where each element refers to a field in the BED format.
30
31 Returns:
32 A string containing the fingerprint id
33
34 >>> read = ['chrX', '100', '200', 'FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
35 >>> get_fingerprint(read)
36 '0012'
37 """
38 return read[3].split('FP:')[1].split(';')[0]
39
40
41 def print_read_to_bed12(key, reads):
42 """ Merge the reads by blocks and print a single read in the BED12 format on stdout.
43 It assumes that the reads are on the same TSS and contains
44 fingerprint information in the read's name.
45
46 Args:
47 key: A tuple that contain the chromosome, barcode and fingerprint information.
48
49 reads: A list of reads (in a list) from the same TSS, that have similar barcode and fingerprint.
50
51 >>> reads = []
52 >>> reads.append(['chrX', '100', '200', 'FP:0012', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75'])
53 >>> reads.append(['chrX', '100', '300', 'FP:0012', '12', '+', '100', '110', '255,0,0', '3', '20,25', '0,175'])
54 >>> print_read_to_bed12(('chrX', '0012'), reads) #doctest: +NORMALIZE_WHITESPACE
55 chrX 100 300 FP:0012 2 + 100 120 255,0,0 3 20,25,25 0,75,175
56 """
57 block_sizes, block_starts = bed12.merge_overlapping_blocks(reads)
58
59 #bed12
60 first_read = sorted(reads, key = bed12.get_start)[0]
61 chrom, fingerprint = key
62 start = bed12.get_start(first_read)
63 end = start + block_starts[-1] + block_sizes[-1]
64 name = "FP:{0}".format(fingerprint)
65 score = len(reads)
66
67 strand = bed12.get_strand(first_read)
68
69 if strand == '+':
70 thick_start = start
71 thick_end = start + block_sizes[0]
72 else:
73 thick_start = end - block_sizes[-1]
74 thick_end = end
75
76 color = "255,0,0"
77 block_count = len(block_sizes)
78 block_sizes = ','.join(map(str, block_sizes))
79 block_starts = ','.join(map(str, block_starts))
80
81 output = [chrom, start, end, name, score, strand, thick_start, thick_end,
82 color, block_count, block_sizes, block_starts]
83
84 output_str = map(str, output)
85 print '\t'.join(output_str)
86
87
88 def main():
89
90 #PARSER TODO: move this code somewhere else
91 parser = argparse.ArgumentParser()
92 group = parser.add_mutually_exclusive_group(required=True)
93 group.add_argument("-d", "--directory", help="absolute path of the folder containing the bed files")
94 group.add_argument("-f", "--file", help="a bed file")
95 parser.add_argument("-o", help='name of the output file. Only works if the script is called with the -f option, \
96 ignored otherwise.')
97
98 args = parser.parse_args()
99
100 if args.directory:
101 path, folder, files = os.walk(args.directory).next()
102 elif args.file:
103 path = ''
104 files = [args.file]
105 #ENDPARSER
106
107 #create a temporary directory
108 tmp_dir = tempfile.mkdtemp()
109
110 plus_strand_tmp_file = open(os.path.join(tmp_dir, '+'), 'w')
111 minus_strand_tmp_file = open(os.path.join(tmp_dir, '-'), 'w')
112 plus_and_minus_sorted_path = os.path.join(tmp_dir, '+-s')
113
114 #creates two temporary bed files containing either reads on the plus or minus strand
115 for bed_file in files:
116
117 with open(os.path.join(path, bed_file)) as bed_file:
118 reader = csv.reader(bed_file, delimiter='\t')
119
120 for read in reader:
121 strand = bed12.get_strand(read)
122 if strand == '+':
123 plus_strand_tmp_file.write('\t'.join(read) + '\n')
124 elif strand == '-':
125 minus_strand_tmp_file.write('\t'.join(read) + '\n')
126
127
128 #close the files
129 plus_strand_tmp_file.close()
130 minus_strand_tmp_file.close()
131
132 #call unix sort on the file containing reads on the plus strand by tss
133 with open(os.path.join(tmp_dir, '+sorted'), "w") as outfile:
134 subprocess.call(["sort", '-k1,1', '-k2,2n', os.path.join(tmp_dir, '+')], stdout=outfile)
135
136 #call unix sort on the file containing reads on the minus strand by tss
137 with open(os.path.join(tmp_dir, '-sorted'), "w") as outfile:
138 subprocess.call(["sort", '-k1,1', '-k3,3n', os.path.join(tmp_dir, '-')], stdout=outfile)
139
140 #concatenate the files sorted by tss
141 with open(plus_and_minus_sorted_path, "w") as outfile:
142 subprocess.call(['cat', os.path.join(tmp_dir, '+sorted'), os.path.join(tmp_dir, '-sorted')], stdout=outfile)
143
144 with open(plus_and_minus_sorted_path) as bedfile:
145 reader = csv.reader(bedfile, delimiter='\t')
146 reads = (line for line in reader)
147
148 #for each reads on the same tss
149 for tss, same_tss_reads in itertools.groupby(reads, bed12.get_tss):
150 d = defaultdict(list)
151
152 #group the reads by chr and fingerprint
153 for read in same_tss_reads:
154 key = (bed12.get_chrom(read), get_fingerprint(read))
155 d[key].append(read)
156
157 #merge and print the reads that have same tss, and fingerprint
158 for key, same_fingerprint_reads in d.iteritems():
159 print_read_to_bed12(key, same_fingerprint_reads)
160
161 shutil.rmtree(tmp_dir)
162
163
164 if __name__ == '__main__':
165 main()
166
167 #TODO: combine this with dedup_barcode_fingerprint