Mercurial > repos > mheinzl > variant_analyzer2
comparison read2mut.py @ 7:ded0dc6a20d3 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author | mheinzl |
---|---|
date | Mon, 25 Jan 2021 13:21:55 +0000 |
parents | 11a2a34f8a2b |
children | ced1a529e7cd |
comparison
equal
deleted
inserted
replaced
6:11a2a34f8a2b | 7:ded0dc6a20d3 |
---|---|
128 for count, variant in enumerate(VCF(file1)): | 128 for count, variant in enumerate(VCF(file1)): |
129 #if count == 2000: | 129 #if count == 2000: |
130 # break | 130 # break |
131 chrom = variant.CHROM | 131 chrom = variant.CHROM |
132 stop_pos = variant.start | 132 stop_pos = variant.start |
133 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) | 133 #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) |
134 ref = variant.REF | 134 ref = variant.REF |
135 alt = variant.ALT[0] | 135 alt = variant.ALT[0] |
136 # nc = variant.format('NC') | 136 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt |
137 ad = variant.format('AD') | 137 |
138 if len(ref) == len(alt): | 138 if len(ref) == len(alt): |
139 mut_array.append([chrom, stop_pos, ref, alt]) | 139 mut_array.append([chrom, stop_pos, ref, alt]) |
140 i += 1 | 140 i += 1 |
141 mut_dict[chrom_stop_pos] = {} | 141 mut_dict[chrom_stop_pos] = {} |
142 mut_read_pos_dict[chrom_stop_pos] = {} | 142 mut_read_pos_dict[chrom_stop_pos] = {} |
214 bam.close() | 214 bam.close() |
215 | 215 |
216 # create pure_tags_dict | 216 # create pure_tags_dict |
217 pure_tags_dict = {} | 217 pure_tags_dict = {} |
218 for key1, value1 in sorted(mut_dict.items()): | 218 for key1, value1 in sorted(mut_dict.items()): |
219 if len(np.where(np.array(['#'.join(str(i) for i in z) | 219 #if len(np.where(np.array(['#'.join(str(i) for i in z) |
220 for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0: | 220 # for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0: |
221 continue | 221 # continue |
222 | 222 |
223 i = np.where(np.array(['#'.join(str(i) for i in z) | 223 i = np.where(np.array(['#'.join(str(i) for i in z) |
224 for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0] | 224 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] |
225 ref = mut_array[i, 2] | 225 ref = mut_array[i, 2] |
226 alt = mut_array[i, 3] | 226 alt = mut_array[i, 3] |
227 pure_tags_dict[key1] = {} | 227 pure_tags_dict[key1] = {} |
228 for key2, value2 in sorted(value1.items()): | 228 for key2, value2 in sorted(value1.items()): |
229 for key3, value3 in value2.items(): | 229 for key3, value3 in value2.items(): |
308 counts_mut = 0 | 308 counts_mut = 0 |
309 chimeric_tag_list = [] | 309 chimeric_tag_list = [] |
310 chimeric_tag = {} | 310 chimeric_tag = {} |
311 if key1 in pure_tags_dict_short.keys(): | 311 if key1 in pure_tags_dict_short.keys(): |
312 i = np.where(np.array(['#'.join(str(i) for i in z) | 312 i = np.where(np.array(['#'.join(str(i) for i in z) |
313 for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0] | 313 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] |
314 ref = mut_array[i, 2] | 314 ref = mut_array[i, 2] |
315 alt = mut_array[i, 3] | 315 alt = mut_array[i, 3] |
316 dcs_median = cvrg_dict[key1][2] | 316 dcs_median = cvrg_dict[key1][2] |
317 whole_array = pure_tags_dict_short[key1].keys() | 317 whole_array = pure_tags_dict_short[key1].keys() |
318 | 318 |
927 else: | 927 else: |
928 tier = "6" | 928 tier = "6" |
929 counter_tier6 += 1 | 929 counter_tier6 += 1 |
930 tier_dict[key1]["tier 6"] += 1 | 930 tier_dict[key1]["tier 6"] += 1 |
931 | 931 |
932 chrom, pos = re.split(r'\#', key1) | 932 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) |
933 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) | 933 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) |
934 sample_tag = key2[:-5] | 934 sample_tag = key2[:-5] |
935 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time | 935 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time |
936 # exclude identical tag from array2, to prevent comparison to itself | 936 # exclude identical tag from array2, to prevent comparison to itself |
937 same_tag = np.where(array2 == sample_tag) | 937 same_tag = np.where(array2 == sample_tag) |
1065 row = 0 | 1065 row = 0 |
1066 | 1066 |
1067 for key1, value1 in sorted(tier_dict.items()): | 1067 for key1, value1 in sorted(tier_dict.items()): |
1068 if key1 in pure_tags_dict_short.keys(): | 1068 if key1 in pure_tags_dict_short.keys(): |
1069 i = np.where(np.array(['#'.join(str(i) for i in z) | 1069 i = np.where(np.array(['#'.join(str(i) for i in z) |
1070 for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0] | 1070 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] |
1071 ref = mut_array[i, 2] | 1071 ref = mut_array[i, 2] |
1072 alt = mut_array[i, 3] | 1072 alt = mut_array[i, 3] |
1073 chrom, pos = re.split(r'\#', key1) | 1073 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) |
1074 ref_count = cvrg_dict[key1][0] | 1074 ref_count = cvrg_dict[key1][0] |
1075 alt_count = cvrg_dict[key1][1] | 1075 alt_count = cvrg_dict[key1][1] |
1076 cvrg = ref_count + alt_count | 1076 cvrg = ref_count + alt_count |
1077 | 1077 |
1078 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) | 1078 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) |
1152 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), | 1152 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), |
1153 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), | 1153 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), |
1154 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), | 1154 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), |
1155 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), | 1155 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), |
1156 ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), | 1156 ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), |
1157 ("Tier 5.1", "variants is close to softclipping in both mates"), | 1157 ("Tier 5.1", "variant is close to softclipping in both mates"), |
1158 ("Tier 5.2", "variants is close to softclipping in one of the mates"), | 1158 ("Tier 5.2", "variant is close to softclipping in one of the mates"), |
1159 ("Tier 5.3", "variants is close to softclipping in one of the SSCS of both mates"), | 1159 ("Tier 5.3", "variant is close to softclipping in one of the SSCS of both mates"), |
1160 ("Tier 5.4", "variants is close to softclipping in one mate (no information of second mate"), | 1160 ("Tier 5.4", "variant is close to softclipping in one mate (no information of second mate"), |
1161 ("Tier 5.5", "variants is close to softclipping in one of the SSCS (no information of the second mate"), | 1161 ("Tier 5.5", "variant is close to softclipping in one of the SSCS (no information of the second mate"), |
1162 ("Tier 6", "remaining variants")] | 1162 ("Tier 6", "remaining variants")] |
1163 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289", | 1163 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289", |
1164 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", | 1164 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", |
1165 "4081", "4098", "5", "10", "", ""), | 1165 "4081", "4098", "5", "10", "", ""), |
1166 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None, | 1166 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None, |