comparison mirgene_functions.py @ 5:810e789ffeab draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:04:54 +0000
parents
children
comparison
equal deleted inserted replaced
4:f44185f616bc 5:810e789ffeab
1 import itertools
2 import urllib.request
3 from collections import OrderedDict
4 import copy
5
6 ########################################################################################################################################################
7
8 """ Read a file and return it as a list """
9
10 def read(path, flag):
11 if flag == 0:
12 with open(path) as fp:
13 file=fp.readlines()
14 fp.close()
15 return file
16
17 if flag == 1:
18 with open(path) as fp:
19 file = fp.read().splitlines()
20 fp.close()
21 return file
22
23 # Write a list to a txt file
24 def write(path, list):
25 with open(path,'w') as fp:
26 for x in list:
27 fp.write(str("\t".join(x[1:-1])))
28 fp.close()
29
30 ########################################################################################################################################################
31
32 """ Detect the longest common substring sequence between two mirnas """
33
34 def longestSubstring(str1, str2):
35
36 from difflib import SequenceMatcher
37 # initialize SequenceMatcher object with
38 # input string
39 seqMatch = SequenceMatcher(None, str1, str2)
40
41 # find match of longest sub-string
42 # output will be like Match(a=0, b=0, size=5)
43 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
44
45 # print longest substring
46 if (match.size != 0):
47 return str1[match.a: match.a + match.size]
48 else:
49 print('No longest common sub-string found')
50
51
52 #################################################################################################################################################################
53
54 """
55
56 Read the sam files from alignment tool and do the followings:
57
58 1) Keep mapped reads
59 2) Keep all sequences with length between 18 and 26 nucleotides
60 3) Detects the ref and templated miRNAs
61 4) Gives names to templated miRNAs based on ref miRNAs
62
63 """
64
65 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts):
66
67 # read the sam file
68 ini_sam=read(path,0)
69 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
70 unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
71 filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam]
72 sorted_uni_arms = []
73
74 # Detection of differences between the canonical miRNA and the detected miRNA
75 for i in range(len(mature_mirnas)):
76 tmp_count_reads = 0 # calculate the total number of reads
77 tmp_count_seq = 0 # calculate the total number of sequences
78 for j in range(len(unique_seq)):
79
80 if mature_mirnas[i] == unique_seq[j][2]:
81
82 temp_mature = mature_mirnas[i+1]
83 off_part = longestSubstring(temp_mature, unique_seq[j][9])
84
85 mat_diff = temp_mature.split(off_part)
86 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
87
88 unique_diff = unique_seq[j][9].split(off_part)
89 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
90
91 # Handling of some special mirnas like (hsa-miR-8485)
92 if mat_diff[1]!=0 and unique_diff[1]!=0:
93 unique_seq[j]=1
94 pre_pos = 0
95 post_pos = 0
96
97 elif mat_diff[0]!=0 and unique_diff[0]!=0:
98 unique_seq[j]=1
99 pre_pos = 0
100 post_pos = 0
101
102 else:
103 # Keep the findings
104 pre_pos = mat_diff[0]-unique_diff[0]
105 post_pos = unique_diff[1]-mat_diff[1]
106 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
107 tmp_count_seq = tmp_count_seq+1
108
109 # Store the detected miRNAs with new names according to the findings
110 if pre_pos != 0 or post_pos != 0:
111 if pre_pos == 0:
112 unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
113 elif post_pos == 0:
114 unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
115 else:
116 unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
117
118 # Remove the values "1" from the handling of special mirnas (hsa-miR-8485)
119 for x in range(unique_seq.count(1)):
120 unique_seq.remove(1)
121
122 # metrics for the production of database
123 if tmp_count_reads != 0 and tmp_count_seq != 0:
124 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads])
125
126 # Sorting of the metrics for database
127 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
128
129 # Correction of metrics due to the collapsing and removing of duplicates for the production of Database
130 for y in sorted_uni_arms:
131 counts=0
132 seqs=0
133 for x in unique_seq:
134 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]:
135 counts+=int(x[0].split("-")[1])
136 seqs+=1
137
138 y[1]=seqs
139 y[2]=counts
140
141 # Output variables
142 temp_mirna_names=[]
143
144 l.acquire()
145 if case == "c" or case == "t":
146 temp_mirna_names.extend(z[2] for z in unique_seq)
147 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq])
148 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
149 mirna_names.extend(temp_mirna_names)
150 unmap_seq.value += sum([1 for x in main_sam if x[1] == '4'])
151 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4'])
152 file_order.append(file)
153 samples.append(unique_seq)
154 data.append([case,file,unique_seq,sorted_uni_arms])
155 ini_sample.append(filter_sam)
156 l.release()
157
158
159 ######################################################################################################################################
160 """
161
162 Read a sam file from Bowtie and do the followings:
163
164 1) Keep unmapped reads
165 2) Keep all sequences with length between 18 and 26 nucleotides
166 3) Detects the non-template isomirs
167 4) Gives names to isomir's based on ref miRNAs
168
169 """
170
171 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs):
172
173 # read the sam file
174 ini_sam=read(path,0)
175 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
176 unique_seq=[]
177 unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
178 uni_seq=[]
179
180 # Calculate the shifted positions for every isomir and add them to the name of it
181 sorted_uni_arms = []
182 for i in range(1,len(mature_mirnas),2):
183 tmp_count_reads = 0 # calculate the total number of reads
184 tmp_count_seq = 0 # calculate the total number of sequences
185
186 for j in range(len(unique_seq)):
187
188 temp_mature = mature_mirnas[i].strip().replace("U", "T")
189
190 # Detection of differences between the canonical miRNA and the detected non template miRNA
191 if temp_mature in unique_seq[j][9]:
192
193 off_part = longestSubstring(temp_mature, unique_seq[j][9])
194
195 mat_diff = temp_mature.split(off_part)
196 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
197
198 unique_diff = unique_seq[j][9].split(off_part)
199 if len(unique_diff)<=2:
200 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
201
202 pre_pos = mat_diff[0]-unique_diff[0]
203 post_pos = unique_diff[1]-mat_diff[1]
204
205 lengthofmir = len(off_part) + post_pos
206 if pre_pos == 0 and post_pos<4:
207 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
208 tmp_count_seq = tmp_count_seq + 1
209
210 t_name=copy.deepcopy(unique_seq[j])
211 t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
212 uni_seq.append(t_name)
213
214 # metrics for the production of database
215 if tmp_count_reads != 0 and tmp_count_seq != 0:
216 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads])
217
218
219 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
220 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
221
222 # Output variables
223
224 l.acquire()
225 if case == "c" or case == "t":
226 names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"])
227 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
228 file_order.append(file)
229 data.append([case,file,unique_seq,sorted_uni_arms])
230 l.release()
231
232 #################################################################################################################################################################################################################
233
234 """
235
236 This function detects the differences between the two groups (control, treated).
237 Then copy the undetected miRNAs from one group to other and add zeros as counts.
238 With this way the two groups will have the same number of miRNAs.
239
240 """
241
242 def black_white(mirna_names_1,mirna_names_2,group,manager):
243
244 add_names = [x for x in mirna_names_1 if x not in mirna_names_2]
245 add_names.sort()
246 add_names = list(add_names for add_names,_ in itertools.groupby(add_names))
247
248 group.sort()
249 group = list(group for group,_ in itertools.groupby(group))
250
251 zeros=["0"]*(len(group[0])-2)
252 [add_names[i].extend(zeros) for i,_ in enumerate(add_names)]
253 group=group+add_names
254
255 manager.extend(group)
256
257 ########################################################################################################>
258
259 """
260
261 This function collapses the miRNAs with same sequences and different names into one entry
262 by merging all the different names into one and are separated with the character "/"
263
264 """
265
266 def merging_dupes(group,f_dupes):
267
268 dupes=[]
269 temp_mat =[]
270
271 for num,_ in enumerate(group):
272
273 if group[num][1] not in temp_mat and group[num][0] not in temp_mat:
274 temp_mat.append(group[num][1])
275 temp_mat.append(group[num][0])
276 else:
277 dupes.append(group[num][1])
278
279
280 dupes=list(set(dupes))
281
282 dupes=[[x] for x in dupes]
283
284 for x in group:
285 for y in dupes:
286 if x[1]==y[0]:
287 fl=0
288 if len(y)==1:
289 y.append(x[0])
290 else:
291 for i in range(1,len(y)):
292 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
293 fl=1
294 if len(x[0])<len(y[i]):
295 del y[i]
296 y.append(x[0])
297 break
298
299 if fl==0:
300 y.append((x[0]))
301
302 for y in dupes:
303 if len(y)>2:
304 for i in range(len(y)-1,1,-1):
305 y[1]=y[1]+"/"+y[i]
306 del y[i]
307
308 f_dupes.extend(dupes)
309
310
311 ########################################################################################################>
312
313 """
314
315 This function removes the duplications of sequences based on output from the fuction merging_dupes
316
317 """
318
319 def apply_merging_dupes(group,dupes,managger):
320
321 for x in group:
322 for y in dupes:
323 if x[1]==y[0]:
324 x[0]=y[1]
325
326 group.sort()
327 group=list(group for group,_ in itertools.groupby(group))
328 managger.extend(group)
329
330 ########################################################################################################>
331
332 """
333
334 This function is optional and performs a filter for low counts miRNAs based on
335 number of counts and the percentage of the samples, according to user preferences
336
337 """
338
339 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts):
340
341 t_group_new=[]
342 c_group_new=[]
343
344 percent=int(per)/100
345 c_col_filter=round(percent*(len(c_group[1])-2))
346 t_col_filter=round(percent*(len(t_group[1])-2))
347
348 for i, _ in enumerate(c_group):
349 c_cols=0
350 t_cols=0
351
352 c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)])
353 t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)])
354
355 if c_cols>=c_col_filter or t_cols>=t_col_filter:
356 t_group_new.append(t_group[i])
357 c_group_new.append(c_group[i])
358
359 fil_c_group.extend(c_group_new)
360 fil_t_group.extend(t_group_new)
361
362 ##################################################################################################################################################################################################################
363
364 """
365
366 This function exports the count matrices for every group (controls, treated)
367 and condition (ref and templated miRNAs, non-templated miRNAs)
368
369 """
370
371 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per):
372
373 if flag == 1 and int(per)!=-1:
374 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
375 fp.write("Name\t")
376 fp.write("Sequence")
377 for y in tre_file_order:
378 fp.write("\t"+y)
379
380 for x in fil_tre:
381 fp.write("\n%s" % "\t".join(x))
382 fp.close()
383
384 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
385 fp.write("Name\t")
386 fp.write("Sequence")
387 for y in con_file_order:
388 fp.write("\t"+y)
389
390 for x in fil_con:
391 fp.write("\n%s" % "\t".join(x))
392 fp.close()
393
394
395 if flag == 2 and int(per)!=-1:
396 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
397 fp.write("Name\t")
398 fp.write("Sequence")
399 for y in tre_file_order:
400 fp.write("\t"+y)
401
402
403 for x in fil_tre:
404 fp.write("\n%s" % "\t".join(x))
405 fp.close()
406
407 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
408 fp.write("Name\t")
409 fp.write("Sequence")
410 for y in con_file_order:
411 fp.write("\t"+y)
412
413 for x in fil_con:
414 fp.write("\n%s" % "\t".join(x))
415 fp.close()
416
417
418 if flag == 1:
419 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
420 fp.write("Name\t")
421 fp.write("Sequence")
422 for y in tre_file_order:
423 fp.write("\t"+y)
424
425 for x in raw_tre:
426 fp.write("\n%s" % "\t".join(x))
427 fp.close()
428
429 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
430 fp.write("Name\t")
431 fp.write("Sequence")
432 for y in con_file_order:
433 fp.write("\t"+y)
434
435 for x in raw_con:
436 fp.write("\n%s" % "\t".join(x))
437 fp.close()
438
439 if flag == 2:
440 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
441 fp.write("Name\t")
442 fp.write("Sequence")
443 for y in tre_file_order:
444 fp.write("\t"+y)
445
446
447 for x in raw_tre:
448 fp.write("\n%s" % "\t".join(x))
449 fp.close()
450
451 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
452 fp.write("Name\t")
453 fp.write("Sequence")
454 for y in con_file_order:
455 fp.write("\t"+y)
456
457 for x in raw_con:
458 fp.write("\n%s" % "\t".join(x))
459 fp.close()
460
461 ####################################################################################################################################################################################################################
462
463 """
464
465 This function exports the files of the database with all the info
466 about every type of the detected miRNAs for every sample
467
468 """
469
470 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
471
472 if f==1:
473 if con=="c":
474 fp = open('split1/'+name, 'w')
475
476 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
477 if con=="t":
478 fp = open('split2/'+name, 'w')
479 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
480
481 for i in range(len(sorted_uni_arms)):
482 temp = []
483 for j in range(len(unique_seq)):
484
485 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]):
486
487 temp.append(unique_seq[j])
488
489 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
490 fp.write("*********************************************************************************************************\n")
491 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
492 fp.write("*********************************************************************************************************\n\n")
493 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
494 fp.write("\n" + "\n")
495 fp.close()
496
497 if f==2:
498
499 if con=="c":
500 fp = open('split3/'+name, 'w')
501 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
502 if con=="t":
503 fp = open('split4/'+name, 'w')
504 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
505
506 for i in range(len(sorted_uni_arms)):
507 temp = []
508 for j in range(len(unique_seq)):
509 if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]:
510 temp.append(unique_seq[j])
511 if temp!=[]:
512 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
513 fp.write("*********************************************************************************************************\n")
514 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
515 fp.write("*********************************************************************************************************\n\n")
516 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
517 fp.write("\n" + "\n")
518 fp.close()
519
520
521 #########################################################################################################################
522
523 """
524
525 This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates
526
527 """
528
529 def merging_names(ini_mat,new):
530
531 dupes=[]
532 temp_mat =[]
533
534 for num in range(len(ini_mat)):
535
536 if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat:
537 temp_mat.append(ini_mat[num][1])
538 temp_mat.append(ini_mat[num][0])
539 else:
540 dupes.append(ini_mat[num][1])
541
542 dupes=list(set(dupes))
543
544 for i in range(len(dupes)):
545 dupes[i]=[dupes[i]]
546
547 for x in ini_mat:
548 for y in dupes:
549 if x[1]==y[0]:
550 fl=0
551 if len(y)==1:
552 y.append(x[0])
553 else:
554 for i in range(1,len(y)):
555 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
556 fl=1
557 if len(x[0])<len(y[i]):
558 del y[i]
559 y.append(x[0])
560 break
561
562 if fl==0:
563 y.append((x[0]))
564
565 for y in dupes:
566 if len(y)>2:
567 for i in range(len(y)-1,1,-1):
568 y[1]=y[1]+"/"+y[i]
569 del y[i]
570
571
572 for x in ini_mat:
573 for y in dupes:
574 if x[1]==y[0]:
575 x[0]=y[1]
576
577 ini_mat.sort()
578 ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat))
579 new.extend(ini_mat)
580
581 ####################################################################################################################################################################################################################
582
583 """
584
585 This function exports the count matrices for differential expresion
586 if user chose analysis with non-templated miRNAs detection
587
588 """
589
590 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro):
591
592 for i in range(2,len(tem_samp[0])):
593
594 fp = open(folder+tem_names[i-2]+'.txt','w')
595 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
596
597 for x in tem_samp:
598 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
599
600 for j in range(len(non_names)):
601 if non_names[j]==tem_names[i-2]:
602 for x in non_samp:
603 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
604 fp.close()
605
606 #################################################################################################################################################################################################################
607
608 """
609
610 This function exports the count matrices for differential expresion
611 if user chose analysis only with templated miRNAs detection
612
613 """
614
615 def temp_counts_to_diff(names,samp,folder,pro):
616
617 for i in range(2,len(samp[0])):
618
619 fp = open(folder+names[i-2]+'.txt','w')
620 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
621
622 for x in samp:
623 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
624 fp.close()
625
626 #################################################################################################################################################################################################################
627
628 """
629
630 This function downloads the fasta files from MirGene site
631 with ref miRNAs and star miRNAs sequences and merges them
632 into one list
633
634 """
635
636 def download_matures(matures,org_name):
637
638 mature_mir=[]
639
640 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1'
641 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1'
642
643 data = urllib.request.urlopen(mat_url).read()
644 file_mirna = data.decode('utf-8')
645 mature_mir = file_mirna.split("\n")
646 mature_mir = [x.replace(">","") for x in mature_mir]
647 del mature_mir[-1]
648
649 data = urllib.request.urlopen(star_url).read()
650 file_mirna = data.decode('utf-8')
651 star_mir = file_mirna.split("\n")
652 star_mir = [x.replace(">","") for x in star_mir]
653 del star_mir[-1]
654
655 mature_mir.extend(star_mir)
656
657 for i in range(1,len(mature_mir),2):
658 mature_mir[i]=mature_mir[i].replace("U","T")
659
660 matures.extend(mature_mir)
661
662 ###################################################################################################################
663
664 """
665
666 This function detects the templated isoforms from the 1st part of analysis
667 These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs
668
669 """
670
671 def non_template_ref(sc,st,all_isoforms):
672
673 pre_uni_seq_con = list(sc)
674 pre_uni_seq_tre = list(st)
675
676 for x in pre_uni_seq_con:
677 for y in x:
678 if y[2] not in all_isoforms and "_t_" in y[2]:
679 all_isoforms.append(y[2])
680 all_isoforms.append(y[9])
681
682 for x in pre_uni_seq_tre:
683 for y in x:
684 if y[2] not in all_isoforms and "_t_" in y[2]:
685 all_isoforms.append(y[2])
686 all_isoforms.append(y[9])
687
688 ################################################################################################################################################################################################
689
690 """
691
692 This function adds uncommon detected mirnas among the samples with zeros as counts
693
694 """
695
696 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order):
697
698 for y in mir_names:
699 flag=0
700 for x in sample:
701 if y[0]==x[0]: # check if miRNA exists in the sample
702 flag=1
703 break
704 if flag==0:
705 sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence
706
707 # sorting and remove duplicates
708 sample.sort(key=lambda x: x[0])
709 sample=list(sample for sample,_ in itertools.groupby(sample))
710
711 # Return the updated sample
712 l.acquire()
713 new_d.append(sample)
714 sample_order.append(sample_name)
715 l.release()
716
717 ###############################################################################################################################################################################################