comparison hd.py @ 10:69aa17354a6e draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/hd commit f01678e9bfead9f9e1b54dd9ecf7141f057dd9de
author mheinzl
date Tue, 15 May 2018 11:27:27 -0400
parents c81bc96bea1c
children 7adc48c8a03d
comparison
equal deleted inserted replaced
9:c81bc96bea1c 10:69aa17354a6e
22 import matplotlib.pyplot as plt 22 import matplotlib.pyplot as plt
23 import os.path 23 import os.path
24 import cPickle as pickle 24 import cPickle as pickle
25 from multiprocessing.pool import Pool 25 from multiprocessing.pool import Pool
26 from functools import partial 26 from functools import partial
27 #from HDAnalysis_plots.plot_HDwithFSD import plotHDwithFSD
28 #from HDAnalysis_plots.plot_FSDwithHD2 import plotFSDwithHD2
29 #from HDAnalysis_plots.plot_HDwithinSeq_Sum2 import plotHDwithinSeq_Sum2
30 #from HDAnalysis_plots.table_HD import createTableHD, createFileHD, createTableHDwithTags, createFileHDwithinTag
31 #from HDAnalysis_plots.table_FSD import createTableFSD2, createFileFSD2
32 import argparse 27 import argparse
33 import sys 28 import sys
34 import os 29 import os
35 from matplotlib.backends.backend_pdf import PdfPages 30 from matplotlib.backends.backend_pdf import PdfPages
36 from collections import Counter 31 from collections import Counter
166 else: 161 else:
167 range1 = range(minimumX, maximumX + 2) 162 range1 = range(minimumX, maximumX + 2)
168 163
169 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False, 164 counts = plt.hist(ham, align="left", rwidth=0.8, stacked=False,
170 label=[ "HD a", "HD b","HD a+b"], 165 label=[ "HD a", "HD b","HD a+b"],
171 bins=range1, color=["#585858", "#58ACFA", "#FA5858"], edgecolor='black', linewidth=1) 166 bins=range1, color=[ "#58ACFA", "#FA5858","#585858"], edgecolor='black', linewidth=1)
172 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1)) 167 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(1.55, 1))
173 plt.suptitle('Hamming distances within tags', fontsize=14) 168 plt.suptitle('Hamming distances within tags', fontsize=14)
174 #plt.title(title_file1, fontsize=12) 169 #plt.title(title_file1, fontsize=12)
175 plt.xlabel("Hamming Distance", fontsize=14) 170 plt.xlabel("Hamming Distance", fontsize=14)
176 plt.ylabel("Absolute Frequency", fontsize=14) 171 plt.ylabel("Absolute Frequency", fontsize=14)
691 print("subset_tag is smaller or equal zero.") 686 print("subset_tag is smaller or equal zero.")
692 exit(5) 687 exit(5)
693 688
694 ### PLOT ### 689 ### PLOT ###
695 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color 690 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
696 plt.rcParams['xtick.labelsize'] = 12 691 plt.rcParams['xtick.labelsize'] = 14
697 plt.rcParams['ytick.labelsize'] = 12 692 plt.rcParams['ytick.labelsize'] = 14
698 plt.rcParams['patch.edgecolor'] = "#000000" 693 plt.rcParams['patch.edgecolor'] = "#000000"
699 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 694 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
700 695
701 if file2 != str(None): 696 if file2 != str(None):
702 files = [file1, file2] 697 files = [file1, file2]
709 files = [file1] 704 files = [file1]
710 name1 = name1.split(".tabular")[0] 705 name1 = name1.split(".tabular")[0]
711 names = [name1] 706 names = [name1]
712 pdf_files = [title_savedFile_pdf] 707 pdf_files = [title_savedFile_pdf]
713 csv_files = [title_savedFile_csv] 708 csv_files = [title_savedFile_csv]
714
715 print(type(onlyDuplicates))
716 print(onlyDuplicates)
717 709
718 for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files): 710 for f, name_file, pdf_f, csv_f in zip(files, names, pdf_files, csv_files):
719 with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf: 711 with open(csv_f, "w") as output_file, PdfPages(pdf_f) as pdf:
720 print("dataset: ", name_file) 712 print("dataset: ", name_file)
721 integers, data_array = readFileReferenceFree(f) 713 integers, data_array = readFileReferenceFree(f)
803 # for h, tag in zip(ham, result1): 795 # for h, tag in zip(ham, result1):
804 # output_file1.write("{}\t{}\n".format(tag, h)) 796 # output_file1.write("{}\t{}\n".format(tag, h))
805 797
806 # HD analysis for chimeric reads 798 # HD analysis for chimeric reads
807 proc_pool_b = Pool(nproc) 799 proc_pool_b = Pool(nproc)
808 print(chunks_sample)
809 print(result2)
810 print(data_array)
811 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample) 800 diff_list_a = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=False), chunks_sample)
812 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample) 801 diff_list_b = proc_pool_b.map(partial(hamming_difference, array2=result2, mate_b=True), chunks_sample)
813 proc_pool_b.close() 802 proc_pool_b.close()
814 proc_pool_b.join() 803 proc_pool_b.join()
815 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]), 804 diff = numpy.concatenate((numpy.concatenate([item[0] for item in diff_list_a]),
901 ########################## Plot difference between HD's separated after FSD ########################################## 890 ########################## Plot difference between HD's separated after FSD ##########################################
902 ######################################################################################################################## 891 ########################################################################################################################
903 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf, 892 plotHDwithFSD(listDifference1, maximumXDifference, minimumXDifference, pdf=pdf,
904 subtitle="Delta Hamming distance within tags", 893 subtitle="Delta Hamming distance within tags",
905 title_file1=name_file, lenTags=lenTags, 894 title_file1=name_file, lenTags=lenTags,
906 xlabel="abs delta Hamming distance", relative=False) 895 xlabel="absolute delta Hamming distance", relative=False)
907 896
908 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf, 897 plotHDwithFSD(listRelDifference1, maximumXRelDifference, minimumXRelDifference, pdf=pdf,
909 subtitle="Relative delta Hamming distances within tags", 898 subtitle="Relative delta Hamming distances within tags",
910 title_file1=name_file, lenTags=lenTags, 899 title_file1=name_file, lenTags=lenTags,
911 xlabel="rel delta Hamming distance", relative=True) 900 xlabel="relative delta Hamming distance", relative=True)
912 901
913 #################### Plot FSD separated after difference between HD's ##################################### 902 #################### Plot FSD separated after difference between HD's #####################################
914 ######################################################################################################################## 903 ########################################################################################################################
915 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff, 904 plotFSDwithHD2(familySizeList1_diff, maximumXFS_diff, minimumXFS_diff,
916 subtitle="Family size distribution with delta Hamming distances within the tags", 905 subtitle="Family size distribution separated by delta Hamming distances within the tags",
917 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant) 906 pdf=pdf,relative=False, diff=True, title_file1=name_file, quant=quant)
918 907
919 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf, 908 plotFSDwithHD2(familySizeList1_reldiff, maximumXFS_reldiff, minimumXFS_reldiff, quant=quant, pdf=pdf,
920 subtitle="Family size distribution with delta Hamming distances within the tags", 909 subtitle="Family size distribution separated by delta Hamming distances within the tags",
921 relative=True, diff=True, title_file1=name_file) 910 relative=True, diff=True, title_file1=name_file)
922 911
923 912
924 # plots for chimeric reads 913 # plots for chimeric reads
925 if len(minHD_tags_zeros) != 0: 914 if len(minHD_tags_zeros) != 0:
929 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False) 918 title_file1=name_file, lenTags=lenTags,xlabel="Hamming distance", relative=False)
930 919
931 ## FSD 920 ## FSD
932 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros, 921 plotFSDwithHD2(familySizeList1_diff_zeros, maximumXFS_diff_zeros, minimumXFS_diff_zeros,
933 quant=quant, pdf=pdf, 922 quant=quant, pdf=pdf,
934 subtitle="Family size distribution with Hamming distance from the non-identical half of chimeras", 923 subtitle="Family size distribution separated by Hamming distance of the non-identical half of chimeras",
935 relative=False, diff=False, title_file1=name_file) 924 relative=False, diff=False, title_file1=name_file)
936 925
937 ### print all data to a CSV file 926 ### print all data to a CSV file
938 #### HD #### 927 #### HD ####
939 summary, sumCol = createTableHD(list1, "HD=") 928 summary, sumCol = createTableHD(list1, "HD=")