Mercurial > repos > mheinzl > fsd_regions
changeset 4:b202c97deabe draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit dfaab79252a858e8df16bbea3607ebf1b6962e5a
author | mheinzl |
---|---|
date | Mon, 08 Oct 2018 05:53:50 -0400 |
parents | 85d870b8ae92 |
children | 52454637bc45 |
files | fsd_regions.py fsd_regions.xml test-data/Test_data.tabular test-data/Test_data_regions.txt test-data/output_file.pdf test-data/output_file.tabular |
diffstat | 6 files changed, 133 insertions(+), 40 deletions(-) [+] |
line wrap: on
line diff
--- a/fsd_regions.py Wed May 23 15:06:27 2018 -0400 +++ b/fsd_regions.py Mon Oct 08 05:53:50 2018 -0400 @@ -8,37 +8,36 @@ # Takes at least one TABULAR file with tags before the alignment to the SSCS # and a TXT with tags of reads that overlap the regions of the reference genome as input. # The program produces a plot which shows the distribution of family sizes of the tags from the input files and -# a CSV file with the data of the plot. +# a tabular file with the data of the plot. -# USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf +# USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf -import numpy -import matplotlib.pyplot as plt import argparse import sys -import os + +import matplotlib.pyplot as plt +import numpy from matplotlib.backends.backend_pdf import PdfPages +plt.switch_backend('agg') + + def readFileReferenceFree(file, delim): with open(file, 'r') as dest_f: data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') return(data_array) + def make_argparser(): parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome') - parser.add_argument('--inputFile', - help='Tabular File with three columns: ab or ba, tag and family size.') + parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.') parser.add_argument('--inputName1') - parser.add_argument('--ref_genome', - help='TXT File with tags of reads that overlap the region.') - parser.add_argument('--output_pdf', default="data.pdf", type=str, - help='Name of the pdf and csv file.') - parser.add_argument('--output_csv', default="data.csv", type=str, - help='Name of the pdf and csv file.') - parser.add_argument('--sep', default=",", - help='Separator in the csv file.') + parser.add_argument('--ref_genome', help='TXT File with tags of reads that overlap the region.') + parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.') + parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.') return parser + def compare_read_families_refGenome(argv): parser = make_argparser() args = parser.parse_args(argv[1:]) @@ -48,12 +47,8 @@ name1 = name1.split(".tabular")[0] refGenome = args.ref_genome title_file = args.output_pdf - title_file2 = args.output_csv - sep = args.sep - - if type(sep) is not str or len(sep) > 1: - print("Error: --sep must be a single character.") - exit(3) + title_file2 = args.output_tabular + sep = "\t" with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: data_array = readFileReferenceFree(firstFile, "\t") @@ -105,7 +100,7 @@ maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) - ### PLOT ### + # PLOT plt.rc('figure', figsize=(11.69, 8.27)) # A4 format plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color plt.rcParams['xtick.labelsize'] = 14 @@ -156,7 +151,7 @@ plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) - #plt.title(name1, fontsize=14) + # plt.title(name1, fontsize=14) plt.xlabel("Family size", fontsize=14) plt.ylabel("Absolute Frequency", fontsize=14) plt.grid(b=True, which="major", color="#424242", linestyle=":") @@ -175,19 +170,19 @@ output_file.write("\n\nValues from family size distribution\n") output_file.write("{}".format(sep)) for i in groupUnique: - output_file.write("{}{}".format(i,sep)) + output_file.write("{}{}".format(i, sep)) output_file.write("\n") - j=0 - for fs in counts[1][0:len(counts[1])-1]: + j = 0 + for fs in counts[1][0:len(counts[1]) - 1]: if fs == 21: fs = ">20" else: fs = "={}".format(fs) - output_file.write("FS{}{}".format(fs,sep)) + output_file.write("FS{}{}".format(fs, sep)) for n in range(len(groupUnique)): output_file.write("{}{}".format(int(counts[0][n][j]), sep)) output_file.write("\n") - j+=1 + j += 1 output_file.write("sum{}".format(sep)) for i in counts[0]: output_file.write("{}{}".format(int(sum(i)), sep)) @@ -195,11 +190,12 @@ output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") output_file.write("Region{}total nr. of tags per region\n".format(sep)) for i, count in zip(groupUnique, quantAfterRegion): - output_file.write("{}{}{}\n".format(i,sep,len(count) / 2)) - output_file.write("sum of tags{}{}\n".format(sep,length_regions)) + output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) + output_file.write("sum of tags{}{}\n".format(sep, length_regions)) print("Files successfully created!") - #print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd())) + # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd())) + if __name__ == '__main__': - sys.exit(compare_read_families_refGenome(sys.argv)) + sys.exit(compare_read_families_refGenome(sys.argv))
--- a/fsd_regions.xml Wed May 23 15:06:27 2018 -0400 +++ b/fsd_regions.xml Mon Oct 08 05:53:50 2018 -0400 @@ -1,22 +1,29 @@ <?xml version="1.0" encoding="UTF-8"?> -<tool id="fsd_regions" name="Duplex Sequencing Analysis: fsd_regions" version="0.0.4"> +<tool id="fsd_regions" name="Duplex Sequencing Analysis: fsd_regions" version="1.0.0"> + <description>Family size distribution (FSD) of user-specified regions in the reference genome</description> <requirements> <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.4">matplotlib</requirement> + <requirement type="package" version="1.4.0">matplotlib</requirement> </requirements> - <description>Family size distribution (FSD) of user-specified regions</description> <command> - python2 $__tool_directory__/fsd_regions.py --inputFile "$file1" --inputName1 "$file1.name" --ref_genome "$file2" --sep $separator --output_pdf $output_pdf --output_csv $output_csv + python2 '$__tool_directory__/fsd_regions.py' --inputFile '$file1' --inputName1 '$file1.name' --ref_genome '$file2' --output_pdf $output_pdf --output_tabular $output_tabular </command> <inputs> <param name="file1" type="data" format="tabular" label="Dataset 1: input tags of whole dataset" optional="false" help="Input in tabular format with the family size, tags and the direction of the strand ('ab' or 'ba') for each family."/> - <param name="file2" type="data" format="txt" label="Dataset 2: input tags aligned to the reference genome" help="Input in txt format with the regions and the tags, which were aligned to the reference genome."/> - <param name="separator" type="text" label="Separator of the CSV file." help="can be a single character" value=","/> + <param name="file2" type="data" format="txt" label="Dataset 2: input tags aligned to the reference genome" help="Input in txt format with the regions in the reference genome and the tags, which were aligned to the reference genome."/> </inputs> <outputs> <data name="output_pdf" format="pdf" /> - <data name="output_csv" format="csv"/> + <data name="output_tabular" format="tabular"/> </outputs> + <tests> + <test> + <param name="file1" value="Test_data.tabular"/> + <param name="file2" value="Test_data_regions.txt"/> + <output name="output_pdf" file="output_file.pdf" lines_diff="136"/> + <output name="output_tabular" file="output_file.tabular"/> + </test> + </tests> <help> <![CDATA[ **What it does** @@ -49,7 +56,7 @@ **Output** - The output is a PDF file with the plot and a CSV with the data of the plot. + The output is a PDF file with the plot and a tabular file with the data of the plot. **About Author**
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Test_data.tabular Mon Oct 08 05:53:50 2018 -0400 @@ -0,0 +1,32 @@ +10 AAAAAACATCCCAATAAGAAATCA ab +9 AAAAAACATCCCAATAAGAAATCA ba +4 AAAAAAGTCCTTCGACTCAAGCGG ab +5 AAAAAAGTCCTTCGACTCAAGCGG ba +5 AAAAAATAGTTAAGCCGACACACT ab +7 AAAAAATAGTTAAGCCGACACACT ba +7 AAAAAATGTGCCGAACCTTGGCGA ab +10 AAAAAATGTGCCGAACCTTGGCGA ba +7 AAAAACAACATAGCTTGAAAATTT ab +4 AAAAACAACATAGCTTGAAAATTT ba +81 ATTCGGATAATTCGACGCAACATT ab +11 ATTCGGATAATTCGACGCAACATT ba +41 ATTCGTCGACAATACAAAGGGGCC ab +226 ATTCGTCGACAATACAAAGGGGCC ba +6 ATTGCCAGTGTGGGCTGGTTAGTA ab +41 ATTGCCAGTGTGGGCTGGTTAGTA ba +50 ATTTCGCGACCATCCGCCACTTTG ab +332 ATTTCGCGACCATCCGCCACTTTG ba +64 CAAACTTTAGCACAGTGTGTGTCC ab +57 CAAACTTTAGCACAGTGTGTGTCC ba +85 ATAAACGGCCTTCGACATTGTGAC ab +15 ATAAACGGCCTTCGACATTGTGAC ba +11 ATAAAGTCACCTGTGAATACGTTG ab +35 ATAAAGTCACCTGTGAATACGTTG ba +83 ATAAATCGAAACCGTGCCCAACAA ab +63 ATAAATCGAAACCGTGCCCAACAA ba +9 ATTTAGATATTTTCTTCTTTTTCT ab +7 ATTTAGATATTTTCTTCTTTTTCT ba +7 ATTTAGTTATCCGTCGGCGACGAA ab +3 ATTTAGTTATCCGTCGGCGACGAA ba +8 ATTTAGTTTGAATTGCCCTGCGTC ab +9 ATTTAGTTTGAATTGCCCTGCGTC ba \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Test_data_regions.txt Mon Oct 08 05:53:50 2018 -0400 @@ -0,0 +1,17 @@ +87_636 AAAAAACATCCCAATAAGAAATCA +87_636 AAAAAAGTCCTTCGACTCAAGCGG +87_636 AAAAAATAGTTAAGCCGACACACT +87_636 AAAAAATGTGCCGAACCTTGGCGA +87_636 AAAAACAACATAGCTTGAAAATTT +656_1143 ATTCGGATAATTCGACGCAACATT +656_1143 ATTCGTCGACAATACAAAGGGGCC +656_1143 ATTGCCAGTGTGGGCTGGTTAGTA +656_1143 ATTTCGCGACCATCCGCCACTTTG +656_1143 CAAACTTTAGCACAGTGTGTGTCC +1141_1564 ATAAACGGCCTTCGACATTGTGAC +1141_1564 ATAAAGTCACCTGTGAATACGTTG +1141_1564 ATAAATCGAAACCGTGCCCAACAA +1892_2398 ATTTAGATATTTTCTTCTTTTTCT +1892_2398 ATTTAGTTATCCGTCGGCGACGAA +1892_2398 ATTTAGTTTGAATTGCCCTGCGTC +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_file.tabular Mon Oct 08 05:53:50 2018 -0400 @@ -0,0 +1,41 @@ +Dataset: Test_data + AB BA +max. family size: 85 332 +absolute frequency: 9 1 +relative frequency: 0.209 0.062 + +total nr. of reads 1312 + + +Values from family size distribution + 87_636 656_1143 1141_1564 1892_2398 +FS=3 0 0 0 1 +FS=4 2 0 0 0 +FS=5 2 0 0 0 +FS=6 0 1 0 0 +FS=7 3 0 0 2 +FS=8 0 0 0 1 +FS=9 1 0 0 2 +FS=10 2 0 0 0 +FS=11 0 1 1 0 +FS=12 0 0 0 0 +FS=13 0 0 0 0 +FS=14 0 0 0 0 +FS=15 0 0 1 0 +FS=16 0 0 0 0 +FS=17 0 0 0 0 +FS=18 0 0 0 0 +FS=19 0 0 0 0 +FS=20 0 0 0 0 +FS>20 0 8 4 0 +sum 10 10 6 6 + + +In the plot, both family sizes of the ab and ba strands were used. +Whereas the total numbers indicate only the single count of the tags per region. +Region total nr. of tags per region +87_636 5 +656_1143 5 +1141_1564 3 +1892_2398 3 +sum of tags 16