Mercurial > repos > jay > pdaug_addclasslabel
comparison PDAUG_Peptide_Ngrams/PDAUG_Peptide_Ngrams.py @ 0:2df11ea23f10 draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
| author | jay |
|---|---|
| date | Wed, 28 Oct 2020 02:36:27 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:2df11ea23f10 |
|---|---|
| 1 import matplotlib | |
| 2 matplotlib.use('Agg') | |
| 3 import os | |
| 4 import sys | |
| 5 sys.path.insert(0, os.path.abspath('..')) | |
| 6 import quantiprot | |
| 7 from quantiprot.utils.io import load_fasta_file | |
| 8 from quantiprot.utils.feature import Feature, FeatureSet | |
| 9 from quantiprot.metrics.aaindex import get_aa2hydropathy | |
| 10 from quantiprot.metrics.basic import identity | |
| 11 from quantiprot.metrics.ngram import pattern_match, pattern_count | |
| 12 from quantiprot.analysis.ngram import ngram_count | |
| 13 from quantiprot.analysis.ngram import zipf_law_fit | |
| 14 from matplotlib import pyplot as plt | |
| 15 | |
| 16 | |
| 17 def Run_ngrams(fasta1, fasta2, OutFile ): | |
| 18 | |
| 19 alphasyn_seq = load_fasta_file(fasta1) | |
| 20 amyload_pos_seq = load_fasta_file(fasta2) | |
| 21 | |
| 22 fs_aa = FeatureSet("aa patterns") | |
| 23 fs_aa.add(identity) | |
| 24 fs_aa.add(pattern_match, pattern='VT', padded=True) | |
| 25 fs_aa.add(pattern_count, pattern='VT') | |
| 26 | |
| 27 result_seq = fs_aa(alphasyn_seq) | |
| 28 | |
| 29 fs_hp = FeatureSet("hydropathy patterns") | |
| 30 fs_hp.add(Feature(get_aa2hydropathy())) | |
| 31 fs_hp.add(Feature(get_aa2hydropathy()).then(pattern_match, pattern=[0.0, 2.0], | |
| 32 metric='taxi', radius=1.0)) | |
| 33 result_seq2 = fs_hp(alphasyn_seq) | |
| 34 result_freq = ngram_count(alphasyn_seq, n=2) | |
| 35 result_fit = zipf_law_fit(amyload_pos_seq, n=3, verbose=True) | |
| 36 | |
| 37 counts = sorted(result_fit["ngram_counts"], reverse=True) | |
| 38 ranks = range(1, len(counts)+1) | |
| 39 | |
| 40 slope = result_fit["slope"] | |
| 41 harmonic_num = sum([rank**-slope for rank in ranks]) | |
| 42 fitted_counts = [(rank**-slope) / harmonic_num * sum(counts) for rank in ranks] | |
| 43 | |
| 44 plt.plot(ranks, counts, 'k', label="empirical") | |
| 45 plt.plot(ranks, fitted_counts, 'k--', | |
| 46 label="Zipf's law\nslope: {:.2f}".format((slope))) | |
| 47 plt.xlabel('rank') | |
| 48 plt.ylabel('count') | |
| 49 plt.xscale('log') | |
| 50 plt.yscale('log') | |
| 51 plt.legend() | |
| 52 | |
| 53 plt.savefig(OutFile) | |
| 54 | |
| 55 if __name__=="__main__": | |
| 56 | |
| 57 | |
| 58 import argparse | |
| 59 | |
| 60 parser = argparse.ArgumentParser() | |
| 61 | |
| 62 parser.add_argument("-f1", "--Fasta1", | |
| 63 required=True, | |
| 64 default=None, | |
| 65 help="First fasta file") | |
| 66 | |
| 67 parser.add_argument("-f2", "--Fasta2", | |
| 68 required=True, | |
| 69 default=None, | |
| 70 help="Second fasta file") | |
| 71 | |
| 72 | |
| 73 parser.add_argument("--OutFile", | |
| 74 required=True, | |
| 75 help="HTML out file", | |
| 76 default="report.html") | |
| 77 | |
| 78 | |
| 79 args = parser.parse_args() | |
| 80 | |
| 81 Run_ngrams(args.Fasta1, args.Fasta2, args.OutFile) | |
| 82 |
