Mercurial > repos > jay > pdaug_peptide_core_descriptors
comparison PDAUG_Word_Vector_Descriptor/PDAUG_Word_Vector_Descriptor.py @ 0:0fc091fb7e8f draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author | jay |
---|---|
date | Wed, 28 Oct 2020 02:12:57 +0000 |
parents | |
children | d8c51a04f5b2 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0fc091fb7e8f |
---|---|
1 import numpy as np | |
2 import os | |
3 import pandas as pd | |
4 from Bio import SeqIO | |
5 from nltk import bigrams | |
6 from nltk import trigrams | |
7 import gensim | |
8 import argparse | |
9 | |
10 parser = argparse.ArgumentParser() | |
11 | |
12 parser.add_argument("-M", "--ModelInput", required=True, default=None, help="Path to target tsv file") | |
13 parser.add_argument("-R", "--row", required=True, default=None, help="Path to target tsv file") | |
14 parser.add_argument("-I", "--InputFasta", required=True, default=6, help="Path to target tsv file") | |
15 parser.add_argument("-O", "--OutFile", required=False, default='model.txt', help="Path to target tsv file") | |
16 parser.add_argument("-P", "--positive", required=True, help="Path to target tsv file") | |
17 parser.add_argument("-N", "--negative", required=True, help="Path to target tsv file") | |
18 | |
19 args = parser.parse_args() | |
20 | |
21 seed = 42 | |
22 np.random.seed(seed) | |
23 | |
24 new_model = gensim.models.KeyedVectors.load_word2vec_format(args.ModelInput, binary=False) | |
25 | |
26 import time | |
27 t0 = time.time() | |
28 | |
29 temp_word = np.zeros(shape=(int(args.row), 200)) | |
30 | |
31 for index, seqs in enumerate(SeqIO.parse(args.InputFasta, 'fasta')): | |
32 seq_sum = 0 | |
33 tri_seq = trigrams(seqs.seq) | |
34 for item in ((tri_seq)): | |
35 tri_str = item[0] + item[1] + item[2] | |
36 if tri_str not in list(new_model.wv.vocab): | |
37 continue | |
38 seq_sum = seq_sum + new_model[tri_str] | |
39 | |
40 temp_word[index] = seq_sum | |
41 | |
42 t1 = time.time() | |
43 | |
44 | |
45 temp_word = temp_word | |
46 | |
47 | |
48 clm = [x for x in range(0,temp_word.shape[1])] | |
49 y_temp_word = np.vstack((np.ones((int(args.positive), 1)), np.zeros((int(args.negative),1)))) | |
50 | |
51 c, r = y_temp_word.shape | |
52 y_temp_word = y_temp_word.reshape(c,) | |
53 | |
54 class_label = pd.DataFrame(y_temp_word, columns=["Class_label"]) | |
55 | |
56 df = pd.DataFrame(temp_word, columns=clm) | |
57 df = pd.concat([df,class_label], axis=1) | |
58 | |
59 df.to_csv(args.OutFile, index=None, sep="\t") |