Mercurial > repos > jay > pdaug_peptide_core_descriptors
comparison PDAUG_Word_Vector_Model/PDAUG_Word_Vector_Model.py @ 0:0fc091fb7e8f draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author | jay |
---|---|
date | Wed, 28 Oct 2020 02:12:57 +0000 |
parents | |
children | 9d8691179324 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0fc091fb7e8f |
---|---|
1 import nltk | |
2 from nltk import trigrams | |
3 import pandas as pd | |
4 from Bio import SeqIO | |
5 import gensim, logging | |
6 import argparse | |
7 | |
8 parser = argparse.ArgumentParser() | |
9 | |
10 parser.add_argument("-I", "--Input", required=True, default=None, help="Path to target fasta file") | |
11 parser.add_argument("-M", "--min_count", required=False, default=0, help="Path to target tsv file") | |
12 parser.add_argument("-W", "--window", required=False, default=5, help="Path to target tsv file") | |
13 parser.add_argument("-O", "--OutFile", required=False, default='model.txt', help="Path to target tsv file") | |
14 | |
15 args = parser.parse_args() | |
16 | |
17 class ProteinSeq(object): | |
18 def __init__(self): | |
19 pass | |
20 def __iter__(self): | |
21 for index, record in enumerate(SeqIO.parse(args.Input, 'fasta')): | |
22 for loop_num in range(0, 3): | |
23 Ngram_list = [] | |
24 tri_tokens = trigrams(record.seq) | |
25 for index1, item in enumerate(tri_tokens): | |
26 if index1 % 3 == loop_num: | |
27 tri_pep = item[0] + item[1] + item[2] | |
28 Ngram_list.append(tri_pep) | |
29 yield Ngram_list | |
30 #min_count = 0 | |
31 size = 200 | |
32 #window = 5 | |
33 sg = 1 | |
34 | |
35 sentences = ProteinSeq() | |
36 model = gensim.models.Word2Vec(sentences, min_count=int(args.min_count), size=size, window=int(args.window), sg = sg, workers = 10) | |
37 model.wv.save_word2vec_format(args.OutFile, binary=False) | |
38 |