Mercurial > repos > jay > pdaug_peptide_sequence_analysis
diff PDAUG_Word_Vector_Model/PDAUG_Word_Vector_Model.py @ 0:e59674e3a391 draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit 6f53ad797ec1af02b41510063a86bec7d121abf3"
author | jay |
---|---|
date | Fri, 20 Nov 2020 19:47:44 +0000 |
parents | |
children | d11a54691a2f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PDAUG_Word_Vector_Model/PDAUG_Word_Vector_Model.py Fri Nov 20 19:47:44 2020 +0000 @@ -0,0 +1,38 @@ +import nltk +from nltk import trigrams +import pandas as pd +from Bio import SeqIO +import gensim, logging +import argparse + +parser = argparse.ArgumentParser() + +parser.add_argument("-I", "--Input", required=True, default=None, help="Path to target fasta file") +parser.add_argument("-M", "--min_count", required=False, default=0, help="Path to target tsv file") +parser.add_argument("-W", "--window", required=False, default=5, help="Path to target tsv file") +parser.add_argument("-O", "--OutFile", required=False, default='model.txt', help="Path to target tsv file") + +args = parser.parse_args() + +class ProteinSeq(object): + def __init__(self): + pass + def __iter__(self): + for index, record in enumerate(SeqIO.parse(args.Input, 'fasta')): + for loop_num in range(0, 3): + Ngram_list = [] + tri_tokens = trigrams(record.seq) + for index1, item in enumerate(tri_tokens): + if index1 % 3 == loop_num: + tri_pep = item[0] + item[1] + item[2] + Ngram_list.append(tri_pep) + yield Ngram_list +#min_count = 0 +size = 200 +#window = 5 +sg = 1 + +sentences = ProteinSeq() +model = gensim.models.Word2Vec(sentences, min_count=int(args.min_count), size=size, window=int(args.window), sg = sg, workers = 10) +model.wv.save_word2vec_format(args.OutFile, binary=False) +