6

Keras'ta bir chatbot yapmaya çalışıyorum. Kelime bilgindeki her kelimeyi kendi kimliğime atadım. Bir eğitim numune şuna benzer: keras kelimesinin tersine çeviri yapılması - python

[0 0 0 0 0 0 32 328 2839 13 192 1 ] -> [23 3289 328 2318 12 0 0 0 0 0 0 0]

Sonra Sonra gizli katmanları olarak LSTM katmanları kullanıyorum boyutu 32 vektörleri içerisinde bu kimliği gömme için keras Gömme katmanını kullanıyorum. Sorun şu ki, çıkışım gömülü kimliğin bir listesi gibi.

[ 0.16102183 0.1238187 0.1159694 0.13688719 0.12964118 0.12848872 0.13515817 0.13582146 0.16919741 0.15453722 ... ]

nasıl orijinal kelime kelime için siz de bu katıştırmalarını dönüştürebilir?

İşte benim kodudur:

from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer 
from keras.models import Sequential, load_model 
from keras.layers import LSTM 
from keras.layers.embeddings import Embedding 
from keras.preprocessing import sequence 

import os 

import numpy as np 
import cPickle as pickle 


class Chatbot(object): 

def __init__(self, h_layers=1): 
    # self.name = name 
    self.h_layers = h_layers 
    self.seq2seq = None 
    self.max_length = 0 
    self.vocabulary = {} 

@staticmethod 
def load(model_name): 
    with open('models/{}/chatbot_object.pkl'.format(model_name), 'rb') as pickle_file: 
     obj = pickle.load(pickle_file) 
    obj.seq2seq = load_model('models/{}/seq2seq.h5'.format(model_name)) 
    return obj 

def train(self, x_train, y_train): 
    count_vect = CountVectorizer() 
    count_vect.fit(x_train) 
    count_vect.fit(y_train) 

    self.vocabulary = count_vect.vocabulary_ 
    self.vocabulary.update({'<START>': len(self.vocabulary), 
          '<END>': len(self.vocabulary) + 1, 
          '<PAD>': len(self.vocabulary) + 2, 
          '<UNK>': len(self.vocabulary) + 3}) 

    for i in range(len(x_train)): 
     x_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(x_train[i])] + ['<END>'] 
    for i in range(len(y_train)): 
     y_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(y_train[i])] + ['<END>'] 

    for sample in x_train: 
     if len(sample) > self.max_length: 
      self.max_length = len(sample) 
    for sample in y_train: 
     if len(sample) > self.max_length: 
      self.max_length = len(sample) 

    for i in range(len(x_train)): 
     x_train[i] = [self.vocabulary[w] for w in x_train[i] if w in self.vocabulary] 
    for i in range(len(y_train)): 
     y_train[i] = [self.vocabulary[w] for w in y_train[i] if w in self.vocabulary] 

    x_train = sequence.pad_sequences(x_train, maxlen=self.max_length, value=self.vocabulary['<PAD>']) 
    y_train = sequence.pad_sequences(y_train, maxlen=self.max_length, padding='post', 
            value=self.vocabulary['<PAD>']) 

    x_train = np.asarray(x_train) 
    y_train = np.asarray(y_train) 

    embedding_vector_length = 32 

    self.seq2seq = Sequential() 
    self.seq2seq.add(Embedding(len(self.vocabulary), embedding_vector_length, input_length=self.max_length)) 

    for _ in range(self.h_layers): 
     self.seq2seq.add(LSTM(self.max_length, return_sequences=True)) 

    self.seq2seq.add(LSTM(self.max_length)) 
    self.seq2seq.compile(loss='cosine_proximity', optimizer='adam', metrics=['accuracy']) 
    self.seq2seq.fit(x_train[:100], y_train[:100], epochs=5, batch_size=32) 

def save(self, filename): 
    if filename not in os.listdir('models'): 
     os.system('mkdir models/{}'.format(filename)) 
    self.seq2seq.save('models/{}/seq2seq.h5'.format(filename)) 
    self.seq2seq = None 
    with open('models/{}/chatbot_object.pkl'.format(filename), 'wb') as pickle_file: 
     pickle.dump(self, pickle_file) 

def respond(self, text): 
    tokens = ['<START>'] + [w.lower() for w in word_tokenize(text)] + ['<END>'] 
    for i in range(len(tokens)): 
     if tokens[i] in self.vocabulary: 
      tokens[i] = self.vocabulary[tokens[i]] 
     else: 
      tokens[i] = self.vocabulary['<PAD>'] 
    x = sequence.pad_sequences([tokens], maxlen=self.max_length, value=self.vocabulary['<PAD>']) 
    prediction = self.seq2seq.predict(x, batch_size=1) 
    return prediction[0] 

cevap

0


Ben de bunun cevabını bulamadık, bu yüzden bir arama fonksiyonu yazdım.

def lookup(tokenizer, vec, returnIntNotWord=True): 
    twordkey = [(k, tokenizer.word_index[k]) for k in sorted(tokenizer.word_index, key=tokenizer.word_index.get, reverse=False)] 
    oneHotVec = [] #captures the index of the ords 
    engVec = [] #this one returns the indexs and the words. Make sure returnIntNotWord is false though 
    for eachRow, notUsed in enumerate(vec): 
     for index, item in enumerate(vec[0]): 
      if vec[eachRow][index] == 1: 
       oneHotVec.append(index) 
    for index in oneHotVec: 
     engVec.append(twordkey[index]) 
    if returnIntNotWord == True: 
     return oneHotVec 
    else: 
     return engVec 

Tokenizer, Keras Tokizatörüdür.
Vec, One-Hot kodlanmış etiketlerin 2B vektörüdür
ReturnIntNotWord, bu yorumlarda.