1. skip-gram
skip-gram 是用中心词预测上下文
给每个词进行embedding 编码 ,inputs 为 [batch_szie] labels 为 [batch_size,context_len]
nce_loss 负采样 用随机梯度下降更新梯度,负采样的选择的概率为按照每个词的频数大的选择概率大
将一个正例和若干个负采样组成的负例,组合起来进行逻辑回归
详细信息见 https://www.cnblogs.com/pinard/p/7249903.html
2. cbow
cbow 是用上下文预测中心词
skip-gram 一个简单tutroil tf 代码:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as pltsentences = ['i like dog','i like cat',"i like animal","dog cat animal", "apple cat dog like", "dog fish milk like","dog cat eyes like", "i like apple", "apple i hate","apple i movie book music like", "cat dog hate", "cat dog like"]word_sequence = ' '.join(sentences).split()
word_list = ' '.join(sentences).split()
word_list = list(set(word_list))print(word_list)
word_dict = dict(zip(word_list,range(len(word_list))))# word2vec Parameter
batch_size = 20
embedding_size = 2 # to show 2 dim embedding graph
num_sampled = 10 # for negative sampling ,less than batch_size
voc_size = len(word_list)def random_batch(data,size):random_inputs = []random_labels = []random_index = np.random.choice(range(len(data)),size=size,replace=False)for i in random_index:random_inputs.append(data[i][0]) # targetrandom_labels.append([data[i][1]]) # context wordreturn random_inputs,random_labels# make skip gram for one size window
skip_grams = []
for i in range(1,len(word_sequence) - 1):target = word_dict[word_sequence[i]]context = [word_dict[word_sequence[i - 1]],word_dict[word_sequence[i + 1]]]for w in context:skip_grams.append([target,w])print('skip_grams = ',skip_grams)# Model
inputs = tf.placeholder(dtype=tf.int64,shape=[batch_size],name='inputs')
labels = tf.placeholder(dtype=tf.int64,shape=[batch_size,1],name='labels') # to use tf.nn.nce_loss, [batch_size,1]embeddings = tf.get_variable(name='embeddings',shape=[voc_size,embedding_size],initializer=tf.random_uniform_initializer(minval=-1.,maxval=1.))
select_embedding = tf.nn.embedding_lookup(params=embeddings,ids=inputs)nce_weights = tf.get_variable(name='nce_weights',shape=[voc_size,embedding_size],initializer=tf.random_uniform_initializer(minval=-1.,maxval=1.))
nce_biases =- tf.get_variable(name='nce_biases',shape=[voc_size],initializer=tf.zeros_initializer())# loss and optimizer
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights,nce_biases,labels=labels,inputs=select_embedding,num_sampled=num_sampled,num_classes=voc_size))
optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)# training
with tf.Session() as sess:sess.run(tf.global_variables_initializer())for epoch in range(5000):batch_inputs,batch_labels = random_batch(skip_grams,batch_size)_,loss_val = sess.run([optimizer,loss],feed_dict={inputs:batch_inputs,labels:batch_labels})if (epoch + 1) % 1000 == 0:print('Epoch : ','%04d'%(epoch + 1),'cost = ','{:.6f}'.format(loss_val))trained_embeddings = embeddings.eval()for i,label in enumerate(word_list):x,y = trained_embeddings[i]plt.scatter(x,y)plt.annotate(label,xy=(x,y),xytext = (5,2),textcoords = 'offset points',ha='right',va='bottom')
plt.show()