1.预处理和数据获取
import pdb
import pickle
import stringimport timeimport gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import scipy
import sklearn
from gensim.models import KeyedVectors
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizerfrom utils import (cosine_similarity, get_dict,process_tweet)
from os import getcwd# add folder, tmp2, from our local workspace containing pre-downloaded corpora files to nltk's data path
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)from gensim.models import KeyedVectorsen_embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)
fr_embeddings = KeyedVectors.load_word2vec_format('./wiki.multi.fr.vec')# loading the english to french dictionaries
en_fr_train = get_dict('en-fr.train.txt')
print('The length of the english to french training dictionary is', len(en_fr_train))
en_fr_test = get_dict('en-fr.test.txt')
print('The length of the english to french test dictionary is', len(en_fr_train))english_set = set(en_embeddings.vocab)
french_set = set(fr_embeddings.vocab)
en_embeddings_subset = {}
fr_embeddings_subset = {}
french_words = set(en_fr_train.values())for en_word in en_fr_train.keys():fr_word = en_fr_train[en_word]if fr_word in french_set and en_word in english_set:en_embeddings_subset[en_word] = en_embeddings[en_word]fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]for en_word in en_fr_test.keys():fr_word = en_fr_test[en_word]if fr_word in french_set and en_word in english_set:en_embeddings_subset[en_word] = en_embeddings[en_word]fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]pickle.dump( en_embeddings_subset, open( "en_embeddings.p", "wb" ) )
pickle.dump( fr_embeddings_subset, open( "fr_embeddings.p", "wb" ) )
en_embeddings_subset = pickle.load(open("en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("fr_embeddings.p", "rb"))# loading the english to french dictionaries
en_fr_train = get_dict('en-fr.train.txt')
print('The length of the English to French training dictionary is', len(en_fr_train))
en_fr_test = get_dict('en-fr.test.txt')
print('The length of the English to French test dictionary is', len(en_fr_train))
def get_matrices(en_fr, french_vecs, english_vecs):"""Input:en_fr: English to French dictionaryfrench_vecs: French words to their corresponding word embeddings.english_vecs: English words to their corresponding word embeddings.Output: X: a matrix where the columns are the English embeddings.Y: a matrix where the columns correspong to the French embeddings.R: the projection matrix that minimizes the F norm ||X R -Y||^2."""### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### X_l and Y_l are lists of the english and french word embeddingsX_l = list()Y_1 = list()# get the english words (the keys in the dictionary) and store in a set()english_set = english_vecs.keys()# get the french words (keys in the dictionary) and store in a set()french_set = french_vecs.keys()# store the french words that are part of the english-french dictionary (these are the values of the dictionary)french_words = set(en_fr.values())# loop through all english, french word pairs in the english french dictionaryfor en_word, fr_word in en_fr.items():# check that the french word has an embedding and that the english word has an embeddingif fr_word in french_set and en_word in english_set:# get the english embeddingen_vec = english_vecs[en_word]# get the french embeddingfr_vec = french_vecs[fr_word]# add the english embedding to the listX_l.append(en_vec)# add the french embedding to the listY_1.append(fr_vec)# stack the vectors of X_l into a matrix XX = np.vstack(X_l)# stack the vectors of Y_l into a matrix YY = np.vstack(Y_1)### END CODE HERE ###return X, YX_train, Y_train = get_matrices(en_fr_train, fr_embeddings_subset, en_embeddings_subset)
2. XR= Y——compute loss
损失函数、梯度计算、转换矩阵R
def compute_loss(X, Y, R):'''Inputs: X: a matrix of dimension (m,n) where the columns are the English embeddings.Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.R: a matrix of dimension (n,n) - transformation matrix from English to French vector space embeddings.Outputs:L: a matrix of dimension (m,n) - the value of the loss function for given X, Y and R.'''### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### m is the number of rows in Xm = X.shape[0]# diff is XR - Ydiff = np.dot(X,R) - Y# diff_squared is the element-wise square of the differencediff_squared = np.square(diff)# sum_diff_squared is the sum of the squared elementssum_diff_squared = np.sum(diff_squared)# loss i the sum_diff_squard divided by the number of examples (m)loss = sum_diff_squared/m### END CODE HERE ###return lossdef compute_gradient(X, Y, R):'''Inputs: X: a matrix of dimension (m,n) where the columns are the English embeddings.Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.R: a matrix of dimension (n,n) - transformation matrix from English to French vector space embeddings.Outputs:g: a matrix of dimension (n,n) - gradient of the loss function L for given X, Y and R.'''### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### m is the number of rows in Xm = X.shape[0]# gradient is X^T(XR - Y) * 2/mgradient = 2*np.dot(X.T,(np.dot(X,R)-Y))/m### END CODE HERE ###return gradient
def align_embeddings(X, Y, train_steps=100, learning_rate=0.0003):'''Inputs:X: a matrix of dimension (m,n) where the columns are the English embeddings.Y: a matrix of dimension (m,n) where the columns correspong to the French embeddings.train_steps: positive int - describes how many steps will gradient descent algorithm do.learning_rate: positive float - describes how big steps will gradient descent algorithm do.Outputs:R: a matrix of dimension (n,n) - the projection matrix that minimizes the F norm ||X R -Y||^2'''np.random.seed(129)# the number of columns in X is the number of dimensions for a word vector (e.g. 300)# R is a square matrix with length equal to the number of dimensions in th word embeddingR = np.random.rand(X.shape[1], X.shape[1])for i in range(train_steps):if i % 25 == 0:print(f"loss at iteration {i} is: {compute_loss(X, Y, R):.4f}")### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### use the function that you defined to compute the gradientgradient = compute_gradient(X, Y, R)# update R by subtracting the learning rate times gradientR -= learning_rate*gradient### END CODE HERE ###return R
R_train = align_embeddings(X_train, Y_train, train_steps=400, learning_rate=0.8)
3. test model (nearest neighbors)
def nearest_neighbor(v, candidates, k=1):"""Input:- v, the vector you are going find the nearest neighbor for- candidates: a set of vectors where we will find the neighbors- k: top k nearest neighbors to findOutput:- k_idx: the indices of the top k closest vectors in sorted form"""### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###similarity_l = []# for each candidate vector...for row in candidates:# get the cosine similaritycos_similarity = cosine_similarity(v,row)# append the similarity to the listsimilarity_l.append(cos_similarity)# sort the similarity list and get the indices of the sorted listsorted_ids = np.argsort(similarity_l)# get the indices of the k most similar candidate vectorsk_idx = sorted_ids[-k:]### END CODE HERE ###return k_idxdef test_vocabulary(X, Y, R):'''Input:X: a matrix where the columns are the English embeddings.Y: a matrix where the columns correspong to the French embeddings.R: the transform matrix which translates word embeddings fromEnglish to French word vector space.Output:accuracy: for the English to French capitals'''### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### The prediction is X times Rpred = np.dot(X,R)# initialize the number correct to zeronum_correct = 0# loop through each row in pred (each transformed embedding)for i in range(len(pred)):# get the index of the nearest neighbor of pred at row 'i'; also pass in the candidates in Ypred_idx = nearest_neighbor(pred[i], Y, k=1)# if the index of the nearest neighbor equals the row of i... \if pred_idx == i:# increment the number correct by 1.num_correct += 1# accuracy is the number correct divided by the number of rows in 'pred' (also number of rows in X)accuracy = num_correct/X.shape[0]### END CODE HERE ###return accuracy
the accuracy of model is 0.557
4. LSH(local sensitive hashing)的应用
在相同hash值内的子空间内完成kNN,通过牺牲一定的精度提高模型的预测速度
a. 计算文档向量
def get_document_embedding(tweet, en_embeddings): '''Input:- tweet: a string- en_embeddings: a dictionary of word embeddingsOutput:- doc_embedding: sum of all word embeddings in the tweet'''doc_embedding = np.zeros(300)### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### process the document into a list of words (process the tweet)processed_doc = process_tweet(tweet)for word in processed_doc:# add the word embedding to the running total for the document embeddingdoc_embedding += en_embeddings.get(word,0)### END CODE HERE ###return doc_embeddingdef get_document_vecs(all_docs, en_embeddings):'''Input:- all_docs: list of strings - all tweets in our dataset.- en_embeddings: dictionary with words as the keys and their embeddings as the values.Output:- document_vec_matrix: matrix of tweet embeddings.- ind2Doc_dict: dictionary with indices of tweets in vecs as keys and their embeddings as the values.'''# the dictionary's key is an index (integer) that identifies a specific tweet# the value is the document embedding for that documentind2Doc_dict = {}# this is list that will store the document vectorsdocument_vec_l = []for i, doc in enumerate(all_docs):### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### get the document embedding of the tweetdoc_embedding = get_document_embedding(doc, en_embeddings)# save the document embedding into the ind2Tweet dictionary at index iind2Doc_dict[i] = doc_embedding # append the document embedding to the list of document vectorsdocument_vec_l.append(doc_embedding )### END CODE HERE #### convert the list of document vectors into a 2D array (each row is a document vector)document_vec_matrix = np.vstack(document_vec_l)return document_vec_matrix, ind2Doc_dict
所有向量空间内查找最相似文档
tweet_embedding = get_document_embedding(my_tweet, en_embeddings_subset)
idx = np.argmax(cosine_similarity(document_vecs, tweet_embedding))
b. apply LSH for doc search
根据创建的超平面计算哈希值
N_VECS = len(all_tweets) # This many vectors.
N_DIMS = len(ind2Tweet[1]) # Vector dimensionality.
# The number of planes. We use log2(625) to have ~16 vectors/bucket.
N_PLANES = 10
# Number of times to repeat the hashing to improve the search.
N_UNIVERSES = 25planes_l = [np.random.normal(size=(N_DIMS, N_PLANES))for _ in range(N_UNIVERSES)]def hash_value_of_vector(v, planes):"""Create a hash for a vector; hash_id says which random hash to use.Input:- v: vector of tweet. It's dimension is (1, N_DIMS)- planes: matrix of dimension (N_DIMS, N_PLANES) - the set of planes that divide up the regionOutput:- res: a number which is used as a hash for your vector"""### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### for the set of planes,# calculate the dot product between the vector and the matrix containing the planes# remember that planes has shape (300, 10)# The dot product will have the shape (1,10)dot_product = np.dot(v,planes)# get the sign of the dot product (1,10) shaped vectorsign_of_dot_product = np.sign(dot_product)# set h to be false (eqivalent to 0 when used in operations) if the sign is negative,# and true (equivalent to 1) if the sign is positive (1,10) shaped vectorh = sign_of_dot_product >= 0# remove extra un-used dimensions (convert this from a 2D to a 1D array)h = np.squeeze(h.astype(np.int))# initialize the hash value to 0hash_value = 0n_planes = planes.shape[1]for i in range(n_planes):# increment the hash value by 2^i * h_ihash_value += 2**i * h[i]### END CODE HERE #### cast hash_value as an integerhash_value = int(hash_value)return hash_value
创建多个哈希表重复hashing提高检索准确性
创建两个字典分别存储相同哈希值对应的文档向量和文档id
def make_hash_table(vecs, planes):"""Input:- vecs: list of vectors to be hashed.- planes: the matrix of planes in a single "universe", with shape (embedding dimensions, number of planes).Output:- hash_table: dictionary - keys are hashes, values are lists of vectors (hash buckets)- id_table: dictionary - keys are hashes, values are list of vectors id's(it's used to know which tweet corresponds to the hashed vector)"""### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### number of planes is the number of columns in the planes matrixnum_of_planes = planes.shape[1]# number of buckets is 2^(number of planes)num_buckets = 2**num_of_planes# create the hash table as a dictionary.# Keys are integers (0,1,2.. number of buckets)# Values are empty listshash_table = {i:[] for i in range(num_buckets)}# create the id table as a dictionary.# Keys are integers (0,1,2... number of buckets)# Values are empty listsid_table = {i:[] for i in range(num_buckets)}# for each vector in 'vecs'for i, v in enumerate(vecs):# calculate the hash value for the vectorh = hash_value_of_vector(v,planes)# store the vector into hash_table at key h,# by appending the vector v to the list at key hhash_table[h].append(v)# store the vector's index 'i' (each document is given a unique integer 0,1,2...)# the key is the h, and the 'i' is appended to the list at key hid_table[h].append(i)### END CODE HERE ###return hash_table, id_table
多个哈希表和id表
hash_tables = []
id_tables = []
for universe_id in range(N_UNIVERSES): # there are 25 hashesprint('working on hash universe #:', universe_id)planes = planes_l[universe_id]hash_table, id_table = make_hash_table(document_vecs, planes)hash_tables.append(hash_table)id_tables.append(id_table)
approximate kNN
def approximate_knn(doc_id, v, planes_l, k=1, num_universes_to_use=N_UNIVERSES):"""Search for k-NN using hashes."""assert num_universes_to_use <= N_UNIVERSES# Vectors that will be checked as possible nearest neighborvecs_to_consider_l = list()# list of document IDsids_to_consider_l = list()# create a set for ids to consider, for faster checking if a document ID already exists in the setids_to_consider_set = set()# loop through the universes of planes在多个hash table检索for universe_id in range(num_universes_to_use):# get the set of planes from the planes_l list, for this particular universe_idplanes = planes_l[universe_id]# get the hash value of the vector for this set of planeshash_value = hash_value_of_vector(v, planes)# get the hash table for this particular universe_idhash_table = hash_tables[universe_id]# get the list of document vectors for this hash table, where the key is the hash_valuedocument_vectors_l = hash_table[hash_value]# get the id_table for this particular universe_idid_table = id_tables[universe_id]# get the subset of documents to consider as nearest neighbors from this id_table dictionarynew_ids_to_consider = id_table[hash_value]### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) #### remove the id of the document that we're searchingif doc_id in new_ids_to_consider:new_ids_to_consider.remove(doc_id)print(f"removed doc_id {doc_id} of input vector from new_ids_to_search")# loop through the subset of document vectors to considerfor i, new_id in enumerate(new_ids_to_consider):# if the document ID is not yet in the set ids_to_consider...if new_id not in ids_to_consider_set:# access document_vectors_l list at index i to get the embedding# then append it to the list of vectors to consider as possible nearest neighborsdocument_vector_at_i = document_vectors_l[i]vecs_to_consider_l.append(document_vector_at_i)# append the new_id (the index for the document) to the list of ids to considerids_to_consider_l.append(new_id)# also add the new_id to the set of ids to consider# (use this to check if new_id is not already in the IDs to consider)ids_to_consider_set.add(new_id)### END CODE HERE #### Now run k-NN on the smaller set of vecs-to-consider.print("Fast considering %d vecs" % len(vecs_to_consider_l))# convert the vecs to consider set to a list, then to a numpy arrayvecs_to_consider_arr = np.array(vecs_to_consider_l)# call nearest neighbors on the reduced list of candidate vectorsnearest_neighbor_idx_l = nearest_neighbor(v, vecs_to_consider_arr, k=k)# Use the nearest neighbor index list as indices into the ids to consider# create a list of nearest neighbors by the document idsnearest_neighbor_ids = [ids_to_consider_l[idx]for idx in nearest_neighbor_idx_l]return nearest_neighbor_ids