fasttext
- n-gram实现
- model
fasttext用于词向量和文本分类,使用词袋以及n-gram袋表征语句
n-gram实现
def biGramHash(sequence, t, buckets):t1 = sequence[t - 1] if t - 1 >= 0 else 0return (t1 * 14918087) % bucketsdef triGramHash(sequence, t, buckets):t1 = sequence[t - 1] if t - 1 >= 0 else 0t2 = sequence[t - 2] if t - 2 >= 0 else 0return (t2 * 14918087 * 18408749 + t1 * 14918087) % buckets
n-gram和CBOW很相似都是通过周边的词预测,只不过n-gram只一边。
model
class Model(nn.Module):def __init__(self, config):super(Model, self).__init__()if config.embedding_pretrained is not None:self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)else:self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)self.embedding_ngram2 = nn.Embedding(config.n_gram_vocab, config.embed)self.embedding_ngram3 = nn.Embedding(config.n_gram_vocab, config.embed)self.dropout = nn.Dropout(config.dropout)self.fc1 = nn.Linear(config.embed * 3, config.hidden_size)# self.dropout2 = nn.Dropout(config.dropout)self.fc2 = nn.Linear(config.hidden_size, config.num_classes)def forward(self, x):out_word = self.embedding(x[0])out_bigram = self.embedding_ngram2(x[2])out_trigram = self.embedding_ngram3(x[3])out = torch.cat((out_word, out_bigram, out_trigram), -1)out = out.mean(dim=1)out = self.dropout(out)out = self.fc1(out)out = F.relu(out)out = self.fc2(out)return out