- python >= 3.5
- Dataset: Chinese/English Corpus, ☞ Click Here
- The implemented method is as follows::
- TF-IDF
- BM25
- LSH
- SIF/uSIF
- RNN Base
- Bert Base
from sim.tf_idf import TFIdf
tokens_list = ["这是 一个 什么 样 的 工具", "..."]
query = ["非常 好用 的 工具"]
tf_idf = TFIdf(tokens_list, split=" ")
print(tf_idf.get_score(query, 0)) # score
print(tf_idf.get_score_list(query, 10)) # [(index, score), ...]
print(tf_idf.weight()) # list or numpy array
from sim.bm25 import BM25
tokens_list = ["这是 一个 什么 样 的 工具", "..."]
query = ["非常 好用 的 工具"]
bm25 = BM25(tokens_list, split=" ")
print(bm25.get_score(query, 0)) # score
print(bm25.get_score_list(query, 10)) # [(index, score), ...]
print(bm25.weight()) # list or numpy array
from sim.lsh import E2LSH
from sim.lsh import MinHash
e2lsh = E2LSH()
min_hash = MinHash()
candidates = [[3.6216, 8.6661, -2.8073, -0.44699, 0], ...]
query = [-2.7769, -5.6967, 5.9179, 0.37671, 1]
print(e2lsh.search(candidates, query)) # index in candidates
print(min_hash.search(candidates, query)) # index in candidates
- Related papers
sentences = [["token1", "token2", "..."], ...]
vector = [[[1, 1, 1], [2, 2, 2], [...]], ...]
from sim.sif_usif import SIF
from sim.sif_usif import uSIF
sif = SIF(n_components=5, component_type="svd")
sif.fit(tokens_list=sentences, vector_list=vector)
usif = uSIF(n_components=5, n=1, component_type="svd")
usif.fit(tokens_list=sentences, vector_list=vector)
- Related papers
from sim.rnn_base import actuator
actuator()
# TensorFlow version:python3 debug.py --execute_type train --type tf
# or pytorch version:python3 debug.py --execute_type train --type torch
- Related papers