
上一篇代码已经很好地完成了英文句子的相似度匹配。如果你还想对多个模块(例如“人工智能知识模块”)也做同样的处理,你可以把这段逻辑封装成函数或模块类,然后每个模块只需要传不同的:
remembered_sentences(记忆内容)
corpus_dir(对应的目录)
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# ✅ 设置代理
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
# ✅ 加载模型(只加载一次)
model = SentenceTransformer('all-MiniLM-L6-v2')
print("模型加载成功")
def find_most_similar(remembered_sentences, corpus_dir, module_name=""):
# Step 1: 读取语料
big_sentences = []
for filename in os.listdir(corpus_dir):
if filename.endswith(".txt"):
filepath = os.path.join(corpus_dir, filename)
with open(filepath, "r", encoding="utf-8") as f:
lines = [line.strip() for line in f if line.strip()]
big_sentences.extend(lines)
# Step 2: 去除重复
remembered_set = set(s.strip().lower() for s in remembered_sentences)
filtered_big_sentences = [s for s in big_sentences if s.strip().lower() not in remembered_set]
if not filtered_big_sentences:
print(f"❌ [{module_name}] 没有可匹配的句子(文件内容可能为空或完全重复)。")
return
# Step 3: 编码 & 相似度计算
remembered_vectors = model.encode(remembered_sentences)
big_vectors = model.encode(filtered_big_sentences)
similarities = cosine_similarity(big_vectors, remembered_vectors)
# Step 4: 找到最佳匹配
max_sim = -1
best_target = ""
best_memory = ""
for i, sim_row in enumerate(similarities):
best_idx = np.argmax(sim_row)
sim = sim_row[best_idx]
if sim > max_sim:
max_sim = sim
best_target = filtered_big_sentences[i]
best_memory = remembered_sentences[best_idx]
# Step 5: 输出结果
print(f"\n✅ [{module_name}] 相似度最高的一对句子:")
print(f"目标句子: {best_target}")
print(f"记忆句子: {best_memory}")
print(f"相似度: {max_sim:.2f}")
# ✅ 模块 1:英语基础句子匹配
english_remembered = [
"what is the price of tofu?",
"how many potatoes do we need?",
"we got married twenty years ago"
]
find_most_similar(english_remembered, "file/english", module_name="英语模块")
# ✅ 模块 2:人工智能知识匹配
ai_remembered = [
"what is artificial intelligence?",
"how does a neural network work?",
"machine learning is a subfield of AI"
]
find_most_similar(ai_remembered, "file/ai", module_name="人工智能模块")project/
├── your_script.py
└── file/
├── english/
│ ├── text1.txt
│ └── text2.txt
└── ai/
├── ai1.txt
└── ai2.txt你还可以定义一个模块配置列表:
modules = [
{
"name": "英语模块",
"dir": "file/english",
"memory": [
"what is the price of tofu?",
"how many potatoes do we need?",
"we got married twenty years ago"
]
},
{
"name": "人工智能模块",
"dir": "file/ai",
"memory": [
"what is artificial intelligence?",
"how does a neural network work?",
"machine learning is a subfield of AI"
]
}
]
for m in modules:
find_most_similar(m["memory"], m["dir"], module_name=m["name"])原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。