微信扫码
添加专属顾问
我要投稿
LangChain与Embeddings技术强强联合,打造高效智能的RAG系统,让非结构化数据处理更简单高效。 核心内容: 1. LangChain框架在RAG系统中的核心作用与工作流程 2. Embeddings技术的原理、优势及主流模型比较 3. 实战案例展示如何构建本地知识问答系统
from langchain_community.embeddings import(
HuggingFaceEmbeddings,
OpenAIEmbeddings,
CohereEmbeddings
)
# HuggingFace嵌入
hf_embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device':'cpu'},
encode_kwargs={'normalize_embeddings':False}
)
# OpenAI嵌入
openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
# Cohere嵌入
cohere_embeddings = CohereEmbeddings(model="embed-english-v2.0")
# -*- coding: utf-8 -*-
"""
LangChain RAG完整实现
基于本地文档构建问答系统
使用sentence-transformers/all-MiniLM-L6-v2嵌入模型
Chroma向量数据库存储
"""
import os
from dotenv import load_dotenv
from typing import List, Dict, Any
# 加载环境变量
load_dotenv()
# 设置国内镜像源(加速下载)
os.environ['HF_ENDPOINT']='https://hf-mirror.com'
# 1. 导入所需库
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOpenAI
from langchain.schema import Document
import chromadb
classRAGSystem:
def__init__(self, config: Dict[str, Any]):
"""
初始化RAG系统
参数:
config: 配置字典,包含:
- document_path: 文档路径
- embedding_model: 嵌入模型名称
- persist_directory: 向量数据库存储路径
- chunk_size: 文本分块大小
- chunk_overlap: 分块重叠大小
- llm_config: LLM配置
"""
self.config = config
self.llm = self._initialize_llm()
self.embeddings = self._initialize_embeddings()
self.vectorstore = self._initialize_vectorstore()
self.retriever = self.vectorstore.as_retriever(
search_type="mmr",
search_kwargs={"k":3,"lambda_mult":0.5}
)
self.chain = self._create_chain()
def_initialize_llm(self)-> ChatOpenAI:
"""初始化语言模型"""
return ChatOpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_API_BASE"),
model=self.config.get("llm_config",{}).get("model","gpt-3.5-turbo"),
temperature=0.7,
streaming=True
)
def_initialize_embeddings(self)-> HuggingFaceEmbeddings:
"""初始化嵌入模型"""
return HuggingFaceEmbeddings(
model_name=self.config.get("embedding_model","sentence-transformers/all-MiniLM-L6-v2"),
model_kwargs={'device':'cpu'},
encode_kwargs={'normalize_embeddings':False}
)
def_initialize_vectorstore(self)-> Chroma:
"""初始化向量数据库"""
# 加载文档
loader = TextLoader(self.config["document_path"], encoding="utf-8")
documents = loader.load()
# 文本分块
text_splitter = CharacterTextSplitter(
chunk_size=self.config.get("chunk_size",1000),
chunk_overlap=self.config.get("chunk_overlap",200)
)
chunks = text_splitter.split_documents(documents)
# 创建向量存储
return Chroma.from_documents(
documents=chunks,
embedding=self.embeddings,
persist_directory=self.config["persist_directory"],
collection_name="knowledge_base"
)
def_create_chain(self):
"""创建处理链"""
# 定义提示模板
template ="""你是一个专业的知识助手,请基于以下上下文回答问题。
如果不知道答案,就说你不知道,不要编造答案。
上下文:
{context}
问题: {question}
回答:"""
prompt = ChatPromptTemplate.from_template(template)
# 格式化检索结果
defformat_docs(docs: List[Document])->str:
return"\n\n".join(doc.page_content for doc in docs)
# 构建处理链
return(
{"context": self.retriever | format_docs,"question": RunnablePassthrough()}
| prompt
| self.llm
| StrOutputParser()
)
defquery(self, question:str)->str:
"""执行查询"""
return self.chain.invoke(question)
defsave_vectorstore(self):
"""保存向量数据库"""
self.vectorstore.persist()
if __name__ =="__main__":
# 配置参数
config ={
"document_path":"knowledge.txt",# 替换为你的文档路径
"embedding_model":"sentence-transformers/all-MiniLM-L6-v2",
"persist_directory":"db",# 向量数据库存储目录
"chunk_size":1000,
"chunk_overlap":200,
"llm_config":{
"model":"gpt-3.5-turbo"
}
}
# 初始化系统
rag = RAGSystem(config)
# 交互式问答
print("知识问答系统已启动,输入'exit'退出")
whileTrue:
try:
question =input("\n提问: ")
if question.lower()in["exit","quit"]:
rag.save_vectorstore()
print("系统已退出,向量数据库已保存")
break
print("\n思考中...", end="")
response = rag.query(question)
print(f"\n回答: {response}")
except KeyboardInterrupt:
rag.save_vectorstore()
print("\n系统已退出,向量数据库已保存")
break
except Exception as e:
print(f"\n发生错误: {str(e)}")
continue
# 初始化组件
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# 文档处理
text_splitter = CharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
documents = text_splitter.split_documents(
TextLoader("knowledge.txt").load()
)
# 向量存储
vectorstore = Chroma.from_documents(
documents=documents,
embedding=embeddings,
persist_directory="db"
)
# 检索增强生成链
retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
llm=ChatOpenAI(),
chain_type="stuff",
retriever=retriever
)
retriever = vectorstore.as_retriever(
search_type="mmr",# 最大边际相关性
search_kwargs={"k":5,"lambda_mult":0.5}
)
from langchain.cache import SQLiteCache
import langchain
langchain.llm_cache = SQLiteCache(database_path=".langchain.db")
53AI,企业落地大模型首选服务商
产品:场景落地咨询+大模型应用平台+行业解决方案
承诺:免费场景POC验证,效果验证后签署服务协议。零风险落地应用大模型,已交付160+中大型企业
2025-07-18
LangGraph:让你的RAG像“大脑”一样思考的秘密武器!
2025-07-18
用LangGraph打造高可用生产级AI Agent
2025-07-16
使用 LangGraph 打造 Multi-Agent 系统
2025-07-15
构建AI Agent的完整实战指南:从邮件助手案例看6步落地方法
2025-07-14
LangChain创始人:决定AI产品成败的隐藏指标
2025-07-14
Langchain官方终极指南,如何构建AI Agent:告别空谈,六步从想法到生产级应用!
2025-07-14
LangChain创始人:如何让AI智能体(Agent)跑得更快?
2025-07-13
“你问我答”,LangChain 是怎么帮 AI 变聪明的?
2025-05-06
2025-06-05
2025-05-08
2025-05-28
2025-05-19
2025-05-28
2025-06-26
2025-04-25
2025-04-23
2025-04-26
2025-07-14
2025-07-13
2025-07-05
2025-06-26
2025-06-13
2025-05-21
2025-05-19
2025-05-08