微信扫码
添加专属顾问
我要投稿
手把手教你用DeepEval自定义模型评估RAG实例,代码级实战指南。核心内容: 1. 快速导入DeepEval评估框架的关键依赖包 2. 自定义Qwen模型对接DeepEval的完整实现 3. 同步/异步API调用的工程化处理技巧
#导入依赖包
import time
import requests
import json
from services.ChatService import ChatService
from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (FaithfulnessMetric,
ContextualPrecisionMetric,
ContextualRecallMetric,
ContextualRelevancyMetric)
2、自定义模型
#自定义模型
class QwenModel(DeepEvalBaseLLM):
def __init__(self):
self.api_key = "fastgpt-*******"
self.base_url = "https://jz-fastgpt-stable.djtest.cn/api/v1"
self.model_name = "qwen-max"
def load_model(self):
return self
def generate(self, prompt: str) -> str:
# 调用 Qwen API 的逻辑
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.model_name,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0
}
response = requests.post(
f"{self.base_url}/chat/completions",
headers=headers,
data=json.dumps(payload)
)
if response.status_code == 200:
return response.json()["choices"][0]["message"]["content"]
else:
raise RuntimeError(f"API 调用失败: {response.status_code}, {response.text}")
async def a_generate(self, prompt: str) -> str:
# 异步实现(与同步类似)
return self.generate(prompt)
def get_model_name(self):
return self.model_name
#评估代码封装
class EvalService:
def get_faithfulness(self,ques:str, response):
# 创建评估模型实例
qwen_model = QwenModel()
faithfulness_metric = FaithfulnessMetric(model=qwen_model)
test_case = self.get_test_case(ques,response)
faithfulness_metric.measure(test_case)
faithfulness = dict()
faithfulness["score"] = faithfulness_metric.score
faithfulness["reason"] = faithfulness_metric.reason
print(f"faithfulness:{faithfulness}")
return faithfulness
def get_contextprecision(self, ques: str, response):
# 创建评估模型实例
qwen_model = QwenModel()
contextprecision_metric = ContextualPrecisionMetric(model=qwen_model)
test_case = self.get_test_case(ques,response)
contextprecision_metric.measure(test_case)
contextprecision = dict()
contextprecision["score"] = contextprecision_metric.score
contextprecision["reason"] = contextprecision_metric.reason
print(f"contextprecision:{contextprecision}")
return contextprecision
def get_contextrecall(self, ques: str, response):
# 创建评估模型实例
qwen_model = QwenModel()
contextrecall_metric = ContextualRecallMetric(model=qwen_model)
test_case = self.get_test_case(ques,response)
contextrecall_metric.measure(test_case)
contextrecall = dict()
contextrecall["score"] = contextrecall_metric.score
contextrecall["reason"] = contextrecall_metric.reason
print(f"contextrecall:{contextrecall}")
return contextrecall
def get_contextrelevant(self, ques: str, response):
# 创建评估模型实例
qwen_model = QwenModel()
contextrelevant_metric = ContextualRelevancyMetric(model=qwen_model)
test_case = self.get_test_case(ques,response)
contextrelevant_metric.measure(test_case)
contextrelevant = dict()
contextrelevant["score"] = contextrelevant_metric.score
contextrelevant["reason"] = contextrelevant_metric.reason
print(f"contextrelevant:{contextrelevant}")
return contextrelevant
def get_test_case(self, ques: str, result):
quote_list = result["responseData"][1]["quoteList"]
retrival_context = []
for quote in quote_list:
retrival_context.append(f"{quote['q']}:{quote['a']}")
context = []
historypreview = result["responseData"][2]["historyPreview"]
for history in historypreview:
context.append(history['value'])
answer = result["choices"][0]["message"]["content"]
# 使用自定义模型进行评估
res_case = LLMTestCase(
input=ques,
actual_output=answer,
expected_output=answer,
context=context,
retrieval_context=retrival_context
)
return res_case
if __name__ == "__main__":
url='https://XXXXXX/api/v1/chat/completions'
key='fastgpt-XXXXXX'
cr=ChatService(url,key)
#调用ai应用,得到result
result=cr.question_response("XXX怎么收费?")
es = EvalService()
es.get_faithfulness("XXX怎么收费?", result)
es.get_contextprecision("XXX怎么收费?", result)
es.get_contextrecall("XXX怎么收费?", result)
es.get_contextrelevant("XXX怎么收费?", result)
faithfulness:{'score': 1.0, 'reason': '实际输出与检索上下文完全一致,没有任何矛盾之处,所以得到了满分1.00的忠实度评分。'}
contextprecision:{'score': 1.0, 'reason': '得分为1.00,因为相关的节点(即第一个节点)被正确地排在了最前面。'}
contextrecall:{'score': 0.5, 'reason': '分数为0.50,因为虽然节点在检索上下文中提到了'}
contextrelevant:{'score': 0.16666666666666666, 'reason': "分数为0.17,因为大部分检索内容并未涉及XXX问题,例如……"}
53AI,企业落地大模型首选服务商
产品:场景落地咨询+大模型应用平台+行业解决方案
承诺:免费场景POC验证,效果验证后签署服务协议。零风险落地应用大模型,已交付160+中大型企业
2025-07-18
用 LangGraph 打造了一个迷你 RAG:150 行代码跑通知识库问答
2025-07-18
RAG文档处理的一种优化方案——问答对的转换技巧
2025-07-18
【精读】构建和扩展 RAG 系统的实践经验总结
2025-07-17
聊聊在Dify上如何做高效RAG&集成Milvus向量库存储检索的原理
2025-07-17
基于Dify 知识库的实验demo:从0到1构建智能商品分类系统
2025-07-16
Dify智能体开发:RAG 技术深度解析与知识库实战指南
2025-07-16
爆改RAG!Relevant Segment Extraction(RSE)让你的AI检索“有头有尾”,不再碎片化
2025-07-15
从知识检索到自主决策:传统RAG与Agent搜索的深度对比
2025-05-08
2025-06-06
2025-04-23
2025-05-30
2025-05-19
2025-06-05
2025-05-10
2025-04-28
2025-06-05
2025-04-21
2025-07-09
2025-07-04
2025-07-01
2025-07-01
2025-07-01
2025-07-01
2025-06-30
2025-06-29