4.gRPC 服务端 - Python 代码
\syntax = "proto3"; import "google/protobuf/wrappers.proto"; option csharp_namespace = "AIHub.RPC"; package aihub; service ChatHub { rpc Chat (ChatRequest) returns (ChatReply); rpc StreamingChat (ChatRequest) returns (stream ChatReply); } message ChatRequest { string prompt = 1; repeated Message history = 2; int32 max_length = 3; float top_p = 4; float temperature = 5; } message Message { string role = 1; string content = 2; } message ChatReply { string response = 1; }增加了 Message 类型,在 ChatRequest 聊天请求中增加了 history 字段作为对话历史。
private ChatRequest GetRequest(string prompt, List<Message>? history = null) { var request = new ChatRequest { Prompt = prompt, MaxLength = 2048, TopP = 0.75f, Temperature = 0.95f }; if (history != null) { request.History.AddRange(history); } return request; }继续改写两个聊天的方法,增加个一个 history 参数
public async Task<string> Chat(string prompt, List<Message>? history = null) { var resp = await _client.ChatAsync(GetRequest(prompt, history)); return RenderText(resp.Response); } public async IAsyncEnumerable<string> StreamingChat(string prompt, List<Message>? history = null) { using var call = _client.StreamingChat(GetRequest(prompt, history)); await foreach (var resp in call.ResponseStream.ReadAllAsync()) { yield return RenderText(resp.Response); } }搞定。
history = [('问题1', '回答1'), ('问题2', '回答2')]但是 AIHub 的对话是按照 OpenAI 的思路来做的,是这样的格式:
history = [ {'role': 'user', 'content': '问题1'}, {'role': 'assistant', 'content': '回答1'}, {'role': 'user', 'content': '问题2'}, {'role': 'assistant', 'content': '回答2'}, ]现在需要把 OpenAI 对话格式转换为 ChatGLM 的格式。
def messages_to_tuple_history(messages: List[chat_pb2.Message]): """把聊天记录列表转换成 ChatGLM 需要的 list 嵌套 tuple 形式""" history = [] current_completion = ['', ''] is_enter_completion = False for item in messages: if not is_enter_completion and item.role == 'user': is_enter_completion = True if is_enter_completion: if item.role == 'user': if len(current_completion[0]) > 0: current_completion[0] = f"{current_completion[0]}\n\n{item.content}" else: current_completion[0] = item.content if item.role == 'assistant': if len(current_completion[1]) > 0: current_completion[1] = f"{current_completion[1]}\n\n{item.content}" else: current_completion[1] = item.content is_enter_completion = False history.append((current_completion[0], current_completion[1])) current_completion = ['', ''] return history目前只处理了 user 和 assistant 两种角色,其实 OpenAI 还有 system 和 function ,system 比较好处理,可以做成以下形式
[('system prompt1', ''), ('system prompt2', '')]不过我还没测试,暂时也用不上这个东西,所以就不写在代码里了。
class ChatService(chat_pb2_grpc.ChatHubServicer): def Chat(self, request: chat_pb2.ChatRequest, context): response, history = model.chat( tokenizer, request.prompt, history=messages_to_tuple_history(request.history), max_length=request.max_length, top_p=request.top_p, temperature=request.temperature) torch_gc() return chat_pb2.ChatReply(response=response) def StreamingChat(self, request: chat_pb2.ChatRequest, context): current_length = 0 for response, history in model.stream_chat( tokenizer, request.prompt, history=messages_to_tuple_history(request.history), max_length=request.max_length, top_p=request.top_p, temperature=request.temperature, return_past_key_values=False): print(response[current_length:], end="", flush=True) yield chat_pb2.ChatReply(response=response) current_length = len(response) torch_gc()对了,每次对话完成记得回收显存
def torch_gc(): if torch.cuda.is_available(): with torch.cuda.device(CUDA_DEVICE): torch.cuda.empty_cache() torch.cuda.ipc_collect()这样就搞定了。