File size: 10,569 Bytes
7ecbf9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude?hl=zh-cn
https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude/use-claude?hl=zh-cn


Llama

https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/use-llama?hl=zh-cn
https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/use-llama?hl=zh-cn#regions-quotas

Model Name
llama-4-maverick-17b-128e-instruct-maas
llama-4-scout-17b-16e-instruct-maas

区域选择 us-east5



Model Name

gemini-2.5-pro
The model does not support setting thinking_budget to 0.
Unable to submit request because thinking_budget is out of range; supported values are integers from 128 to 32768.


"""
import argparse
from datetime import datetime
import json
import os
from pathlib import Path
import sys
import time
import tempfile
from zoneinfo import ZoneInfo  # Python 3.9+ 自带,无需安装

pwd = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.join(pwd, "../"))

from google import genai
from google.genai import types

from project_settings import environment, project_path


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name",
        # default="gemini-2.5-pro",   # The model does not support setting thinking_budget to 0.
        default="gemini-2.5-flash",
        # default="gemini-2.5-flash-lite-preview-06-17",
        # default="llama-4-maverick-17b-128e-instruct-maas",
        # default="llama-4-scout-17b-16e-instruct-maas",
        type=str
    )
    parser.add_argument(
        "--eval_dataset_name",
        default="agent-nxcloud-zh-375-choice.jsonl",
        type=str
    )
    parser.add_argument(
        "--eval_dataset_dir",
        default=(project_path / "data/dataset").as_posix(),
        type=str
    )
    parser.add_argument(
        "--eval_data_dir",
        default=(project_path / "data/eval_data").as_posix(),
        type=str
    )
    parser.add_argument(
        "--client",
        default="shenzhen_sase",
        type=str
    )
    parser.add_argument(
        "--service",
        # default="google_potent_veld_462405_t3",
        default="google_nxcloud_312303",
        type=str
    )
    parser.add_argument(
        "--create_time_str",
        default="null",
        # default="20250731_162116",
        type=str
    )
    parser.add_argument(
        "--interval",
        default=1,
        type=int
    )
    args = parser.parse_args()
    return args


def conversation_to_str(conversation: list):
    conversation_str = ""
    for turn in conversation:
        role = turn["role"]
        content = turn["content"]
        row_ = f"{role}: {content}\n"
        conversation_str += row_

    return conversation_str


system_prompt = """
你是一位专业的电话对话分析专家,负责根据客服与客户之间的通话内容判断客户意图类别。

请仔细分析用户提供的完整对话,并严格按照以下规则进行分类:

- **A**:客户**明确同意参加试听课**(如“好啊,安排一下”)。仅询问细节、模糊回应(如“嗯嗯”“好的”)不算。
- **B**:客户**投诉、辱骂、或明确要求停止拨打此类电话**(如“别再打了!”)。仅拒绝试听(如“不用了”)不属于 B。
- **C**:客户表示**当前时刻不方便通话,例如提到“在开车”、“不方便”等**。
- **D**:对话为**语音留言/自动应答**,或包含“留言”“voicemail”“message”“已录音”等关键词,或出现**逐字念出的数字串**(如“九零九五……”)。
- **E**:客服**完成两次独立推销后**,客户**两次都表达了明确拒绝,仅一次不算做E分类**。
- **F**:客户未表达明确意愿,或以上情况均不符合(默认类别)。

**输出要求:**
- 仅输出一个大写字母(A、B、C、D、E 或 F);
- 不要任何解释、标点、空格、换行、JSON、引号或其他字符;
- 输出必须且只能是单个字母。
"""


def main():
    args = get_args()

    service = environment.get(args.service, dtype=json.loads)
    project_id = service["project_id"]

    google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
    google_application_credentials.parent.mkdir(parents=True, exist_ok=True)

    with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
        content = json.dumps(service, ensure_ascii=False, indent=4)
        f.write(f"{content}\n")

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()

    eval_dataset_dir = Path(args.eval_dataset_dir)
    eval_dataset_dir.mkdir(parents=True, exist_ok=True)
    eval_data_dir = Path(args.eval_data_dir)
    eval_data_dir.mkdir(parents=True, exist_ok=True)

    if args.create_time_str == "null":
        tz = ZoneInfo("Asia/Shanghai")
        now = datetime.now(tz)
        create_time_str = now.strftime("%Y%m%d_%H%M%S")
        # create_time_str = "20250729-interval-5"
    else:
        create_time_str = args.create_time_str

    eval_dataset = eval_dataset_dir / args.eval_dataset_name

    output_file = eval_data_dir / f"gemini_google_nxcloud_choice/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
    output_file.parent.mkdir(parents=True, exist_ok=True)

    client = genai.Client(
        vertexai=True,
        project=project_id,
        location="global",
        # location="us-east5",
    )
    generate_content_config = types.GenerateContentConfig(
        top_p=0.95,
        temperature=0.6,
        max_output_tokens=1,
        response_modalities=["TEXT"],
        thinking_config=types.ThinkingConfig(
            thinking_budget=0
        )
    )

    total = 0
    total_correct = 0

    # finished
    finished_idx_set = set()
    if os.path.exists(output_file.as_posix()):
        with open(output_file.as_posix(), "r", encoding="utf-8") as f:
            for row in f:
                row = json.loads(row)
                idx = row["idx"]
                total = row["total"]
                total_correct = row["total_correct"]
                finished_idx_set.add(idx)
    print(f"finished count: {len(finished_idx_set)}")

    with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
        for row in fin:
            row = json.loads(row)
            idx = row["idx"]
            # system_prompt = row["system_prompt"]
            conversation = row["conversation"]
            examples = row["examples"]
            choices = row["choices"]
            response = row["response"]

            if idx in finished_idx_set:
                continue
            finished_idx_set.add(idx)

            # conversation
            conversation_str = conversation_to_str(conversation)

            examples_str = ""
            for example in examples:
                conversation_ = example["conversation"]
                outputs = example["outputs"]
                output = outputs["output"]
                explanation = outputs["explanation"]

                examples_str += conversation_to_str(conversation_)
                # output_json = {"Explanation": explanation, "output": output}
                # output_json_str = json.dumps(output_json, ensure_ascii=False)
                # examples_str += f"\nOutput: {output_json_str}\n"
                examples_str += f"\nOutput: {output}\n\n"

            # print(examples_str)

            choices_str = ""
            for choice in choices:
                condition = choice["condition"]
                choice_letter = choice["choice_letter"]

                row_ = f"{condition}, output: {choice_letter}\n"
                choices_str += row_
            # choices_str += "\nRemember to output ONLY the corresponding letter.\nYour output is:"
            # choices_str += "\nPlease use only 10-15 words to explain.\nOutput:"

            # prompt = f"{system_prompt}\n\n**Output**\n{choices_}\n**Examples**\n{examples_}"
            prompt1 = f"{system_prompt}\n\n**Examples**\n{examples_str}"
            prompt2 = f"**Conversation**\n{conversation_str}\n\nOutput:"
            # print(prompt1)
            # print(prompt2)

            messages = list()
            messages.append(
                {"role": "system", "content": prompt1},
            )
            messages.append(
                {"role": "user", "content": prompt2},
            )
            # print(f"messages: {json.dumps(messages, ensure_ascii=False, indent=4)}")
            contents = [
                types.Content(
                    role="user" if messages[0]["role"] == "user" else "model",
                    parts=[
                        types.Part.from_text(text=messages[0]["content"])
                    ]
                ),
                types.Content(
                    role=messages[1]["role"],
                    parts=[
                        types.Part.from_text(text=messages[1]["content"])
                    ]
                )
            ]

            time.sleep(args.interval)
            print(f"sleep: {args.interval}")
            time_begin = time.time()
            llm_response: types.GenerateContentResponse = client.models.generate_content(
                model=args.model_name,
                contents=contents,
                config=generate_content_config,
            )
            time_cost = time.time() - time_begin
            print(f"time_cost: {time_cost}")
            try:
                prediction = llm_response.candidates[0].content.parts[0].text
            except TypeError as e:
                print(f"request failed, error type: {type(e)}, error text: {str(e)}")
                continue
            correct = 1 if prediction == response else 0

            total += 1
            total_correct += correct
            score = total_correct / total

            row_ = {
                "idx": idx,
                "messages": messages,
                "response": response,
                "prediction": prediction,
                "correct": correct,
                "total": total,
                "total_correct": total_correct,
                "score": score,
                "time_cost": time_cost,
            }
            row_ = json.dumps(row_, ensure_ascii=False)
            fout.write(f"{row_}\n")

    return


if __name__ == "__main__":
    main()