生成选项采用上采样的方式，采样6次并让模型进行回答；将早停的认为困难，全部采样都回答正确的认为简单。基于此构造新的stepy

2025-06-02 16:19:18 +08:00
parent d219b9b0c0
commit abeacaac3e
8 changed files with 169413 additions and 11331 deletions
--- a/eval_framework/config/config.yaml
+++ b/eval_framework/config/config.yaml
@@ -2,7 +2,7 @@
 api:
  key: "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
  base_url: "https://vip.apiyi.com/v1"
-  temperature: -1 # 默认使用模型的温度设置
+  temperature: 0 # 默认使用模型的温度设置
  max_retries: 10
  # 支持多个模型
  models:
@@ -10,7 +10,6 @@ api:
    - "gpt-4o"
    - "deepseek-chat"
    - "claude-sonnet-4-20250514"
-    - "deepseek-r1"
  # 或者使用单个模型（向后兼容）
  # model: "qwen-max-2025-01-25"

@@ -20,7 +19,8 @@ system_prompt: None
 evaluation:
  max_workers: 20
  # input_file: "/home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json"
-  input_file: "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions.json"
+  # input_file: "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions.json"
+  input_file: "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json"
  # 输出配置
  output:
    base_dir: "results"