生成sft数据,设置OQMD的代理,测试mars-t1

2025-04-22 16:44:26 +08:00
parent 6b92e54a41
commit a7964add00
38 changed files with 888 additions and 191 deletions
--- a/generate_data/calculate_tokens.py
+++ b/generate_data/calculate_tokens.py
@@ -0,0 +1,79 @@
+import json
+import tiktoken
+import numpy as np
+import statistics
+from pathlib import Path
+# 均值: 13716.062458398048
+# 最大值: 106876
+# 最小值: 5108
+# 中值: 13285.5
+# 样本数: 9014
+def count_tokens_in_string(text):
+    """使用tiktoken库计算字符串中的token数量"""
+    # 使用cl100k_base编码器，这是GPT-4使用的编码器
+    encoding = tiktoken.get_encoding("cl100k_base")
+    # 计算tokens
+    tokens = encoding.encode(text)
+    return len(tokens)
+
+def process_jsonl_file(file_path):
+    """处理JSONL文件并计算token统计信息"""
+    token_counts = []
+    count=0
+    # 读取JSONL文件
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                # 解析JSON行
+                data = json.loads(line)
+                if len(data['messages'])==4:
+                # 将数据转换为字符串
+                    count+=1
+                    data_str = json.dumps(data)
+                    # 计算tokens
+                    token_count = count_tokens_in_string(data_str)
+                    token_counts.append(token_count)
+                else:
+                    pass
+            except Exception as e:
+                print(f"处理行时出错: {e}")
+    print("countnumber",count)
+    # 计算统计信息
+    if token_counts:
+        mean_value = statistics.mean(token_counts)
+        max_value = max(token_counts)
+        min_value = min(token_counts)
+        median_value = statistics.median(token_counts)
+
+        # 计算token数小于32k的样本数量
+        count_less_than_32k = sum(1 for count in token_counts if count < 32000)
+        count_less_than_24k = sum(1 for count in token_counts if count < 24000)
+        count_less_than_16k = sum(1 for count in token_counts if count < 16000)
+        return {
+            "均值": mean_value,
+            "最大值": max_value,
+            "最小值": min_value,
+            "中值": median_value,
+            "样本数": len(token_counts),
+            "token数小于32k的样本数": count_less_than_32k,
+            "token数小于32k的样本百分比": (count_less_than_32k / len(token_counts)) * 100 if token_counts else 0,
+            "token数小于24k的样本数": count_less_than_24k,
+            "token数小于24k的样本百分比": (count_less_than_24k / len(token_counts)) * 100 if token_counts else 0,
+            "token数小于16k的样本数": count_less_than_16k,
+            "token数小于16k的样本百分比": (count_less_than_16k / len(token_counts)) * 100 if token_counts else 0
+        }
+    else:
+        return {"错误": "没有找到有效数据"}
+
+if __name__ == "__main__":
+    file_path = "/home/ubuntu/sas0/lzy/mars-mcp/generate_data/agent_questions_solution_turn5_ans_no_none.jsonl"
+
+    # 确认文件存在
+    if not Path(file_path).exists():
+        print(f"错误: 文件不存在 - {file_path}")
+    else:
+        # 处理文件并打印结果
+        results = process_jsonl_file(file_path)
+        print("\n统计结果:")
+        for key, value in results.items():
+            print(f"{key}: {value}")