生成sft数据,设置OQMD的代理,测试mars-t1
This commit is contained in:
79
generate_data/calculate_tokens.py
Normal file
79
generate_data/calculate_tokens.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import json
|
||||
import tiktoken
|
||||
import numpy as np
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
# 均值: 13716.062458398048
|
||||
# 最大值: 106876
|
||||
# 最小值: 5108
|
||||
# 中值: 13285.5
|
||||
# 样本数: 9014
|
||||
def count_tokens_in_string(text):
|
||||
"""使用tiktoken库计算字符串中的token数量"""
|
||||
# 使用cl100k_base编码器,这是GPT-4使用的编码器
|
||||
encoding = tiktoken.get_encoding("cl100k_base")
|
||||
# 计算tokens
|
||||
tokens = encoding.encode(text)
|
||||
return len(tokens)
|
||||
|
||||
def process_jsonl_file(file_path):
|
||||
"""处理JSONL文件并计算token统计信息"""
|
||||
token_counts = []
|
||||
count=0
|
||||
# 读取JSONL文件
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
try:
|
||||
# 解析JSON行
|
||||
data = json.loads(line)
|
||||
if len(data['messages'])==4:
|
||||
# 将数据转换为字符串
|
||||
count+=1
|
||||
data_str = json.dumps(data)
|
||||
# 计算tokens
|
||||
token_count = count_tokens_in_string(data_str)
|
||||
token_counts.append(token_count)
|
||||
else:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"处理行时出错: {e}")
|
||||
print("countnumber",count)
|
||||
# 计算统计信息
|
||||
if token_counts:
|
||||
mean_value = statistics.mean(token_counts)
|
||||
max_value = max(token_counts)
|
||||
min_value = min(token_counts)
|
||||
median_value = statistics.median(token_counts)
|
||||
|
||||
# 计算token数小于32k的样本数量
|
||||
count_less_than_32k = sum(1 for count in token_counts if count < 32000)
|
||||
count_less_than_24k = sum(1 for count in token_counts if count < 24000)
|
||||
count_less_than_16k = sum(1 for count in token_counts if count < 16000)
|
||||
return {
|
||||
"均值": mean_value,
|
||||
"最大值": max_value,
|
||||
"最小值": min_value,
|
||||
"中值": median_value,
|
||||
"样本数": len(token_counts),
|
||||
"token数小于32k的样本数": count_less_than_32k,
|
||||
"token数小于32k的样本百分比": (count_less_than_32k / len(token_counts)) * 100 if token_counts else 0,
|
||||
"token数小于24k的样本数": count_less_than_24k,
|
||||
"token数小于24k的样本百分比": (count_less_than_24k / len(token_counts)) * 100 if token_counts else 0,
|
||||
"token数小于16k的样本数": count_less_than_16k,
|
||||
"token数小于16k的样本百分比": (count_less_than_16k / len(token_counts)) * 100 if token_counts else 0
|
||||
}
|
||||
else:
|
||||
return {"错误": "没有找到有效数据"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = "/home/ubuntu/sas0/lzy/mars-mcp/generate_data/agent_questions_solution_turn5_ans_no_none.jsonl"
|
||||
|
||||
# 确认文件存在
|
||||
if not Path(file_path).exists():
|
||||
print(f"错误: 文件不存在 - {file_path}")
|
||||
else:
|
||||
# 处理文件并打印结果
|
||||
results = process_jsonl_file(file_path)
|
||||
print("\n统计结果:")
|
||||
for key, value in results.items():
|
||||
print(f"{key}: {value}")
|
||||
Reference in New Issue
Block a user