Files
mars-mcp/generate_data/calculate_tokens.py

80 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import tiktoken
import numpy as np
import statistics
from pathlib import Path
# 均值: 13716.062458398048
# 最大值: 106876
# 最小值: 5108
# 中值: 13285.5
# 样本数: 9014
def count_tokens_in_string(text):
"""使用tiktoken库计算字符串中的token数量"""
# 使用cl100k_base编码器这是GPT-4使用的编码器
encoding = tiktoken.get_encoding("cl100k_base")
# 计算tokens
tokens = encoding.encode(text)
return len(tokens)
def process_jsonl_file(file_path):
"""处理JSONL文件并计算token统计信息"""
token_counts = []
count=0
# 读取JSONL文件
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
try:
# 解析JSON行
data = json.loads(line)
if len(data['messages'])==4:
# 将数据转换为字符串
count+=1
data_str = json.dumps(data)
# 计算tokens
token_count = count_tokens_in_string(data_str)
token_counts.append(token_count)
else:
pass
except Exception as e:
print(f"处理行时出错: {e}")
print("countnumber",count)
# 计算统计信息
if token_counts:
mean_value = statistics.mean(token_counts)
max_value = max(token_counts)
min_value = min(token_counts)
median_value = statistics.median(token_counts)
# 计算token数小于32k的样本数量
count_less_than_32k = sum(1 for count in token_counts if count < 32000)
count_less_than_24k = sum(1 for count in token_counts if count < 24000)
count_less_than_16k = sum(1 for count in token_counts if count < 16000)
return {
"均值": mean_value,
"最大值": max_value,
"最小值": min_value,
"中值": median_value,
"样本数": len(token_counts),
"token数小于32k的样本数": count_less_than_32k,
"token数小于32k的样本百分比": (count_less_than_32k / len(token_counts)) * 100 if token_counts else 0,
"token数小于24k的样本数": count_less_than_24k,
"token数小于24k的样本百分比": (count_less_than_24k / len(token_counts)) * 100 if token_counts else 0,
"token数小于16k的样本数": count_less_than_16k,
"token数小于16k的样本百分比": (count_less_than_16k / len(token_counts)) * 100 if token_counts else 0
}
else:
return {"错误": "没有找到有效数据"}
if __name__ == "__main__":
file_path = "/home/ubuntu/sas0/lzy/mars-mcp/generate_data/agent_questions_solution_turn5_ans_no_none.jsonl"
# 确认文件存在
if not Path(file_path).exists():
print(f"错误: 文件不存在 - {file_path}")
else:
# 处理文件并打印结果
results = process_jsonl_file(file_path)
print("\n统计结果:")
for key, value in results.items():
print(f"{key}: {value}")