import json import tiktoken import numpy as np import statistics from pathlib import Path # 均值: 13716.062458398048 # 最大值: 106876 # 最小值: 5108 # 中值: 13285.5 # 样本数: 9014 def count_tokens_in_string(text): """使用tiktoken库计算字符串中的token数量""" # 使用cl100k_base编码器,这是GPT-4使用的编码器 encoding = tiktoken.get_encoding("cl100k_base") # 计算tokens tokens = encoding.encode(text) return len(tokens) def process_jsonl_file(file_path): """处理JSONL文件并计算token统计信息""" token_counts = [] count=0 # 读取JSONL文件 with open(file_path, 'r', encoding='utf-8') as f: for line in f: try: # 解析JSON行 data = json.loads(line) if len(data['messages'])==4: # 将数据转换为字符串 count+=1 data_str = json.dumps(data) # 计算tokens token_count = count_tokens_in_string(data_str) token_counts.append(token_count) else: pass except Exception as e: print(f"处理行时出错: {e}") print("countnumber",count) # 计算统计信息 if token_counts: mean_value = statistics.mean(token_counts) max_value = max(token_counts) min_value = min(token_counts) median_value = statistics.median(token_counts) # 计算token数小于32k的样本数量 count_less_than_32k = sum(1 for count in token_counts if count < 32000) count_less_than_24k = sum(1 for count in token_counts if count < 24000) count_less_than_16k = sum(1 for count in token_counts if count < 16000) return { "均值": mean_value, "最大值": max_value, "最小值": min_value, "中值": median_value, "样本数": len(token_counts), "token数小于32k的样本数": count_less_than_32k, "token数小于32k的样本百分比": (count_less_than_32k / len(token_counts)) * 100 if token_counts else 0, "token数小于24k的样本数": count_less_than_24k, "token数小于24k的样本百分比": (count_less_than_24k / len(token_counts)) * 100 if token_counts else 0, "token数小于16k的样本数": count_less_than_16k, "token数小于16k的样本百分比": (count_less_than_16k / len(token_counts)) * 100 if token_counts else 0 } else: return {"错误": "没有找到有效数据"} if __name__ == "__main__": file_path = "/home/ubuntu/sas0/lzy/mars-mcp/generate_data/agent_questions_solution_turn5_ans_no_none.jsonl" # 确认文件存在 if not Path(file_path).exists(): print(f"错误: 文件不存在 - {file_path}") else: # 处理文件并打印结果 results = process_jsonl_file(file_path) print("\n统计结果:") for key, value in results.items(): print(f"{key}: {value}")