From 716cd73977df13a8d5f6380e10ea7b39a7fceded Mon Sep 17 00:00:00 2001 From: PeterGriffinJin Date: Mon, 31 Mar 2025 12:58:04 +0000 Subject: [PATCH] add more data processing codes --- scripts/data_process/qa_search_test_merge.py | 115 ++++++++++++++++++ scripts/data_process/qa_search_train_merge.py | 105 ++++++++++++++++ scripts/nq_hotpotqa/data_process.sh | 10 ++ 3 files changed, 230 insertions(+) create mode 100644 scripts/data_process/qa_search_test_merge.py create mode 100644 scripts/data_process/qa_search_train_merge.py create mode 100644 scripts/nq_hotpotqa/data_process.sh diff --git a/scripts/data_process/qa_search_test_merge.py b/scripts/data_process/qa_search_test_merge.py new file mode 100644 index 0000000..6bc98b8 --- /dev/null +++ b/scripts/data_process/qa_search_test_merge.py @@ -0,0 +1,115 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the QA dataset to parquet format +""" + +import re +import os +import datasets + +from verl.utils.hdfs_io import copy, makedirs +import argparse + + +def make_prefix(dp, template_type): + question = dp['question'] + + # NOTE: also need to change reward_score/countdown.py + if template_type == 'base': + """This works for any base model""" + prefix = f"""Answer the given question. \ +You must conduct reasoning inside and first every time you get new information. \ +After reasoning, if you find you lack some knowledge, you can call a search engine by query and it will return the top searched results between and . \ +You can search as many times as your want. \ +If you find no further external knowledge needed, you can directly provide the answer inside and , without detailed illustrations. For example, Beijing . Question: {question}\n""" + else: + raise NotImplementedError + return prefix + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--local_dir', default='./data/nq_search') + parser.add_argument('--hdfs_dir', default=None) + parser.add_argument('--template_type', type=str, default='base') + parser.add_argument('--data_sources', default='nq') + + args = parser.parse_args() + + data_sources = args.data_sources.split(',') + all_dataset = [] + + for data_source in data_sources: + + if data_source != 'strategyqa': + dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', data_source) + else: + dataset = datasets.load_dataset('json', data_files="/home/peterjin/mnt/data/strategyqa/test_correct.jsonl") + + if 'test' in dataset: + print(f'Using the {data_source} test dataset...') + test_dataset = dataset['test'] + elif 'dev' in dataset: + print(f'Using the {data_source} dev dataset...') + test_dataset = dataset['dev'] + else: + print(f'Using the {data_source} train dataset...') + test_dataset = dataset['train'] + + # add a row to each data item that represents a unique id + def make_map_fn(split): + + def process_fn(example, idx): + example['question'] = example['question'].strip() + if example['question'][-1] != '?': + example['question'] += '?' + question = make_prefix(example, template_type=args.template_type) + solution = { + "target": example['golden_answers'], + } + + data = { + "data_source": data_source, + "prompt": [{ + "role": "user", + "content": question, + }], + "ability": "fact-reasoning", + "reward_model": { + "style": "rule", + "ground_truth": solution + }, + "extra_info": { + 'split': split, + 'index': idx, + } + } + return data + + return process_fn + + test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True) + all_dataset.append(test_dataset) + + local_dir = args.local_dir + hdfs_dir = args.hdfs_dir + + all_test_dataset = datasets.concatenate_datasets(all_dataset) + all_test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet')) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + + copy(src=local_dir, dst=hdfs_dir) diff --git a/scripts/data_process/qa_search_train_merge.py b/scripts/data_process/qa_search_train_merge.py new file mode 100644 index 0000000..ac8de65 --- /dev/null +++ b/scripts/data_process/qa_search_train_merge.py @@ -0,0 +1,105 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the QA dataset to parquet format +""" + +import re +import os +import datasets + +from verl.utils.hdfs_io import copy, makedirs +import argparse + + +def make_prefix(dp, template_type): + question = dp['question'] + + # NOTE: also need to change reward_score/countdown.py + if template_type == 'base': + """This works for any base model""" + prefix = f"""Answer the given question. \ +You must conduct reasoning inside and first every time you get new information. \ +After reasoning, if you find you lack some knowledge, you can call a search engine by query and it will return the top searched results between and . \ +You can search as many times as your want. \ +If you find no further external knowledge needed, you can directly provide the answer inside and , without detailed illustrations. For example, Beijing . Question: {question}\n""" + else: + raise NotImplementedError + return prefix + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--local_dir', default='./data/nq_search') + parser.add_argument('--hdfs_dir', default=None) + parser.add_argument('--template_type', type=str, default='base') + parser.add_argument('--data_sources', default='nq') + + args = parser.parse_args() + + # data_source = 'nq' + data_sources = args.data_sources.split(',') + all_dataset = [] + + for data_source in data_sources: + + dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', data_source) + + train_dataset = dataset['train'] + + # add a row to each data item that represents a unique id + def make_map_fn(split): + + def process_fn(example, idx): + example['question'] = example['question'].strip() + if example['question'][-1] != '?': + example['question'] += '?' + question = make_prefix(example, template_type=args.template_type) + solution = { + "target": example['golden_answers'], + } + + data = { + "data_source": data_source, + "prompt": [{ + "role": "user", + "content": question, + }], + "ability": "fact-reasoning", + "reward_model": { + "style": "rule", + "ground_truth": solution + }, + "extra_info": { + 'split': split, + 'index': idx, + } + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True) + all_dataset.append(train_dataset) + + local_dir = args.local_dir + hdfs_dir = args.hdfs_dir + + all_train_dataset = datasets.concatenate_datasets(all_dataset) + all_train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet')) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + + copy(src=local_dir, dst=hdfs_dir) diff --git a/scripts/nq_hotpotqa/data_process.sh b/scripts/nq_hotpotqa/data_process.sh new file mode 100644 index 0000000..ae1b45b --- /dev/null +++ b/scripts/nq_hotpotqa/data_process.sh @@ -0,0 +1,10 @@ +WORK_DIR=your/work/dir +LOCAL_DIR=$WORK_DIR/data/nq_hotpotqa_train + +## process multiple dataset search format train file +DATA=nq,hotpotqa +python $WORK_DIR/scripts/data_process/qa_search_train_merge.py --local_dir $LOCAL_DIR --data_sources $DATA + +## process multiple dataset search format test file +DATA=nq,triviaqa,popqa,hotpotqa,2wikimultihopqa,musique,bamboogle +python $WORK_DIR/scripts/data_process/qa_search_test_merge.py --local_dir $LOCAL_DIR --data_sources $DATA