diff --git a/README.md b/README.md index 0ff41b0..91008cb 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ You can refer to ```scripts/data_process/nq_search.py``` for a concrete data pro It is recommended to make your corpus a jsonl file, where each line (a dictionary with "id" key and "contents" key) corresponds to one passage. You can refer to ```example/corpus.jsonl``` for an example. -The "id" key corresponds to the passage id, while the "contents" key corresponds to the passage content. +The "id" key corresponds to the passage id, while the "contents" key corresponds to the passage content ('"' + title + '"\n' + text). For example: ``` {"id": "0", "contents": "Evan Morris Evan L. Morris (January 26, 1977 \u2013 July 9, 2015) was a lobbyist for Genentech and its parent corporation Roche in Washington."} diff --git a/search_r1/search/index_builder.py b/search_r1/search/index_builder.py index 3734cc8..2cba65a 100644 --- a/search_r1/search/index_builder.py +++ b/search_r1/search/index_builder.py @@ -195,9 +195,10 @@ class Index_Builder: for start_idx in tqdm(range(0, len(self.corpus), self.batch_size), desc='Inference Embeddings:'): - batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title'] - batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text'] - batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)] + # batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title'] + # batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text'] + # batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)] + batch_data = self.corpus[start_idx:start_idx+self.batch_size]['contents'] if self.retrieval_method == "e5": batch_data = [f"passage: {doc}" for doc in batch_data]