fix index builder key bug

2025-05-16 20:37:44 +00:00
parent 8ecaa29f43
commit 59cc844c17
2 changed files with 5 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -161,7 +161,7 @@ You can refer to ```scripts/data_process/nq_search.py``` for a concrete data pro

 It is recommended to make your corpus a jsonl file, where each line (a dictionary with "id" key and "contents" key) corresponds to one passage. You can refer to ```example/corpus.jsonl``` for an example.

-The "id" key corresponds to the passage id, while the "contents" key corresponds to the passage content.
+The "id" key corresponds to the passage id, while the "contents" key corresponds to the passage content ('"' + title + '"\n' + text).
 For example:
 ```
 {"id": "0", "contents": "Evan Morris Evan L. Morris (January 26, 1977 \u2013 July 9, 2015) was a lobbyist for Genentech and its parent corporation Roche in Washington."}
--- a/search_r1/search/index_builder.py
+++ b/search_r1/search/index_builder.py
@@ -195,9 +195,10 @@ class Index_Builder:

        for start_idx in tqdm(range(0, len(self.corpus), self.batch_size), desc='Inference Embeddings:'):

-            batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title']
-            batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text']
-            batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)]
+            # batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title']
+            # batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text']
+            # batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)]
+            batch_data = self.corpus[start_idx:start_idx+self.batch_size]['contents']

            if self.retrieval_method == "e5":
                batch_data = [f"passage: {doc}" for doc in batch_data]