fix index builder key bug
This commit is contained in:
@@ -161,7 +161,7 @@ You can refer to ```scripts/data_process/nq_search.py``` for a concrete data pro
|
||||
|
||||
It is recommended to make your corpus a jsonl file, where each line (a dictionary with "id" key and "contents" key) corresponds to one passage. You can refer to ```example/corpus.jsonl``` for an example.
|
||||
|
||||
The "id" key corresponds to the passage id, while the "contents" key corresponds to the passage content.
|
||||
The "id" key corresponds to the passage id, while the "contents" key corresponds to the passage content ('"' + title + '"\n' + text).
|
||||
For example:
|
||||
```
|
||||
{"id": "0", "contents": "Evan Morris Evan L. Morris (January 26, 1977 \u2013 July 9, 2015) was a lobbyist for Genentech and its parent corporation Roche in Washington."}
|
||||
|
||||
@@ -195,9 +195,10 @@ class Index_Builder:
|
||||
|
||||
for start_idx in tqdm(range(0, len(self.corpus), self.batch_size), desc='Inference Embeddings:'):
|
||||
|
||||
batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title']
|
||||
batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text']
|
||||
batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)]
|
||||
# batch_data_title = self.corpus[start_idx:start_idx+self.batch_size]['title']
|
||||
# batch_data_text = self.corpus[start_idx:start_idx+self.batch_size]['text']
|
||||
# batch_data = ['"' + title + '"\n' + text for title, text in zip(batch_data_title, batch_data_text)]
|
||||
batch_data = self.corpus[start_idx:start_idx+self.batch_size]['contents']
|
||||
|
||||
if self.retrieval_method == "e5":
|
||||
batch_data = [f"passage: {doc}" for doc in batch_data]
|
||||
|
||||
Reference in New Issue
Block a user