CoACT initialize (#292)
This commit is contained in:
@@ -0,0 +1,7 @@
|
||||
# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from .wikipedia import WikipediaPageLoadTool, WikipediaQueryRunTool
|
||||
|
||||
__all__ = ["WikipediaPageLoadTool", "WikipediaQueryRunTool"]
|
||||
@@ -0,0 +1,287 @@
|
||||
# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel
|
||||
|
||||
from autogen.import_utils import optional_import_block, require_optional_import
|
||||
from autogen.tools import Tool
|
||||
|
||||
with optional_import_block():
|
||||
import wikipediaapi
|
||||
|
||||
# Maximum allowed length for a query string.
|
||||
MAX_QUERY_LENGTH = 300
|
||||
# Maximum number of pages to retrieve from a search.
|
||||
MAX_PAGE_RETRIEVE = 100
|
||||
# Maximum number of characters to return from a Wikipedia page.
|
||||
MAX_ARTICLE_LENGTH = 10000
|
||||
|
||||
|
||||
class Document(BaseModel):
|
||||
"""Pydantic model representing a Wikipedia document.
|
||||
|
||||
Attributes:
|
||||
page_content (str): Textual content of the Wikipedia page
|
||||
(possibly truncated).
|
||||
metadata (dict[str, str]): Additional info, including:
|
||||
- source URL
|
||||
- title
|
||||
- pageid
|
||||
- timestamp
|
||||
- word count
|
||||
- size
|
||||
"""
|
||||
|
||||
page_content: str
|
||||
metadata: dict[str, str]
|
||||
|
||||
|
||||
class WikipediaClient:
|
||||
"""Client for interacting with the Wikipedia API.
|
||||
|
||||
Supports searching and page retrieval on a specified language edition.
|
||||
|
||||
Public methods:
|
||||
search(query: str, limit: int) -> list[dict[str, Any]]
|
||||
get_page(title: str) -> Optional[wikipediaapi.WikipediaPage]
|
||||
|
||||
Attributes:
|
||||
base_url (str): URL of the MediaWiki API endpoint.
|
||||
headers (dict[str, str]): HTTP headers, including User-Agent.
|
||||
wiki (wikipediaapi.Wikipedia): Low-level Wikipedia API client.
|
||||
"""
|
||||
|
||||
def __init__(self, language: str = "en", tool_name: str = "wikipedia-client") -> None:
|
||||
"""Initialize the WikipediaClient.
|
||||
|
||||
Args:
|
||||
language (str): ISO code of the Wikipedia edition (e.g., 'en', 'es').
|
||||
tool_name (str): Identifier for User-Agent header.
|
||||
"""
|
||||
self.base_url = f"https://{language}.wikipedia.org/w/api.php"
|
||||
self.headers = {"User-Agent": f"autogen.Agent ({tool_name})"}
|
||||
self.wiki = wikipediaapi.Wikipedia(
|
||||
language=language,
|
||||
extract_format=wikipediaapi.ExtractFormat.WIKI,
|
||||
user_agent=f"autogen.Agent ({tool_name})",
|
||||
)
|
||||
|
||||
def search(self, query: str, limit: int = 3) -> Any:
|
||||
"""Search Wikipedia for pages matching a query string.
|
||||
|
||||
Args:
|
||||
query (str): The search keywords.
|
||||
limit (int): Max number of results to return.
|
||||
|
||||
Returns:
|
||||
list[dict[str, Any]]: Each dict has keys:
|
||||
- 'title' (str)
|
||||
- 'size' (int)
|
||||
- 'wordcount' (int)
|
||||
- 'timestamp' (str)
|
||||
|
||||
Raises:
|
||||
requests.HTTPError: If the HTTP request to the API fails.
|
||||
"""
|
||||
params = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"list": "search",
|
||||
"srsearch": query,
|
||||
"srlimit": str(limit),
|
||||
"srprop": "size|wordcount|timestamp",
|
||||
}
|
||||
|
||||
response = requests.get(url=self.base_url, params=params, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
search_data = data.get("query", {}).get("search", [])
|
||||
return search_data
|
||||
|
||||
def get_page(self, title: str) -> Optional[Any]:
|
||||
"""Retrieve a WikipediaPage object by title.
|
||||
|
||||
Args:
|
||||
title (str): Title of the Wikipedia page.
|
||||
|
||||
Returns:
|
||||
wikipediaapi.WikipediaPage | None: The page object if it exists,
|
||||
otherwise None.
|
||||
|
||||
Raises:
|
||||
wikipediaapi.WikipediaException: On lower‑level API errors.
|
||||
"""
|
||||
page = self.wiki.page(title)
|
||||
if not page.exists():
|
||||
return None
|
||||
return page
|
||||
|
||||
|
||||
@require_optional_import(["wikipediaapi"], "wikipedia")
|
||||
class WikipediaQueryRunTool(Tool):
|
||||
"""Tool for querying Wikipedia and returning summarized page results.
|
||||
|
||||
This tool uses the `wikipediaapi` package to perform searches
|
||||
against a specified language edition of Wikipedia and returns
|
||||
up to `top_k` page summaries.
|
||||
|
||||
Public methods:
|
||||
query_run(query: str) -> list[str] | str
|
||||
|
||||
Attributes:
|
||||
language (str): Language code for the Wikipedia edition (e.g., 'en', 'es').
|
||||
top_k (int): Max number of page summaries returned (≤ MAX_PAGE_RETRIEVE).
|
||||
verbose (bool): If True, enables debug logging to stdout.
|
||||
wiki_cli (WikipediaClient): Internal client for Wikipedia API calls.
|
||||
"""
|
||||
|
||||
def __init__(self, language: str = "en", top_k: int = 3, verbose: bool = False) -> None:
|
||||
"""Initialize the WikipediaQueryRunTool.
|
||||
|
||||
Args:
|
||||
language (str): ISO code of the Wikipedia edition to query.
|
||||
top_k (int): Desired number of summaries (capped by MAX_PAGE_RETRIEVE).
|
||||
verbose (bool): If True, print debug information during searches.
|
||||
"""
|
||||
self.language = language
|
||||
self.tool_name = "wikipedia-query-run"
|
||||
self.wiki_cli = WikipediaClient(language, self.tool_name)
|
||||
self.top_k = min(top_k, MAX_PAGE_RETRIEVE)
|
||||
self.verbose = verbose
|
||||
super().__init__(
|
||||
name=self.tool_name,
|
||||
description="Run a Wikipedia query and return page summaries.",
|
||||
func_or_tool=self.query_run,
|
||||
)
|
||||
|
||||
def query_run(self, query: str) -> Union[list[str], str]:
|
||||
"""Search Wikipedia and return formatted page summaries.
|
||||
|
||||
Truncates `query` to MAX_QUERY_LENGTH before searching.
|
||||
|
||||
Args:
|
||||
query (str): Search term(s) to look up in Wikipedia.
|
||||
|
||||
Returns:
|
||||
list[str]: Each element is "Page: <title>\nSummary: <text>".
|
||||
str: Error message if no results are found or on exception.
|
||||
|
||||
Note:
|
||||
Automatically handles API exceptions and returns error strings for robust operation
|
||||
"""
|
||||
try:
|
||||
if self.verbose:
|
||||
print(f"INFO\t [{self.tool_name}] search query='{query[:MAX_QUERY_LENGTH]}' top_k={self.top_k}")
|
||||
search_results = self.wiki_cli.search(query[:MAX_QUERY_LENGTH], limit=self.top_k)
|
||||
summaries: list[str] = []
|
||||
for item in search_results:
|
||||
title = item["title"]
|
||||
page = self.wiki_cli.get_page(title)
|
||||
# Only format the summary if the page exists and has a summary.
|
||||
if page is not None and page.summary:
|
||||
summary = f"Page: {title}\nSummary: {page.summary}"
|
||||
summaries.append(summary)
|
||||
if not summaries:
|
||||
return "No good Wikipedia Search Result was found"
|
||||
return summaries
|
||||
except Exception as e:
|
||||
return f"wikipedia search failed: {str(e)}"
|
||||
|
||||
|
||||
@require_optional_import(["wikipediaapi"], "wikipedia")
|
||||
class WikipediaPageLoadTool(Tool):
|
||||
"""
|
||||
A tool to load up to N characters of Wikipedia page content along with metadata.
|
||||
|
||||
This tool uses a language-specific Wikipedia client to search for relevant articles
|
||||
and returns a list of Document objects containing truncated page content and metadata
|
||||
(source URL, title, page ID, timestamp, word count, and size). Ideal for agents
|
||||
requiring structured Wikipedia data for research, summarization, or contextual enrichment.
|
||||
|
||||
Attributes:
|
||||
language (str): Wikipedia language code (default: "en").
|
||||
top_k (int): Maximum number of pages to retrieve per query (default: 3).
|
||||
truncate (int): Maximum number of characters of content per page (default: 4000).
|
||||
verbose (bool): If True, prints debug information (default: False).
|
||||
tool_name (str): Identifier used in User-Agent header.
|
||||
wiki_cli (WikipediaClient): Client for interacting with the Wikipedia API.
|
||||
"""
|
||||
|
||||
def __init__(self, language: str = "en", top_k: int = 3, truncate: int = 4000, verbose: bool = False) -> None:
|
||||
"""
|
||||
Initializes the WikipediaPageLoadTool with configurable language, result count, and content length.
|
||||
|
||||
Args:
|
||||
language (str): The language code for the Wikipedia edition (default is "en").
|
||||
top_k (int): The maximum number of pages to retrieve per query (default is 3;
|
||||
capped at MAX_PAGE_RETRIEVE).
|
||||
truncate (int): The maximum number of characters to extract from each page (default is 4000;
|
||||
capped at MAX_ARTICLE_LENGTH).
|
||||
verbose (bool): If True, enables verbose/debug logging (default is False).
|
||||
"""
|
||||
self.language = language
|
||||
self.top_k = min(top_k, MAX_PAGE_RETRIEVE)
|
||||
self.truncate = min(truncate, MAX_ARTICLE_LENGTH)
|
||||
self.verbose = verbose
|
||||
self.tool_name = "wikipedia-page-load"
|
||||
self.wiki_cli = WikipediaClient(language, self.tool_name)
|
||||
super().__init__(
|
||||
name=self.tool_name,
|
||||
description=(
|
||||
"Search Wikipedia for relevant pages using a language-specific client. "
|
||||
"Returns a list of documents with truncated content and metadata including title, URL, "
|
||||
"page ID, timestamp, word count, and page size. Configure number of results with the 'top_k' parameter "
|
||||
"and content length with 'truncate'. Useful for research, summarization, or contextual enrichment."
|
||||
),
|
||||
func_or_tool=self.content_search,
|
||||
)
|
||||
|
||||
def content_search(self, query: str) -> Union[list[Document], str]:
|
||||
"""
|
||||
Executes a Wikipedia search and returns page content plus metadata.
|
||||
|
||||
Args:
|
||||
query (str): The search term to query Wikipedia.
|
||||
|
||||
Returns:
|
||||
Union[list[Document], str]:
|
||||
- list[Document]: Documents with up to `truncate` characters of page text
|
||||
and metadata if pages are found.
|
||||
- str: Error message if the search fails or no pages are found.
|
||||
|
||||
Notes:
|
||||
- Errors are caught internally and returned as strings.
|
||||
- If no matching pages have text content, returns
|
||||
"No good Wikipedia Search Result was found".
|
||||
"""
|
||||
try:
|
||||
if self.verbose:
|
||||
print(f"INFO\t [{self.tool_name}] search query='{query[:MAX_QUERY_LENGTH]}' top_k={self.top_k}")
|
||||
search_results = self.wiki_cli.search(query[:MAX_QUERY_LENGTH], limit=self.top_k)
|
||||
docs: list[Document] = []
|
||||
for item in search_results:
|
||||
page = self.wiki_cli.get_page(item["title"])
|
||||
# Only process pages that exist and have text content.
|
||||
if page is not None and page.text:
|
||||
document = Document(
|
||||
page_content=page.text[: self.truncate],
|
||||
metadata={
|
||||
"source": f"https://{self.language}.wikipedia.org/?curid={item['pageid']}",
|
||||
"title": item["title"],
|
||||
"pageid": str(item["pageid"]),
|
||||
"timestamp": str(item["timestamp"]),
|
||||
"wordcount": str(item["wordcount"]),
|
||||
"size": str(item["size"]),
|
||||
},
|
||||
)
|
||||
docs.append(document)
|
||||
if not docs:
|
||||
return "No good Wikipedia Search Result was found"
|
||||
return docs
|
||||
|
||||
except Exception as e:
|
||||
return f"wikipedia search failed: {str(e)}"
|
||||
Reference in New Issue
Block a user