sci-gui-agent-benchmark/mm_agents/coact/autogen/tools/experimental/wikipedia/wikipedia.py

# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Optional, Union

import requests
from pydantic import BaseModel

from autogen.import_utils import optional_import_block, require_optional_import
from autogen.tools import Tool

with optional_import_block():
    import wikipediaapi

# Maximum allowed length for a query string.
MAX_QUERY_LENGTH = 300
# Maximum number of pages to retrieve from a search.
MAX_PAGE_RETRIEVE = 100
# Maximum number of characters to return from a Wikipedia page.
MAX_ARTICLE_LENGTH = 10000


class Document(BaseModel):
    """Pydantic model representing a Wikipedia document.

    Attributes:
        page_content (str): Textual content of the Wikipedia page
            (possibly truncated).
        metadata (dict[str, str]): Additional info, including:
            - source URL
            - title
            - pageid
            - timestamp
            - word count
            - size
    """

    page_content: str
    metadata: dict[str, str]


class WikipediaClient:
    """Client for interacting with the Wikipedia API.

    Supports searching and page retrieval on a specified language edition.

    Public methods:
        search(query: str, limit: int) -> list[dict[str, Any]]
        get_page(title: str) -> Optional[wikipediaapi.WikipediaPage]

    Attributes:
        base_url (str): URL of the MediaWiki API endpoint.
        headers (dict[str, str]): HTTP headers, including User-Agent.
        wiki (wikipediaapi.Wikipedia): Low-level Wikipedia API client.
    """

    def __init__(self, language: str = "en", tool_name: str = "wikipedia-client") -> None:
        """Initialize the WikipediaClient.

        Args:
            language (str): ISO code of the Wikipedia edition (e.g., 'en', 'es').
            tool_name (str): Identifier for User-Agent header.
        """
        self.base_url = f"https://{language}.wikipedia.org/w/api.php"
        self.headers = {"User-Agent": f"autogen.Agent ({tool_name})"}
        self.wiki = wikipediaapi.Wikipedia(
            language=language,
            extract_format=wikipediaapi.ExtractFormat.WIKI,
            user_agent=f"autogen.Agent ({tool_name})",
        )

    def search(self, query: str, limit: int = 3) -> Any:
        """Search Wikipedia for pages matching a query string.

        Args:
            query (str): The search keywords.
            limit (int): Max number of results to return.

        Returns:
            list[dict[str, Any]]: Each dict has keys:
                - 'title' (str)
                - 'size' (int)
                - 'wordcount' (int)
                - 'timestamp' (str)

        Raises:
            requests.HTTPError: If the HTTP request to the API fails.
        """
        params = {
            "action": "query",
            "format": "json",
            "list": "search",
            "srsearch": query,
            "srlimit": str(limit),
            "srprop": "size|wordcount|timestamp",
        }

        response = requests.get(url=self.base_url, params=params, headers=self.headers)
        response.raise_for_status()
        data = response.json()
        search_data = data.get("query", {}).get("search", [])
        return search_data

    def get_page(self, title: str) -> Optional[Any]:
        """Retrieve a WikipediaPage object by title.

        Args:
            title (str): Title of the Wikipedia page.

        Returns:
            wikipediaapi.WikipediaPage | None: The page object if it exists,
            otherwise None.

        Raises:
            wikipediaapi.WikipediaException: On lower‑level API errors.
        """
        page = self.wiki.page(title)
        if not page.exists():
            return None
        return page


@require_optional_import(["wikipediaapi"], "wikipedia")
class WikipediaQueryRunTool(Tool):
    """Tool for querying Wikipedia and returning summarized page results.

    This tool uses the `wikipediaapi` package to perform searches
    against a specified language edition of Wikipedia and returns
    up to `top_k` page summaries.

    Public methods:
        query_run(query: str) -> list[str] | str

    Attributes:
        language (str): Language code for the Wikipedia edition (e.g., 'en', 'es').
        top_k (int): Max number of page summaries returned (≤ MAX_PAGE_RETRIEVE).
        verbose (bool): If True, enables debug logging to stdout.
        wiki_cli (WikipediaClient): Internal client for Wikipedia API calls.
    """

    def __init__(self, language: str = "en", top_k: int = 3, verbose: bool = False) -> None:
        """Initialize the WikipediaQueryRunTool.

        Args:
            language (str): ISO code of the Wikipedia edition to query.
            top_k (int): Desired number of summaries (capped by MAX_PAGE_RETRIEVE).
            verbose (bool): If True, print debug information during searches.
        """
        self.language = language
        self.tool_name = "wikipedia-query-run"
        self.wiki_cli = WikipediaClient(language, self.tool_name)
        self.top_k = min(top_k, MAX_PAGE_RETRIEVE)
        self.verbose = verbose
        super().__init__(
            name=self.tool_name,
            description="Run a Wikipedia query and return page summaries.",
            func_or_tool=self.query_run,
        )

    def query_run(self, query: str) -> Union[list[str], str]:
        """Search Wikipedia and return formatted page summaries.

        Truncates `query` to MAX_QUERY_LENGTH before searching.

        Args:
            query (str): Search term(s) to look up in Wikipedia.

        Returns:
            list[str]: Each element is "Page: <title>\nSummary: <text>".
            str: Error message if no results are found or on exception.

        Note:
            Automatically handles API exceptions and returns error strings for robust operation
        """
        try:
            if self.verbose:
                print(f"INFO\t [{self.tool_name}] search query='{query[:MAX_QUERY_LENGTH]}' top_k={self.top_k}")
            search_results = self.wiki_cli.search(query[:MAX_QUERY_LENGTH], limit=self.top_k)
            summaries: list[str] = []
            for item in search_results:
                title = item["title"]
                page = self.wiki_cli.get_page(title)
                # Only format the summary if the page exists and has a summary.
                if page is not None and page.summary:
                    summary = f"Page: {title}\nSummary: {page.summary}"
                    summaries.append(summary)
            if not summaries:
                return "No good Wikipedia Search Result was found"
            return summaries
        except Exception as e:
            return f"wikipedia search failed: {str(e)}"


@require_optional_import(["wikipediaapi"], "wikipedia")
class WikipediaPageLoadTool(Tool):
    """
    A tool to load up to N characters of Wikipedia page content along with metadata.

    This tool uses a language-specific Wikipedia client to search for relevant articles
    and returns a list of Document objects containing truncated page content and metadata
    (source URL, title, page ID, timestamp, word count, and size). Ideal for agents
    requiring structured Wikipedia data for research, summarization, or contextual enrichment.

    Attributes:
        language (str): Wikipedia language code (default: "en").
        top_k (int): Maximum number of pages to retrieve per query (default: 3).
        truncate (int): Maximum number of characters of content per page (default: 4000).
        verbose (bool): If True, prints debug information (default: False).
        tool_name (str): Identifier used in User-Agent header.
        wiki_cli (WikipediaClient): Client for interacting with the Wikipedia API.
    """

    def __init__(self, language: str = "en", top_k: int = 3, truncate: int = 4000, verbose: bool = False) -> None:
        """
        Initializes the WikipediaPageLoadTool with configurable language, result count, and content length.

        Args:
            language (str): The language code for the Wikipedia edition (default is "en").
            top_k (int): The maximum number of pages to retrieve per query (default is 3;
                         capped at MAX_PAGE_RETRIEVE).
            truncate (int): The maximum number of characters to extract from each page (default is 4000;
                            capped at MAX_ARTICLE_LENGTH).
            verbose (bool): If True, enables verbose/debug logging (default is False).
        """
        self.language = language
        self.top_k = min(top_k, MAX_PAGE_RETRIEVE)
        self.truncate = min(truncate, MAX_ARTICLE_LENGTH)
        self.verbose = verbose
        self.tool_name = "wikipedia-page-load"
        self.wiki_cli = WikipediaClient(language, self.tool_name)
        super().__init__(
            name=self.tool_name,
            description=(
                "Search Wikipedia for relevant pages using a language-specific client. "
                "Returns a list of documents with truncated content and metadata including title, URL, "
                "page ID, timestamp, word count, and page size. Configure number of results with the 'top_k' parameter "
                "and content length with 'truncate'. Useful for research, summarization, or contextual enrichment."
            ),
            func_or_tool=self.content_search,
        )

    def content_search(self, query: str) -> Union[list[Document], str]:
        """
        Executes a Wikipedia search and returns page content plus metadata.

        Args:
            query (str): The search term to query Wikipedia.

        Returns:
            Union[list[Document], str]:
                - list[Document]: Documents with up to `truncate` characters of page text
                  and metadata if pages are found.
                - str: Error message if the search fails or no pages are found.

        Notes:
            - Errors are caught internally and returned as strings.
            - If no matching pages have text content, returns
              "No good Wikipedia Search Result was found".
        """
        try:
            if self.verbose:
                print(f"INFO\t [{self.tool_name}] search query='{query[:MAX_QUERY_LENGTH]}' top_k={self.top_k}")
            search_results = self.wiki_cli.search(query[:MAX_QUERY_LENGTH], limit=self.top_k)
            docs: list[Document] = []
            for item in search_results:
                page = self.wiki_cli.get_page(item["title"])
                # Only process pages that exist and have text content.
                if page is not None and page.text:
                    document = Document(
                        page_content=page.text[: self.truncate],
                        metadata={
                            "source": f"https://{self.language}.wikipedia.org/?curid={item['pageid']}",
                            "title": item["title"],
                            "pageid": str(item["pageid"]),
                            "timestamp": str(item["timestamp"]),
                            "wordcount": str(item["wordcount"]),
                            "size": str(item["size"]),
                        },
                    )
                    docs.append(document)
            if not docs:
                return "No good Wikipedia Search Result was found"
            return docs

        except Exception as e:
            return f"wikipedia search failed: {str(e)}"