Source code for agent.browsing.manual.sources.google_scholar

"""Google Scholar manual browsing via DuckDuckGo site-restricted search.

This module intentionally avoids scraping Google Scholar directly. It leverages
the `ddgs` (or legacy ``duckduckgo_search`` as a fallback) package to retrieve
public result snippets limited to the Scholar domain.
"""

from typing import override
from typing import Iterator, List, Optional

# Prefer the new `ddgs` package; fall back to the legacy name to avoid warnings.
try:  # pragma: no cover - import resolution depends on environment
    from ddgs import DDGS  # type: ignore
except Exception:  # pragma: no cover - fallback path in older envs
    from duckduckgo_search import DDGS  # type: ignore

from .base import ManualSource, SearchItem



[docs]
class GoogleScholarBrowser(ManualSource):
    """Manual source for Google Scholar using site-restricted web search.

    Note: result metadata is limited to title, URL, and snippet.
    """


[docs]
    @override
    def search(
        self,
        query: str,
        max_results: int = 25,
        start: int = 0,
        *,
        region: str = "wt-wt",
        **kwargs: object,
    ) -> List[SearchItem]:
        """Search Scholar results using DuckDuckGo site restriction.

        :param query: Free-text query string.
        :param max_results: Maximum number of results to return.
        :param start: Zero-based start index; applied client-side.
        :param region: Region code for DuckDuckGo.
        :returns: List of normalized search items.
        """

        # DDG provides a generator that yields up to max_results results.
        # We post-filter for pagination semantics.
        ddg_query = f"site:scholar.google.com {query}".strip()
        items: List[SearchItem] = []
        with DDGS() as ddgs:
            from typing import Any  # local import to avoid global dependency

            # ddgs and duckduckgo_search have different parameter names: query vs keywords
            ddgs_any: Any = ddgs
            text_fn: Any = ddgs_any.text
            try:
                generator = text_fn(
                    query=ddg_query, region=region, max_results=max_results + start
                )
            except TypeError:
                generator = text_fn(
                    keywords=ddg_query, region=region, max_results=max_results + start
                )
            for i, res in enumerate(generator):
                if i < start:
                    continue
                title = str(res.get("title") or "")
                url = str(res.get("href") or res.get("link") or res.get("url") or "")
                snippet = str(
                    res.get("body") or res.get("snippet") or res.get("desc") or ""
                )
                if not title and not url:
                    continue
                items.append(
                    SearchItem(
                        title=title, url=url, snippet=snippet, item_id=None, extra=None
                    )
                )
                if len(items) >= max_results:
                    break
        return items



[docs]
    @override
    def iter_all(
        self,
        query: str,
        chunk_size: int = 100,
        limit: Optional[int] = None,
        *,
        region: str = "wt-wt",
        **kwargs: object,
    ) -> Iterator[SearchItem]:
        """Iterate through Scholar results by fetching in chunks.

        :param query: Free-text query string.
        :param chunk_size: Number of results fetched per request.
        :param limit: Optional maximum number of items to yield.
        :param region: Region code for DuckDuckGo.
        :returns: Iterator over normalized search items.
        """
        yielded = 0
        start = 0
        while True:
            page = self.search(
                query=query, max_results=chunk_size, start=start, region=region
            )
            if not page:
                return
            for item in page:
                if limit is not None and yielded >= limit:
                    return
                yielded += 1
                yield item
            start += len(page)
            if len(page) < chunk_size:
                return



[docs]
    @override
    def search_all(
        self,
        query: str,
        chunk_size: int = 100,
        limit: Optional[int] = None,
        *,
        region: str = "wt-wt",
        **kwargs: object,
    ) -> List[SearchItem]:
        """Collect Scholar results for a query into a list.

        :param query: Free-text query string.
        :param chunk_size: Number of results fetched per request.
        :param limit: Optional maximum number of items to collect.
        :param region: Region code for DuckDuckGo.
        :returns: List of normalized search items.
        """
        results: List[SearchItem] = []
        for item in self.iter_all(
            query=query, chunk_size=chunk_size, limit=limit, region=region
        ):
            results.append(item)
        return results