Source code for agent.browsing.manual.sources.google_scholar
"""Google Scholar manual browsing via DuckDuckGo site-restricted search.This module intentionally avoids scraping Google Scholar directly. It leveragesthe `ddgs` (or legacy ``duckduckgo_search`` as a fallback) package to retrievepublic result snippets limited to the Scholar domain."""fromtypingimportoverridefromtypingimportIterator,List,Optional# Prefer the new `ddgs` package; fall back to the legacy name to avoid warnings.try:# pragma: no cover - import resolution depends on environmentfromddgsimportDDGS# type: ignoreexceptException:# pragma: no cover - fallback path in older envsfromduckduckgo_searchimportDDGS# type: ignorefrom.baseimportManualSource,SearchItem
[docs]classGoogleScholarBrowser(ManualSource):"""Manual source for Google Scholar using site-restricted web search. Note: result metadata is limited to title, URL, and snippet. """
[docs]@overridedefsearch(self,query:str,max_results:int=25,start:int=0,*,region:str="wt-wt",**kwargs:object,)->List[SearchItem]:"""Search Scholar results using DuckDuckGo site restriction. :param query: Free-text query string. :param max_results: Maximum number of results to return. :param start: Zero-based start index; applied client-side. :param region: Region code for DuckDuckGo. :returns: List of normalized search items. """# DDG provides a generator that yields up to max_results results.# We post-filter for pagination semantics.ddg_query=f"site:scholar.google.com {query}".strip()items:List[SearchItem]=[]withDDGS()asddgs:fromtypingimportAny# local import to avoid global dependency# ddgs and duckduckgo_search have different parameter names: query vs keywordsddgs_any:Any=ddgstext_fn:Any=ddgs_any.texttry:generator=text_fn(query=ddg_query,region=region,max_results=max_results+start)exceptTypeError:generator=text_fn(keywords=ddg_query,region=region,max_results=max_results+start)fori,resinenumerate(generator):ifi<start:continuetitle=str(res.get("title")or"")url=str(res.get("href")orres.get("link")orres.get("url")or"")snippet=str(res.get("body")orres.get("snippet")orres.get("desc")or"")ifnottitleandnoturl:continueitems.append(SearchItem(title=title,url=url,snippet=snippet,item_id=None,extra=None))iflen(items)>=max_results:breakreturnitems
[docs]@overridedefiter_all(self,query:str,chunk_size:int=100,limit:Optional[int]=None,*,region:str="wt-wt",**kwargs:object,)->Iterator[SearchItem]:"""Iterate through Scholar results by fetching in chunks. :param query: Free-text query string. :param chunk_size: Number of results fetched per request. :param limit: Optional maximum number of items to yield. :param region: Region code for DuckDuckGo. :returns: Iterator over normalized search items. """yielded=0start=0whileTrue:page=self.search(query=query,max_results=chunk_size,start=start,region=region)ifnotpage:returnforiteminpage:iflimitisnotNoneandyielded>=limit:returnyielded+=1yielditemstart+=len(page)iflen(page)<chunk_size:return
[docs]@overridedefsearch_all(self,query:str,chunk_size:int=100,limit:Optional[int]=None,*,region:str="wt-wt",**kwargs:object,)->List[SearchItem]:"""Collect Scholar results for a query into a list. :param query: Free-text query string. :param chunk_size: Number of results fetched per request. :param limit: Optional maximum number of items to collect. :param region: Region code for DuckDuckGo. :returns: List of normalized search items. """results:List[SearchItem]=[]foriteminself.iter_all(query=query,chunk_size=chunk_size,limit=limit,region=region):results.append(item)returnresults