Source code for shared.arxiv_parser

"""arXiv client with convenient typed wrappers.

This module provides :class:`ArxivParser` and helper functions to search for
papers, fetch metadata, download PDFs, and extract text. It is intentionally
lightweight and dependency-minimal.

clearExample::

    from shared.arxiv_parser import ArxivParser

    parser = ArxivParser()
    results = parser.search_papers("RAG small datasets", max_results=5)
    for p in results:
        print(p.id, p.title)
"""

# TODO: move this to agent/browsing/manual/sources/arxiv.py

import os
import re
import logging
from datetime import datetime, timedelta
from typing import List, Optional
from dataclasses import dataclass
from pathlib import Path

import arxiv
import requests
from bs4 import BeautifulSoup
import PyPDF2

logger = logging.getLogger(__name__)


[docs] @dataclass class ArxivPaper: """Class for representing a scientific article from arXiv. :ivar id: arXiv identifier (e.g., ``"2301.07041"``). :ivar title: Paper title. :ivar authors: Author names. :ivar summary: Abstract text. :ivar categories: arXiv categories. :ivar published: Submission date. :ivar updated: Last updated date. :ivar pdf_url: Link to the PDF. :ivar abs_url: Link to the abstract page. :ivar journal_ref: Optional journal reference. :ivar doi: Optional DOI. :ivar comment: Optional author comment. :ivar primary_category: Optional primary category. """ id: str title: str authors: List[str] summary: str categories: List[str] published: datetime updated: datetime pdf_url: str abs_url: str journal_ref: Optional[str] = None doi: Optional[str] = None comment: Optional[str] = None primary_category: Optional[str] = None
[docs] class ArxivParser: """Main class for working with the arXiv API. :param downloads_dir: Directory used to store temporary files when downloading PDFs. :returns: ``None``. """ def __init__(self, downloads_dir: str = "downloads"): self.client = arxiv.Client() self.downloads_dir = Path(downloads_dir) self.downloads_dir.mkdir(exist_ok=True)
[docs] def search_papers( self, query: str, max_results: int = 10, sort_by: arxiv.SortCriterion = arxiv.SortCriterion.Relevance, sort_order: arxiv.SortOrder = arxiv.SortOrder.Descending, categories: Optional[List[str]] = None, date_from: Optional[datetime] = None, date_to: Optional[datetime] = None, start: int = 0, ) -> List[ArxivPaper]: """Search articles by query with optional filters. :param query: Search query string. :param max_results: Maximum number of results to return. :param sort_by: Sort criterion, e.g., :data:`arxiv.SortCriterion.Relevance`. :param sort_order: Sort order, e.g., :data:`arxiv.SortOrder.Descending`. :param categories: Category filter like ``["cs.AI", "cs.LG"]``. :param date_from: Start date for results (inclusive). :param date_to: End date for results (inclusive). :param start: Starting index for pagination (default 0). :returns: Found papers as typed records. """ try: # Build search query search_query = self._build_search_query( query, categories, date_from, date_to ) # Create search object search = arxiv.Search( query=search_query, max_results=max_results, sort_by=sort_by, sort_order=sort_order, ) # Execute search with pagination support results = [] processed_count = 0 skipped_count = 0 for result in self.client.results(search): # Skip results until we reach the start position if skipped_count < start: skipped_count += 1 continue # Stop when we have enough results if processed_count >= max_results: break paper = self._convert_to_arxiv_paper(result) results.append(paper) processed_count += 1 logger.info( f"Found {len(results)} articles for query: {query} (start={start}, skipped={skipped_count})" ) return results except Exception as e: logger.error(f"Error searching articles: {e}") return []
[docs] def get_paper_by_id(self, arxiv_id: str) -> Optional[ArxivPaper]: """Get article data by ID. :param arxiv_id: Article ID on arXiv (e.g., ``"2301.07041"``) :returns: The paper if found, otherwise ``None``. """ try: # Normalize ID clean_id = self._clean_arxiv_id(arxiv_id) # Create search query by ID search = arxiv.Search(id_list=[clean_id]) # Get result results = list(self.client.results(search)) if results: paper = self._convert_to_arxiv_paper(results[0]) logger.info(f"Found article: {paper.title}") return paper else: logger.warning(f"Article with ID {arxiv_id} not found") return None except Exception as e: logger.error(f"Error getting article {arxiv_id}: {e}") return None
[docs] def download_pdf( self, paper: ArxivPaper, filename: Optional[str] = None ) -> Optional[str]: """Download a paper's PDF file. :param paper: The paper to download. :param filename: Optional filename override; defaults to a safe name from id/title. :returns: Path to the downloaded file or ``None`` on error. """ try: if not filename: # Generate filename from ID and title safe_title = re.sub(r"[^\w\s-]", "", paper.title)[:50] safe_title = re.sub(r"[-\s]+", "-", safe_title) filename = f"{paper.id}_{safe_title}.pdf" filepath = self.downloads_dir / filename # Download PDF response = requests.get(paper.pdf_url, stream=True) response.raise_for_status() with open(filepath, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) logger.info(f"PDF downloaded: {filepath}") return str(filepath) except Exception as e: logger.error(f"Error downloading PDF {paper.id}: {e}") return None
[docs] def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]: """Extract text from a PDF file. :param pdf_path: Path to the local PDF file. :returns: Extracted text or ``None`` on error. """ try: with open(pdf_path, "rb") as file: pdf_reader = PyPDF2.PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n\n" logger.info(f"Text extracted from PDF: {len(text)} characters") return text.strip() except Exception as e: logger.error(f"Error extracting text from PDF {pdf_path}: {e}") return None
[docs] def get_paper_text_online(self, paper: ArxivPaper) -> Optional[str]: """Get article text online without downloading the PDF. :param paper: The paper descriptor. :returns: The article text or ``None`` on error. """ try: # First try to get through HTML version html_url = paper.abs_url.replace("/abs/", "/html/") response = requests.get(html_url) if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") # Find main text content_div = soup.find("div", class_="ltx_page_content") if content_div: text = content_div.get_text(strip=True) logger.info(f"Text obtained online (HTML): {len(text)} characters") return text # If HTML not available, download and parse PDF logger.info("HTML version not available, downloading PDF...") pdf_path = self.download_pdf(paper) if pdf_path: text = self.extract_text_from_pdf(pdf_path) # Remove temporary file os.remove(pdf_path) return text return None except Exception as e: logger.error(f"Error getting text online {paper.id}: {e}") return None
[docs] def search_by_author( self, author_name: str, max_results: int = 10 ) -> List[ArxivPaper]: """Search articles by author. :param author_name: Author name. :param max_results: Maximum number of results. :returns: List of the author's articles. """ query = f"au:{author_name}" return self.search_papers(query, max_results=max_results)
[docs] def search_by_category( self, category: str, max_results: int = 10 ) -> List[ArxivPaper]: """Search articles by category. :param category: Category (e.g., ``"cs.AI"``, ``"cs.LG"``). :param max_results: Maximum number of results. :returns: List of articles in the category. """ query = f"cat:{category}" return self.search_papers(query, max_results=max_results)
[docs] def get_recent_papers( self, category: Optional[str] = None, days: int = 7, max_results: int = 10 ) -> List[ArxivPaper]: """Get recent articles. :param category: Category filter. :param days: Number of days back from now. :param max_results: Maximum number of results. :returns: List of recent articles. """ date_from = datetime.now() - timedelta(days=days) if category: query = f"cat:{category}" else: query = "*" return self.search_papers( query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate, date_from=date_from, )
def _build_search_query( self, query: str, categories: Optional[List[str]] = None, date_from: Optional[datetime] = None, date_to: Optional[datetime] = None, ) -> str: """Build search query with filters. :param query: Base query string. :param categories: Optional category filters. :param date_from: Optional start date. :param date_to: Optional end date. :returns: Composed query string for arXiv API. """ search_parts = [query] # Add category filter if categories: cat_filter = " OR ".join([f"cat:{cat}" for cat in categories]) search_parts.append(f"({cat_filter})") # Add date filter (basic support) if date_from: date_str = date_from.strftime("%Y%m%d") search_parts.append(f"submittedDate:[{date_str}* TO *]") return " AND ".join(search_parts) def _convert_to_arxiv_paper(self, result: arxiv.Result) -> ArxivPaper: """Convert search result to :class:`ArxivPaper`.""" return ArxivPaper( id=result.entry_id.split("/")[-1], title=result.title, authors=[author.name for author in result.authors], summary=result.summary, categories=result.categories, published=result.published, updated=result.updated, pdf_url=result.pdf_url or "", abs_url=result.entry_id, journal_ref=result.journal_ref, doi=result.doi, comment=result.comment, primary_category=result.primary_category, ) def _clean_arxiv_id(self, arxiv_id: str) -> str: """Clean and normalize arXiv ID. :param arxiv_id: Raw arXiv id possibly with prefix/version. :returns: Cleaned id without prefix and version. """ # Remove "arXiv:" prefix if present clean_id = arxiv_id.replace("arXiv:", "") # Remove version if present (e.g., v1, v2) clean_id = re.sub(r"v\d+$", "", clean_id) return clean_id
# Helper functions for convenience
[docs] def search_papers(query: str, max_results: int = 10) -> List[ArxivPaper]: """Quick article search. :param query: Free-text search query. :param max_results: Maximum number of results to return. :returns: List of :class:`ArxivPaper` instances. """ parser = ArxivParser() return parser.search_papers(query, max_results)
[docs] def get_paper(arxiv_id: str) -> Optional[ArxivPaper]: """Quick article retrieval by ID. :param arxiv_id: arXiv identifier. :returns: :class:`ArxivPaper` instance or ``None``. """ parser = ArxivParser() return parser.get_paper_by_id(arxiv_id)
[docs] def download_paper(arxiv_id: str, downloads_dir: str = "downloads") -> Optional[str]: """Quick article download. :param arxiv_id: arXiv identifier. :param downloads_dir: Directory to store the PDF file. :returns: Path to the downloaded PDF or ``None``. """ parser = ArxivParser(downloads_dir) paper = parser.get_paper_by_id(arxiv_id) if paper: return parser.download_pdf(paper) return None
if __name__ == "__main__": # Usage example logging.basicConfig(level=logging.INFO) parser = ArxivParser() # Search by keywords papers = parser.search_papers("machine learning transformers", max_results=5) for paper in papers: print(f"ID: {paper.id}") print(f"Title: {paper.title}") print(f"Authors: {', '.join(paper.authors)}") print(f"Published: {paper.published}") print(f"Categories: {', '.join(paper.categories)}") print("-" * 80)