Source code for docviz

import asyncio
import threading

from .constants import get_docviz_directory
from .environment import check_dependencies
from .lib.document.class_ import Document
from .lib.functions import (
    batch_extract,
    extract_content,
    extract_content_streaming,
    extract_content_streaming_sync,
    extract_content_sync,
)
from .types import (
    DetectionConfig,
    ExtractionChunk,
    ExtractionConfig,
    ExtractionEntry,
    ExtractionResult,
    ExtractionType,
    LLMConfig,
    OCRConfig,
    SaveFormat,
)

__DEPENDENCIES_CHECKED = False
__DEPENDENCIES_LOCK = threading.Lock()


[docs] def _check_dependencies_once(): """ Ensure dependencies are checked only once in a thread-safe and process-safe manner. This function is called automatically on module import to verify that all required dependencies (models, libraries, etc.) are available before document processing. This prevents runtime errors and provides early feedback about missing dependencies. A global variable tracks whether dependencies have been checked in the current thread. For process-level safety, a lock file at ~/.docviz/dependencies_checked.lock prevents multiple processes from performing the check simultaneously. Double-checked locking is used to minimize unnecessary locking and improve performance. The function handles different asyncio contexts: - Creates a new event loop if none exists - Uses asyncio.run() for clean execution - Handles cases where event loop is already running (e.g., Jupyter notebooks) Raises: Exception: If any required dependency is missing or the dependency check fails. The specific exception type depends on what dependency is missing (e.g., FileNotFoundError for missing models, ImportError for missing packages). """ global __DEPENDENCIES_CHECKED # Use a lock file to ensure this runs only once across processes lock_file = get_docviz_directory() / "dependencies_checked.lock" lock_file.parent.mkdir(exist_ok=True) # Check if already verified in this session or globally if __DEPENDENCIES_CHECKED or lock_file.exists(): return with __DEPENDENCIES_LOCK: # Double-check pattern if __DEPENDENCIES_CHECKED or lock_file.exists(): return try: _run_async_dependency_check() __DEPENDENCIES_CHECKED = True lock_file.touch() except Exception as e: # If dependencies check fails, don't mark as checked # so it will retry next time raise e
[docs] def _run_async_dependency_check(): """ Run the async dependency check with proper event loop handling. This helper function handles different asyncio contexts gracefully: 1. If no event loop is running, use asyncio.run() (preferred modern approach) 2. If an event loop is already running (e.g., in Jupyter), create a new thread 3. Handle various edge cases and provide clear error messages Raises: RuntimeError: If dependency check fails after multiple attempts Exception: Original exception from check_dependencies() if it's not event loop related """ try: asyncio.run(check_dependencies()) except RuntimeError as e: error_msg = str(e).lower() # Handle "asyncio.run() cannot be called from a running event loop" if "cannot be called from a running event loop" in error_msg: # We're in an environment with a running event loop (e.g., Jupyter) # Run in a separate thread to avoid conflicts import concurrent.futures def run_in_thread(): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: return loop.run_until_complete(check_dependencies()) finally: loop.close() with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit(run_in_thread) future.result() else: # Re-raise other RuntimeErrors raise
# Check dependencies on import _check_dependencies_once() __all__ = [ "DetectionConfig", "Document", "ExtractionChunk", "ExtractionConfig", "ExtractionEntry", "ExtractionResult", "ExtractionType", "LLMConfig", "OCRConfig", "SaveFormat", "batch_extract", "extract_content", "extract_content_streaming", "extract_content_streaming_sync", "extract_content_sync", ]