Skip to content

url_loader

langroid/parsing/url_loader.py

BaseCrawlerConfig

Bases: BaseSettings

Base configuration for web crawlers.

TrafilaturaConfig

Bases: BaseCrawlerConfig

Configuration for Trafilatura crawler.

FirecrawlConfig

Bases: BaseCrawlerConfig

Configuration for Firecrawl crawler.

Crawl4aiConfig(**kwargs)

Bases: BaseCrawlerConfig

Configuration for the Crawl4aiCrawler.

Source code in langroid/parsing/url_loader.py
def __init__(self, **kwargs: Any) -> None:
    """Initialize and ensure forward refs are resolved."""
    self._resolve_forward_refs()
    super().__init__(**kwargs)

BaseCrawler(config)

Bases: ABC

Abstract base class for web crawlers.

Parameters:

Name Type Description Default
config BaseCrawlerConfig

Configuration for the crawler

required
Source code in langroid/parsing/url_loader.py
def __init__(self, config: BaseCrawlerConfig):
    """Initialize the base crawler.

    Args:
        config: Configuration for the crawler
    """
    self.parser = config.parser if self.needs_parser else None
    self.config: BaseCrawlerConfig = config

needs_parser abstractmethod property

Indicates whether the crawler requires a parser.

CrawlerFactory

Factory for creating web crawlers.

create_crawler(config) staticmethod

Create a crawler instance based on configuration type.

Parameters:

Name Type Description Default
config BaseCrawlerConfig

Configuration for the crawler

required

Returns:

Type Description
BaseCrawler

A BaseCrawler instance

Raises:

Type Description
ValueError

If config type is not supported

Source code in langroid/parsing/url_loader.py
@staticmethod
def create_crawler(config: BaseCrawlerConfig) -> BaseCrawler:
    """Create a crawler instance based on configuration type.

    Args:
        config: Configuration for the crawler

    Returns:
        A BaseCrawler instance

    Raises:
        ValueError: If config type is not supported
    """
    if isinstance(config, TrafilaturaConfig):
        return TrafilaturaCrawler(config)
    elif isinstance(config, FirecrawlConfig):
        return FirecrawlCrawler(config)
    elif isinstance(config, ExaCrawlerConfig):
        return ExaCrawler(config)
    elif isinstance(config, Crawl4aiConfig):
        return Crawl4aiCrawler(config)
    else:
        raise ValueError(f"Unsupported crawler configuration type: {type(config)}")

TrafilaturaCrawler(config)

Bases: BaseCrawler

Crawler implementation using Trafilatura.

Parameters:

Name Type Description Default
config TrafilaturaConfig

Configuration for the crawler

required
Source code in langroid/parsing/url_loader.py
def __init__(self, config: TrafilaturaConfig):
    """Initialize the Trafilatura crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: TrafilaturaConfig = config

FirecrawlCrawler(config)

Bases: BaseCrawler

Crawler implementation using Firecrawl.

Parameters:

Name Type Description Default
config FirecrawlConfig

Configuration for the crawler

required
Source code in langroid/parsing/url_loader.py
def __init__(self, config: FirecrawlConfig) -> None:
    """Initialize the Firecrawl crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: FirecrawlConfig = config

ExaCrawler(config)

Bases: BaseCrawler

Crawler implementation using Exa API.

Parameters:

Name Type Description Default
config ExaCrawlerConfig

Configuration for the crawler

required
Source code in langroid/parsing/url_loader.py
def __init__(self, config: ExaCrawlerConfig) -> None:
    """Initialize the Exa crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: ExaCrawlerConfig = config

crawl(urls)

Crawl the given URLs using Exa SDK.

Parameters:

Name Type Description Default
urls List[str]

List of URLs to crawl

required

Returns:

Type Description
List[Document]

List of Documents with content extracted from the URLs

Raises:

Type Description
LangroidImportError

If the exa package is not installed

ValueError

If the Exa API key is not set

Source code in langroid/parsing/url_loader.py
def crawl(self, urls: List[str]) -> List[Document]:
    """Crawl the given URLs using Exa SDK.

    Args:
        urls: List of URLs to crawl

    Returns:
        List of Documents with content extracted from the URLs

    Raises:
        LangroidImportError: If the exa package is not installed
        ValueError: If the Exa API key is not set
    """
    try:
        from exa_py import Exa
    except ImportError:
        raise LangroidImportError("exa", "exa")

    if not self.config.api_key:
        raise ValueError("EXA_API_KEY key is required in your env or .env")

    exa = Exa(self.config.api_key)
    docs = []

    try:
        for url in urls:
            parsed_doc_chunks = self._process_document(url)
            if parsed_doc_chunks:
                docs.extend(parsed_doc_chunks)
                continue
            else:
                results = exa.get_contents(
                    [url],
                    livecrawl="always",
                    text={
                        "include_html_tags": True,
                    },
                )
                result = results.results[0]
                if result.text:
                    md_text = md.markdownify(result.text, heading_style="ATX")
                    # append a NON-chunked document
                    # (metadata.is_chunk = False, so will be chunked downstream)
                    docs.append(
                        Document(
                            content=md_text,
                            metadata=DocMetaData(
                                source=url,
                                title=getattr(result, "title", "Unknown Title"),
                                published_date=getattr(
                                    result, "published_date", "Unknown Date"
                                ),
                            ),
                        )
                    )

    except Exception as e:
        logging.error(f"Error retrieving content from Exa API: {e}")

    return docs

Crawl4aiCrawler(config)

Bases: BaseCrawler

Crawler implementation using the crawl4ai library.

This crawler intelligently dispatches URLs. Standard web pages are rendered and scraped using the crawl4ai browser engine. Direct links to documents (PDF, DOCX, etc.) are delegated to the framework's internal DocumentParser.

Source code in langroid/parsing/url_loader.py
def __init__(self, config: Crawl4aiConfig) -> None:
    """Initialize the Crawl4ai crawler."""
    super().__init__(config)
    self.config: Crawl4aiConfig = config

needs_parser property

Indicates that this crawler relies on the framework's DocumentParser for handling specific file types like PDF, DOCX, etc., which the browser engine cannot parse directly.

crawl(urls)

Executes the crawl by separating document URLs from web page URLs.

  • Document URLs (.pdf, .docx, etc.) are processed using _process_document.
  • Web page URLs are handled using the async crawl4ai engine.
Source code in langroid/parsing/url_loader.py
def crawl(self, urls: List[str]) -> List[Document]:
    """
    Executes the crawl by separating document URLs from web page URLs.

    - Document URLs (.pdf, .docx, etc.) are processed using `_process_document`.
    - Web page URLs are handled using the async crawl4ai engine.
    """
    all_documents: List[Document] = []
    webpage_urls: List[str] = []

    # Step 1: Separate URLs into documents and web pages
    for url in urls:
        parsed_doc_chunks = self._process_document(url)
        if parsed_doc_chunks:
            all_documents.extend(parsed_doc_chunks)
        else:
            webpage_urls.append(url)

    # Step 2: Process web page URLs asynchronously
    if webpage_urls:
        try:
            loop = asyncio.get_running_loop()
            if loop.is_running():
                import nest_asyncio

                nest_asyncio.apply()
            web_docs = asyncio.run(self._async_crawl(webpage_urls))
        except RuntimeError:
            web_docs = asyncio.run(self._async_crawl(webpage_urls))

        all_documents.extend(web_docs)

    return all_documents

URLLoader(urls, parsing_config=ParsingConfig(), crawler_config=None)

Loads URLs and extracts text using a specified crawler.

Parameters:

Name Type Description Default
urls List[Any]

List of URLs to load

required
parsing_config ParsingConfig

Configuration for parsing

ParsingConfig()
crawler_config Optional[BaseCrawlerConfig]

Configuration for the crawler

None
Source code in langroid/parsing/url_loader.py
def __init__(
    self,
    urls: List[Any],
    parsing_config: ParsingConfig = ParsingConfig(),
    crawler_config: Optional[BaseCrawlerConfig] = None,
):
    """Initialize the URL loader.

    Args:
        urls: List of URLs to load
        parsing_config: Configuration for parsing
        crawler_config: Configuration for the crawler
    """
    self.urls = urls
    self.parsing_config = parsing_config

    if crawler_config is None:
        crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))

    self.crawler = CrawlerFactory.create_crawler(crawler_config)
    if self.crawler.needs_parser:
        self.crawler.parser = Parser(parsing_config)

load()

Load the URLs using the specified crawler.

Source code in langroid/parsing/url_loader.py
def load(self) -> List[Document]:
    """Load the URLs using the specified crawler."""
    return self.crawler.crawl(self.urls)