url_loader

langroid/parsing/url_loader.py

`BaseCrawlerConfig` ¶

Bases: BaseSettings

Base configuration for web crawlers.

`TrafilaturaConfig` ¶

Bases: BaseCrawlerConfig

Configuration for Trafilatura crawler.

`FirecrawlConfig` ¶

Bases: BaseCrawlerConfig

Configuration for Firecrawl crawler.

`Crawl4aiConfig(**kwargs)` ¶

Bases: BaseCrawlerConfig

Configuration for the Crawl4aiCrawler.

Source code in langroid/parsing/url_loader.py

def __init__(self, **kwargs: Any) -> None:
    """Initialize and ensure forward refs are resolved."""
    self._resolve_forward_refs()
    super().__init__(**kwargs)

`BaseCrawler(config)` ¶

Bases: ABC

Abstract base class for web crawlers.

Parameters:

Name	Type	Description	Default
`config`	`BaseCrawlerConfig`	Configuration for the crawler	required

Source code in langroid/parsing/url_loader.py

def __init__(self, config: BaseCrawlerConfig):
    """Initialize the base crawler.

    Args:
        config: Configuration for the crawler
    """
    self.parser = config.parser if self.needs_parser else None
    self.config: BaseCrawlerConfig = config

`needs_parser` `abstractmethod` `property` ¶

Indicates whether the crawler requires a parser.

`CrawlerFactory` ¶

Factory for creating web crawlers.

`create_crawler(config)` `staticmethod` ¶

Create a crawler instance based on configuration type.

Parameters:

Name	Type	Description	Default
`config`	`BaseCrawlerConfig`	Configuration for the crawler	required

Returns:

Type	Description
`BaseCrawler`	A BaseCrawler instance

Raises:

Type	Description
`ValueError`	If config type is not supported

Source code in langroid/parsing/url_loader.py

@staticmethod
def create_crawler(config: BaseCrawlerConfig) -> BaseCrawler:
    """Create a crawler instance based on configuration type.

    Args:
        config: Configuration for the crawler

    Returns:
        A BaseCrawler instance

    Raises:
        ValueError: If config type is not supported
    """
    if isinstance(config, TrafilaturaConfig):
        return TrafilaturaCrawler(config)
    elif isinstance(config, FirecrawlConfig):
        return FirecrawlCrawler(config)
    elif isinstance(config, ExaCrawlerConfig):
        return ExaCrawler(config)
    elif isinstance(config, Crawl4aiConfig):
        return Crawl4aiCrawler(config)
    else:
        raise ValueError(f"Unsupported crawler configuration type: {type(config)}")

`TrafilaturaCrawler(config)` ¶

Bases: BaseCrawler

Crawler implementation using Trafilatura.

Parameters:

Name	Type	Description	Default
`config`	`TrafilaturaConfig`	Configuration for the crawler	required

Source code in langroid/parsing/url_loader.py

def __init__(self, config: TrafilaturaConfig):
    """Initialize the Trafilatura crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: TrafilaturaConfig = config

`FirecrawlCrawler(config)` ¶

Bases: BaseCrawler

Crawler implementation using Firecrawl.

Parameters:

Name	Type	Description	Default
`config`	`FirecrawlConfig`	Configuration for the crawler	required

Source code in langroid/parsing/url_loader.py

def __init__(self, config: FirecrawlConfig) -> None:
    """Initialize the Firecrawl crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: FirecrawlConfig = config

`ExaCrawler(config)` ¶

Bases: BaseCrawler

Crawler implementation using Exa API.

Parameters:

Name	Type	Description	Default
`config`	`ExaCrawlerConfig`	Configuration for the crawler	required

Source code in langroid/parsing/url_loader.py

def __init__(self, config: ExaCrawlerConfig) -> None:
    """Initialize the Exa crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: ExaCrawlerConfig = config

`crawl(urls)` ¶

Crawl the given URLs using Exa SDK.

Parameters:

Name	Type	Description	Default
`urls`	`List[str]`	List of URLs to crawl	required

Returns:

Type	Description
`List[Document]`	List of Documents with content extracted from the URLs

Raises:

Type	Description
`LangroidImportError`	If the exa package is not installed
`ValueError`	If the Exa API key is not set

Source code in langroid/parsing/url_loader.py

def crawl(self, urls: List[str]) -> List[Document]:
    """Crawl the given URLs using Exa SDK.

    Args:
        urls: List of URLs to crawl

    Returns:
        List of Documents with content extracted from the URLs

    Raises:
        LangroidImportError: If the exa package is not installed
        ValueError: If the Exa API key is not set
    """
    try:
        from exa_py import Exa
    except ImportError:
        raise LangroidImportError("exa", "exa")

    if not self.config.api_key:
        raise ValueError("EXA_API_KEY key is required in your env or .env")

    exa = Exa(self.config.api_key)
    docs = []

    try:
        for url in urls:
            parsed_doc_chunks = self._process_document(url)
            if parsed_doc_chunks:
                docs.extend(parsed_doc_chunks)
                continue
            else:
                results = exa.get_contents(
                    [url],
                    livecrawl="always",
                    text={
                        "include_html_tags": True,
                    },
                )
                result = results.results[0]
                if result.text:
                    md_text = md.markdownify(result.text, heading_style="ATX")
                    # append a NON-chunked document
                    # (metadata.is_chunk = False, so will be chunked downstream)
                    docs.append(
                        Document(
                            content=md_text,
                            metadata=DocMetaData(
                                source=url,
                                title=getattr(result, "title", "Unknown Title"),
                                published_date=getattr(
                                    result, "published_date", "Unknown Date"
                                ),
                            ),
                        )
                    )

    except Exception as e:
        logging.error(f"Error retrieving content from Exa API: {e}")

    return docs

`Crawl4aiCrawler(config)` ¶

Bases: BaseCrawler

Crawler implementation using the crawl4ai library.

This crawler intelligently dispatches URLs. Standard web pages are rendered and scraped using the crawl4ai browser engine. Direct links to documents (PDF, DOCX, etc.) are delegated to the framework's internal DocumentParser.

Source code in langroid/parsing/url_loader.py

def __init__(self, config: Crawl4aiConfig) -> None:
    """Initialize the Crawl4ai crawler."""
    super().__init__(config)
    self.config: Crawl4aiConfig = config

`needs_parser` `property` ¶

Indicates that this crawler relies on the framework's DocumentParser for handling specific file types like PDF, DOCX, etc., which the browser engine cannot parse directly.

`crawl(urls)` ¶

Executes the crawl by separating document URLs from web page URLs.

Document URLs (.pdf, .docx, etc.) are processed using _process_document.
Web page URLs are handled using the async crawl4ai engine.

Source code in langroid/parsing/url_loader.py

def crawl(self, urls: List[str]) -> List[Document]:
    """
    Executes the crawl by separating document URLs from web page URLs.

    - Document URLs (.pdf, .docx, etc.) are processed using `_process_document`.
    - Web page URLs are handled using the async crawl4ai engine.
    """
    all_documents: List[Document] = []
    webpage_urls: List[str] = []

    # Step 1: Separate URLs into documents and web pages
    for url in urls:
        parsed_doc_chunks = self._process_document(url)
        if parsed_doc_chunks:
            all_documents.extend(parsed_doc_chunks)
        else:
            webpage_urls.append(url)

    # Step 2: Process web page URLs asynchronously
    if webpage_urls:
        try:
            loop = asyncio.get_running_loop()
            if loop.is_running():
                import nest_asyncio

                nest_asyncio.apply()
            web_docs = asyncio.run(self._async_crawl(webpage_urls))
        except RuntimeError:
            web_docs = asyncio.run(self._async_crawl(webpage_urls))

        all_documents.extend(web_docs)

    return all_documents

`URLLoader(urls, parsing_config=ParsingConfig(), crawler_config=None)` ¶

Loads URLs and extracts text using a specified crawler.

Parameters:

Name	Type	Description	Default
`urls`	`List[Any]`	List of URLs to load	required
`parsing_config`	`ParsingConfig`	Configuration for parsing	`ParsingConfig()`
`crawler_config`	`Optional[BaseCrawlerConfig]`	Configuration for the crawler	`None`

Source code in langroid/parsing/url_loader.py

def __init__(
    self,
    urls: List[Any],
    parsing_config: ParsingConfig = ParsingConfig(),
    crawler_config: Optional[BaseCrawlerConfig] = None,
):
    """Initialize the URL loader.

    Args:
        urls: List of URLs to load
        parsing_config: Configuration for parsing
        crawler_config: Configuration for the crawler
    """
    self.urls = urls
    self.parsing_config = parsing_config

    if crawler_config is None:
        crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))

    self.crawler = CrawlerFactory.create_crawler(crawler_config)
    if self.crawler.needs_parser:
        self.crawler.parser = Parser(parsing_config)

`load()` ¶

Load the URLs using the specified crawler.

Source code in langroid/parsing/url_loader.py

def load(self) -> List[Document]:
    """Load the URLs using the specified crawler."""
    return self.crawler.crawl(self.urls)

url_loader

BaseCrawlerConfig ¶

TrafilaturaConfig ¶

FirecrawlConfig ¶

Crawl4aiConfig(**kwargs) ¶

BaseCrawler(config) ¶

needs_parser abstractmethod property ¶

CrawlerFactory ¶

create_crawler(config) staticmethod ¶

TrafilaturaCrawler(config) ¶

FirecrawlCrawler(config) ¶

ExaCrawler(config) ¶

crawl(urls) ¶

Crawl4aiCrawler(config) ¶

needs_parser property ¶

crawl(urls) ¶

URLLoader(urls, parsing_config=ParsingConfig(), crawler_config=None) ¶

load() ¶

`BaseCrawlerConfig` ¶

`TrafilaturaConfig` ¶

`FirecrawlConfig` ¶

`Crawl4aiConfig(**kwargs)` ¶

`BaseCrawler(config)` ¶

`needs_parser` `abstractmethod` `property` ¶

`CrawlerFactory` ¶

`create_crawler(config)` `staticmethod` ¶

`TrafilaturaCrawler(config)` ¶

`FirecrawlCrawler(config)` ¶

`ExaCrawler(config)` ¶

`crawl(urls)` ¶

`Crawl4aiCrawler(config)` ¶

`needs_parser` `property` ¶

`crawl(urls)` ¶

`URLLoader(urls, parsing_config=ParsingConfig(), crawler_config=None)` ¶

`load()` ¶