Skip to content

url_loader

langroid/parsing/url_loader.py

BaseCrawlerConfig

Bases: BaseSettings

Base configuration for web crawlers.

TrafilaturaConfig

Bases: BaseCrawlerConfig

Configuration for Trafilatura crawler.

FirecrawlConfig

Bases: BaseCrawlerConfig

Configuration for Firecrawl crawler.

BaseCrawler(config)

Bases: ABC

Abstract base class for web crawlers.

Parameters:

Name Type Description Default
config BaseCrawlerConfig

Configuration for the crawler

required
Source code in langroid/parsing/url_loader.py
def __init__(self, config: BaseCrawlerConfig):
    """Initialize the base crawler.

    Args:
        config: Configuration for the crawler
    """
    self.parser = config.parser if self.needs_parser else None
    self.config: BaseCrawlerConfig = config

needs_parser abstractmethod property

Indicates whether the crawler requires a parser.

CrawlerFactory

Factory for creating web crawlers.

create_crawler(config) staticmethod

Create a crawler instance based on configuration type.

Parameters:

Name Type Description Default
config BaseCrawlerConfig

Configuration for the crawler

required

Returns:

Type Description
BaseCrawler

A BaseCrawler instance

Raises:

Type Description
ValueError

If config type is not supported

Source code in langroid/parsing/url_loader.py
@staticmethod
def create_crawler(config: BaseCrawlerConfig) -> BaseCrawler:
    """Create a crawler instance based on configuration type.

    Args:
        config: Configuration for the crawler

    Returns:
        A BaseCrawler instance

    Raises:
        ValueError: If config type is not supported
    """
    if isinstance(config, TrafilaturaConfig):
        return TrafilaturaCrawler(config)
    elif isinstance(config, FirecrawlConfig):
        return FirecrawlCrawler(config)
    elif isinstance(config, ExaCrawlerConfig):
        return ExaCrawler(config)
    else:
        raise ValueError(f"Unsupported crawler configuration type: {type(config)}")

TrafilaturaCrawler(config)

Bases: BaseCrawler

Crawler implementation using Trafilatura.

Parameters:

Name Type Description Default
config TrafilaturaConfig

Configuration for the crawler

required
Source code in langroid/parsing/url_loader.py
def __init__(self, config: TrafilaturaConfig):
    """Initialize the Trafilatura crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: TrafilaturaConfig = config

FirecrawlCrawler(config)

Bases: BaseCrawler

Crawler implementation using Firecrawl.

Parameters:

Name Type Description Default
config FirecrawlConfig

Configuration for the crawler

required
Source code in langroid/parsing/url_loader.py
def __init__(self, config: FirecrawlConfig) -> None:
    """Initialize the Firecrawl crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: FirecrawlConfig = config

ExaCrawler(config)

Bases: BaseCrawler

Crawler implementation using Exa API.

Parameters:

Name Type Description Default
config ExaCrawlerConfig

Configuration for the crawler

required
Source code in langroid/parsing/url_loader.py
def __init__(self, config: ExaCrawlerConfig) -> None:
    """Initialize the Exa crawler.

    Args:
        config: Configuration for the crawler
    """
    super().__init__(config)
    self.config: ExaCrawlerConfig = config

crawl(urls)

Crawl the given URLs using Exa SDK.

Parameters:

Name Type Description Default
urls List[str]

List of URLs to crawl

required

Returns:

Type Description
List[Document]

List of Documents with content extracted from the URLs

Raises:

Type Description
LangroidImportError

If the exa package is not installed

ValueError

If the Exa API key is not set

Source code in langroid/parsing/url_loader.py
def crawl(self, urls: List[str]) -> List[Document]:
    """Crawl the given URLs using Exa SDK.

    Args:
        urls: List of URLs to crawl

    Returns:
        List of Documents with content extracted from the URLs

    Raises:
        LangroidImportError: If the exa package is not installed
        ValueError: If the Exa API key is not set
    """
    try:
        from exa_py import Exa
    except ImportError:
        raise LangroidImportError("exa", "exa")

    if not self.config.api_key:
        raise ValueError("EXA_API_KEY key is required in your env or .env")

    exa = Exa(self.config.api_key)
    docs = []

    try:
        for url in urls:
            parsed_doc_chunks = self._process_document(url)
            if parsed_doc_chunks:
                docs.extend(parsed_doc_chunks)
                continue
            else:
                results = exa.get_contents([url], livecrawl="always", text=True)
                result = results.results[0]
                if result.text:
                    # append a NON-chunked document
                    # (metadata.is_chunk = False, so will be chunked downstream)
                    docs.append(
                        Document(
                            content=result.text,
                            metadata=DocMetaData(
                                source=url,
                                title=getattr(result, "title", "Unknown Title"),
                                published_date=getattr(
                                    result, "published_date", "Unknown Date"
                                ),
                            ),
                        )
                    )

    except Exception as e:
        logging.error(f"Error retrieving content from Exa API: {e}")

    return docs

URLLoader(urls, parsing_config=ParsingConfig(), crawler_config=None)

Loads URLs and extracts text using a specified crawler.

Parameters:

Name Type Description Default
urls List[Any]

List of URLs to load

required
parsing_config ParsingConfig

Configuration for parsing

ParsingConfig()
crawler_config Optional[BaseCrawlerConfig]

Configuration for the crawler

None
Source code in langroid/parsing/url_loader.py
def __init__(
    self,
    urls: List[Any],
    parsing_config: ParsingConfig = ParsingConfig(),
    crawler_config: Optional[BaseCrawlerConfig] = None,
):
    """Initialize the URL loader.

    Args:
        urls: List of URLs to load
        parsing_config: Configuration for parsing
        crawler_config: Configuration for the crawler
    """
    self.urls = urls
    self.parsing_config = parsing_config

    if crawler_config is None:
        crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))

    self.crawler = CrawlerFactory.create_crawler(crawler_config)
    if self.crawler.needs_parser:
        self.crawler.parser = Parser(parsing_config)

load()

Load the URLs using the specified crawler.

Source code in langroid/parsing/url_loader.py
def load(self) -> List[Document]:
    """Load the URLs using the specified crawler."""
    return self.crawler.crawl(self.urls)