Skip to content

spider

langroid/parsing/spider.py

DomainSpecificSpider(start_url, k=20, *args, **kwargs)

Bases: CrawlSpider

Parameters:

Name Type Description Default
start_url str

The starting URL.

required
k int

The max desired final URLs. Defaults to 20.

20
Source code in langroid/parsing/spider.py
def __init__(self, start_url: str, k: int = 20, *args, **kwargs):  # type: ignore
    """Initialize the spider with start_url and k.

    Args:
        start_url (str): The starting URL.
        k (int, optional): The max desired final URLs. Defaults to 20.
    """
    super(DomainSpecificSpider, self).__init__(*args, **kwargs)
    self.start_urls = [start_url]
    self.allowed_domains = [urlparse(start_url).netloc]
    self.k = k
    self.visited_urls: Set[str] = set()

parse_item(response)

Extracts URLs that are within the same domain.

Parameters:

Name Type Description Default
response Response

The scrapy response object.

required
Source code in langroid/parsing/spider.py
def parse_item(self, response: Response):  # type: ignore
    """Extracts URLs that are within the same domain.

    Args:
        response: The scrapy response object.
    """
    for link in LinkExtractor(allow_domains=self.allowed_domains).extract_links(
        response
    ):
        if len(self.visited_urls) < self.k:
            self.visited_urls.add(link.url)
            yield {"url": link.url}

scrapy_fetch_urls(url, k=20)

Fetches up to k URLs reachable from the input URL using Scrapy.

Parameters:

Name Type Description Default
url str

The starting URL.

required
k int

The max desired final URLs. Defaults to 20.

20

Returns:

Type Description
List[str]

List[str]: List of URLs within the same domain as the input URL.

Source code in langroid/parsing/spider.py
@no_type_check
def scrapy_fetch_urls(url: str, k: int = 20) -> List[str]:
    """Fetches up to k URLs reachable from the input URL using Scrapy.

    Args:
        url (str): The starting URL.
        k (int, optional): The max desired final URLs. Defaults to 20.

    Returns:
        List[str]: List of URLs within the same domain as the input URL.
    """
    urls = []

    def _collect_urls(spider):
        """Handler for the spider_closed signal. Collects the visited URLs."""
        nonlocal urls
        urls.extend(list(spider.visited_urls))

    # Connect the spider_closed signal with our handler
    dispatcher.connect(_collect_urls, signal=signals.spider_closed)

    runner = CrawlerRunner(
        {
            "USER_AGENT": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
        }
    )

    d = runner.crawl(DomainSpecificSpider, start_url=url, k=k)

    # Block until crawling is done and then stop the reactor
    crawl_deferred = defer.Deferred()

    def _crawl_done(_):
        reactor.stop()
        crawl_deferred.callback(urls)

    d.addBoth(_crawl_done)

    # Start the reactor, it will stop once the crawl is done
    reactor.run(installSignalHandlers=0)

    # This will block until the deferred gets a result
    return crawl_deferred.result