Skip to content

repo_loader

langroid/parsing/repo_loader.py

RepoLoaderConfig

Bases: BaseSettings

Configuration for RepoLoader.

RepoLoader(url, config=RepoLoaderConfig())

Class for recursively getting all file content in a repo.

config: configuration for RepoLoader
Source code in langroid/parsing/repo_loader.py
def __init__(
    self,
    url: str,
    config: RepoLoaderConfig = RepoLoaderConfig(),
):
    """
    Args:
        url: full github url of repo, or just "owner/repo"
        config: configuration for RepoLoader
    """
    self.url = url
    self.config = config
    self.clone_path: Optional[str] = None
    self.log_file = ".logs/repo_loader/download_log.json"
    os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
    if not os.path.exists(self.log_file):
        with open(self.log_file, "w") as f:
            json.dump({"junk": "ignore"}, f)
    with open(self.log_file, "r") as f:
        log = json.load(f)
    if self.url in log and os.path.exists(log[self.url]):
        logger.info(f"Repo Already downloaded in {log[self.url]}")
        self.clone_path = log[self.url]

    if "github.com" in self.url:
        repo_name = self.url.split("github.com/")[1]
    else:
        repo_name = self.url
    load_dotenv()
    # authenticated calls to github api have higher rate limit
    token = os.getenv("GITHUB_ACCESS_TOKEN")
    g = Github(token)
    self.repo = self._get_repo_with_retry(g, repo_name)

get_issues(k=100)

Get up to k issues from the GitHub repo.

Source code in langroid/parsing/repo_loader.py
def get_issues(self, k: int | None = 100) -> List[IssueData]:
    """Get up to k issues from the GitHub repo."""
    if k is None:
        issues = self.repo.get_issues(state="all")
    else:
        issues = self.repo.get_issues(state="all")[:k]
    issue_data_list = []
    for issue in issues:
        issue_data = IssueData(
            state=issue.state,
            year=issue.created_at.year,
            month=issue.created_at.month,
            day=issue.created_at.day,
            assignee=issue.assignee.login if issue.assignee else None,
            size=get_issue_size(issue.labels),
            text=issue.body or "No issue description body.",
        )
        issue_data_list.append(issue_data)

    return issue_data_list

clone(path=None)

Clone a GitHub repository to a local directory specified by path, if it has not already been cloned.

Parameters:

Name Type Description Default
path str

The local directory where the repository should be cloned. If not specified, a temporary directory will be created.

None

Returns:

Name Type Description
str Optional[str]

The path to the local directory where the repository was cloned.

Source code in langroid/parsing/repo_loader.py
def clone(self, path: Optional[str] = None) -> Optional[str]:
    """
    Clone a GitHub repository to a local directory specified by `path`,
    if it has not already been cloned.

    Args:
        path (str): The local directory where the repository should be cloned.
            If not specified, a temporary directory will be created.

    Returns:
        str: The path to the local directory where the repository was cloned.
    """
    with open(self.log_file, "r") as f:
        log: Dict[str, str] = json.load(f)

    if (
        self.url in log
        and os.path.exists(log[self.url])
        and _has_files(log[self.url])
    ):
        logger.warning(f"Repo Already downloaded in {log[self.url]}")
        self.clone_path = log[self.url]
        return self.clone_path

    self.clone_path = path
    if path is None:
        path = self.default_clone_path()
        self.clone_path = path

    try:
        subprocess.run(["git", "clone", self.url, path], check=True)
        log[self.url] = path
        with open(self.log_file, "w") as f:
            json.dump(log, f)
        return self.clone_path
    except subprocess.CalledProcessError as e:
        logger.error(f"Git clone failed: {e}")
    except Exception as e:
        logger.error(f"An error occurred while trying to clone the repository:{e}")

    return self.clone_path

load_tree_from_github(depth, lines=0)

Get a nested dictionary of GitHub repository file and directory names up to a certain depth, with file contents.

Parameters:

Name Type Description Default
depth int

The depth level.

required
lines int

The number of lines of file contents to include.

0

Returns:

Type Description
Dict[str, Union[str, List[Dict[str, Any]]]]

Dict[str, Union[str, List[Dict]]]:

Dict[str, Union[str, List[Dict[str, Any]]]]

A dictionary containing file and directory names, with file contents.

Source code in langroid/parsing/repo_loader.py
def load_tree_from_github(
    self, depth: int, lines: int = 0
) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
    """
    Get a nested dictionary of GitHub repository file and directory names
    up to a certain depth, with file contents.

    Args:
        depth (int): The depth level.
        lines (int): The number of lines of file contents to include.

    Returns:
        Dict[str, Union[str, List[Dict]]]:
        A dictionary containing file and directory names, with file contents.
    """
    root_contents = self.repo.get_contents("")
    if not isinstance(root_contents, list):
        root_contents = [root_contents]
    repo_structure = {
        "type": "dir",
        "name": "",
        "dirs": [],
        "files": [],
        "path": "",
    }

    # A queue of tuples (current_node, current_depth, parent_structure)
    queue = deque([(root_contents, 0, repo_structure)])

    while queue:
        current_node, current_depth, parent_structure = queue.popleft()

        for content in current_node:
            if not self._is_allowed(content):
                continue
            if content.type == "dir" and current_depth < depth:
                # Create a new sub-dictionary for this directory
                new_dir = {
                    "type": "dir",
                    "name": content.name,
                    "dirs": [],
                    "files": [],
                    "path": content.path,
                }
                parent_structure["dirs"].append(new_dir)
                contents = self.repo.get_contents(content.path)
                if not isinstance(contents, list):
                    contents = [contents]
                queue.append(
                    (
                        contents,
                        current_depth + 1,
                        new_dir,
                    )
                )
            elif content.type == "file":
                file_content = "\n".join(
                    _get_decoded_content(content).splitlines()[:lines]
                )
                file_dict = {
                    "type": "file",
                    "name": content.name,
                    "content": file_content,
                    "path": content.path,
                }
                parent_structure["files"].append(file_dict)

    return repo_structure

load(path=None, depth=3, lines=0)

From a local folder path (if None, the repo clone path), get: a nested dictionary (tree) of dicts, files and contents a list of Document objects for each file.

Parameters:

Name Type Description Default
path str

The local folder path; if none, use self.clone_path()

None
depth int

The depth level.

3
lines int

The number of lines of file contents to include.

0

Returns:

Type Description
Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]

Tuple of (dict, List_of_Documents): A dictionary containing file and directory names, with file contents, and a list of Document objects for each file.

Source code in langroid/parsing/repo_loader.py
def load(
    self,
    path: Optional[str] = None,
    depth: int = 3,
    lines: int = 0,
) -> Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]:
    """
    From a local folder `path` (if None, the repo clone path), get:
      a nested dictionary (tree) of dicts, files and contents
      a list of Document objects for each file.

    Args:
        path (str): The local folder path; if none, use self.clone_path()
        depth (int): The depth level.
        lines (int): The number of lines of file contents to include.

    Returns:
        Tuple of (dict, List_of_Documents):
            A dictionary containing file and directory names, with file
            contents, and a list of Document objects for each file.
    """
    if path is None:
        if self.clone_path is None or not _has_files(self.clone_path):
            self.clone()
        path = self.clone_path
    if path is None:
        raise ValueError("Unable to clone repo")
    return self.load_from_folder(
        path=path,
        depth=depth,
        lines=lines,
        file_types=self.config.file_types,
        exclude_dirs=self.config.exclude_dirs,
        url=self.url,
    )

load_from_folder(path, depth=3, lines=0, file_types=None, exclude_dirs=None, url='') staticmethod

From a local folder path (required), get: a nested dictionary (tree) of dicts, files and contents, restricting to desired file_types and excluding undesired directories. a list of Document objects for each file.

Parameters:

Name Type Description Default
path str

The local folder path, required.

required
depth int

The depth level. Optional, default 3.

3
lines int

The number of lines of file contents to include. Optional, default 0 (no lines => empty string).

0
file_types List[str]

The file types to include. Optional, default None (all).

None
exclude_dirs List[str]

The directories to exclude. Optional, default None (no exclusions).

None
url str

Optional url, to be stored in docs as metadata. Default "".

''

Returns:

Type Description
Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]

Tuple of (dict, List_of_Documents): A dictionary containing file and directory names, with file contents. A list of Document objects for each file.

Source code in langroid/parsing/repo_loader.py
@staticmethod
def load_from_folder(
    path: str,
    depth: int = 3,
    lines: int = 0,
    file_types: Optional[List[str]] = None,
    exclude_dirs: Optional[List[str]] = None,
    url: str = "",
) -> Tuple[Dict[str, Union[str, List[Dict[str, Any]]]], List[Document]]:
    """
    From a local folder `path` (required), get:
      a nested dictionary (tree) of dicts, files and contents, restricting to
        desired file_types and excluding undesired directories.
      a list of Document objects for each file.

    Args:
        path (str): The local folder path, required.
        depth (int): The depth level. Optional, default 3.
        lines (int): The number of lines of file contents to include.
                Optional, default 0 (no lines => empty string).
        file_types (List[str]): The file types to include.
                Optional, default None (all).
        exclude_dirs (List[str]): The directories to exclude.
                Optional, default None (no exclusions).
        url (str): Optional url, to be stored in docs as metadata. Default "".

    Returns:
        Tuple of (dict, List_of_Documents):
            A dictionary containing file and directory names, with file contents.
            A list of Document objects for each file.
    """

    folder_structure = {
        "type": "dir",
        "name": "",
        "dirs": [],
        "files": [],
        "path": "",
    }
    # A queue of tuples (current_path, current_depth, parent_structure)
    queue = deque([(path, 0, folder_structure)])
    docs = []
    exclude_dirs = exclude_dirs or []
    while queue:
        current_path, current_depth, parent_structure = queue.popleft()

        for item in os.listdir(current_path):
            item_path = os.path.join(current_path, item)
            relative_path = os.path.relpath(item_path, path)
            if (os.path.isdir(item_path) and item in exclude_dirs) or (
                os.path.isfile(item_path)
                and file_types is not None
                and RepoLoader._file_type(item) not in file_types
            ):
                continue

            if os.path.isdir(item_path) and current_depth < depth:
                # Create a new sub-dictionary for this directory
                new_dir = {
                    "type": "dir",
                    "name": item,
                    "dirs": [],
                    "files": [],
                    "path": relative_path,
                }
                parent_structure["dirs"].append(new_dir)
                queue.append((item_path, current_depth + 1, new_dir))
            elif os.path.isfile(item_path):
                # Add the file to the current dictionary
                with open(item_path, "r") as f:
                    file_lines = list(itertools.islice(f, lines))
                file_content = "\n".join(line.strip() for line in file_lines)
                if file_content == "":
                    continue

                file_dict = {
                    "type": "file",
                    "name": item,
                    "content": file_content,
                    "path": relative_path,
                }
                parent_structure["files"].append(file_dict)
                docs.append(
                    Document(
                        content=file_content,
                        metadata=DocMetaData(
                            repo=url,
                            source=relative_path,
                            url=url,
                            filename=item,
                            extension=RepoLoader._file_type(item),
                            language=RepoLoader._file_type(item),
                        ),
                    )
                )
    return folder_structure, docs

get_documents(path, parser=Parser(ParsingConfig()), file_types=None, exclude_dirs=None, depth=-1, lines=None, doc_type=None) staticmethod

Recursively get all files under a path as Document objects.

Parameters:

Name Type Description Default
path str | bytes

The path to the directory or file, or bytes content. The bytes option is meant to support the case where the content has already been read from a file in an upstream process (e.g. from an API or a database), and we want to avoid having to write it to a temporary file just to read it again. (which can be very slow for large files, especially in a docker container)

required
parser Parser

Parser to use to parse files.

Parser(ParsingConfig())
file_types List[str]

List of file extensions OR filenames OR file_path_names to include. Defaults to None, which includes all files.

None
exclude_dirs List[str]

List of directories to exclude. Defaults to None, which includes all directories.

None
depth int

Max depth of recursion. Defaults to -1, which includes all depths.

-1
lines int

Number of lines to read from each file. Defaults to None, which reads all lines.

None
doc_type str | DocumentType

The type of document to parse.

None

Returns:

Type Description
List[Document]

List[Document]: List of Document objects representing files.

Source code in langroid/parsing/repo_loader.py
@staticmethod
def get_documents(
    path: str | bytes,
    parser: Parser = Parser(ParsingConfig()),
    file_types: Optional[List[str]] = None,
    exclude_dirs: Optional[List[str]] = None,
    depth: int = -1,
    lines: Optional[int] = None,
    doc_type: str | DocumentType | None = None,
) -> List[Document]:
    """
    Recursively get all files under a path as Document objects.

    Args:
        path (str|bytes): The path to the directory or file, or bytes content.
            The bytes option is meant to support the case where the content
            has already been read from a file in an upstream process
            (e.g. from an API or a database), and we want to avoid having to
            write it to a temporary file just to read it again.
            (which can be very slow for large files,
            especially in a docker container)
        parser (Parser): Parser to use to parse files.
        file_types (List[str], optional): List of file extensions OR
            filenames OR file_path_names to  include.
            Defaults to None, which includes all files.
        exclude_dirs (List[str], optional): List of directories to exclude.
            Defaults to None, which includes all directories.
        depth (int, optional): Max depth of recursion. Defaults to -1,
            which includes all depths.
        lines (int, optional): Number of lines to read from each file.
            Defaults to None, which reads all lines.
        doc_type (str|DocumentType, optional): The type of document to parse.

    Returns:
        List[Document]: List of Document objects representing files.

    """
    docs = []
    file_paths = []
    if isinstance(path, bytes):
        file_paths.append(path)
    else:
        path_obj = Path(path).resolve()

        if path_obj.is_file():
            file_paths.append(str(path_obj))
        else:
            path_depth = len(path_obj.parts)
            for root, dirs, files in os.walk(path):
                # Exclude directories if needed
                if exclude_dirs:
                    dirs[:] = [d for d in dirs if d not in exclude_dirs]

                current_depth = len(Path(root).resolve().parts) - path_depth
                if depth == -1 or current_depth <= depth:
                    for file in files:
                        file_path = str(Path(root) / file)
                        if (
                            file_types is None
                            or RepoLoader._file_type(file_path) in file_types
                            or os.path.basename(file_path) in file_types
                            or file_path in file_types
                        ):
                            file_paths.append(file_path)

    for file_path in file_paths:
        docs.extend(
            DocumentParser.chunks_from_path_or_bytes(
                file_path,
                parser,
                doc_type=doc_type,
                lines=lines,
            )
        )
    return docs

load_docs_from_github(k=None, depth=None, lines=None)

Directly from GitHub, recursively get all files in a repo that have one of the extensions, possibly up to a max number of files, max depth, and max number of lines per file (if any of these are specified).

Parameters:

Name Type Description Default
k int

max number of files to load, or None for all files

None
depth int

max depth to recurse, or None for infinite depth

None
lines int

max number of lines to get, from a file, or None for all lines

None

Returns:

Type Description
List[Document]

list of Document objects, each has fields content and metadata,

List[Document]

and metadata has fields url, filename, extension, language

Source code in langroid/parsing/repo_loader.py
def load_docs_from_github(
    self,
    k: Optional[int] = None,
    depth: Optional[int] = None,
    lines: Optional[int] = None,
) -> List[Document]:
    """
    Directly from GitHub, recursively get all files in a repo that have one of the
    extensions, possibly up to a max number of files, max depth, and max number
    of lines per file (if any of these are specified).

    Args:
        k (int): max number of files to load, or None for all files
        depth (int): max depth to recurse, or None for infinite depth
        lines (int): max number of lines to get, from a file, or None for all lines

    Returns:
        list of Document objects, each has fields `content` and `metadata`,
        and `metadata` has fields `url`, `filename`, `extension`, `language`
    """
    contents = self.repo.get_contents("")
    if not isinstance(contents, list):
        contents = [contents]
    stack = list(zip(contents, [0] * len(contents)))  # stack of (content, depth)
    # recursively get all files in repo that have one of the extensions
    docs = []
    i = 0

    while stack:
        if k is not None and i == k:
            break
        file_content, d = stack.pop()
        if not self._is_allowed(file_content):
            continue
        if file_content.type == "dir":
            if depth is None or d <= depth:
                items = self.repo.get_contents(file_content.path)
                if not isinstance(items, list):
                    items = [items]
                stack.extend(list(zip(items, [d + 1] * len(items))))
        else:
            if depth is None or d <= depth:
                # need to decode the file content, which is in bytes
                contents = self.repo.get_contents(file_content.path)
                if isinstance(contents, list):
                    contents = contents[0]
                text = _get_decoded_content(contents)
                if lines is not None:
                    text = "\n".join(text.split("\n")[:lines])
                i += 1

                # Note `source` is important, it may be used to cite
                # evidence for an answer.
                # See  URLLoader
                # TODO we should use Pydantic to enforce/standardize this

                docs.append(
                    Document(
                        content=text,
                        metadata=DocMetaData(
                            repo=self.url,
                            source=file_content.html_url,
                            url=file_content.html_url,
                            filename=file_content.name,
                            extension=self._file_type(file_content.name),
                            language=self._file_type(file_content.name),
                        ),
                    )
                )
    return docs

select(structure, includes, excludes=[]) staticmethod

Filter a structure dictionary for certain directories and files.

Parameters:

Name Type Description Default
structure Dict[str, Union[str, List[Dict]]]

The structure dictionary.

required
includes List[str]

A list of desired directories and files. For files, either full file names or "file type" can be specified. E.g. "toml" will include all files with the ".toml" extension, or "Makefile" will include all files named "Makefile".

required
excludes List[str]

A list of directories and files to exclude. Similar to includes, full file/dir names or "file type" can be specified. Optional, defaults to empty list.

[]

Returns:

Type Description
Dict[str, Union[str, List[Dict[str, Any]]]]

Dict[str, Union[str, List[Dict]]]: The filtered structure dictionary.

Source code in langroid/parsing/repo_loader.py
@staticmethod
def select(
    structure: Dict[str, Union[str, List[Dict[str, Any]]]],
    includes: List[str],
    excludes: List[str] = [],
) -> Dict[str, Union[str, List[Dict[str, Any]]]]:
    """
    Filter a structure dictionary for certain directories and files.

    Args:
        structure (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
        includes (List[str]): A list of desired directories and files.
            For files, either full file names or "file type" can be specified.
            E.g.  "toml" will include all files with the ".toml" extension,
            or "Makefile" will include all files named "Makefile".
        excludes (List[str]): A list of directories and files to exclude.
            Similar to `includes`, full file/dir names or "file type" can be
            specified. Optional, defaults to empty list.


    Returns:
        Dict[str, Union[str, List[Dict]]]: The filtered structure dictionary.
    """
    filtered_structure = {
        "type": structure["type"],
        "name": structure["name"],
        "dirs": [],
        "files": [],
        "path": structure["path"],
    }

    for dir in structure["dirs"]:
        if (
            dir["name"] in includes
            or RepoLoader._file_type(dir["name"]) in includes
        ) and (
            dir["name"] not in excludes
            and RepoLoader._file_type(dir["name"]) not in excludes
        ):
            # If the directory is in the select list, include the whole subtree
            filtered_structure["dirs"].append(dir)
        else:
            # Otherwise, filter the directory's contents
            filtered_dir = RepoLoader.select(dir, includes)
            if (
                filtered_dir["dirs"] or filtered_dir["files"]
            ):  # only add if not empty
                filtered_structure["dirs"].append(filtered_dir)

    for file in structure["files"]:
        if (
            file["name"] in includes
            or RepoLoader._file_type(file["name"]) in includes
        ) and (
            file["name"] not in excludes
            and RepoLoader._file_type(file["name"]) not in excludes
        ):
            filtered_structure["files"].append(file)

    return filtered_structure

ls(structure, depth=0) staticmethod

Get a list of names of files or directories up to a certain depth from a structure dictionary.

Parameters:

Name Type Description Default
structure Dict[str, Union[str, List[Dict]]]

The structure dictionary.

required
depth int

The depth level. Defaults to 0.

0

Returns:

Type Description
List[str]

List[str]: A list of names of files or directories.

Source code in langroid/parsing/repo_loader.py
@staticmethod
def ls(structure: Dict[str, Union[str, List[Dict]]], depth: int = 0) -> List[str]:
    """
    Get a list of names of files or directories up to a certain depth from a
    structure dictionary.

    Args:
        structure (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
        depth (int, optional): The depth level. Defaults to 0.

    Returns:
        List[str]: A list of names of files or directories.
    """
    names = []

    # A queue of tuples (current_structure, current_depth)
    queue = deque([(structure, 0)])

    while queue:
        current_structure, current_depth = queue.popleft()

        if current_depth <= depth:
            names.append(current_structure["name"])

            for dir in current_structure["dirs"]:
                queue.append((dir, current_depth + 1))

            for file in current_structure["files"]:
                # add file names only if depth is less than the limit
                if current_depth < depth:
                    names.append(file["name"])
    names = [n for n in names if n not in ["", None]]
    return names

list_files(dir, depth=1, include_types=[], exclude_types=[]) staticmethod

Recursively list all files in a directory, up to a certain depth.

Parameters:

Name Type Description Default
dir str

The directory path, relative to root.

required
depth int

The depth level. Defaults to 1.

1
include_types List[str]

A list of file types to include. Defaults to empty list.

[]
exclude_types List[str]

A list of file types to exclude. Defaults to empty list.

[]

Returns: List[str]: A list of file names.

Source code in langroid/parsing/repo_loader.py
@staticmethod
def list_files(
    dir: str,
    depth: int = 1,
    include_types: List[str] = [],
    exclude_types: List[str] = [],
) -> List[str]:
    """
    Recursively list all files in a directory, up to a certain depth.

    Args:
        dir (str): The directory path, relative to root.
        depth (int, optional): The depth level. Defaults to 1.
        include_types (List[str], optional): A list of file types to include.
            Defaults to empty list.
        exclude_types (List[str], optional): A list of file types to exclude.
            Defaults to empty list.
    Returns:
        List[str]: A list of file names.
    """
    depth = depth if depth >= 0 else 200
    output = []

    for root, dirs, files in os.walk(dir):
        if root.count(os.sep) - dir.count(os.sep) < depth:
            level = root.count(os.sep) - dir.count(os.sep)
            sub_indent = " " * 4 * (level + 1)
            for d in dirs:
                output.append("{}{}/".format(sub_indent, d))
            for f in files:
                if include_types and RepoLoader._file_type(f) not in include_types:
                    continue
                if exclude_types and RepoLoader._file_type(f) in exclude_types:
                    continue
                output.append("{}{}".format(sub_indent, f))
    return output

show_file_contents(tree) staticmethod

Print the contents of all files from a structure dictionary.

Parameters:

Name Type Description Default
tree Dict[str, Union[str, List[Dict]]]

The structure dictionary.

required
Source code in langroid/parsing/repo_loader.py
@staticmethod
def show_file_contents(tree: Dict[str, Union[str, List[Dict[str, Any]]]]) -> str:
    """
    Print the contents of all files from a structure dictionary.

    Args:
        tree (Dict[str, Union[str, List[Dict]]]): The structure dictionary.
    """
    contents = ""
    for dir in tree["dirs"]:
        contents += RepoLoader.show_file_contents(dir)
    for file in tree["files"]:
        path = file["path"]
        contents += f"""
        {path}:
        --------------------
        {file["content"]}

        """

    return contents