Skip to content

document_parser

langroid/parsing/document_parser.py

DocumentParser(source, config)

Bases: Parser

Abstract base class for extracting text from special types of docs such as PDFs or Docx.

Attributes:

Name Type Description
source str

The source, either a URL or a file path.

doc_bytes BytesIO

BytesIO object containing the doc data.

a path, a URL or a bytes object.
Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

create(source, config, doc_type=None) classmethod

Create a DocumentParser instance based on source type and config..library specified.

Parameters:

Name Type Description Default
source str | bytes

The source, could be a URL, file path, or bytes object.

required
config ParserConfig

The parser configuration.

required
doc_type str | None

The type of document, if known

None

Returns:

Name Type Description
DocumentParser 'DocumentParser'

An instance of a DocumentParser subclass.

Source code in langroid/parsing/document_parser.py
@classmethod
def create(
    cls,
    source: str | bytes,
    config: ParsingConfig,
    doc_type: str | DocumentType | None = None,
) -> "DocumentParser":
    """
    Create a DocumentParser instance based on source type
        and config.<source_type>.library specified.

    Args:
        source (str|bytes): The source, could be a URL, file path,
            or bytes object.
        config (ParserConfig): The parser configuration.
        doc_type (str|None): The type of document, if known

    Returns:
        DocumentParser: An instance of a DocumentParser subclass.
    """
    inferred_doc_type = DocumentParser._document_type(source, doc_type)
    if inferred_doc_type == DocumentType.PDF:
        if config.pdf.library == "fitz":
            return FitzPDFParser(source, config)
        elif config.pdf.library == "pymupdf4llm":
            return PyMuPDF4LLMParser(source, config)
        elif config.pdf.library == "docling":
            return DoclingParser(source, config)
        elif config.pdf.library == "pypdf":
            return PyPDFParser(source, config)
        elif config.pdf.library == "unstructured":
            return UnstructuredPDFParser(source, config)
        elif config.pdf.library == "pdf2image":
            return ImagePdfParser(source, config)
        elif config.pdf.library == "gemini":
            return GeminiPdfParser(source, config)
        elif config.pdf.library == "marker":
            return MarkerPdfParser(source, config)
        else:
            raise ValueError(
                f"Unsupported PDF library specified: {config.pdf.library}"
            )
    elif inferred_doc_type == DocumentType.DOCX:
        if config.docx.library == "unstructured":
            return UnstructuredDocxParser(source, config)
        elif config.docx.library == "python-docx":
            return PythonDocxParser(source, config)
        elif config.docx.library == "markitdown-docx":
            return MarkitdownDocxParser(source, config)
        else:
            raise ValueError(
                f"Unsupported DOCX library specified: {config.docx.library}"
            )
    elif inferred_doc_type == DocumentType.DOC:
        return UnstructuredDocParser(source, config)
    elif inferred_doc_type == DocumentType.XLS:
        return MarkitdownXLSXParser(source, config)
    elif inferred_doc_type == DocumentType.XLSX:
        return MarkitdownXLSXParser(source, config)
    elif inferred_doc_type == DocumentType.PPTX:
        return MarkitdownPPTXParser(source, config)
    else:
        source_name = source if isinstance(source, str) else "bytes"
        raise ValueError(f"Unsupported document type: {source_name}")

chunks_from_path_or_bytes(source, parser, doc_type=None, lines=None) staticmethod

Get document chunks from a file path or bytes object. Args: source (str|bytes): The source, which could be a URL, path or bytes object. parser (Parser): The parser instance (for splitting the document). doc_type (str|DocumentType|None): The type of document, if known. lines (int|None): The number of lines to read from a plain text file. Returns: List[Document]: A list of Document objects, each containing a chunk of text, determined by the chunking and splitting settings in the parser config.

Source code in langroid/parsing/document_parser.py
@staticmethod
def chunks_from_path_or_bytes(
    source: str | bytes,
    parser: Parser,
    doc_type: str | DocumentType | None = None,
    lines: int | None = None,
) -> List[Document]:
    """
    Get document chunks from a file path or bytes object.
    Args:
        source (str|bytes): The source, which could be a URL, path or bytes object.
        parser (Parser): The parser instance (for splitting the document).
        doc_type (str|DocumentType|None): The type of document, if known.
        lines (int|None): The number of lines to read from a plain text file.
    Returns:
        List[Document]: A list of `Document` objects,
            each containing a chunk of text, determined by the
            chunking and splitting settings in the parser config.
    """
    dtype: DocumentType = DocumentParser._document_type(source, doc_type)
    if dtype in [
        DocumentType.PDF,
        DocumentType.DOC,
        DocumentType.DOCX,
        DocumentType.PPTX,
        DocumentType.XLS,
        DocumentType.XLSX,
    ]:
        doc_parser = DocumentParser.create(
            source,
            parser.config,
            doc_type=doc_type,
        )
        chunks = doc_parser.get_doc_chunks()
        if len(chunks) == 0 and dtype == DocumentType.PDF:
            doc_parser = ImagePdfParser(source, parser.config)
            chunks = doc_parser.get_doc_chunks()
        return chunks
    else:
        # try getting as plain text; these will be chunked downstream
        # -- could be a bytes object or a path
        if isinstance(source, bytes):
            content = source.decode()
            if lines is not None:
                file_lines = content.splitlines()[:lines]
                content = "\n".join(line.strip() for line in file_lines)
        else:
            with open(source, "r") as f:
                if lines is not None:
                    file_lines = list(itertools.islice(f, lines))
                    content = "\n".join(line.strip() for line in file_lines)
                else:
                    content = f.read()
        soup = BeautifulSoup(content, "html.parser")
        text = soup.get_text()
        source_name = source if isinstance(source, str) else "bytes"
        doc = Document(
            content=text,
            metadata=DocMetaData(source=str(source_name)),
        )
        return parser.split([doc])

iterate_pages()

Yield each page in the PDF.

Source code in langroid/parsing/document_parser.py
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
    """Yield each page in the PDF."""
    raise NotImplementedError

get_document_from_page(page)

Get Langroid Document object (with possible metadata) corresponding to a given page.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: Any) -> Document:
    """
    Get Langroid Document object (with possible metadata)
    corresponding to a given page.
    """
    raise NotImplementedError

fix_text(text)

Fix text extracted from a PDF.

Parameters:

Name Type Description Default
text str

The extracted text.

required

Returns:

Name Type Description
str str

The fixed text.

Source code in langroid/parsing/document_parser.py
def fix_text(self, text: str) -> str:
    """
    Fix text extracted from a PDF.

    Args:
        text (str): The extracted text.

    Returns:
        str: The fixed text.
    """
    # Some pdf parsers introduce extra space before hyphen,
    # so use regular expression to replace 'space-hyphen' with just 'hyphen'
    return re.sub(r" +\-", "-", text)

get_doc()

Get entire text from source as a single document.

Returns:

Type Description
Document

a Document object containing the content of the pdf file, and metadata containing source name (URL or path)

Source code in langroid/parsing/document_parser.py
def get_doc(self) -> Document:
    """
    Get entire text from source as a single document.

    Returns:
        a `Document` object containing the content of the pdf file,
            and metadata containing source name (URL or path)
    """

    text = "".join(
        [
            self.get_document_from_page(page).content
            for _, page in self.iterate_pages()
        ]
    )
    return Document(content=text, metadata=DocMetaData(source=self.source))

get_doc_chunks()

Get document chunks from a pdf source, with page references in the document metadata.

Adapted from https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py

Returns:

Type Description
List[Document]

List[Document]: a list of Document objects, each containing a chunk of text

Source code in langroid/parsing/document_parser.py
def get_doc_chunks(self) -> List[Document]:
    """
    Get document chunks from a pdf source,
    with page references in the document metadata.

    Adapted from
    https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py

    Returns:
        List[Document]: a list of `Document` objects,
            each containing a chunk of text
    """

    split = []  # tokens in curr split
    pages: List[str] = []
    docs: List[Document] = []
    # metadata.id to be shared by ALL chunks of this document
    common_id = ObjectRegistry.new_id()
    n_chunks = 0  # how many chunk so far
    for i, page in self.iterate_pages():
        # not used but could be useful, esp to blend the
        # metadata from the pages into the chunks
        page_doc = self.get_document_from_page(page)
        page_text = page_doc.content
        split += self.tokenizer.encode(page_text)
        pages.append(str(i + 1))
        # split could be so long it needs to be split
        # into multiple chunks. Or it could be so short
        # that it needs to be combined with the next chunk.
        while len(split) > self.config.chunk_size:
            # pretty formatting of pages (e.g. 1-3, 4, 5-7)
            p_0 = int(pages[0]) - self.config.page_number_offset
            p_n = int(pages[-1]) - self.config.page_number_offset
            page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}"
            text = self.tokenizer.decode(split[: self.config.chunk_size])
            docs.append(
                Document(
                    content=text,
                    metadata=DocMetaData(
                        source=f"{self.source} {page_str}",
                        is_chunk=True,
                        id=common_id,
                    ),
                )
            )
            n_chunks += 1
            split = split[self.config.chunk_size - self.config.overlap :]
            pages = [str(i + 1)]
    # there may be a last split remaining:
    # if it's shorter than the overlap, we shouldn't make a chunk for it
    # since it's already included in the prior chunk;
    # the only exception is if there have been no chunks so far.
    if len(split) > self.config.overlap or n_chunks == 0:
        p_0 = int(pages[0]) - self.config.page_number_offset
        p_n = int(pages[-1]) - self.config.page_number_offset
        page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}"
        text = self.tokenizer.decode(split[: self.config.chunk_size])
        docs.append(
            Document(
                content=text,
                metadata=DocMetaData(
                    source=f"{self.source} {page_str}",
                    is_chunk=True,
                    id=common_id,
                ),
            )
        )
    self.add_window_ids(docs)
    return docs

FitzPDFParser(source, config)

Bases: DocumentParser

Parser for processing PDFs using the fitz library.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

iterate_pages()

Yield each page in the PDF using fitz.

Returns:

Type Description
Generator[Tuple[int, 'fitz.Page'], None, None]

Generator[fitz.Page]: Generator yielding each page.

Source code in langroid/parsing/document_parser.py
def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
    """
    Yield each page in the PDF using `fitz`.

    Returns:
        Generator[fitz.Page]: Generator yielding each page.
    """
    try:
        import fitz
    except ImportError:
        LangroidImportError("fitz", "doc-chat")
    doc = fitz.open(stream=self.doc_bytes, filetype="pdf")
    for i, page in enumerate(doc):
        yield i, page
    doc.close()

get_document_from_page(page)

Get Document object from a given fitz page.

Parameters:

Name Type Description Default
page Page

The fitz page object.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: "fitz.Page") -> Document:
    """
    Get Document object from a given `fitz` page.

    Args:
        page (fitz.Page): The `fitz` page object.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    return Document(
        content=self.fix_text(page.get_text()),
        metadata=DocMetaData(source=self.source),
    )

PyMuPDF4LLMParser(source, config)

Bases: DocumentParser

Parser for processing PDFs using the pymupdf4llm library.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

iterate_pages()

Yield each page in the PDF using fitz.

Returns:

Type Description
Generator[Tuple[int, 'fitz.Page'], None, None]

Generator[fitz.Page]: Generator yielding each page.

Source code in langroid/parsing/document_parser.py
def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
    """
    Yield each page in the PDF using `fitz`.

    Returns:
        Generator[fitz.Page]: Generator yielding each page.
    """
    try:
        import pymupdf4llm  # noqa
        import fitz
    except ImportError:
        raise LangroidImportError(
            "pymupdf4llm", ["pymupdf4llm", "all", "pdf-parsers", "doc-chat"]
        )
    doc: fitz.Document = fitz.open(stream=self.doc_bytes, filetype="pdf")
    pages: List[Dict[str, Any]] = pymupdf4llm.to_markdown(doc, page_chunks=True)
    for i, page in enumerate(pages):
        yield i, page
    doc.close()

get_document_from_page(page)

Get Document object corresponding to a given "page-chunk" dictionary, see: https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html

Parameters:

Name Type Description Default
page Dict[str, Any]

The "page-chunk" dictionary.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: Dict[str, Any]) -> Document:
    """
    Get Document object corresponding to a given "page-chunk"
    dictionary, see:
     https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html


    Args:
        page (Dict[str,Any]): The "page-chunk" dictionary.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    return Document(
        content=self.fix_text(page.get("text", "")),
        # TODO could possible use other metadata from page, see above link.
        metadata=DocMetaData(source=self.source),
    )

DoclingParser(source, config)

Bases: DocumentParser

Parser for processing PDFs using the docling library.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

iterate_pages()

Yield each page in the PDF using docling. Code largely from this example: https://github.com/DS4SD/docling/blob/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b/docs/examples/export_figures.py

Returns:

Type Description
Generator[Tuple[int, Any], None, None]

Generator[docling.Page]: Generator yielding each page.

Source code in langroid/parsing/document_parser.py
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
    """
    Yield each page in the PDF using `docling`.
    Code largely from this example:
    https://github.com/DS4SD/docling/blob/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b/docs/examples/export_figures.py

    Returns:
        Generator[docling.Page]: Generator yielding each page.
    """
    try:
        import docling  # noqa
    except ImportError:
        raise LangroidImportError(
            "docling", ["docling", "pdf-parsers", "all", "doc-chat"]
        )

    from docling.datamodel.base_models import InputFormat  # type: ignore
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    from docling.document_converter import (  # type: ignore
        ConversionResult,
        DocumentConverter,
        PdfFormatOption,
    )
    from docling_core.types.doc import ImageRefMode  # type: ignore

    IMAGE_RESOLUTION_SCALE = 2.0
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    doc_path = self.source
    if doc_path == "bytes":
        # write to tmp file, then use that path
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(self.doc_bytes.getvalue())
            doc_path = temp_file.name

    output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
    os.makedirs(output_dir, exist_ok=True)

    result: ConversionResult = converter.convert(doc_path)

    def n_page_elements(page) -> int:  # type: ignore
        if page.assembled is None:
            return 0
        return 1 + len(page.assembled.elements)

    page_element_count = [n_page_elements(i) for i in result.pages]
    element_page_cutoff = list(accumulate([1] + page_element_count))
    for i, page in enumerate(result.pages):
        page_start = element_page_cutoff[i]
        page_end = element_page_cutoff[i + 1]
        md_file = output_dir / f"page_{i}.md"
        # we could have just directly exported to a markdown string,
        # but we need to save to a file to force generation of image-files.
        result.document.save_as_markdown(
            md_file,
            image_mode=ImageRefMode.REFERENCED,
            from_element=page_start,
            to_element=page_end,
        )
        yield i, md_file

get_document_from_page(md_file)

Get Document object from a given 1-page markdown file, possibly containing image refs.

Parameters:

Name Type Description Default
md_file str

The markdown file path for the page.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, md_file: str) -> Document:
    """
    Get Document object from a given 1-page markdown file,
    possibly containing image refs.

    Args:
        md_file (str): The markdown file path for the page.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    with open(md_file, "r") as f:
        text = f.read()
    return Document(
        content=self.fix_text(text),
        metadata=DocMetaData(source=self.source),
    )

PyPDFParser(source, config)

Bases: DocumentParser

Parser for processing PDFs using the pypdf library.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

iterate_pages()

Yield each page in the PDF using pypdf.

Returns:

Type Description
Generator[Tuple[int, PageObject], None, None]

Generator[pypdf.pdf.PageObject]: Generator yielding each page.

Source code in langroid/parsing/document_parser.py
def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
    """
    Yield each page in the PDF using `pypdf`.

    Returns:
        Generator[pypdf.pdf.PageObject]: Generator yielding each page.
    """
    try:
        import pypdf
    except ImportError:
        raise LangroidImportError("pypdf", "pdf-parsers")
    reader = pypdf.PdfReader(self.doc_bytes)
    for i, page in enumerate(reader.pages):
        yield i, page

get_document_from_page(page)

Get Document object from a given pypdf page.

Parameters:

Name Type Description Default
page PageObject

The pypdf page object.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: pypdf.PageObject) -> Document:
    """
    Get Document object from a given `pypdf` page.

    Args:
        page (pypdf.pdf.PageObject): The `pypdf` page object.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    return Document(
        content=self.fix_text(page.extract_text()),
        metadata=DocMetaData(source=self.source),
    )

ImagePdfParser(source, config)

Bases: DocumentParser

Parser for processing PDFs that are images, i.e. not "true" PDFs.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

get_document_from_page(page)

Get Document object corresponding to a given pdf2image page.

Parameters:

Name Type Description Default
page Image

The PIL Image object.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: "Image") -> Document:  # type: ignore
    """
    Get Document object corresponding to a given `pdf2image` page.

    Args:
        page (Image): The PIL Image object.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    try:
        import pytesseract
    except ImportError:
        raise LangroidImportError("pytesseract", "pdf-parsers")

    text = pytesseract.image_to_string(page)
    return Document(
        content=self.fix_text(text),
        metadata=DocMetaData(source=self.source),
    )

UnstructuredPDFParser(source, config)

Bases: DocumentParser

Parser for processing PDF files using the unstructured library.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

get_document_from_page(page)

Get Document object from a given unstructured element.

Parameters:

Name Type Description Default
page unstructured element

The unstructured element object.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: Any) -> Document:
    """
    Get Document object from a given `unstructured` element.

    Args:
        page (unstructured element): The `unstructured` element object.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    text = " ".join(el.text for el in page)
    return Document(
        content=self.fix_text(text),
        metadata=DocMetaData(source=self.source),
    )

UnstructuredDocxParser(source, config)

Bases: DocumentParser

Parser for processing DOCX files using the unstructured library.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

get_document_from_page(page)

Get Document object from a given unstructured element.

Note

The concept of "pages" doesn't actually exist in the .docx file format in the same way it does in formats like .pdf. A .docx file is made up of a series of elements like paragraphs and tables, but the division into pages is done dynamically based on the rendering settings (like the page size, margin size, font size, etc.).

Parameters:

Name Type Description Default
page unstructured element

The unstructured element object.

required

Returns:

Type Description
Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: Any) -> Document:
    """
    Get Document object from a given `unstructured` element.

    Note:
        The concept of "pages" doesn't actually exist in the .docx file format in
        the same way it does in formats like .pdf. A .docx file is made up of a
        series of elements like paragraphs and tables, but the division into
        pages is done dynamically based on the rendering settings (like the page
        size, margin size, font size, etc.).

    Args:
        page (unstructured element): The `unstructured` element object.

    Returns:
        Document object, with content and possible metadata.
    """
    text = " ".join(el.text for el in page)
    return Document(
        content=self.fix_text(text),
        metadata=DocMetaData(source=self.source),
    )

PythonDocxParser(source, config)

Bases: DocumentParser

Parser for processing DOCX files using the python-docx library.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

iterate_pages()

Simulate iterating through pages. In a DOCX file, pages are not explicitly defined, so we consider each paragraph as a separate 'page' for simplicity.

Source code in langroid/parsing/document_parser.py
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
    """
    Simulate iterating through pages.
    In a DOCX file, pages are not explicitly defined,
    so we consider each paragraph as a separate 'page' for simplicity.
    """
    try:
        import docx
    except ImportError:
        raise LangroidImportError("python-docx", "docx")

    doc = docx.Document(self.doc_bytes)
    for i, para in enumerate(doc.paragraphs, start=1):
        yield i, [para]

get_document_from_page(page)

Get Document object from a given 'page', which in this case is a single paragraph.

Parameters:

Name Type Description Default
page list

A list containing a single Paragraph object.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: Any) -> Document:
    """
    Get Document object from a given 'page', which in this case is a single
    paragraph.

    Args:
        page (list): A list containing a single Paragraph object.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    paragraph = page[0]
    return Document(
        content=self.fix_text(paragraph.text),
        metadata=DocMetaData(source=self.source),
    )

MarkitdownDocxParser(source, config)

Bases: DocumentParser

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

get_document_from_page(md_content)

Get Document object from a given markdown section.

Parameters:

Name Type Description Default
md_content str

The markdown content for the section.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, md_content: str) -> Document:
    """
    Get Document object from a given markdown section.

    Args:
        md_content (str): The markdown content for the section.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    return Document(
        content=self.fix_text(md_content),
        metadata=DocMetaData(source=self.source),
    )

MarkitdownXLSXParser(source, config)

Bases: DocumentParser

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

get_document_from_page(md_content)

Get Document object from a given 1-page markdown string.

Parameters:

Name Type Description Default
md_content str

The markdown content for the page.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, md_content: str) -> Document:
    """
    Get Document object from a given 1-page markdown string.

    Args:
        md_content (str): The markdown content for the page.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    return Document(
        content=self.fix_text(md_content),
        metadata=DocMetaData(source=self.source),
    )

MarkitdownPPTXParser(source, config)

Bases: DocumentParser

Source code in langroid/parsing/document_parser.py
def __init__(self, source: str | bytes, config: ParsingConfig):
    """
    Args:
        source (str|bytes): The source, which could be
        a path, a URL or a bytes object.
    """
    super().__init__(config)
    self.config = config
    if isinstance(source, bytes):
        self.source = "bytes"
        self.doc_bytes = BytesIO(source)
    else:
        self.source = source
        self.doc_bytes = self._load_doc_as_bytesio()

get_document_from_page(md_content)

Get Document object from a given 1-page markdown string.

Parameters:

Name Type Description Default
md_content str

The markdown content for the page.

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, md_content: str) -> Document:
    """
    Get Document object from a given 1-page markdown string.

    Args:
        md_content (str): The markdown content for the page.

    Returns:
        Document: Document object, with content and possible metadata.
    """
    return Document(
        content=self.fix_text(md_content),
        metadata=DocMetaData(source=self.source),
    )

GeminiPdfParser(source, config)

Bases: DocumentParser

This class converts PDFs to Markdown using Gemini multimodal LLMs.

It extracts pages, converts them with the LLM (replacing images with detailed descriptions), and outputs Markdown page by page. The conversion follows GEMINI_SYSTEM_INSTRUCTION. It employs multiprocessing for speed, async requests with rate limiting, and handles errors.

It supports page-by-page splitting or chunking multiple pages into one, respecting page boundaries and a max_token_limit.

Source code in langroid/parsing/document_parser.py
def __init__(self, source: Union[str, bytes], config: ParsingConfig):
    super().__init__(source, config)
    if not config.pdf.gemini_config:
        raise ValueError(
            "GeminiPdfParser requires a Gemini-based config in pdf parsing config"
        )
    self.model_name = config.pdf.gemini_config.model_name

    # Ensure output directory exists
    self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    prefix = (
        Path(source).stem + "_"
        if isinstance(source, str) and Path(source).exists()
        else "output_"
    )
    temp_file = tempfile.NamedTemporaryFile(
        suffix=".md",
        prefix=prefix,
        dir=str(self.OUTPUT_DIR),
        delete=False,
    )
    temp_file.close()
    self.output_filename = Path(temp_file.name)

    self.max_tokens = config.pdf.gemini_config.max_tokens or self.DEFAULT_MAX_TOKENS

    """
    If True, each PDF page is processed as a separate chunk,
    resulting in one LLM request per page. If False, pages are
    grouped into chunks based on `max_token_limit` before being sent
    to the LLM.
    """
    self.split_on_page = config.pdf.gemini_config.split_on_page or False

    # Rate limiting parameters
    import asyncio

    self.requests_per_minute = config.pdf.gemini_config.requests_per_minute or 5

    """
    A semaphore to control the number of concurrent requests to the LLM,
    preventing rate limit errors.  A semaphore slot is acquired before
    making an LLM request and released after the request is complete.
    """
    self.semaphore = asyncio.Semaphore(self.requests_per_minute)
    self.retry_delay = 5  # seconds, for exponential backoff
    self.max_retries = 3

max_tokens = config.pdf.gemini_config.max_tokens or self.DEFAULT_MAX_TOKENS instance-attribute

If True, each PDF page is processed as a separate chunk, resulting in one LLM request per page. If False, pages are grouped into chunks based on max_token_limit before being sent to the LLM.

requests_per_minute = config.pdf.gemini_config.requests_per_minute or 5 instance-attribute

A semaphore to control the number of concurrent requests to the LLM, preventing rate limit errors. A semaphore slot is acquired before making an LLM request and released after the request is complete.

process_chunks(chunks, api_key) async

Processes PDF chunks by sending them to the Gemini API and collecting the results.

Parameters:

Name Type Description Default
chunks List[Dict[str, Any]]

A list of dictionaries, where each dictionary represents a PDF chunk and contains the PDF data and page numbers.

required
api_key str

The Gemini API key.

required
Source code in langroid/parsing/document_parser.py
async def process_chunks(
    self, chunks: List[Dict[str, Any]], api_key: str
) -> List[str]:
    """
    Processes PDF chunks by sending them to the Gemini API and
    collecting the results.

    Args:
        chunks: A list of dictionaries, where each dictionary represents
            a PDF chunk and contains the PDF data and page numbers.
        api_key: The Gemini API key.
    """
    # To show nice progress bar
    from tqdm.asyncio import tqdm_asyncio

    # Create a list of asynchronous tasks to send each chunk to Gemini.
    # Chunk in this case might be single page or group of pages returned
    # by prepare_pdf_chunks function
    tasks = [self._send_chunk_to_gemini(chunk, api_key) for chunk in chunks]

    # Gather the results from all tasks, allowing exceptions to be returned.
    # tqdm_asyncio is wrapper around asyncio.gather
    gathered_results = await tqdm_asyncio.gather(
        *tasks, desc="Processing chunks(pages)", unit="chunk"
    )
    results = []
    for i, result in enumerate(gathered_results):
        chunk = chunks[i]  # Get the corresponding chunk.

        if isinstance(result, Exception):
            # Handle exceptions that occurred during chunk processing.
            logging.error(
                "Failed to process chunk %s: %s",
                chunk.get("page_numbers", "Unknown"),
                result,
            )
            results.append(
                "<!----Error: Could not process chunk %s---->"
                % chunk.get("page_numbers", "Unknown")
            )
        else:
            # Process successful results and append page/chunk markers.
            markdown = str(result)
            if self.split_on_page:
                results.append(
                    markdown + f"<!----Page-{chunk['page_numbers']}---->"
                )
            else:
                results.append(
                    markdown + f"<!----Chunk-{chunk['page_numbers']}---->"
                )

    return results  # Return the list of results.

iterate_pages()

Iterates over the document pages, extracting content using the Gemini API, saves them to a markdown file, and yields page numbers along with their corresponding content.

Yields:

Type Description
int

A generator of tuples, where each tuple contains the page number

Any

(int) and the page content (Any).

Source code in langroid/parsing/document_parser.py
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
    """
    Iterates over the document pages, extracting content using the
    Gemini API, saves them to a markdown file, and yields page numbers
    along with their corresponding content.

    Yields:
        A generator of tuples, where each tuple contains the page number
        (int) and the page content (Any).
    """
    import asyncio
    import os

    # Load environment variables (e.g., GEMINI_API_KEY) from a .env file.
    load_dotenv()
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("GEMINI_API_KEY not found in environment variables.")

    try:
        # This involves extracting pages, grouping them according to the
        # `max_tokens` limit (if `split_on_page` is False), and
        # merging pages into larger PDF chunks. The result
        # is a list of dictionaries, where each dictionary contains the
        # PDF bytes and the associated page numbers or single page if
        # `split_on_page` is true

        pdf_chunks = self._prepare_pdf_chunks_for_gemini(
            num_workers=8,
            max_tokens=self.max_tokens,
            split_on_page=self.split_on_page,
        )

        # We asynchronously processes each chunk, sending it
        # to Gemini and retrieving the Markdown output. It handles rate
        # limiting and retries.
        markdown_results = asyncio.run(
            self.process_chunks(pdf_chunks, gemini_api_key)
        )

        # This file serves as an intermediate storage location for the
        # complete Markdown output.
        with open(self.output_filename, "w", encoding="utf-8") as outfile:
            outfile.write("\n\n".join(markdown_results))

        # Read the full Markdown content from the temporary file.
        with open(self.output_filename, "r", encoding="utf-8") as infile:
            full_markdown = infile.read()

        # The splitting is based on the `split_on_page` setting. If True,
        # the Markdown is split using the "Page-" marker. Otherwise, it's
        # split using the "Chunk-" marker.
        if self.split_on_page:
            pages = full_markdown.split("<!----Page-")
        else:
            pages = full_markdown.split("<!----Chunk-")

        # Remove the first element if it's empty (due to the split).
        if pages and pages[0] == "":
            pages = pages[1:]

        # Iterate over the pages or chunks and yield their content.
        for i, page in enumerate(pages):
            # Check for errors during processing.
            if "<!----Error:" in page:
                page_content = page
                logging.warning(f"Page {i}: Error processing chunk.")
            else:
                # Extract the actual page content by removing the marker.
                page_content = (
                    page.split("---->", 1)[1]
                    if len(page.split("---->", 1)) > 1
                    else page
                )

            # Yield the page number and content.
            yield i, page_content

    except Exception as e:
        raise ValueError(f"Error processing document: {e}") from e

get_document_from_page(page)

Get a Document object from a given markdown page.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: str) -> Document:
    """
    Get a Document object from a given markdown page.
    """
    return Document(
        content=page,
        metadata=DocMetaData(source=self.source),
    )

MarkerPdfParser(source, config)

Bases: DocumentParser

Parse PDF files using the marker library: https://github.com/VikParuchuri/marker

Source code in langroid/parsing/document_parser.py
def __init__(self, source: Union[str, bytes], config: ParsingConfig):
    super().__init__(source, config)
    user_config = (
        config.pdf.marker_config.config_dict if config.pdf.marker_config else {}
    )

    self.config_dict = {**MarkerPdfParser.DEFAULT_CONFIG, **user_config}

iterate_pages()

Yield each page in the PDF using marker.

Source code in langroid/parsing/document_parser.py
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
    """
    Yield each page in the PDF using `marker`.
    """
    try:
        import marker  # noqa
    except ImportError:
        raise LangroidImportError(
            "marker-pdf", ["marker-pdf", "pdf-parsers", "all", "doc-chat"]
        )

    import re

    from marker.config.parser import ConfigParser
    from marker.converters.pdf import PdfConverter
    from marker.models import create_model_dict
    from marker.output import save_output

    config_parser = ConfigParser(self.config_dict)
    converter = PdfConverter(
        config=config_parser.generate_config_dict(),
        artifact_dict=create_model_dict(),
        processor_list=config_parser.get_processors(),
        renderer=config_parser.get_renderer(),
        llm_service=config_parser.get_llm_service(),
    )
    doc_path = self.source
    if doc_path == "bytes":
        # write to tmp file, then use that path
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(self.doc_bytes.getvalue())
            doc_path = temp_file.name

    output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
    os.makedirs(output_dir, exist_ok=True)
    filename = Path(doc_path).stem + "_converted"

    rendered = converter(doc_path)
    save_output(rendered, output_dir=output_dir, fname_base=filename)
    file_path = output_dir / f"{filename}.md"

    with open(file_path, "r", encoding="utf-8") as f:
        full_markdown = f.read()

    # Regex for splitting pages
    pages = re.split(r"\{\d+\}----+", full_markdown)

    page_no = 0
    for page in pages:
        if page.strip():
            yield page_no, page
        page_no += 1

get_document_from_page(page)

Get Document object from a given 1-page markdown file, possibly containing image refs.

Parameters:

Name Type Description Default
page str

The page we get by splitting large md file from

required

Returns:

Name Type Description
Document Document

Document object, with content and possible metadata.

Source code in langroid/parsing/document_parser.py
def get_document_from_page(self, page: str) -> Document:
    """
    Get Document object from a given 1-page markdown file,
    possibly containing image refs.

    Args:
        page (str): The page we get by splitting large md file from
        marker

    Returns:
        Document: Document object, with content and possible metadata.
    """
    return Document(
        content=self.fix_text(page),
        metadata=DocMetaData(source=self.source),
    )

find_last_full_char(possible_unicode)

Find the index of the last full character in a byte string. Args: possible_unicode (bytes): The bytes to check. Returns: int: The index of the last full unicode character.

Source code in langroid/parsing/document_parser.py
def find_last_full_char(possible_unicode: bytes) -> int:
    """
    Find the index of the last full character in a byte string.
    Args:
        possible_unicode (bytes): The bytes to check.
    Returns:
        int: The index of the last full unicode character.
    """

    for i in range(len(possible_unicode) - 1, 0, -1):
        if (possible_unicode[i] & 0xC0) != 0x80:
            return i
    return 0

is_plain_text(path_or_bytes)

Check if a file is plain text by attempting to decode it as UTF-8. Args: path_or_bytes (str|bytes): The file path or bytes object. Returns: bool: True if the file is plain text, False otherwise.

Source code in langroid/parsing/document_parser.py
def is_plain_text(path_or_bytes: str | bytes) -> bool:
    """
    Check if a file is plain text by attempting to decode it as UTF-8.
    Args:
        path_or_bytes (str|bytes): The file path or bytes object.
    Returns:
        bool: True if the file is plain text, False otherwise.
    """
    if isinstance(path_or_bytes, str):
        if path_or_bytes.startswith(("http://", "https://")):
            response = requests.get(path_or_bytes)
            response.raise_for_status()
            content = response.content[:1024]
        else:
            with open(path_or_bytes, "rb") as f:
                content = f.read(1024)
    else:
        content = path_or_bytes[:1024]
    try:
        # Use magic to detect the MIME type
        import magic

        mime_type = magic.from_buffer(content, mime=True)

        # Check if the MIME type is not a text type
        if not mime_type.startswith("text/"):
            return False

        # Attempt to decode the content as UTF-8
        content = content[: find_last_full_char(content)]

        try:
            _ = content.decode("utf-8")
            # Additional checks can go here, e.g., to verify that the content
            # doesn't contain too many unusual characters for it to be considered text
            return True
        except UnicodeDecodeError:
            return False
    except UnicodeDecodeError:
        # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
        return False