Skip to content

lance_doc_chat_agent

langroid/agent/special/lance_doc_chat_agent.py

LanceDocChatAgent is a subclass of DocChatAgent that uses LanceDB as a vector store: - Uses the DocChatAgentConfig.filter variable (a sql string) in the where clause to do filtered vector search. - Overrides the get_similar_chunks_bm25() to use LanceDB FTS (Full Text Search).

For usage see
  • tests/main/test_lance_doc_chat_agent.py.
  • example script examples/docqa/lance_rag.py.

LanceDocChatAgent(cfg)

Bases: DocChatAgent

Source code in langroid/agent/special/lance_doc_chat_agent.py
def __init__(self, cfg: DocChatAgentConfig):
    super().__init__(cfg)
    self.config: DocChatAgentConfig = cfg
    self.enable_message(QueryPlanTool, use=False, handle=True)

query_plan(msg)

Handle the LLM's use of the FilterTool. Temporarily set the config filter and either return the final answer in case there's a dataframe_calc, or return the rephrased query so the LLM can handle it.

Source code in langroid/agent/special/lance_doc_chat_agent.py
def query_plan(self, msg: QueryPlanTool) -> str:
    """
    Handle the LLM's use of the FilterTool.
    Temporarily set the config filter and either return the final answer
    in case there's a dataframe_calc, or return the rephrased query
    so the LLM can handle it.
    """
    # create document-subset based on this filter
    plan = msg.plan
    try:
        self.setup_documents(filter=plan.filter or None)
    except Exception as e:
        logger.error(f"Error setting up documents: {e}")
        # say DONE with err msg so it goes back to LanceFilterAgent
        return f"""
        {DONE} Possible Filter Error:\n {e}

        Note that only the following fields are allowed in the filter
        of a query plan: 
        {", ".join(self.config.filter_fields)}
        """

    # update the filter so it is used in the DocChatAgent
    self.config.filter = plan.filter or None
    if plan.dataframe_calc:
        # we just get relevant docs then do the calculation
        # TODO if calc causes err, it is captured in result,
        # and LLM can correct the calc based on the err,
        # and this will cause retrieval all over again,
        # which may be wasteful if only the calc part is wrong.
        # The calc step can later be done with a separate Agent/Tool.
        if plan.query is None or plan.query.strip() == "":
            if plan.filter is None or plan.filter.strip() == "":
                return """DONE
                Cannot execute Query Plan since filter as well as 
                rephrased query are empty.
                """
            else:
                # no query to match, so just get all docs matching filter
                docs = self.vecdb.get_all_documents(plan.filter)
        else:
            _, docs = self.get_relevant_extracts(plan.query)
        if len(docs) == 0:
            return DONE + " " + NO_ANSWER
        result = self.vecdb.compute_from_docs(docs, plan.dataframe_calc)
        return DONE + " " + result
    else:
        # pass on the query so LLM can handle it
        return plan.query

ingest_dataframe(df, content='content', metadata=[])

Ingest from a dataframe. Assume we are doing this once, not incrementally

Source code in langroid/agent/special/lance_doc_chat_agent.py
def ingest_dataframe(
    self,
    df: pd.DataFrame,
    content: str = "content",
    metadata: List[str] = [],
) -> int:
    """Ingest from a dataframe. Assume we are doing this once, not incrementally"""

    self.from_dataframe = True
    if df.shape[0] == 0:
        raise ValueError(
            """
            LanceDocChatAgent.ingest_dataframe() received an empty dataframe.
            """
        )
    n = df.shape[0]

    # If any additional fields need to be added to content,
    # add them as key=value pairs, into the `content` field for all rows.
    # This helps retrieval for table-like data.
    # Note we need to do this at stage so that the embeddings
    # are computed on the full content with these additional fields.
    fields = [f for f in self.config.add_fields_to_content if f in df.columns]
    if len(fields) > 0:
        df[content] = df.apply(
            lambda row: (",".join(f"{f}={row[f]}" for f in fields))
            + ", content="
            + row[content],
            axis=1,
        )

    df, metadata = DocChatAgent.document_compatible_dataframe(df, content, metadata)
    self.df_description = describe_dataframe(
        df,
        filter_fields=self.config.filter_fields,
        n_vals=10,
    )
    self.vecdb.add_dataframe(df, content="content", metadata=metadata)

    tbl = self.vecdb.client.open_table(self.vecdb.config.collection_name)
    # We assume "content" is available as top-level field
    if "content" in tbl.schema.names:
        tbl.create_fts_index("content", replace=True)
    # We still need to do the below so that
    # other types of searches in DocChatAgent
    # can work, as they require Document objects
    docs = dataframe_to_documents(df, content="content", metadata=metadata)
    self.setup_documents(docs)
    # mark each doc as already-chunked so we don't try to split them further
    # TODO later we may want to split large text-columns
    for d in docs:
        d.metadata.is_chunk = True
    return n  # type: ignore

get_similar_chunks_bm25(query, multiple)

Override the DocChatAgent.get_similar_chunks_bm25() to use LanceDB FTS (Full Text Search).

Source code in langroid/agent/special/lance_doc_chat_agent.py
def get_similar_chunks_bm25(
    self, query: str, multiple: int
) -> List[Tuple[Document, float]]:
    """
    Override the DocChatAgent.get_similar_chunks_bm25()
    to use LanceDB FTS (Full Text Search).
    """
    # Clean up query: replace all newlines with spaces in query,
    # force special search keywords to lower case, remove quotes,
    # so it's not interpreted as search syntax
    query_clean = (
        query.replace("\n", " ")
        .replace("AND", "and")
        .replace("OR", "or")
        .replace("NOT", "not")
        .replace("'", "")
        .replace('"', "")
    )

    tbl = self.vecdb.client.open_table(self.vecdb.config.collection_name)
    result = (
        tbl.search(query_clean)
        .where(self.config.filter or None)
        .limit(self.config.parsing.n_similar_docs * multiple)
    )
    docs = self.vecdb._lance_result_to_docs(result)
    scores = [r["score"] for r in result.to_list()]
    return list(zip(docs, scores))