Source code for langchain_community.retrievers.arxiv

from typing import List

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever

from langchain_community.utilities.arxiv import ArxivAPIWrapper


[docs]class ArxivRetriever(BaseRetriever, ArxivAPIWrapper): """`Arxiv` retriever. Setup: Install ``arxiv``: .. code-block:: bash pip install -U arxiv Key init args: load_max_docs: int maximum number of documents to load get_ful_documents: bool whether to return full document text or snippets Instantiate: .. code-block:: python from langchain_community.retrievers import ArxivRetriever retriever = ArxivRetriever( load_max_docs=2, get_ful_documents=True, ) Usage: .. code-block:: python docs = retriever.invoke("What is the ImageBind model?") docs[0].metadata .. code-block:: none {'Entry ID': 'http://arxiv.org/abs/2305.05665v2', 'Published': datetime.date(2023, 5, 31), 'Title': 'ImageBind: One Embedding Space To Bind Them All', 'Authors': 'Rohit Girdhar, Alaaeldin El-Nouby, Zhuang Liu, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, Ishan Misra'} Use within a chain: .. code-block:: python from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from langchain_core.runnables import RunnablePassthrough from langchain_openai import ChatOpenAI prompt = ChatPromptTemplate.from_template( \"\"\"Answer the question based only on the context provided. Context: {context} Question: {question}\"\"\" ) llm = ChatOpenAI(model="gpt-3.5-turbo-0125") def format_docs(docs): return "\\n\\n".join(doc.page_content for doc in docs) chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) chain.invoke("What is the ImageBind model?") .. code-block:: none 'The ImageBind model is an approach to learn a joint embedding across six different modalities - images, text, audio, depth, thermal, and IMU data...' """ # noqa: E501 get_full_documents: bool = False def _get_relevant_documents( self, query: str, *, run_manager: CallbackManagerForRetrieverRun ) -> List[Document]: if self.get_full_documents: return self.load(query=query) else: return self.get_summaries_as_docs(query)