Source code for langchain_community.document_loaders.yuque import re
from typing import Dict , Iterator , List
import requests
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
[docs]
class YuqueLoader ( BaseLoader ):
"""Load documents from `Yuque`."""
[docs]
def __init__ ( self , access_token : str , api_url : str = "https://www.yuque.com" ):
"""Initialize with Yuque access_token and api_url.
Args:
access_token: Personal access token - see https://www.yuque.com/settings/tokens.
api_url: Yuque API url.
"""
self . access_token = access_token
self . api_url = api_url
@property
def headers ( self ) -> Dict [ str , str ]:
return {
"Content-Type" : "application/json" ,
"X-Auth-Token" : self . access_token ,
}
[docs]
def get_user_id ( self ) -> int :
url = f " { self . api_url } /api/v2/user"
response = self . http_get ( url = url )
return response [ "data" ][ "id" ]
[docs]
def get_books ( self , user_id : int ) -> List [ Dict ]:
url = f " { self . api_url } /api/v2/users/ { user_id } /repos"
response = self . http_get ( url = url )
return response [ "data" ]
[docs]
def get_document_ids ( self , book_id : int ) -> List [ int ]:
url = f " { self . api_url } /api/v2/repos/ { book_id } /docs"
response = self . http_get ( url = url )
return [ document [ "id" ] for document in response [ "data" ]]
[docs]
def get_document ( self , book_id : int , document_id : int ) -> Dict :
url = f " { self . api_url } /api/v2/repos/ { book_id } /docs/ { document_id } "
response = self . http_get ( url = url )
return response [ "data" ]
[docs]
def parse_document ( self , document : Dict ) -> Document :
content = self . parse_document_body ( document [ "body" ])
metadata = {
"title" : document [ "title" ],
"description" : document [ "description" ],
"created_at" : document [ "created_at" ],
"updated_at" : document [ "updated_at" ],
}
return Document ( page_content = content , metadata = metadata )
[docs]
@staticmethod
def parse_document_body ( body : str ) -> str :
result = re . sub ( r '<a name="(.*)"></a>' , "" , body )
result = re . sub ( r "<br\s*/?>" , "" , result )
return result
[docs]
def http_get ( self , url : str ) -> Dict :
response = requests . get ( url , headers = self . headers )
response . raise_for_status ()
return response . json ()
[docs]
def get_documents ( self ) -> Iterator [ Document ]:
user_id = self . get_user_id ()
books = self . get_books ( user_id )
for book in books :
book_id = book [ "id" ]
document_ids = self . get_document_ids ( book_id )
for document_id in document_ids :
document = self . get_document ( book_id , document_id )
parsed_document = self . parse_document ( document )
yield parsed_document
[docs]
def load ( self ) -> List [ Document ]:
"""Load documents from `Yuque`."""
return list ( self . get_documents ())
Copy to clipboard