Source code for langchain_experimental.graph_transformers.relik

import logging
from typing import Any, Dict, List, Sequence

from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain_core.documents import Document

DEFAULT_NODE_TYPE = "Node"


[docs] class RelikGraphTransformer: """ A transformer class for converting documents into graph structures using the Relik library and models. This class leverages relik models for extracting relationships and nodes from text documents and converting them into a graph format. The relationships are filtered based on a specified confidence threshold. For more details on the Relik library, visit their GitHub repository: https://github.com/SapienzaNLP/relik Args: model (str): The name of the pretrained Relik model to use. Default is "relik-ie/relik-relation-extraction-small-wikipedia". relationship_confidence_threshold (float): The confidence threshold for filtering relationships. Default is 0.1. model_config (Dict[str, any]): Additional configuration options for the Relik model. Default is an empty dictionary. ignore_self_loops (bool): Whether to ignore relationships where the source and target nodes are the same. Default is True. """
[docs] def __init__( self, model: str = "relik-ie/relik-relation-extraction-small", relationship_confidence_threshold: float = 0.1, model_config: Dict[str, Any] = {}, ignore_self_loops: bool = True, ) -> None: try: import relik # type: ignore # Remove default INFO logging logging.getLogger("relik").setLevel(logging.WARNING) except ImportError: raise ImportError( "Could not import relik python package. " "Please install it with `pip install relik`." ) self.relik_model = relik.Relik.from_pretrained(model, **model_config) self.relationship_confidence_threshold = relationship_confidence_threshold self.ignore_self_loops = ignore_self_loops
[docs] def process_document(self, document: Document) -> GraphDocument: relik_out = self.relik_model(document.page_content) nodes = [] # Extract nodes for node in relik_out.spans: nodes.append( Node( id=node.text, type=DEFAULT_NODE_TYPE if node.label.strip() == "--NME--" else node.label.strip(), ) ) relationships = [] # Extract relationships for triple in relik_out.triplets: # Ignore relationship if below confidence threshold if triple.confidence < self.relationship_confidence_threshold: continue # Ignore self loops if self.ignore_self_loops and triple.subject.text == triple.object.text: continue source_node = Node( id=triple.subject.text, type=DEFAULT_NODE_TYPE if triple.subject.label.strip() == "--NME--" else triple.subject.label.strip(), ) target_node = Node( id=triple.object.text, type=DEFAULT_NODE_TYPE if triple.object.label.strip() == "--NME--" else triple.object.label.strip(), ) relationship = Relationship( source=source_node, target=target_node, type=triple.label.replace(" ", "_").upper(), ) relationships.append(relationship) return GraphDocument(nodes=nodes, relationships=relationships, source=document)
[docs] def convert_to_graph_documents( self, documents: Sequence[Document] ) -> List[GraphDocument]: """Convert a sequence of documents into graph documents. Args: documents (Sequence[Document]): The original documents. kwargs: Additional keyword arguments. Returns: Sequence[GraphDocument]: The transformed documents as graphs. """ results = [] for document in documents: graph_document = self.process_document(document) results.append(graph_document) return results