Source code for langchain_community.graphs.neo4j_graph
fromhashlibimportmd5fromtypingimportAny,Dict,List,Optionalfromlangchain_core._api.deprecationimportdeprecatedfromlangchain_core.utilsimportget_from_dict_or_envfromlangchain_community.graphs.graph_documentimportGraphDocumentfromlangchain_community.graphs.graph_storeimportGraphStoreBASE_ENTITY_LABEL="__Entity__"EXCLUDED_LABELS=["_Bloom_Perspective_","_Bloom_Scene_"]EXCLUDED_RELS=["_Bloom_HAS_SCENE_"]EXHAUSTIVE_SEARCH_LIMIT=10000LIST_LIMIT=128# Threshold for returning all available prop values in graph schemaDISTINCT_VALUE_LIMIT=10node_properties_query="""CALL apoc.meta.data()YIELD label, other, elementType, type, propertyWHERE NOT type = "RELATIONSHIP" AND elementType = "node" AND NOT label IN $EXCLUDED_LABELSWITH label AS nodeLabels, collect({property:property, type:type}) AS propertiesRETURN {labels: nodeLabels, properties: properties} AS output"""rel_properties_query="""CALL apoc.meta.data()YIELD label, other, elementType, type, propertyWHERE NOT type = "RELATIONSHIP" AND elementType = "relationship" AND NOT label in $EXCLUDED_LABELSWITH label AS nodeLabels, collect({property:property, type:type}) AS propertiesRETURN {type: nodeLabels, properties: properties} AS output"""rel_query="""CALL apoc.meta.data()YIELD label, other, elementType, type, propertyWHERE type = "RELATIONSHIP" AND elementType = "node"UNWIND other AS other_nodeWITH * WHERE NOT label IN $EXCLUDED_LABELS AND NOT other_node IN $EXCLUDED_LABELSRETURN {start: label, type: property, end: toString(other_node)} AS output"""include_docs_query=("MERGE (d:Document {id:$document.metadata.id}) ""SET d.text = $document.page_content ""SET d += $document.metadata ""WITH d ")
[docs]@deprecated(since="0.3.8",removal="1.0",alternative_import="langchain_neo4j.graphs.neo4j_graph.clean_string_values",)defclean_string_values(text:str)->str:"""Clean string values for schema. Cleans the input text by replacing newline and carriage return characters. Args: text (str): The input text to clean. Returns: str: The cleaned text. """returntext.replace("\n"," ").replace("\r"," ")
[docs]@deprecated(since="0.3.8",removal="1.0",alternative_import="langchain_neo4j.graphs.neo4j_graph.value_sanitize",)defvalue_sanitize(d:Any)->Any:"""Sanitize the input dictionary or list. Sanitizes the input by removing embedding-like values, lists with more than 128 elements, that are mostly irrelevant for generating answers in a LLM context. These properties, if left in results, can occupy significant context space and detract from the LLM's performance by introducing unnecessary noise and cost. Args: d (Any): The input dictionary or list to sanitize. Returns: Any: The sanitized dictionary or list. """ifisinstance(d,dict):new_dict={}forkey,valueind.items():ifisinstance(value,dict):sanitized_value=value_sanitize(value)if(sanitized_valueisnotNone):# Check if the sanitized value is not Nonenew_dict[key]=sanitized_valueelifisinstance(value,list):iflen(value)<LIST_LIMIT:sanitized_value=value_sanitize(value)if(sanitized_valueisnotNone):# Check if the sanitized value is not Nonenew_dict[key]=sanitized_value# Do not include the key if the list is oversizedelse:new_dict[key]=valuereturnnew_dictelifisinstance(d,list):iflen(d)<LIST_LIMIT:return[value_sanitize(item)foritemindifvalue_sanitize(item)isnotNone]else:returnNoneelse:returnd
@deprecated(since="0.3.8",removal="1.0",alternative_import="langchain_neo4j.graphs.neo4j_graph._get_node_import_query",)def_get_node_import_query(baseEntityLabel:bool,include_source:bool)->str:ifbaseEntityLabel:return(f"{include_docs_queryifinclude_sourceelse''}""UNWIND $data AS row "f"MERGE (source:`{BASE_ENTITY_LABEL}` {{id: row.id}}) ""SET source += row.properties "f"{'MERGE (d)-[:MENTIONS]->(source) 'ifinclude_sourceelse''}""WITH source, row ""CALL apoc.create.addLabels( source, [row.type] ) YIELD node ""RETURN distinct 'done' AS result")else:return(f"{include_docs_queryifinclude_sourceelse''}""UNWIND $data AS row ""CALL apoc.merge.node([row.type], {id: row.id}, ""row.properties, {}) YIELD node "f"{'MERGE (d)-[:MENTIONS]->(node) 'ifinclude_sourceelse''}""RETURN distinct 'done' AS result")@deprecated(since="0.3.8",removal="1.0",alternative_import="langchain_neo4j.graphs.neo4j_graph._get_rel_import_query",)def_get_rel_import_query(baseEntityLabel:bool)->str:ifbaseEntityLabel:return("UNWIND $data AS row "f"MERGE (source:`{BASE_ENTITY_LABEL}` {{id: row.source}}) "f"MERGE (target:`{BASE_ENTITY_LABEL}` {{id: row.target}}) ""WITH source, target, row ""CALL apoc.merge.relationship(source, row.type, ""{}, row.properties, target) YIELD rel ""RETURN distinct 'done'")else:return("UNWIND $data AS row ""CALL apoc.merge.node([row.source_label], {id: row.source},""{}, {}) YIELD node as source ""CALL apoc.merge.node([row.target_label], {id: row.target},""{}, {}) YIELD node as target ""CALL apoc.merge.relationship(source, row.type, ""{}, row.properties, target) YIELD rel ""RETURN distinct 'done'")@deprecated(since="0.3.8",removal="1.0",alternative_import="langchain_neo4j.graphs.neo4j_graph._format_schema",)def_format_schema(schema:Dict,is_enhanced:bool)->str:formatted_node_props=[]formatted_rel_props=[]ifis_enhanced:# Enhanced formatting for nodesfornode_type,propertiesinschema["node_props"].items():formatted_node_props.append(f"- **{node_type}**")forpropinproperties:example=""ifprop["type"]=="STRING"andprop.get("values"):ifprop.get("distinct_count",11)>DISTINCT_VALUE_LIMIT:example=(f'Example: "{clean_string_values(prop["values"][0])}"'ifprop["values"]else"")else:# If less than 10 possible values return allexample=(("Available options: "f"{[clean_string_values(el)forelinprop['values']]}")ifprop["values"]else"")elifprop["type"]in["INTEGER","FLOAT","DATE","DATE_TIME","LOCAL_DATE_TIME",]:ifprop.get("min")isnotNone:example=f"Min: {prop['min']}, Max: {prop['max']}"else:example=(f'Example: "{prop["values"][0]}"'ifprop.get("values")else"")elifprop["type"]=="LIST":# Skip embeddingsifnotprop.get("min_size")orprop["min_size"]>LIST_LIMIT:continueexample=(f"Min Size: {prop['min_size']}, Max Size: {prop['max_size']}")formatted_node_props.append(f" - `{prop['property']}`: {prop['type']}{example}")# Enhanced formatting for relationshipsforrel_type,propertiesinschema["rel_props"].items():formatted_rel_props.append(f"- **{rel_type}**")forpropinproperties:example=""ifprop["type"]=="STRING":ifprop.get("distinct_count",11)>DISTINCT_VALUE_LIMIT:example=(f'Example: "{clean_string_values(prop["values"][0])}"'ifprop["values"]else"")else:# If less than 10 possible values return allexample=(("Available options: "f"{[clean_string_values(el)forelinprop['values']]}")ifprop["values"]else"")elifprop["type"]in["INTEGER","FLOAT","DATE","DATE_TIME","LOCAL_DATE_TIME",]:ifprop.get("min"):# If we have min/maxexample=f"Min: {prop['min']}, Max: {prop['max']}"else:# return a single valueexample=(f'Example: "{prop["values"][0]}"'ifprop["values"]else"")elifprop["type"]=="LIST":# Skip embeddingsifnotprop.get("min_size")orprop["min_size"]>LIST_LIMIT:continueexample=(f"Min Size: {prop['min_size']}, Max Size: {prop['max_size']}")formatted_rel_props.append(f" - `{prop['property']}: {prop['type']}` {example}")else:# Format node propertiesforlabel,propsinschema["node_props"].items():props_str=", ".join([f"{prop['property']}: {prop['type']}"forpropinprops])formatted_node_props.append(f"{label}{{{props_str}}}")# Format relationship properties using structured_schemafortype,propsinschema["rel_props"].items():props_str=", ".join([f"{prop['property']}: {prop['type']}"forpropinprops])formatted_rel_props.append(f"{type}{{{props_str}}}")# Format relationshipsformatted_rels=[f"(:{el['start']})-[:{el['type']}]->(:{el['end']})"forelinschema["relationships"]]return"\n".join(["Node properties:","\n".join(formatted_node_props),"Relationship properties:","\n".join(formatted_rel_props),"The relationships:","\n".join(formatted_rels),])@deprecated(since="0.3.8",removal="1.0",alternative_import="langchain_neo4j.graphs.neo4j_graph._remove_backticks",)def_remove_backticks(text:str)->str:returntext.replace("`","")
[docs]@deprecated(since="0.3.8",removal="1.0",alternative_import="langchain_neo4j.Neo4jGraph",)classNeo4jGraph(GraphStore):"""Neo4j database wrapper for various graph operations. Parameters: url (Optional[str]): The URL of the Neo4j database server. username (Optional[str]): The username for database authentication. password (Optional[str]): The password for database authentication. database (str): The name of the database to connect to. Default is 'neo4j'. timeout (Optional[float]): The timeout for transactions in seconds. Useful for terminating long-running queries. By default, there is no timeout set. sanitize (bool): A flag to indicate whether to remove lists with more than 128 elements from results. Useful for removing embedding-like properties from database responses. Default is False. refresh_schema (bool): A flag whether to refresh schema information at initialization. Default is True. enhanced_schema (bool): A flag whether to scan the database for example values and use them in the graph schema. Default is False. driver_config (Dict): Configuration passed to Neo4j Driver. *Security note*: Make sure that the database connection uses credentials that are narrowly-scoped to only include necessary permissions. Failure to do so may result in data corruption or loss, since the calling code may attempt commands that would result in deletion, mutation of data if appropriately prompted or reading sensitive data if such data is present in the database. The best way to guard against such negative outcomes is to (as appropriate) limit the permissions granted to the credentials used with this tool. See https://python.langchain.com/docs/security for more information. """
[docs]def__init__(self,url:Optional[str]=None,username:Optional[str]=None,password:Optional[str]=None,database:Optional[str]=None,timeout:Optional[float]=None,sanitize:bool=False,refresh_schema:bool=True,*,driver_config:Optional[Dict]=None,enhanced_schema:bool=False,)->None:"""Create a new Neo4j graph wrapper instance."""try:importneo4jexceptImportError:raiseImportError("Could not import neo4j python package. ""Please install it with `pip install neo4j`.")url=get_from_dict_or_env({"url":url},"url","NEO4J_URI")# if username and password are "", assume Neo4j auth is disabledifusername==""andpassword=="":auth=Noneelse:username=get_from_dict_or_env({"username":username},"username","NEO4J_USERNAME",)password=get_from_dict_or_env({"password":password},"password","NEO4J_PASSWORD",)auth=(username,password)database=get_from_dict_or_env({"database":database},"database","NEO4J_DATABASE","neo4j")self._driver=neo4j.GraphDatabase.driver(url,auth=auth,**(driver_configor{}))self._database=databaseself.timeout=timeoutself.sanitize=sanitizeself._enhanced_schema=enhanced_schemaself.schema:str=""self.structured_schema:Dict[str,Any]={}# Verify connectiontry:self._driver.verify_connectivity()exceptneo4j.exceptions.ServiceUnavailable:raiseValueError("Could not connect to Neo4j database. ""Please ensure that the url is correct")exceptneo4j.exceptions.AuthError:raiseValueError("Could not connect to Neo4j database. ""Please ensure that the username and password are correct")# Set schemaifrefresh_schema:try:self.refresh_schema()exceptneo4j.exceptions.ClientErrorase:ife.code=="Neo.ClientError.Procedure.ProcedureNotFound":raiseValueError("Could not use APOC procedures. ""Please ensure the APOC plugin is installed in Neo4j and that ""'apoc.meta.data()' is allowed in Neo4j configuration ")raisee
@propertydefget_schema(self)->str:"""Returns the schema of the Graph"""returnself.schema@propertydefget_structured_schema(self)->Dict[str,Any]:"""Returns the structured schema of the Graph"""returnself.structured_schema
[docs]defquery(self,query:str,params:dict={},)->List[Dict[str,Any]]:"""Query Neo4j database. Args: query (str): The Cypher query to execute. params (dict): The parameters to pass to the query. Returns: List[Dict[str, Any]]: The list of dictionaries containing the query results. """fromneo4jimportQueryfromneo4j.exceptionsimportNeo4jErrortry:data,_,_=self._driver.execute_query(Query(text=query,timeout=self.timeout),database_=self._database,parameters_=params,)json_data=[r.data()forrindata]ifself.sanitize:json_data=[value_sanitize(el)forelinjson_data]returnjson_dataexceptNeo4jErrorase:ifnot(((# isCallInTransactionErrore.code=="Neo.DatabaseError.Statement.ExecutionFailed"ore.code=="Neo.DatabaseError.Transaction.TransactionStartFailed")and"in an implicit transaction"ine.message# type: ignore[operator])or(# isPeriodicCommitErrore.code=="Neo.ClientError.Statement.SemanticError"and("in an open transaction is not possible"ine.message# type: ignore[operator]or"tried to execute in an explicit transaction"ine.message# type: ignore[operator]))):raise# fallback to allow implicit transactionswithself._driver.session(database=self._database)assession:data=session.run(Query(text=query,timeout=self.timeout),params)# type: ignore[assignment]json_data=[r.data()forrindata]ifself.sanitize:json_data=[value_sanitize(el)forelinjson_data]returnjson_data
[docs]defrefresh_schema(self)->None:""" Refreshes the Neo4j graph schema information. """fromneo4j.exceptionsimportClientError,CypherTypeErrornode_properties=[el["output"]forelinself.query(node_properties_query,params={"EXCLUDED_LABELS":EXCLUDED_LABELS+[BASE_ENTITY_LABEL]},)]rel_properties=[el["output"]forelinself.query(rel_properties_query,params={"EXCLUDED_LABELS":EXCLUDED_RELS})]relationships=[el["output"]forelinself.query(rel_query,params={"EXCLUDED_LABELS":EXCLUDED_LABELS+[BASE_ENTITY_LABEL]},)]# Get constraints & indexestry:constraint=self.query("SHOW CONSTRAINTS")index=self.query("CALL apoc.schema.nodes() YIELD label, properties, type, size, ""valuesSelectivity WHERE type = 'RANGE' RETURN *, ""size * valuesSelectivity as distinctValues")except(ClientError):# Read-only user might not have access to schema informationconstraint=[]index=[]self.structured_schema={"node_props":{el["labels"]:el["properties"]forelinnode_properties},"rel_props":{el["type"]:el["properties"]forelinrel_properties},"relationships":relationships,"metadata":{"constraint":constraint,"index":index},}ifself._enhanced_schema:schema_counts=self.query("CALL apoc.meta.graphSample() YIELD nodes, relationships ""RETURN nodes, [rel in relationships | {name:apoc.any.property""(rel, 'type'), count: apoc.any.property(rel, 'count')}]"" AS relationships")# Update node infofornodeinschema_counts[0]["nodes"]:# Skip bloom labelsifnode["name"]inEXCLUDED_LABELS:continuenode_props=self.structured_schema["node_props"].get(node["name"])ifnotnode_props:# The node has no propertiescontinueenhanced_cypher=self._enhanced_schema_cypher(node["name"],node_props,node["count"]<EXHAUSTIVE_SEARCH_LIMIT)# Due to schema-flexible nature of neo4j errors can happentry:enhanced_info=self.query(enhanced_cypher)[0]["output"]forpropinnode_props:ifprop["property"]inenhanced_info:prop.update(enhanced_info[prop["property"]])exceptCypherTypeError:continue# Update rel infoforrelinschema_counts[0]["relationships"]:# Skip bloom labelsifrel["name"]inEXCLUDED_RELS:continuerel_props=self.structured_schema["rel_props"].get(rel["name"])ifnotrel_props:# The rel has no propertiescontinueenhanced_cypher=self._enhanced_schema_cypher(rel["name"],rel_props,rel["count"]<EXHAUSTIVE_SEARCH_LIMIT,is_relationship=True,)try:enhanced_info=self.query(enhanced_cypher)[0]["output"]forpropinrel_props:ifprop["property"]inenhanced_info:prop.update(enhanced_info[prop["property"]])# Due to schema-flexible nature of neo4j errors can happenexceptCypherTypeError:continueschema=_format_schema(self.structured_schema,self._enhanced_schema)self.schema=schema
[docs]defadd_graph_documents(self,graph_documents:List[GraphDocument],include_source:bool=False,baseEntityLabel:bool=False,)->None:""" This method constructs nodes and relationships in the graph based on the provided GraphDocument objects. Parameters: - graph_documents (List[GraphDocument]): A list of GraphDocument objects that contain the nodes and relationships to be added to the graph. Each GraphDocument should encapsulate the structure of part of the graph, including nodes, relationships, and the source document information. - include_source (bool, optional): If True, stores the source document and links it to nodes in the graph using the MENTIONS relationship. This is useful for tracing back the origin of data. Merges source documents based on the `id` property from the source document metadata if available; otherwise it calculates the MD5 hash of `page_content` for merging process. Defaults to False. - baseEntityLabel (bool, optional): If True, each newly created node gets a secondary __Entity__ label, which is indexed and improves import speed and performance. Defaults to False. """ifbaseEntityLabel:# Check if constraint already existsconstraint_exists=any([el["labelsOrTypes"]==[BASE_ENTITY_LABEL]andel["properties"]==["id"]forelinself.structured_schema.get("metadata",{}).get("constraint",[])])ifnotconstraint_exists:# Create constraintself.query(f"CREATE CONSTRAINT IF NOT EXISTS FOR (b:{BASE_ENTITY_LABEL}) ""REQUIRE b.id IS UNIQUE;")self.refresh_schema()# Refresh constraint informationnode_import_query=_get_node_import_query(baseEntityLabel,include_source)rel_import_query=_get_rel_import_query(baseEntityLabel)fordocumentingraph_documents:ifnotdocument.source.metadata.get("id"):document.source.metadata["id"]=md5(document.source.page_content.encode("utf-8")).hexdigest()# Remove backticks from node typesfornodeindocument.nodes:node.type=_remove_backticks(node.type)# Import nodesself.query(node_import_query,{"data":[el.__dict__forelindocument.nodes],"document":document.source.__dict__,},)# Import relationshipsself.query(rel_import_query,{"data":[{"source":el.source.id,"source_label":_remove_backticks(el.source.type),"target":el.target.id,"target_label":_remove_backticks(el.target.type),"type":_remove_backticks(el.type.replace(" ","_").upper()),"properties":el.properties,}forelindocument.relationships]},)
def_enhanced_schema_cypher(self,label_or_type:str,properties:List[Dict[str,Any]],exhaustive:bool,is_relationship:bool=False,)->str:ifis_relationship:match_clause=f"MATCH ()-[n:`{label_or_type}`]->()"else:match_clause=f"MATCH (n:`{label_or_type}`)"with_clauses=[]return_clauses=[]output_dict={}ifexhaustive:forpropinproperties:prop_name=prop["property"]prop_type=prop["type"]ifprop_type=="STRING":with_clauses.append((f"collect(distinct substring(toString(n.`{prop_name}`)"f", 0, 50)) AS `{prop_name}_values`"))return_clauses.append((f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"f" distinct_count: size(`{prop_name}_values`)"))elifprop_typein["INTEGER","FLOAT","DATE","DATE_TIME","LOCAL_DATE_TIME",]:with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")with_clauses.append(f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`")return_clauses.append((f"min: toString(`{prop_name}_min`), "f"max: toString(`{prop_name}_max`), "f"distinct_count: `{prop_name}_distinct`"))elifprop_type=="LIST":with_clauses.append((f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"))return_clauses.append(f"min_size: `{prop_name}_size_min`, "f"max_size: `{prop_name}_size_max`")elifprop_typein["BOOLEAN","POINT","DURATION"]:continueoutput_dict[prop_name]="{"+return_clauses.pop()+"}"else:# Just sample 5 random nodesmatch_clause+=" WITH n LIMIT 5"forpropinproperties:prop_name=prop["property"]prop_type=prop["type"]# Check if indexed property, we can still do exhaustiveprop_index=[elforelinself.structured_schema["metadata"]["index"]ifel["label"]==label_or_typeandel["properties"]==[prop_name]andel["type"]=="RANGE"]ifprop_type=="STRING":if(prop_indexandprop_index[0].get("size")>0andprop_index[0].get("distinctValues")<=DISTINCT_VALUE_LIMIT):distinct_values=self.query(f"CALL apoc.schema.properties.distinct("f"'{label_or_type}', '{prop_name}') YIELD value")[0]["value"]return_clauses.append((f"values: {distinct_values},"f" distinct_count: {len(distinct_values)}"))else:with_clauses.append((f"collect(distinct substring(toString(n.`{prop_name}`)"f", 0, 50)) AS `{prop_name}_values`"))return_clauses.append(f"values: `{prop_name}_values`")elifprop_typein["INTEGER","FLOAT","DATE","DATE_TIME","LOCAL_DATE_TIME",]:ifnotprop_index:with_clauses.append(f"collect(distinct toString(n.`{prop_name}`)) "f"AS `{prop_name}_values`")return_clauses.append(f"values: `{prop_name}_values`")else:with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")with_clauses.append(f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`")return_clauses.append((f"min: toString(`{prop_name}_min`), "f"max: toString(`{prop_name}_max`), "f"distinct_count: `{prop_name}_distinct`"))elifprop_type=="LIST":with_clauses.append((f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"))return_clauses.append((f"min_size: `{prop_name}_size_min`, "f"max_size: `{prop_name}_size_max`"))elifprop_typein["BOOLEAN","POINT","DURATION"]:continueoutput_dict[prop_name]="{"+return_clauses.pop()+"}"with_clause="WITH "+",\n ".join(with_clauses)return_clause=("RETURN {"+", ".join(f"`{k}`: {v}"fork,vinoutput_dict.items())+"} AS output")# Combine all parts of the Cypher querycypher_query="\n".join([match_clause,with_clause,return_clause])returncypher_query