Skip to content

Commit a414f2a

Browse files
authored
Make unstructured import send schema of previous chunks (neo4j#62)
1 parent 2d7a869 commit a414f2a

File tree

5 files changed

+60
-24
lines changed

5 files changed

+60
-24
lines changed

.env.example

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,3 @@ NEO4J_URL=neo4j+s://demo.neo4jlabs.com
33
NEO4J_USER=companies
44
NEO4J_PASS=companies
55
NEO4J_DATABASE=companies
6-
RUN_PARALLEL=FALSE

api/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,4 @@ retry==0.9.2
77
tiktoken==0.4.0
88
python-dotenv==1.0.0
99
websockets===11.0.3
10-
joblib===1.2.0
1110
gunicorn===20.1.0

api/src/components/unstructured_data_extractor.py

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from typing import List
44

55
from components.base_component import BaseComponent
6-
from joblib import Parallel, delayed
76
from llm.basellm import BaseLLM
87
from utils.unstructured_data_utils import (
98
nodesTextToListOfDict,
@@ -16,6 +15,7 @@ def generate_system_message_with_schema() -> str:
1615
You are a data scientist working for a company that is building a graph database. Your task is to extract information from data and convert it into a graph database.
1716
Provide a set of Nodes in the form [ENTITY, TYPE, PROPERTIES] and a set of relationships in the form [ENTITY1, RELATIONSHIP, ENTITY2, PROPERTIES].
1817
Pay attention to the type of the properties, if you can't find data for a property set it to null. Don't make anything up and don't add any extra data. If you can't find any data for a node or relationship don't add it.
18+
Only add nodes and relationships that are part of the schema.
1919
2020
Example:
2121
Schema: Nodes: [Person {age: integer, name: string}] Relationships: [Person, roommate, Person]
@@ -39,6 +39,22 @@ def generate_system_message() -> str:
3939
"""
4040

4141

42+
def generate_system_message_with_labels() -> str:
43+
return """
44+
You are a data scientist working for a company that is building a graph database. Your task is to extract information from data and convert it into a graph database.
45+
Provide a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of relationships in the form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES].
46+
It is important that the ENTITY_ID_1 and ENTITY_ID_2 exists as nodes with a matching ENTITY_ID. If you can't pair a relationship with a pair of nodes don't add it.
47+
When you find a node or relationship you want to add try to create a generic TYPE for it that describes the entity you can also think of it as a label.
48+
You will be given a list of types that you should try to use when creating the TYPE for a node. If you can't find a type that fits the node you can create a new one.
49+
50+
Example:
51+
Data: Alice lawyer and is 25 years old and Bob is her roommate since 2001. Bob works as a journalist. Alice owns a the webpage www.alice.com and Bob owns the webpage www.bob.com.
52+
Types: ["Person", "Webpage"]
53+
Nodes: ["alice", "Person", {"age": 25, "occupation": "lawyer", "name":"Alice"}], ["bob", "Person", {"occupation": "journalist", "name": "Bob"}], ["alice.com", "Webpage", {"url": "www.alice.com"}], ["bob.com", "Webpage", {"url": "www.bob.com"}]
54+
Relationships: ["alice", "roommate", "bob", {"start": 2021}], ["alice", "owns", "alice.com", {}], ["bob", "owns", "bob.com", {}]
55+
"""
56+
57+
4258
def generate_prompt(data) -> str:
4359
return f"""
4460
Data: {data}"""
@@ -50,6 +66,12 @@ def generate_prompt_with_schema(data, schema) -> str:
5066
Data: {data}"""
5167

5268

69+
def generate_prompt_with_labels(data, labels) -> str:
70+
return f"""
71+
Data: {data}
72+
Types: {labels}"""
73+
74+
5375
def splitString(string, max_length) -> List[str]:
5476
return [string[i : i + max_length] for i in range(0, len(string), max_length)]
5577

@@ -83,13 +105,10 @@ def getNodesAndRelationshipsFromResult(result):
83105
relationships = []
84106
for row in result:
85107
parsing = re.match(regex, row, flags=re.S)
86-
print("parsing", parsing)
87108
if parsing == None:
88109
continue
89110
rawNodes = str(parsing.group(1))
90-
print("rawNodes", rawNodes)
91111
rawRelationships = parsing.group(2)
92-
print("rawRelationships", rawRelationships)
93112
nodes.extend(re.findall(internalRegex, rawNodes))
94113
relationships.extend(re.findall(internalRegex, rawRelationships))
95114

@@ -116,6 +135,15 @@ def process(self, chunk):
116135
output = self.llm.generate(messages)
117136
return output
118137

138+
def process_with_labels(self, chunk, labels):
139+
messages = [
140+
{"role": "system", "content": generate_system_message_with_schema()},
141+
{"role": "user", "content": generate_prompt_with_labels(chunk, labels)},
142+
]
143+
print(messages)
144+
output = self.llm.generate(messages)
145+
return output
146+
119147
def run(self, data: str) -> List[str]:
120148
system_message = generate_system_message()
121149
prompt_string = generate_prompt("")
@@ -125,22 +153,20 @@ def run(self, data: str) -> List[str]:
125153
chunked_data = splitStringToFitTokenSpace(
126154
llm=self.llm, string=data, token_use_per_string=token_usage_per_prompt
127155
)
128-
print("starting multiple procceesing")
129-
results = []
130-
multi_processing = False
131156

132-
if os.environ.get("RUN_PARALLEL", "False") == "True":
133-
multi_processing = True
157+
results = []
158+
labels = set()
159+
print("Starting chunked processing")
160+
for chunk in chunked_data:
161+
proceededChunk = self.process_with_labels(chunk, list(labels))
162+
print("proceededChunk", proceededChunk)
163+
chunkResult = getNodesAndRelationshipsFromResult([proceededChunk])
164+
print("chunkResult", chunkResult)
165+
newLabels = [node["label"] for node in chunkResult["nodes"]]
166+
print("newLabels", newLabels)
167+
results.append(proceededChunk)
168+
labels.update(newLabels)
134169

135-
if multi_processing:
136-
results = Parallel(n_jobs=10)(
137-
delayed(self.process)(chunk) for chunk in chunked_data
138-
)
139-
else:
140-
for chunk in chunked_data:
141-
results.append(self.process(chunk))
142-
print("finished multiple procceesing")
143-
print(results)
144170
return getNodesAndRelationshipsFromResult(results)
145171

146172

@@ -164,7 +190,10 @@ def run(self, data: str, schema: str) -> List[str]:
164190
llm=self.llm, string=data, token_use_per_string=token_usage_per_prompt
165191
)
166192
result = []
193+
print("Starting chunked processing")
194+
167195
for chunk in chunked_data:
196+
print("prompt", generate_prompt_with_schema(chunk, schema))
168197
messages = [
169198
{
170199
"role": "system",

api/src/utils/unstructured_data_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ def nodesTextToListOfDict(nodes):
1010
result = []
1111
for node in nodes:
1212
nodeList = node.split(",")
13+
if len(nodeList) < 2:
14+
continue
15+
1316
name = nodeList[0].strip().replace('"', "")
1417
label = nodeList[1].strip().replace('"', "")
1518
properties = re.search(jsonRegex, node)
@@ -29,8 +32,6 @@ def nodesTextToListOfDict(nodes):
2932
def relationshipTextToListOfDict(relationships):
3033
result = []
3134
for relation in relationships:
32-
print("relation", relation)
33-
3435
relationList = relation.split(",")
3536
if len(relation) < 3:
3637
continue

ui/src/unstructured-import/utils/file-utils.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,11 +190,19 @@ export const saveImportResultAsNeo4jImport = (importResult: ImportResult) => {
190190

191191
const relationshipsCsv = relationshipsToCsv(relationships);
192192
console.log(relationships[0].start);
193+
194+
const fromId = labels.indexOf(startLabel);
195+
const toId = labels.indexOf(endLabel);
196+
197+
if (fromId === -1 || toId === -1) {
198+
return;
199+
}
200+
193201
modelFile.graph.relationships.push({
194202
id: `n${index}`,
195203
type: relationship,
196-
fromId: `n${labels.indexOf(startLabel)}`,
197-
toId: `n${labels.indexOf(endLabel)}`,
204+
fromId: `n${fromId}`,
205+
toId: `n${toId}`,
198206
});
199207

200208
zip.file(`relationships-${relationship}.csv`, relationshipsCsv);

0 commit comments

Comments
 (0)