Skip to content

Commit 05611df

Browse files
authored
perf: vector generate (labring#1748)
1 parent d0085a2 commit 05611df

File tree

2 files changed

+108
-108
lines changed

2 files changed

+108
-108
lines changed

projects/app/src/pages/api/core/dataset/training/rebuildEmbedding.ts

Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -81,50 +81,61 @@ async function handler(
8181
});
8282

8383
// get 10 init dataset.data
84-
const arr = new Array(10).fill(0);
84+
const max = global.systemEnv?.vectorMaxProcess || 10;
85+
const arr = new Array(max * 2).fill(0);
86+
8587
for await (const _ of arr) {
86-
await mongoSessionRun(async (session) => {
87-
const data = await MongoDatasetData.findOneAndUpdate(
88-
{
89-
teamId,
90-
datasetId,
91-
rebuilding: true
92-
},
93-
{
94-
$unset: {
95-
rebuilding: null
88+
try {
89+
const hasNext = await mongoSessionRun(async (session) => {
90+
// get next dataset.data
91+
const data = await MongoDatasetData.findOneAndUpdate(
92+
{
93+
rebuilding: true,
94+
teamId,
95+
datasetId
96+
},
97+
{
98+
$unset: {
99+
rebuilding: null
100+
},
101+
updateTime: new Date()
96102
},
97-
updateTime: new Date()
98-
},
99-
{
100-
session
101-
}
102-
).select({
103-
_id: 1,
104-
collectionId: 1
105-
});
106-
107-
if (data) {
108-
await MongoDatasetTraining.create(
109-
[
110-
{
111-
teamId,
112-
tmbId,
113-
datasetId,
114-
collectionId: data.collectionId,
115-
billId,
116-
mode: TrainingModeEnum.chunk,
117-
model: vectorModel,
118-
q: '1',
119-
dataId: data._id
120-
}
121-
],
122103
{
123104
session
124105
}
125-
);
106+
).select({
107+
_id: 1,
108+
collectionId: 1
109+
});
110+
111+
if (data) {
112+
await MongoDatasetTraining.create(
113+
[
114+
{
115+
teamId,
116+
tmbId,
117+
datasetId,
118+
collectionId: data.collectionId,
119+
billId,
120+
mode: TrainingModeEnum.chunk,
121+
model: vectorModel,
122+
q: '1',
123+
dataId: data._id
124+
}
125+
],
126+
{
127+
session
128+
}
129+
);
130+
}
131+
132+
return !!data;
133+
});
134+
135+
if (!hasNext) {
136+
break;
126137
}
127-
});
138+
} catch (error) {}
128139
}
129140

130141
return {};

projects/app/src/service/events/generateVector.ts

Lines changed: 59 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -158,27 +158,69 @@ const rebuildData = async ({
158158

159159
const deleteVectorIdList = mongoData.indexes.map((index) => index.dataId);
160160

161-
const { tokens } = await mongoSessionRun(async (session) => {
162-
// update vector, update dataset.data rebuilding status, delete data from training
163-
const updateResult = await Promise.all(
164-
mongoData.indexes.map(async (index, i) => {
165-
const result = await insertDatasetDataVector({
166-
query: index.text,
167-
model: getVectorModel(trainingData.model),
168-
teamId: mongoData.teamId,
169-
datasetId: mongoData.datasetId,
170-
collectionId: mongoData.collectionId
171-
});
172-
mongoData.indexes[i].dataId = result.insertId;
173-
return result;
174-
})
175-
);
161+
// Find next rebuilding data to insert training queue
162+
await mongoSessionRun(async (session) => {
163+
// get new mongoData insert to training
164+
const newRebuildingData = await MongoDatasetData.findOneAndUpdate(
165+
{
166+
teamId: mongoData.teamId,
167+
datasetId: mongoData.datasetId,
168+
rebuilding: true
169+
},
170+
{
171+
$unset: {
172+
rebuilding: null
173+
},
174+
updateTime: new Date()
175+
},
176+
{ session }
177+
).select({
178+
_id: 1,
179+
collectionId: 1
180+
});
176181

177-
// Ensure that the training data is deleted after the Mongo update is successful
182+
if (newRebuildingData) {
183+
await MongoDatasetTraining.create(
184+
[
185+
{
186+
teamId: mongoData.teamId,
187+
tmbId: trainingData.tmbId,
188+
datasetId: mongoData.datasetId,
189+
collectionId: newRebuildingData.collectionId,
190+
billId: trainingData.billId,
191+
mode: TrainingModeEnum.chunk,
192+
model: trainingData.model,
193+
q: '1',
194+
dataId: newRebuildingData._id
195+
}
196+
],
197+
{ session }
198+
);
199+
}
200+
});
201+
202+
// update vector, update dataset_data rebuilding status, delete data from training
203+
// 1. Insert new vector to dataset_data
204+
const updateResult = await Promise.all(
205+
mongoData.indexes.map(async (index, i) => {
206+
const result = await insertDatasetDataVector({
207+
query: index.text,
208+
model: getVectorModel(trainingData.model),
209+
teamId: mongoData.teamId,
210+
datasetId: mongoData.datasetId,
211+
collectionId: mongoData.collectionId
212+
});
213+
mongoData.indexes[i].dataId = result.insertId;
214+
return result;
215+
})
216+
);
217+
const { tokens } = await mongoSessionRun(async (session) => {
218+
// 2. Ensure that the training data is deleted after the Mongo update is successful
178219
await mongoData.save({ session });
220+
// 3. Delete the training data
179221
await trainingData.deleteOne({ session });
180222

181-
// delete old vector
223+
// 4. Delete old vector
182224
await deleteDatasetDataVector({
183225
teamId: mongoData.teamId,
184226
idList: deleteVectorIdList
@@ -189,59 +231,6 @@ const rebuildData = async ({
189231
};
190232
});
191233

192-
// find next data insert to training queue
193-
const arr = new Array(5).fill(0);
194-
195-
for await (const _ of arr) {
196-
try {
197-
const hasNextData = await mongoSessionRun(async (session) => {
198-
// get new mongoData insert to training
199-
const newRebuildingData = await MongoDatasetData.findOneAndUpdate(
200-
{
201-
teamId: mongoData.teamId,
202-
datasetId: mongoData.datasetId,
203-
rebuilding: true
204-
},
205-
{
206-
$unset: {
207-
rebuilding: null
208-
},
209-
updateTime: new Date()
210-
},
211-
{ session }
212-
).select({
213-
_id: 1,
214-
collectionId: 1
215-
});
216-
217-
if (newRebuildingData) {
218-
await MongoDatasetTraining.create(
219-
[
220-
{
221-
teamId: mongoData.teamId,
222-
tmbId: trainingData.tmbId,
223-
datasetId: mongoData.datasetId,
224-
collectionId: newRebuildingData.collectionId,
225-
billId: trainingData.billId,
226-
mode: TrainingModeEnum.chunk,
227-
model: trainingData.model,
228-
q: '1',
229-
dataId: newRebuildingData._id
230-
}
231-
],
232-
{ session }
233-
);
234-
}
235-
236-
return !!newRebuildingData;
237-
});
238-
239-
if (!hasNextData) {
240-
break;
241-
}
242-
} catch (error) {}
243-
}
244-
245234
return { tokens };
246235
};
247236

0 commit comments

Comments
 (0)