Skip to content

Commit cf0aaa1

Browse files
authored
fix: invalid dataset data clear (labring#3927)
* fix: collection list count * fix: collection list count * fix: invalid dataset data clear * update ts * perf: cron clear invalid data * perf: init * perf: clear invalid code * update init * perf: clear invalid code * perf: clear invalid code * perf: init count * batch init * batch init * batch init * batch init * add comment * perf: init * fix: api proxy type
1 parent ac4255e commit cf0aaa1

File tree

13 files changed

+286
-96
lines changed

13 files changed

+286
-96
lines changed

docSite/content/zh-cn/docs/development/upgrading/4823.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,5 @@ weight: 802
2727
1. 标签过滤时,子文件夹未成功过滤。
2828
2. 暂时移除 md 阅读优化,避免链接分割错误。
2929
3. 离开团队时,未刷新成员列表。
30-
4. PPTX 编码错误,导致解析失败。
30+
4. PPTX 编码错误,导致解析失败。
31+
5. 删除知识库单条数据时,全文索引未跟随删除。

packages/service/core/dataset/collection/controller.ts

Lines changed: 23 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,13 @@ export const delCollectionRelatedSource = async ({
227227
collections,
228228
session
229229
}: {
230-
collections: DatasetCollectionSchemaType[];
230+
collections: {
231+
teamId: string;
232+
fileId?: string;
233+
metadata?: {
234+
relatedImgId?: string;
235+
};
236+
}[];
231237
session: ClientSession;
232238
}) => {
233239
if (collections.length === 0) return;
@@ -259,11 +265,13 @@ export const delCollectionRelatedSource = async ({
259265
export async function delCollection({
260266
collections,
261267
session,
262-
delRelatedSource
268+
delImg = true,
269+
delFile = true
263270
}: {
264271
collections: DatasetCollectionSchemaType[];
265272
session: ClientSession;
266-
delRelatedSource: boolean;
273+
delImg: boolean;
274+
delFile: boolean;
267275
}) {
268276
if (collections.length === 0) return;
269277

@@ -281,9 +289,18 @@ export async function delCollection({
281289
collectionId: { $in: collectionIds }
282290
});
283291

284-
/* file and imgs */
285-
if (delRelatedSource) {
286-
await delCollectionRelatedSource({ collections, session });
292+
if (delImg) {
293+
await delImgByRelatedId({
294+
teamId,
295+
relateIds: collections.map((item) => item?.metadata?.relatedImgId || '').filter(Boolean),
296+
session
297+
});
298+
}
299+
if (delFile) {
300+
await delFileByFileIdList({
301+
bucketName: BucketNameEnum.dataset,
302+
fileIdList: collections.map((item) => item?.fileId || '').filter(Boolean)
303+
});
287304
}
288305

289306
// Delete dataset_datas
@@ -309,48 +326,3 @@ export async function delCollection({
309326
// no session delete: delete files, vector data
310327
await deleteDatasetDataVector({ teamId, datasetIds, collectionIds });
311328
}
312-
313-
/**
314-
* delete delOnlyCollection
315-
*/
316-
export async function delOnlyCollection({
317-
collections,
318-
session
319-
}: {
320-
collections: DatasetCollectionSchemaType[];
321-
session: ClientSession;
322-
}) {
323-
if (collections.length === 0) return;
324-
325-
const teamId = collections[0].teamId;
326-
327-
if (!teamId) return Promise.reject('teamId is not exist');
328-
329-
const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
330-
const collectionIds = collections.map((item) => String(item._id));
331-
332-
// delete training data
333-
await MongoDatasetTraining.deleteMany({
334-
teamId,
335-
datasetId: { $in: datasetIds },
336-
collectionId: { $in: collectionIds }
337-
});
338-
339-
// delete dataset.datas
340-
await MongoDatasetData.deleteMany(
341-
{ teamId, datasetId: { $in: datasetIds }, collectionId: { $in: collectionIds } },
342-
{ session }
343-
);
344-
345-
// delete collections
346-
await MongoDatasetCollection.deleteMany(
347-
{
348-
teamId,
349-
_id: { $in: collectionIds }
350-
},
351-
{ session }
352-
);
353-
354-
// no session delete: delete files, vector data
355-
await deleteDatasetDataVector({ teamId, datasetIds, collectionIds });
356-
}

packages/service/core/dataset/collection/utils.ts

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,14 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
174174
}
175175

176176
await mongoSessionRun(async (session) => {
177+
// Delete old collection
178+
await delCollection({
179+
collections: [collection],
180+
delImg: false,
181+
delFile: false,
182+
session
183+
});
184+
177185
// Create new collection
178186
await createCollectionAndInsertData({
179187
session,
@@ -208,13 +216,6 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
208216
updateTime: new Date()
209217
}
210218
});
211-
212-
// Delete old collection
213-
await delCollection({
214-
collections: [collection],
215-
delRelatedSource: false,
216-
session
217-
});
218219
});
219220

220221
return DatasetCollectionSyncResultEnum.success;

packages/service/core/dataset/data/dataTextSchema.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { connectionMongo, getMongoModel } from '../../../common/mongo';
22
const { Schema } = connectionMongo;
3-
import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d';
3+
import { DatasetDataTextSchemaType } from '@fastgpt/global/core/dataset/type.d';
44
import { TeamCollectionName } from '@fastgpt/global/support/user/team/constant';
55
import { DatasetCollectionName } from '../schema';
66
import { DatasetColCollectionName } from '../collection/schema';
@@ -45,7 +45,7 @@ try {
4545
console.log(error);
4646
}
4747

48-
export const MongoDatasetDataText = getMongoModel<DatasetDataSchemaType>(
48+
export const MongoDatasetDataText = getMongoModel<DatasetDataTextSchemaType>(
4949
DatasetDataTextCollectionName,
5050
DatasetDataTextSchema
5151
);

projects/app/src/pageComponents/account/model/Channel/ModelTest.tsx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ const ModelTest = ({ models, onClose }: { models: string[]; onClose: () => void
135135
}
136136
);
137137

138-
console.log(testModelList);
139138
return (
140139
<MyModal
141140
iconSrc={'core/chat/sendLight'}

projects/app/src/pageComponents/account/model/Log/index.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ const ChannelLog = ({ Tab }: { Tab: React.ReactNode }) => {
119119
},
120120
...res
121121
];
122-
}, [systemModelList]);
122+
}, [systemModelList, t]);
123123

124124
const { data, isLoading, ScrollData } = useScrollPagination(getChannelLog, {
125125
pageSize: 20,

projects/app/src/pages/api/admin/clearInvalidData.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ async function checkInvalidImg(start: Date, end: Date, limit = 50) {
3535
'metadata.relatedImgId': image.metadata?.relatedId
3636
},
3737
'_id'
38-
);
38+
).lean();
3939

4040
if (!collection) {
4141
await image.deleteOne();
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
import type { NextApiRequest, NextApiResponse } from 'next';
2+
import { jsonRes } from '@fastgpt/service/common/response';
3+
import { connectToDatabase } from '@/service/mongo';
4+
import { authCert } from '@fastgpt/service/support/permission/auth/common';
5+
import { addHours } from 'date-fns';
6+
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
7+
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
8+
import { delay, retryFn } from '@fastgpt/global/common/system/utils';
9+
import { delCollection } from '@fastgpt/service/core/dataset/collection/controller';
10+
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
11+
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
12+
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
13+
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type';
14+
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
15+
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
16+
17+
// 删了库,没删集合
18+
const checkInvalidCollection = async () => {
19+
const batchSize = 1000;
20+
21+
let skip = 0;
22+
let success = 0;
23+
while (true) {
24+
try {
25+
const collections = await MongoDatasetCollection.find(
26+
{},
27+
'_id teamId datasetId fileId metadata'
28+
)
29+
.limit(batchSize)
30+
.skip(skip)
31+
.lean();
32+
if (collections.length === 0) break;
33+
34+
const datasetMap: Record<string, DatasetCollectionSchemaType[]> = {};
35+
36+
// 相同 datasetId 的集合放到一起
37+
for await (const collection of collections) {
38+
const datasetId = String(collection.datasetId);
39+
const val = datasetMap[datasetId];
40+
if (val) {
41+
val.push(collection);
42+
} else {
43+
datasetMap[datasetId] = [collection];
44+
}
45+
}
46+
47+
const datasetIds = Object.keys(datasetMap);
48+
for await (const datasetId of datasetIds) {
49+
try {
50+
const val = datasetMap[datasetId];
51+
if (!val) {
52+
continue;
53+
}
54+
55+
await retryFn(async () => {
56+
const datasetExists = await MongoDataset.findById(datasetId, '_id').lean();
57+
if (!datasetExists) {
58+
console.log('清理无效的知识库集合, datasetId', datasetId);
59+
await mongoSessionRun(async (session) => {
60+
return await delCollection({
61+
collections: val,
62+
delImg: true,
63+
delFile: true,
64+
session
65+
});
66+
});
67+
}
68+
});
69+
} catch (error) {
70+
console.log(error);
71+
}
72+
}
73+
74+
success += batchSize;
75+
skip += batchSize;
76+
console.log(`检测集合完成:${success}`);
77+
} catch (error) {
78+
console.log(error);
79+
await delay(1000);
80+
}
81+
}
82+
};
83+
84+
// 删了集合,没删 data
85+
const checkInvalidData = async () => {
86+
try {
87+
const datas = (await MongoDatasetData.aggregate([
88+
{
89+
$group: {
90+
_id: '$collectionId',
91+
teamId: { $first: '$teamId' },
92+
datasetId: { $first: '$datasetId' },
93+
collectionId: { $first: '$collectionId' }
94+
}
95+
}
96+
])) as {
97+
_id: string;
98+
teamId: string;
99+
datasetId: string;
100+
collectionId: string;
101+
}[];
102+
console.log('Total data collections length', datas.length);
103+
// 批量获取集合
104+
const collections = await MongoDatasetCollection.find({}, '_id').lean();
105+
console.log('Total collection length', collections.length);
106+
const collectionMap: Record<string, DatasetCollectionSchemaType> = {};
107+
for await (const collection of collections) {
108+
collectionMap[collection._id] = collection;
109+
}
110+
// 逐一删除无效的集合内容
111+
for await (const data of datas) {
112+
try {
113+
const col = collectionMap[data.collectionId];
114+
if (!col) {
115+
console.log('清理无效的知识库集合内容, collectionId', data.collectionId);
116+
await retryFn(async () => {
117+
await MongoDatasetTraining.deleteMany({
118+
teamId: data.teamId,
119+
datasetId: data.datasetId,
120+
collectionId: data.collectionId
121+
});
122+
await MongoDatasetDataText.deleteMany({
123+
teamId: data.teamId,
124+
datasetId: data.datasetId,
125+
collectionId: data.collectionId
126+
});
127+
await deleteDatasetDataVector({
128+
teamId: data.teamId,
129+
datasetIds: [data.datasetId],
130+
collectionIds: [data.collectionId]
131+
});
132+
await MongoDatasetData.deleteMany({
133+
teamId: data.teamId,
134+
datasetId: data.datasetId,
135+
collectionId: data.collectionId
136+
});
137+
});
138+
}
139+
} catch (error) {
140+
console.log(error);
141+
}
142+
}
143+
144+
console.log(`检测集合完成`);
145+
} catch (error) {
146+
console.log(error);
147+
}
148+
};
149+
150+
// 删了data,没删 data_text
151+
const checkInvalidDataText = async () => {
152+
try {
153+
// 获取所有索引层的 dataId
154+
const dataTexts = await MongoDatasetDataText.find({}, 'dataId').lean();
155+
const dataIds = dataTexts.map((item) => String(item.dataId));
156+
console.log('Total data_text dataIds:', dataIds.length);
157+
158+
// 获取数据层的 dataId
159+
const datas = await MongoDatasetData.find({}, '_id').lean();
160+
const datasSet = new Set(datas.map((item) => String(item._id)));
161+
console.log('Total data length:', datas.length);
162+
163+
// 存在索引层,不存在数据层的 dataId,说明数据已经被删了
164+
const unExistsSet = dataIds.filter((id) => !datasSet.has(id));
165+
console.log('Total unExists dataIds:', unExistsSet.length);
166+
await MongoDatasetDataText.deleteMany({
167+
dataId: { $in: unExistsSet }
168+
});
169+
} catch (error) {}
170+
};
171+
172+
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
173+
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
174+
try {
175+
await connectToDatabase();
176+
await authCert({ req, authRoot: true });
177+
const { start = -2, end = -360 * 24 } = req.body as { start: number; end: number };
178+
179+
(async () => {
180+
try {
181+
// 360天 ~ 2小时前
182+
const endTime = addHours(new Date(), start);
183+
const startTime = addHours(new Date(), end);
184+
console.log('清理无效的集合');
185+
await checkInvalidCollection();
186+
console.log('清理无效的数据');
187+
await checkInvalidData();
188+
console.log('清理无效的data_text');
189+
await checkInvalidDataText();
190+
} catch (error) {
191+
console.log('执行脏数据清理任务出错了');
192+
}
193+
})();
194+
195+
jsonRes(res, {
196+
message: 'success'
197+
});
198+
} catch (error) {
199+
console.log(error);
200+
201+
jsonRes(res, {
202+
code: 500,
203+
error
204+
});
205+
}
206+
}

0 commit comments

Comments
 (0)