diff --git a/.gitignore b/.gitignore index d7fb6d4..c08e1e9 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ Khauneesh/ # DB *metadata.db-shm *metadata.db-wal +telemetry.db # Test and coverage reports .coverage diff --git a/.project-metadata.yaml b/.project-metadata.yaml index 85b9b1b..75b1abc 100644 --- a/.project-metadata.yaml +++ b/.project-metadata.yaml @@ -30,6 +30,10 @@ environment_variables: default: "your huggingface username" description: >- hf_username + CDP_TOKEN: + default: "API key for Cloudera AI Inference" + description: >- + CDP_TOKEN @@ -69,7 +73,7 @@ tasks: script: build/build_client.py arguments: None cpu: 2 - memory: 2 + memory: 4 short_summary: Create job to build client application environment: TASK_TYPE: CREATE/RUN_JOB diff --git a/alembic/versions/1a8fdc23eb6f_add_s3_export_path.py b/alembic/versions/1a8fdc23eb6f_add_s3_export_path.py new file mode 100644 index 0000000..0497be5 --- /dev/null +++ b/alembic/versions/1a8fdc23eb6f_add_s3_export_path.py @@ -0,0 +1,38 @@ +"""add_s3_export_path + +Revision ID: 1a8fdc23eb6f +Revises: 9023b46c8d4c +Create Date: 2025-04-22 20:01:13.247491 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '1a8fdc23eb6f' +down_revision: Union[str, None] = '9023b46c8d4c' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add s3_export_path column to generation_metadata table + with op.batch_alter_table('generation_metadata', schema=None) as batch_op: + batch_op.add_column(sa.Column('s3_export_path', sa.Text(), nullable=True)) + + # Add s3_export_path column to export_metadata table + with op.batch_alter_table('export_metadata', schema=None) as batch_op: + batch_op.add_column(sa.Column('s3_export_path', sa.Text(), nullable=True)) + + +def downgrade() -> None: + # Remove s3_export_path column from generation_metadata table + with op.batch_alter_table('generation_metadata', schema=None) as batch_op: + batch_op.drop_column('s3_export_path') + + # Remove s3_export_path column from export_metadata table + with op.batch_alter_table('export_metadata', schema=None) as batch_op: + batch_op.drop_column('s3_export_path') diff --git a/app/client/eslint.config.js b/app/client/eslint.config.js index 092408a..6a991c7 100644 --- a/app/client/eslint.config.js +++ b/app/client/eslint.config.js @@ -23,6 +23,7 @@ export default tseslint.config( 'warn', { allowConstantExport: true }, ], + '@typescript-eslint/no-explicit-any': ['warn', { 'fixToUnknown': true, 'ignoreRestArgs': false }] }, }, ) diff --git a/app/client/package.json b/app/client/package.json index f3e0095..bf69255 100644 --- a/app/client/package.json +++ b/app/client/package.json @@ -16,6 +16,8 @@ "@mui/icons-material": "6.1.7", "@mui/material": "6.1.7", "@tanstack/react-query": "5.66.0", + "ag-grid-community": "33.2.4", + "ag-grid-react":"33.2.4", "antd": "5.22.1", "axios": "1.6.7", "lodash": "4.17.21", diff --git a/app/client/src/Container.tsx b/app/client/src/Container.tsx index 94db670..f881d2c 100644 --- a/app/client/src/Container.tsx +++ b/app/client/src/Container.tsx @@ -48,9 +48,6 @@ const PageHeader = styled(Header)` height: fit-content; padding: 5px 15px `; -const StyledImg = styled.img` - height: ${props => props?.height && `${props.height}px`} -` const StyledText = styled.div` font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'Noto Sans', sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; diff --git a/app/client/src/api/api.ts b/app/client/src/api/api.ts index 19d9073..b343957 100644 --- a/app/client/src/api/api.ts +++ b/app/client/src/api/api.ts @@ -27,8 +27,11 @@ export const useFetchModels = (): UseFetchApiReturn => { return useFetch(url); } -export const useFetchDefaultPrompt = (useCase: string): UseFetchApiReturn => { - const url = `${baseUrl}/${isEmpty(useCase) ? 'custom' : useCase}/gen_prompt`; +export const useFetchDefaultPrompt = (useCase: string, workflowType?: WorkerType): UseFetchApiReturn => { + let url = `${baseUrl}/${isEmpty(useCase) ? 'custom' : useCase}/gen_prompt`; + if (workflowType && workflowType === 'freeform') { + url = `${baseUrl}/${isEmpty(useCase) ? 'custom' : useCase}/gen_freeform_prompt`; + } return useFetch(url); } @@ -42,7 +45,7 @@ export const useFetchDefaultModelParams = (): UseFetchApiReturn() => { - const genDatasetUrl = `${import.meta.env.VITE_AMP_URL}/synthesis/generate`; +export const useTriggerDatagen = (workflow_type: string) => { + const genDatasetUrl = `${import.meta.env.VITE_AMP_URL}/synthesis/${workflow_type === 'freeform' ? 'freeform' : 'generate'}`; return usePostApi(genDatasetUrl); } diff --git a/app/client/src/api/hooks.ts b/app/client/src/api/hooks.ts index e233f18..81dc16a 100644 --- a/app/client/src/api/hooks.ts +++ b/app/client/src/api/hooks.ts @@ -106,7 +106,7 @@ interface UsePostApiReturn { data: T | null; loading: boolean; error: Error | null; - triggerPost: (body: Record) => Promise; + triggerPost: (body: Record) => Promise; } export function usePostApi(url: string): UsePostApiReturn { @@ -114,7 +114,7 @@ export function usePostApi(url: string): UsePostApiReturn { const [loading, setLoading] = useState(false); const [error, setError] = useState(null); - const triggerPost = async (body: Record) => { + const triggerPost = async (body: Record) => { setLoading(true); setError(null); // Reset error on each request diff --git a/app/client/src/components/RouteAccessControl.tsx b/app/client/src/components/RouteAccessControl.tsx index dbd46b3..dc8242d 100644 --- a/app/client/src/components/RouteAccessControl.tsx +++ b/app/client/src/components/RouteAccessControl.tsx @@ -10,7 +10,7 @@ import { Navigate, useLocation } from "react-router-dom"; */ interface RouteACProps{ element: ReactNode; - validator: (state: any | null) => boolean; + validator: (state: unknown | null) => boolean; reroutePath?: string; } const RouteAccessControl: FC = ({ element, validator, reroutePath = '/' }) => { diff --git a/app/client/src/components/TelemetryDashboard.tsx b/app/client/src/components/TelemetryDashboard.tsx index b154186..27d7269 100644 --- a/app/client/src/components/TelemetryDashboard.tsx +++ b/app/client/src/components/TelemetryDashboard.tsx @@ -1,7 +1,7 @@ import React, { useState, useEffect } from 'react'; import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip, Legend, ResponsiveContainer, - LineChart, Line, AreaChart, Area + LineChart, Line } from 'recharts'; import axios from 'axios'; import { @@ -9,7 +9,7 @@ import { } from 'antd'; import { DashboardOutlined, ApiOutlined, CloudServerOutlined, RocketOutlined, SyncOutlined, - PieChartOutlined, BarChartOutlined, CodeOutlined, WarningOutlined, CheckCircleOutlined, CloseCircleOutlined + CodeOutlined, WarningOutlined, CheckCircleOutlined, CloseCircleOutlined } from '@ant-design/icons'; const { Title, Text } = Typography; @@ -19,7 +19,7 @@ const SUCCESS_COLOR = '#52c41a'; const ERROR_COLOR = '#f5222d'; const WARNING_COLOR = '#faad14'; const INFO_COLOR = '#1890ff'; -const COLORS = ['#0088FE', '#00C49F', '#FFBB28', '#FF8042', '#8884d8', '#4CAF50', '#F44336', '#9C27B0']; +// const COLORS = ['#0088FE', '#00C49F', '#FFBB28', '#FF8042', '#8884d8', '#4CAF50', '#F44336', '#9C27B0']; const TelemetryDashboard = () => { const [loading, setLoading] = useState(true); diff --git a/app/client/src/pages/DataGenerator/Configure.tsx b/app/client/src/pages/DataGenerator/Configure.tsx index 2db6fd4..cde2cab 100644 --- a/app/client/src/pages/DataGenerator/Configure.tsx +++ b/app/client/src/pages/DataGenerator/Configure.tsx @@ -1,5 +1,6 @@ import endsWith from 'lodash/endsWith'; import isEmpty from 'lodash/isEmpty'; +import isFunction from 'lodash/isFunction'; import { useEffect, useState } from 'react'; import { Flex, Form, Input, Select, Typography } from 'antd'; import styled from 'styled-components'; @@ -10,6 +11,7 @@ import { ModelProviders, ModelProvidersDropdownOpts } from './types'; import { useWizardCtx } from './utils'; import FileSelectorButton from './FileSelectorButton'; + const StepContainer = styled(Flex)` background: white; padding: 40px 0px; @@ -31,7 +33,8 @@ export const USECASE_OPTIONS = [ export const WORKFLOW_OPTIONS = [ { label: 'Supervised Fine-Tuning', value: 'supervised-fine-tuning' }, - { label: 'Custom Data Generation', value: 'custom' } + { label: 'Custom Data Generation', value: 'custom' }, + { label: 'Freefrom Data Generation', value: 'freeform' } ]; export const MODEL_TYPE_OPTIONS: ModelProvidersDropdownOpts = [ @@ -55,16 +58,23 @@ const Configure = () => { delete values.output_value; const allFieldsFilled = Object.values(values).every(value => Boolean(value)); - if (allFieldsFilled) { - setIsStepValid && setIsStepValid(true) - } else { - setIsStepValid && setIsStepValid(false) + if (allFieldsFilled && isFunction(setIsStepValid)) { + setIsStepValid(true) + } else if (isFunction(setIsStepValid)) { + setIsStepValid(false) } } useEffect(() => { validateForm() }, [form, formData]) + // keivan + useEffect(() => { + if (formData && formData?.inference_type === undefined) { + form.setFieldValue('inference_type', ModelProviders.CAII); + } + }, [formData]); + const labelCol = { span: 8 }; @@ -83,7 +93,7 @@ const Configure = () => { form.setFieldValue('doc_paths', paths); } - const onFilesChange = (selections: any) => { + const onFilesChange = (selections: unknown) => { if (Array.isArray(selections) && !isEmpty(selections)) { const paths = selections.map((file: File) => ( { @@ -106,7 +116,6 @@ const Configure = () => { setSelectedFiles([]); } } - return ( @@ -209,7 +218,8 @@ const Configure = () => { )} - {formData?.workflow_type === WorkflowType.SUPERVISED_FINE_TUNING && + {(formData?.workflow_type === WorkflowType.SUPERVISED_FINE_TUNING || + formData?.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION) && { formData?.workflow_type === WorkflowType.CUSTOM_DATA_GENERATION) && { } + {/* {formData?.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION || + + + + + + } */} ) diff --git a/app/client/src/pages/DataGenerator/CustomPromptButton.tsx b/app/client/src/pages/DataGenerator/CustomPromptButton.tsx index e0696f1..dc2fa47 100644 --- a/app/client/src/pages/DataGenerator/CustomPromptButton.tsx +++ b/app/client/src/pages/DataGenerator/CustomPromptButton.tsx @@ -1,9 +1,8 @@ -import { Button, Flex, Form, Input, Modal, notification, Spin } from "antd"; +import { Button, Form, Input, Modal, notification } from "antd"; import { useEffect, useState } from "react"; import { useMutation } from "@tanstack/react-query"; import styled from "styled-components"; -import { LoadingOutlined } from '@ant-design/icons'; -import { fetchCustomPrompt, fetchPrompt } from "./hooks"; +import { fetchCustomPrompt } from "./hooks"; import Loading from "../Evaluator/Loading"; interface Props { @@ -16,9 +15,30 @@ interface Props { export const StyledTextArea = styled(Input.TextArea)` margin-bottom: 10px !important; - min-height: 175px !important; + min-height: 275px !important; + margin-bottom: 10px !important; + padding: 15px 20px !important; `; +const StyledModal = styled(Modal)` + .ant-modal-content { + max-height: 90vh; + // height: 760px; + height: 85vh; + width: 750px; + .ant-modal-body { + padding-top: 0; + min-height: 70vh; + } + } + // .ant-modal-content { + // border-radius: 8px; + // box-shadow: 0px 4px 16px rgba(0, 0, 0, 0.1); + // background-color: #ffffff; + // padding: 24px; + // } +` + const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_endpoint, use_case, setPrompt }) => { const [form] = Form.useForm(); const [showModal, setShowModal] = useState(false); @@ -39,7 +59,7 @@ const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_en setShowModal(false); } }, [mutation.error, mutation.isSuccess]); - + const onFinish = async () => { const custom_prompt = form.getFieldValue('custom_prompt_instructions'); try { @@ -67,7 +87,7 @@ const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_en {showModal && ( - = ({ model_id, inference_type, caii_en initialValues={initialValues} onFinish={onSubmit} style={{ marginTop: '24px' }} - disabled={mutation.isLoading} + disabled={mutation.isPending} > - {mutation.isLoading && + {mutation.isPending && } @@ -90,6 +110,8 @@ const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_en name='custom_prompt_instructions' label='Custom Prompt Instructions' rules={[{ required: true, message: "This field is required." }]} + labelCol={{ span: 24 }} + wrapperCol={{ span: 24 }} > = ({ model_id, inference_type, caii_en - + ) } diff --git a/app/client/src/pages/DataGenerator/DataGenerator.tsx b/app/client/src/pages/DataGenerator/DataGenerator.tsx index 62a545b..f4d6472 100644 --- a/app/client/src/pages/DataGenerator/DataGenerator.tsx +++ b/app/client/src/pages/DataGenerator/DataGenerator.tsx @@ -2,7 +2,7 @@ import isEmpty from 'lodash/isEmpty'; import isString from 'lodash/isString'; import { useRef, useState } from 'react'; import { useLocation } from 'react-router-dom'; -import { Button, Flex, Form, Layout, Steps, Typography } from 'antd'; +import { Button, Flex, Form, Layout, Steps } from 'antd'; import type { FormInstance } from 'antd'; import ArrowBackIcon from '@mui/icons-material/ArrowBack'; import ArrowForwardIcon from '@mui/icons-material/ArrowForward'; @@ -17,6 +17,7 @@ import Finish from './Finish'; import { DataGenWizardSteps, WizardStepConfig, WorkflowType } from './types'; import { WizardCtx } from './utils'; +import { useGetDatasetDetails } from '../DatasetDetails/hooks'; const { Content } = Layout; // const { Title } = Typography; @@ -98,10 +99,14 @@ const DataGenerator = () => { const [isStepValid, setIsStepValid] = useState(false); // Data passed from listing table to prepopulate form const location = useLocation(); - console.log('DatGenerator >> location?.state?.data:', location?.state?.data); + console.log('location?.state?.data:', location?.state?.data); const initialData = location?.state?.data; + + const datasetDetailsReq = location?.state?.data && useGetDatasetDetails(location?.state?.data?.generate_file_name) if (initialData?.technique) { - initialData.workflow_type = initialData?.technique === 'sft' ? WorkflowType.SUPERVISED_FINE_TUNING : + initialData.workflow_type = initialData?.technique === 'sft' ? + WorkflowType.SUPERVISED_FINE_TUNING : + initialData?.technique === 'freeform' ? WorkflowType.FREE_FORM_DATA_GENERATION : WorkflowType.CUSTOM_DATA_GENERATION; } if (Array.isArray(initialData?.doc_paths) && !isEmpty(initialData?.doc_paths) ) { @@ -111,6 +116,12 @@ const DataGenerator = () => { })); } + + // if (datasetDetailsReq && datasetDetailsReq.data && + // !isEmpty(datasetDetailsReq?.data?.generate_file_name)) { + // initialData.example_path = initialData?.example_path; + // } + if (Array.isArray(initialData?.input_paths) && !isEmpty(initialData?.input_paths) ) { initialData.doc_paths = initialData?.input_paths.map((path: string) => ({ value: path, diff --git a/app/client/src/pages/DataGenerator/Examples.tsx b/app/client/src/pages/DataGenerator/Examples.tsx index 2864ba6..b64fbb7 100644 --- a/app/client/src/pages/DataGenerator/Examples.tsx +++ b/app/client/src/pages/DataGenerator/Examples.tsx @@ -1,10 +1,21 @@ -import { Button, Form, Modal, Space, Table, Tooltip, Typography, Flex } from 'antd'; -import { DeleteOutlined, EditOutlined } from '@ant-design/icons'; +import first from 'lodash/first'; +import get from 'lodash/get'; +import isEmpty from 'lodash/isEmpty'; +import React, { useEffect } from 'react'; +import { Button, Form, Modal, Space, Table, Tooltip, Typography, Flex, Input, Empty } from 'antd'; +import { CloudUploadOutlined, DeleteOutlined, EditOutlined } from '@ant-design/icons'; import styled from 'styled-components'; +import { useMutation } from "@tanstack/react-query"; import { useFetchExamples } from '../../api/api'; import TooltipIcon from '../../components/TooltipIcon'; import PCModalContent from './PCModalContent'; -import { QuestionSolution } from './types'; +import { File, QuestionSolution, WorkflowType } from './types'; +import FileSelectorButton from './FileSelectorButton'; + +import { fetchFileContent } from './hooks'; +import { useState } from 'react'; +import FreeFormExampleTable from './FreeFormExampleTable'; +import { useWizardCtx } from './utils'; const { Title } = Typography; const Container = styled.div` @@ -25,20 +36,66 @@ const StyledTable = styled(Table)` cursor: pointer; } ` + +const StyledContainer = styled.div` + margin-bottom: 24px; + height: 48px; + color: rgba(0, 0, 0, 0.45); + svg { + font-size: 48px; + } + +`; + const MAX_EXAMPLES = 5; -const Examples = () => { +enum ExampleType { + FREE_FORM = 'freeform', + PROMPT_COMPLETION = 'promptcompletion' +} + +const Examples: React.FC = () => { const form = Form.useFormInstance(); - // const { setIsStepValid } = useWizardCtx(); - // const _values = Form.useWatch('examples', form); - // useEffect (() => { - // const values = form.getFieldsValue(); - // if (isEmpty(values.examples)) { - // setIsStepValid(false); - // } else if (!isEmpty(values?.examples)) { - // setIsStepValid(true); - // } - // }, [_values]); + const [exampleType, setExampleType] = useState(ExampleType.PROMPT_COMPLETION); + + const mutation = useMutation({ + mutationFn: fetchFileContent + }); + const values = form.getFieldsValue(true) + + useEffect(() => { + const example_path = form.getFieldValue('example_path'); + + if (!isEmpty(example_path)) { + mutation.mutate({ + path: example_path + }); + } + + if (form.getFieldValue('workflow_type') === 'freeform') { + setExampleType(ExampleType.FREE_FORM); + } + + + + }, [form.getFieldValue('example_path'), form.getFieldValue('workflow_type')]); + + useEffect(() => { + if (!isEmpty(mutation.data)) { + form.setFieldValue('examples', mutation.data); + } + }, [mutation.data]); + + const { setIsStepValid } = useWizardCtx(); + const _values = Form.useWatch(['examples', 'example_path'], form); + useEffect (() => { + const values = form.getFieldsValue(); + if (isEmpty(values.examples) && isEmpty(form.getFieldValue('example_path'))) { + setIsStepValid(false); + } else { + setIsStepValid(true); + } + }, [_values, form.getFieldValue('example_path')]); const columns = [ { @@ -141,6 +198,26 @@ const Examples = () => { form.setFieldValue('examples', examples.examples) } const rowLimitReached = form.getFieldValue('examples')?.length === MAX_EXAMPLES; + const workflowType = form.getFieldValue('workflow_type'); + + const onAddFiles = (files: File[]) => { + if (!isEmpty (files)) { + const file = first(files); + mutation.mutate({ + path: get(file, '_path'), + }); + const values = form.getFieldsValue(); + form.setFieldsValue({ + ...values, + example_path: get(file, '_path') + }); + setExampleType(ExampleType.FREE_FORM); + } + } + + const labelCol = { + span: 10 + }; return ( @@ -151,7 +228,26 @@ const Examples = () => { - + + {workflowType === WorkflowType.FREE_FORM_DATA_GENERATION && + <> + + + + + + } + + {exampleType !== ExampleType.FREE_FORM && + } + + {exampleType !== ExampleType.FREE_FORM && - + } + {exampleType === ExampleType.FREE_FORM && !isEmpty(mutation.data) && + } + {exampleType === ExampleType.FREE_FORM && isEmpty(mutation.data) && !isEmpty(values.examples) && + } + {exampleType === ExampleType.FREE_FORM && isEmpty(mutation.data) && isEmpty(values.examples) && + + + + } + imageStyle={{ + height: 60, + marginBottom: 24 + }} + description={ + <> +

+ Upload a JSON file containing examples +

+

+ {'Examples should be in the format of a JSON array containing array of key & value pairs. The key should be the column name and the value should be the cell value.'} +

+ + } + > +
+ } + {exampleType !== ExampleType.FREE_FORM && @@ -230,7 +358,7 @@ const Examples = () => { rowClassName={() => 'hover-pointer'} rowKey={(_record, index) => `examples-table-${index}`} /> - + }
) diff --git a/app/client/src/pages/DataGenerator/FileSelectorButton.tsx b/app/client/src/pages/DataGenerator/FileSelectorButton.tsx index fd612cf..b8e6f88 100644 --- a/app/client/src/pages/DataGenerator/FileSelectorButton.tsx +++ b/app/client/src/pages/DataGenerator/FileSelectorButton.tsx @@ -1,4 +1,3 @@ -import get from 'lodash/get'; import { Button, Modal } from 'antd'; import React, { useState } from 'react'; import FilesTable from './FilesTable'; @@ -9,9 +8,10 @@ import { File, WorkflowType } from './types'; interface Props { onAddFiles: (files: File[]) => void; workflowType: WorkflowType; + label?: string; } -const FileSelectorButton: React.FC = ({ onAddFiles, workflowType }) => { +const FileSelectorButton: React.FC = ({ onAddFiles, workflowType, label }) => { const [showModal, setShowModal] = useState(false); const [selectedFiles, setSelectedFiles] = useState([]) @@ -31,7 +31,9 @@ const FileSelectorButton: React.FC = ({ onAddFiles, workflowType }) => { style={{ marginLeft: '4px' }} onClick={() => setShowModal(true)} icon={} - /> + > + {label ? label : null} + {showModal && ( = ({ onSelectedRows, workflowType }) => { const [paths, setPaths] = useState(null); const [path, setPath] = useState(null); const [selectedRowKeys, setSelectedRowKeys] = useState([]); - const [selectedRows, setSelectedRows] = useState([]); + const [, setSelectedRows] = useState([]); // row selection map: path as key -> list of row keys const [rowSelectionMap, setRowSelectionMap] = useState({}); // row selection map: path as key -> list of files const [fileSelectionMap, setFileSelectionMap] = useState({}); - const { fetching, listProjectFiles, data } = useGetProjectFiles(paths || []); + const { fetching, listProjectFiles, data } = useGetProjectFiles(); useEffect(() => { if (!isEmpty(path) || paths === null || isEmpty(paths)) { @@ -151,7 +153,7 @@ const FilesTable: React.FC = ({ onSelectedRows, workflowType }) => { key: 'name', ellipsis: true, render: (file: File) => { - const { name, url } = file; + const { name } = file; if (file?.mime !== DIRECTORY_MIME_TYPE) { return ( @@ -205,6 +207,7 @@ const FilesTable: React.FC = ({ onSelectedRows, workflowType }) => { )} + {fetching && } diff --git a/app/client/src/pages/DataGenerator/Finish.tsx b/app/client/src/pages/DataGenerator/Finish.tsx index 00df57f..116bfb7 100644 --- a/app/client/src/pages/DataGenerator/Finish.tsx +++ b/app/client/src/pages/DataGenerator/Finish.tsx @@ -1,6 +1,7 @@ import isNumber from 'lodash/isNumber'; import filter from 'lodash/filter'; import isString from 'lodash/isString'; +import isEmpty from 'lodash/isEmpty'; import { FC, useEffect } from 'react'; import { HomeOutlined, PageviewOutlined } from '@mui/icons-material'; import AssessmentIcon from '@mui/icons-material/Assessment'; @@ -16,10 +17,10 @@ import { useTriggerDatagen } from './../../api/api' import { DEMO_MODE_THRESHOLD } from './constants' import { GenDatasetResponse, QuestionSolution, WorkflowType } from './types'; import { Pages } from '../../types'; -import { isEmpty } from 'lodash'; import CustomResultTable from './CustomResultTable'; import SeedResultTable from './SeedResultTable'; import { getFilesURL } from '../Evaluator/util'; +import FreeFormTable from './FreeFormTable'; const { Title } = Typography; @@ -126,9 +127,10 @@ const isDemoMode = (numQuestions: number, topics: [], form: FormInstance) => { const Finish = () => { const form = Form.useFormInstance(); - const { data: genDatasetResp, loading, error: generationError, triggerPost } = useTriggerDatagen(); - const { num_questions, topics } = form.getFieldsValue(true) - const isDemo = isDemoMode(num_questions, topics, form) + const { num_questions, topics, workflow_type } = form.getFieldsValue(true); + const { data: genDatasetResp, loading, error: generationError, triggerPost } = useTriggerDatagen(workflow_type); + + const isDemo = isDemoMode(num_questions, topics, form); useEffect(() => { const formValues = form.getFieldsValue(true); @@ -153,6 +155,8 @@ const Finish = () => { formValues.technique = 'sft'; } else if (formValues.workflow_type === WorkflowType.CUSTOM_DATA_GENERATION) { formValues.technique = 'custom_workflow'; + } else if (formValues.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION) { + formValues.technique = 'freeform'; } // send examples as null when the array is empty if (isEmpty(formValues.examples)) { @@ -173,11 +177,15 @@ const Finish = () => { formValues.doc_paths = doc_paths } + if (formValues.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION) { + delete formValues.examples; + } + const args = {...formValues, is_demo: isDemo, model_params: formValues.model_parameters } triggerPost(args) }, []); - const hasTopics = (genDatasetResp: any) => { + const hasTopics = (genDatasetResp: unknown) => { return !Array.isArray(genDatasetResp?.results) } @@ -192,13 +200,23 @@ const Finish = () => { let topicTabs = []; if (!hasDocSeeds && formValues.workflow_type !== WorkflowType.CUSTOM_DATA_GENERATION && - hasTopics(genDatasetResp)) { - topicTabs = genDatasetResp?.results && Object.keys(genDatasetResp.results).map((topic, i) => ({ + hasTopics(genDatasetResp) && !isEmpty(genDatasetResp?.results)) { + const values = Object.values(genDatasetResp?.results); + + + topicTabs = genDatasetResp?.results && Object.keys(genDatasetResp.results).map((topic, i) => { + return ({ key: `${topic}-${i}`, label: {topic}, value: topic, - children: - })); + children: workflow_type !== WorkflowType.FREE_FORM_DATA_GENERATION ? + : + // + + + + }) + }); } const nextStepsListPreview = [ diff --git a/app/client/src/pages/DataGenerator/FreeFormExampleTable.tsx b/app/client/src/pages/DataGenerator/FreeFormExampleTable.tsx new file mode 100644 index 0000000..c93bceb --- /dev/null +++ b/app/client/src/pages/DataGenerator/FreeFormExampleTable.tsx @@ -0,0 +1,127 @@ + +import isEmpty from 'lodash/isEmpty'; +import first from 'lodash/first'; +import toString from 'lodash/toString'; +import React, { FunctionComponent, useState, useMemo, useCallback, useEffect } from 'react'; +import { AgGridReact } from 'ag-grid-react'; + +// // Register all Community features +// // ModuleRegistry.registerModules([AllCommunityModule]); +import { themeMaterial } from "ag-grid-community"; + +import { + ModuleRegistry, + ClientSideRowModelModule, + ValidationModule, + type ColDef, + type GetRowIdFunc, + type GetRowIdParams + } from 'ag-grid-community'; + +import { TextFilterModule } from 'ag-grid-community'; +import { NumberFilterModule } from 'ag-grid-community'; +import { DateFilterModule } from 'ag-grid-community'; + +// Register all Community features (if needed, specify valid modules here) +ModuleRegistry.registerModules([ + // AllModules, + TextFilterModule, + NumberFilterModule, + DateFilterModule, + // SetFilterModule, + // MultiFilterModule, + // GroupFilterModule, + // CustomFilterModule, + + // ModuleRegistry, + // RowGroupingModule, + // PivotModule, + // TreeDataModule, + ClientSideRowModelModule, + ValidationModule +]); + +interface Props { + data: Record[]; +} + +const FreeFormExampleTable: FunctionComponent = ({ data }) => { + const [colDefs, setColDefs] = useState([]); + const [rowData, setRowData] = useState([]); + + useEffect(() => { + if (!isEmpty(data)) { + const columnNames = Object.keys(first(data)); + const columnDefs = columnNames.map((colName) => ({ + field: colName, + headerName: colName, + width: 250, + filter: true, + sortable: true, + resizable: true + })); + setColDefs(columnDefs); + setRowData(data); + } + } + , [data]); + + const defaultColDef: ColDef = useMemo( + () => ({ + flex: 1, + filter: true, + enableRowGroup: true, + enableValue: true, + + editable: true, + minWidth: 170 + }), + [] + ); + + let index = 0; + const getRowId = useCallback( + ({ data: { ticker } }: GetRowIdParams) => { + index++; + return ticker || toString(index); + }, + [] + ); + + const statusBar = useMemo( + () => ({ + statusPanels: [ + { statusPanel: "agTotalAndFilteredRowCountComponent" }, + { statusPanel: "agTotalRowCountComponent" }, + { statusPanel: "agFilteredRowCountComponent" }, + { statusPanel: "agSelectedRowCountComponent" }, + { statusPanel: "agAggregationComponent" }, + ], + }), + [] + ); + + + return ( + <> +
+ +
+ + ); +} +export default FreeFormExampleTable; \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/FreeFormTable.tsx b/app/client/src/pages/DataGenerator/FreeFormTable.tsx new file mode 100644 index 0000000..24fb9f8 --- /dev/null +++ b/app/client/src/pages/DataGenerator/FreeFormTable.tsx @@ -0,0 +1,139 @@ + +import isEmpty from 'lodash/isEmpty'; +import first from 'lodash/first'; +import toString from 'lodash/toString'; +import React, { FunctionComponent, useState, useMemo, useCallback, useEffect } from 'react'; +import { AgGridReact } from 'ag-grid-react'; +// // Register all Community features +// // ModuleRegistry.registerModules([AllCommunityModule]); +import { themeMaterial } from 'ag-grid-community'; + +import { + ModuleRegistry, + ClientSideRowModelModule, + ValidationModule, + type ColDef, + type GetRowIdFunc, + type GetRowIdParams + } from 'ag-grid-community'; + +// import { RowGroupingModule } from 'ag-grid-community'; +// import { PivotModule } from 'ag-grid-community'; +// import { TreeDataModule } from 'ag-grid-community'; +// import { ClientSideRowModelModule } from 'ag-grid-community'; +// import { AllModules } from 'ag-grid-community'; +import { TextFilterModule } from 'ag-grid-community'; +import { NumberFilterModule } from 'ag-grid-community'; +import { DateFilterModule } from 'ag-grid-community'; +// import { SetFilterModule } from 'ag-grid-community'; +// import { MultiFilterModule } from 'ag-grid-community'; +// import { GroupFilterModule } from 'ag-grid-community'; +// import { CustomFilterModule } from 'ag-grid-community'; + +// Register all Community features (if needed, specify valid modules here) +ModuleRegistry.registerModules([ + // AllModules, + TextFilterModule, + NumberFilterModule, + DateFilterModule, + ClientSideRowModelModule, + ValidationModule +]); + +interface Props { + data: Record[]; +} + +const FreeFormTable: FunctionComponent = ({ data }) => { + const [colDefs, setColDefs] = useState([]); + const [rowData, setRowData] = useState([]); + + useEffect(() => { + if (!isEmpty(data)) { + const columnNames = Object.keys(first(data)); + const columnDefs = columnNames.map((colName) => ({ + field: colName, + headerName: colName, + width: 250, + filter: true, + sortable: true, + resizable: true + })); + setColDefs(columnDefs); + setRowData(data); + } + } + , [data]); + // const [rowData, setRowData] = useState([ + // { make: "Tesla", model: "Model Y", price: 64950, electric: true }, + // { make: "Ford", model: "F-Series", price: 33850, electric: false }, + // { make: "Toyota", model: "Corolla", price: 29600, electric: false }, + // ]); + + // // Column Definitions: Defines the columns to be displayed. + // const [colDefs, setColDefs] = useState([ + // { field: "make" }, + // { field: "model" }, + // { field: "price" }, + // { field: "electric" } + // ]); + + const defaultColDef: ColDef = useMemo( + () => ({ + flex: 1, + filter: true, + enableRowGroup: true, + enableValue: true, + + editable: true, + minWidth: 170 + }), + [] + ); + + let index = 0; + const getRowId = useCallback( + ({ data: { ticker } }: GetRowIdParams) => { + index++; + return ticker || toString(index); + }, + [] + ); + + const statusBar = useMemo( + () => ({ + statusPanels: [ + { statusPanel: "agTotalAndFilteredRowCountComponent" }, + { statusPanel: "agTotalRowCountComponent" }, + { statusPanel: "agFilteredRowCountComponent" }, + { statusPanel: "agSelectedRowCountComponent" }, + { statusPanel: "agAggregationComponent" }, + ], + }), + [] + ); + + + return ( + <> +
+ +
+ + ); +} +export default FreeFormTable; \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/PCModalContent.tsx b/app/client/src/pages/DataGenerator/PCModalContent.tsx index b157ed1..83b7c0e 100644 --- a/app/client/src/pages/DataGenerator/PCModalContent.tsx +++ b/app/client/src/pages/DataGenerator/PCModalContent.tsx @@ -3,7 +3,7 @@ import styled from 'styled-components'; import Markdown from '../../components/Markdown'; import TooltipIcon from '../../components/TooltipIcon'; -import { JustificationScore, QuestionSolution } from './types'; +import { QuestionSolution } from './types'; const { Title } = Typography; const Container = styled(Flex)` diff --git a/app/client/src/pages/DataGenerator/Parameters.tsx b/app/client/src/pages/DataGenerator/Parameters.tsx index 7f1564e..9855568 100644 --- a/app/client/src/pages/DataGenerator/Parameters.tsx +++ b/app/client/src/pages/DataGenerator/Parameters.tsx @@ -1,6 +1,6 @@ import isEmpty from 'lodash/isEmpty'; import { useEffect, useRef, useState } from 'react'; -import { Col, Divider, Form, InputNumber, Row, Slider, Spin, Typography } from 'antd'; +import { Col, Divider, Form, InputNumber, Row, Slider, Typography } from 'antd'; import { merge } from 'lodash'; import styled from 'styled-components'; @@ -58,7 +58,7 @@ const Parameters = () => { const formData = form.getFieldsValue(true); const [values, setValues] = useState(formData?.model_parameters); - const { data: defaultParams, loading: loadingDefaultParams } = useFetchDefaultModelParams(); + const { data: defaultParams } = useFetchDefaultModelParams(); useEffect(() => { if (!isEmpty(formData?.model_parameters)) { @@ -185,7 +185,7 @@ const Parameters = () => { - {/* {LABELS[ModelParameters.MAX_TOKENS]}} labelCol={{ span: 24 }} @@ -215,7 +215,7 @@ const Parameters = () => { /> - */} + ) diff --git a/app/client/src/pages/DataGenerator/Prompt.tsx b/app/client/src/pages/DataGenerator/Prompt.tsx index 71c654d..8ff5ab6 100644 --- a/app/client/src/pages/DataGenerator/Prompt.tsx +++ b/app/client/src/pages/DataGenerator/Prompt.tsx @@ -77,12 +77,12 @@ const Prompt = () => { const output_key = form.getFieldValue('output_key'); const caii_endpoint = form.getFieldValue('caii_endpoint'); - const { data: defaultPrompt, loading: promptsLoading } = useFetchDefaultPrompt(useCase); + const { data: defaultPrompt, loading: promptsLoading } = useFetchDefaultPrompt(useCase, workflow_type); // Page Bootstrap requests and useEffect const { data: defaultTopics, loading: topicsLoading } = usefetchTopics(useCase); const { data: defaultSchema, loading: schemaLoading } = useFetchDefaultSchema(); - const { data: dataset_size, isLoading: datasetSizeLoadin, isError, error } = useDatasetSize( + const { data: dataset_size, isLoading: datasetSizeLoading, isError, error } = useDatasetSize( workflow_type, doc_paths, input_key, @@ -266,7 +266,8 @@ const Prompt = () => { } {isEmpty(doc_paths) && (workflow_type === WorkflowType.SUPERVISED_FINE_TUNING || - workflow_type === WorkflowType.CUSTOM_DATA_GENERATION) && + workflow_type === WorkflowType.CUSTOM_DATA_GENERATION || + workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION) && { { + { validator: (_: unknown, value: string) => { if (items.includes(value)) { return Promise.reject('This seed instruction already exists in the list') } diff --git a/app/client/src/pages/DataGenerator/SeedResultTable.tsx b/app/client/src/pages/DataGenerator/SeedResultTable.tsx index 159eb07..ae416a3 100644 --- a/app/client/src/pages/DataGenerator/SeedResultTable.tsx +++ b/app/client/src/pages/DataGenerator/SeedResultTable.tsx @@ -24,7 +24,7 @@ const SeedResultTable: React.FC = ({ results }) => { forEach(seeds, (seed: string) => { const pairs = get(results, `${seed}`); if (Array.isArray(pairs)) { - forEach(pairs, (pair: any) => { + forEach(pairs, (pair: unknown) => { data.push({ seed, question: get(pair, `question`), diff --git a/app/client/src/pages/DataGenerator/Summary.tsx b/app/client/src/pages/DataGenerator/Summary.tsx index 797f110..5aeff01 100644 --- a/app/client/src/pages/DataGenerator/Summary.tsx +++ b/app/client/src/pages/DataGenerator/Summary.tsx @@ -6,6 +6,7 @@ import PCModalContent from './PCModalContent' import { MODEL_PROVIDER_LABELS } from './constants' import { ModelParameters } from '../../types'; import { ModelProviders, QuestionSolution, Usecases } from './types'; +import FreeFormExampleTable from './FreeFormExampleTable'; const { Title } = Typography; const MODEL_PARAMETER_LABELS: Record = { @@ -46,10 +47,11 @@ const Summary= () => { num_questions, custom_prompt, model_parameters, + workflow_type, topics = [], schema, examples = [] - } = form.getFieldsValue(true) + } = form.getFieldsValue(true); const cfgStepDataSource = [ { label: 'Dataset Name', children: display_name }, @@ -72,7 +74,7 @@ const Summary= () => { ellipsis: true, render: (_text: QuestionSolution, record: QuestionSolution) => <>{record.solution} }, - ] + ]; return ( @@ -133,9 +135,11 @@ const Summary= () => { )} - {isEmpty(examples) && + {!isEmpty(examples) &&
{'Examples'} + {workflow_type === 'freeform' ? + : { }) })} rowKey={(_record, index) => `summary-examples-table-${index}`} - /> + />}
}
) diff --git a/app/client/src/pages/DataGenerator/constants.ts b/app/client/src/pages/DataGenerator/constants.ts index 4e5549a..d90b5b2 100644 --- a/app/client/src/pages/DataGenerator/constants.ts +++ b/app/client/src/pages/DataGenerator/constants.ts @@ -19,7 +19,8 @@ export const USECASE_OPTIONS = [ export const WORKFLOW_OPTIONS = [ { label: 'Supervised Fine-Tuning', value: 'sft' }, - { label: 'Custom Data Generation', value: 'custom' } + { label: 'Custom Data Generation', value: 'custom' }, + { label: 'Freeform Data Generation', value: 'freeform' } ]; export const MODEL_TYPE_OPTIONS: ModelProvidersDropdownOpts = [ diff --git a/app/client/src/pages/DataGenerator/hooks.ts b/app/client/src/pages/DataGenerator/hooks.ts index 61bb2a5..eab4e99 100644 --- a/app/client/src/pages/DataGenerator/hooks.ts +++ b/app/client/src/pages/DataGenerator/hooks.ts @@ -3,13 +3,12 @@ import get from 'lodash/get'; import toNumber from 'lodash/toNumber'; import isEmpty from 'lodash/isEmpty'; import isString from 'lodash/isString'; -import { useState } from 'react'; import { useMutation, useQuery } from '@tanstack/react-query'; import { WorkflowType } from './types'; const BASE_API_URL = import.meta.env.VITE_AMP_URL; -export const fetchPrompt = async (use_case: string, params: any) => { +export const fetchPrompt = async (use_case: string, params: unknown) => { if (use_case !== 'custom') { const resp = await fetch(`${BASE_API_URL}/${use_case}/gen_prompt`, { method: 'GET' @@ -63,7 +62,7 @@ export const useGetPromptByUseCase = (use_case: string, { model_id, inference_ty }; } -export const fetchCustomPrompt = async (params: any) => { +export const fetchCustomPrompt = async (params: unknown) => { if (params.use_case !== 'custom') { const resp = await fetch(`${BASE_API_URL}/${params.use_case}/gen_prompt`, { method: 'GET' @@ -89,7 +88,24 @@ export const fetchCustomPrompt = async (params: any) => { } } -export const listModels = async (params: any) => { +export const fetchFileContent = async (params: unknown) => { + const resp = await fetch(`${BASE_API_URL}/json/get_content`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(params), + }); + if (resp.status !== 200) { + const error = await resp.json(); + throw new Error(error.message || error.detail); + } + const body = await resp.json(); + const content = get(body, 'data'); + return content; +} + +export const listModels = async (params: unknown) => { const resp = await fetch(`${BASE_API_URL}/model/model_ID`, { method: 'POST', headers: { @@ -105,7 +121,7 @@ export const listModels = async (params: any) => { return body; } -export const listFilesByPath = async (params: any) => { +export const listFilesByPath = async (params: unknown) => { const resp = await fetch(`${BASE_API_URL}/get_project_files`, { method: 'POST', headers: { @@ -119,7 +135,7 @@ export const listFilesByPath = async (params: any) => { } const body = await resp.json(); const _files = get(body, '_files'); - const files = _files.map((_file: any) => { + const files = _files.map((_file: unknown) => { const name = get(_file, '_path'); const size = toNumber(get(_file, '_file_size')); const _is_dir = get(_file, '_is_dir') @@ -135,9 +151,7 @@ export const listFilesByPath = async (params: any) => { return files; } -export const useGetProjectFiles = (paths: string[]) => { - const [files, setFiles] = useState([]); - +export const useGetProjectFiles = () => { const mutation = useMutation({ mutationFn: listFilesByPath }); @@ -145,12 +159,12 @@ export const useGetProjectFiles = (paths: string[]) => { if (mutation.isError) { notification.error({ message: 'Error', - description: `An error occurred while fetching the prompt.\n ${mutation.error}` + description: `An error occurred while fetching the list of project files.\n ${mutation.error}` }); } return { listProjectFiles: mutation.mutate, - fetching: mutation.isLoading, + fetching: mutation.isPending, error: mutation.error, isError: mutation.isError, data: mutation.data @@ -158,7 +172,7 @@ export const useGetProjectFiles = (paths: string[]) => { }; - export const fetchDatasetSize = async (params: any) => { + export const fetchDatasetSize = async (params: unknown) => { const resp = await fetch(`${BASE_API_URL}/json/dataset_size`, { method: 'POST', headers: { diff --git a/app/client/src/pages/DataGenerator/types.ts b/app/client/src/pages/DataGenerator/types.ts index 38a8f25..00be477 100644 --- a/app/client/src/pages/DataGenerator/types.ts +++ b/app/client/src/pages/DataGenerator/types.ts @@ -35,7 +35,7 @@ export interface GenDatasetRequest { topics?: string[]; use_case?: Usecases is_demo?: boolean; - results?: any + results?: unknown } export interface GenDatasetResponse { @@ -104,7 +104,8 @@ export interface File { export enum WorkflowType { SUPERVISED_FINE_TUNING = 'supervised-fine-tuning', - CUSTOM_DATA_GENERATION = "custom" + CUSTOM_DATA_GENERATION = "custom", + FREE_FORM_DATA_GENERATION = "freeform" } export interface CustomResult { @@ -114,5 +115,6 @@ export interface CustomResult { export enum TechniqueType { SFT = 'sft', - CUSTOME_WORKFLOW = 'custom_workflow' + CUSTOME_WORKFLOW = 'custom_workflow', + FREE_FORM = 'freeform' } \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/utils.ts b/app/client/src/pages/DataGenerator/utils.ts index ab9f784..6f7e932 100644 --- a/app/client/src/pages/DataGenerator/utils.ts +++ b/app/client/src/pages/DataGenerator/utils.ts @@ -37,3 +37,35 @@ export const fromNow = time => { } return moment(time).fromNow(); }; + +export const sampleExamplesData = [ + { + "loan_amnt": 10000.00, + "term": "36 months", + "int_rate": 11.44, + "installment": 329.48, + "grade": "B", + "sub_grade": "B4", + "emp_title": "Marketing", + "emp_length": "10+ years", + "home_ownership": "RENT", + "annual_inc": 117000.00, + "verification_status": "Not Verified", + "issue_d": "Jan-2015", + "loan_status": "Fully Paid", + "purpose": "vacation", + "title": "Vacation", + "dti": 26.24, + "earliest_cr_line": "Jun-1990", + "open_acc": 16.00, + "pub_rec": 0.00, + "revol_bal": 36369.00, + "revol_util": 41.80, + "total_acc": 25.00, + "initial_list_status": "w", + "application_type": "INDIVIDUAL", + "mort_acc": 0.00, + "pub_rec_bankruptcies": 0.00, + "address": "0185 Michelle Gateway\r\nMendozaberg, OK 22690" + } +]; diff --git a/app/client/src/pages/DatasetDetails/ConfigurationTab.tsx b/app/client/src/pages/DatasetDetails/ConfigurationTab.tsx index 138adbb..b16ec0c 100644 --- a/app/client/src/pages/DatasetDetails/ConfigurationTab.tsx +++ b/app/client/src/pages/DatasetDetails/ConfigurationTab.tsx @@ -6,6 +6,7 @@ import { Col, Flex, Modal, Row, Space, Table, Tag, Typography } from 'antd'; import ExampleModal from './ExampleModal'; import { QuestionSolution } from '../DataGenerator/types'; import styled from 'styled-components'; +import FreeFormExampleTable from '../DataGenerator/FreeFormExampleTable'; const { Text } = Typography; @@ -149,6 +150,8 @@ const ConfigurationTab: React.FC = ({ dataset }) => { Examples + {dataset.technique === 'freeform' && } + {dataset.technique !== 'freeform' && = ({ dataset }) => { }) })} rowKey={(_record, index) => `summary-examples-table-${index}`} - /> + />} diff --git a/app/client/src/pages/DatasetDetails/CustomGenerationTable.tsx b/app/client/src/pages/DatasetDetails/CustomGenerationTable.tsx index 1e2ec19..776276f 100644 --- a/app/client/src/pages/DatasetDetails/CustomGenerationTable.tsx +++ b/app/client/src/pages/DatasetDetails/CustomGenerationTable.tsx @@ -1,7 +1,5 @@ import React, { SyntheticEvent, useEffect } from 'react'; - import { Col, Input, Row, Table } from 'antd'; -import { CustomResult } from '../DataGenerator/types'; import { DatasetGeneration } from '../Home/types'; import { sortItemsByKey } from '../../utils/sortutils'; import { SearchProps } from 'antd/es/input/Search'; @@ -51,7 +49,7 @@ const CustomGenerationTable: React.FC = ({ results }) => { } ]; - const onSearch: SearchProps['onSearch'] = (value, _e, info) => { + const onSearch: SearchProps['onSearch'] = (value) => { throttle((value: string) => setSearchQuery(value), 500)(value); } diff --git a/app/client/src/pages/DatasetDetails/DatasetDetailsPage.tsx b/app/client/src/pages/DatasetDetails/DatasetDetailsPage.tsx index 3ef5175..b5de8a0 100644 --- a/app/client/src/pages/DatasetDetails/DatasetDetailsPage.tsx +++ b/app/client/src/pages/DatasetDetails/DatasetDetailsPage.tsx @@ -24,21 +24,6 @@ import { getFilesURL } from '../Evaluator/util'; const { Content } = Layout; const { Title } = Typography; - -const StyledHeader = styled.div` - height: 28px; - flex-grow: 0; - font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; - color: #5a656d; - font-size: 24px; - font-weight: 300; - font-stretch: normal; - font-style: normal; - line-height: 1.4; - letter-spacing: normal; - text-align: left; -`; - const StyledLabel = styled.div` margin-bottom: 4px; font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; @@ -221,7 +206,7 @@ const DatasetDetailsPage: React.FC = () => { - Files + Context {/* {dataset?.custom_prompt} */} diff --git a/app/client/src/pages/DatasetDetails/DatasetGenerationTab.tsx b/app/client/src/pages/DatasetDetails/DatasetGenerationTab.tsx index 9341cf2..5e4419f 100644 --- a/app/client/src/pages/DatasetDetails/DatasetGenerationTab.tsx +++ b/app/client/src/pages/DatasetDetails/DatasetGenerationTab.tsx @@ -5,8 +5,8 @@ import styled from 'styled-components'; import { Dataset } from '../Evaluator/types'; import CustomGenerationTable from './CustomGenerationTable'; import DatasetGenerationTopics from './DatasetGenerationTopics'; -import { CustomResult } from "../DataGenerator/types"; import { DatasetDetails, DatasetGeneration } from '../Home/types'; +import DatasetViewer from './DatasetViewer'; @@ -23,19 +23,17 @@ const Container = styled.div` const DatasetGenerationTab: React.FC = ({ dataset, datasetDetails }) => { - console.log(`DatasetGenerationTab > dataset`, dataset); - console.log(` datasetDetails`, datasetDetails); const topics = get(dataset, 'topics', []); - console.log(` topics`, topics); + const technique = get(dataset, 'technique'); const hasCustomSeeds = !Array.isArray(datasetDetails?.generation) || isEmpty(topics) || topics !== null; - console.log(` hasCustomSeeds`, hasCustomSeeds); return ( - {hasCustomSeeds && } - {!hasCustomSeeds && } + {technique === 'freeform' && } + {(technique !== 'freeform' && hasCustomSeeds) && } + {(technique !== 'freeform' && !hasCustomSeeds) && } diff --git a/app/client/src/pages/DatasetDetails/DatasetGenerationTopics.tsx b/app/client/src/pages/DatasetDetails/DatasetGenerationTopics.tsx index 3d5d529..c74bed7 100644 --- a/app/client/src/pages/DatasetDetails/DatasetGenerationTopics.tsx +++ b/app/client/src/pages/DatasetDetails/DatasetGenerationTopics.tsx @@ -1,44 +1,17 @@ import get from 'lodash/get'; -import { Card, Table, Tabs, Typography } from "antd"; +import { Card, Tabs, Typography } from "antd"; import { DatasetGeneration } from "../Home/types"; import TopicGenerationTable from "./TopicGenerationTable"; import isEmpty from "lodash/isEmpty"; import styled from "styled-components"; import { Dataset } from '../Evaluator/types'; +import FreeFormTable from '../DataGenerator/FreeFormTable'; interface Props { data: DatasetGeneration; dataset: Dataset; } -const StyledTable = styled(Table)` - font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; - color: #5a656d; - .ant-table-thead > tr > th { - color: #5a656d; - border-bottom: 1px solid #eaebec; - font-weight: 500; - text-align: left; - // background: #ffffff; - border-bottom: 1px solid #eaebec; - transition: background 0.3s ease; - } - .ant-table-row { - cursor: pointer; - } - .ant-table-row > td.ant-table-cell { - padding: 8px; - padding-left: 16px; - font-size: 13px; - font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; - color: #5a656d; - .ant-typography { - font-size: 13px; - font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; - } - } -`; - const TabsContainer = styled(Card)` .ant-card-body { padding: 0; @@ -59,6 +32,7 @@ const getTopicTree = (data: DatasetGeneration, topics: string[]) => { const DatasetGenerationTable: React.FC = ({ data, dataset }) => { const topics = get(dataset, 'topics', []); + const technique = get(dataset, 'technique'); const topicTree = getTopicTree(data, topics); let topicTabs = []; @@ -67,7 +41,9 @@ const DatasetGenerationTable: React.FC = ({ data, dataset }) => { key: `${topic}-${i}`, label: {topic}, value: topic, - children: + children: technique !== 'freefoem' ? + : + })); } diff --git a/app/client/src/pages/DatasetDetails/DatasetViewer.tsx b/app/client/src/pages/DatasetDetails/DatasetViewer.tsx new file mode 100644 index 0000000..7db1530 --- /dev/null +++ b/app/client/src/pages/DatasetDetails/DatasetViewer.tsx @@ -0,0 +1,44 @@ +import { FunctionComponent, useEffect } from "react"; +import { Dataset } from '../Evaluator/types'; +import { useMutation } from "@tanstack/react-query"; +import { fetchFileContent } from "../DataGenerator/hooks"; +import get from "lodash/get"; +import isEmpty from "lodash/isEmpty"; +import { Col, Row } from "antd"; +import FreeFormTable from "../DataGenerator/FreeFormTable"; + +interface Props { + dataset: Dataset; +} + + +const DatasetViewer: FunctionComponent = ({ dataset }) => { + const mutation = useMutation({ + mutationFn: fetchFileContent + }); + + useEffect(() => { + const generate_file_name = get(dataset, 'generate_file_name'); + if (!isEmpty(generate_file_name)) { + mutation.mutate({ + path: generate_file_name + }); + } + }, [dataset]); + + + return ( + + +
+ {mutation.isLoading &&

Loading...

} + {mutation.isError &&

Error: {mutation.error}

} + {mutation.isSuccess && ( + + )} +
+ +
+ ); +} +export default DatasetViewer; \ No newline at end of file diff --git a/app/client/src/pages/DatasetDetails/ExampleModal.tsx b/app/client/src/pages/DatasetDetails/ExampleModal.tsx index 8443537..df3123a 100644 --- a/app/client/src/pages/DatasetDetails/ExampleModal.tsx +++ b/app/client/src/pages/DatasetDetails/ExampleModal.tsx @@ -1,4 +1,4 @@ -import { Flex, Form, Typography } from 'antd'; +import { Flex, Typography } from 'antd'; import styled from 'styled-components'; import Markdown from '../../components/Markdown'; diff --git a/app/client/src/pages/DatasetDetails/ExamplesSection.tsx b/app/client/src/pages/DatasetDetails/ExamplesSection.tsx index aaf5d52..9d1eb02 100644 --- a/app/client/src/pages/DatasetDetails/ExamplesSection.tsx +++ b/app/client/src/pages/DatasetDetails/ExamplesSection.tsx @@ -1,15 +1,12 @@ -import { Collapse, Descriptions, Flex, Modal, Table, Typography } from "antd"; +import { Collapse, Flex, Modal, Table } from "antd"; import styled from "styled-components"; -import Markdown from "../../Markdown"; import { DatasetResponse } from "../../../api/Datasets/response"; import { QuestionSolution } from "../../../pages/DataGenerator/types"; -import { MODEL_PARAMETER_LABELS, ModelParameters, Usecases } from "../../../types"; import { Dataset } from "../../../pages/Evaluator/types"; -import PCModalContent from "../../../pages/DataGenerator/PCModalContent"; import ExampleModal from "./ExampleModal"; +import FreeFormExampleTable from "../DataGenerator/FreeFormExampleTable"; -const { Text, Title } = Typography; const Panel = Collapse.Panel; @@ -41,16 +38,7 @@ const StyledTable = styled(Table)` } `; -const MarkdownWrapper = styled.div` - border: 1px solid #d9d9d9; - border-radius: 6px; - padding: 4px 11px; -`; -const StyledLabel = styled.div` - font-size: 16px; - padding-top: 8px; -`; const StyledCollapse = styled(Collapse)` .ant-collapse-content > .ant-collapse-content-box { @@ -74,6 +62,7 @@ export type DatasetDetailProps = { } const ExamplesSection= ({ datasetDetails }: DatasetDetailProps) => { + const { technique } = datasetDetails; const exampleCols = [ { @@ -99,6 +88,11 @@ const ExamplesSection= ({ datasetDetails }: DatasetDetailProps) => { style={{ padding: 0 }} > + {technique === 'freeform' ? ( + + ) : { }) })} rowKey={(_record, index) => `summary-examples-table-${index}`} - /> - - {/* Model Parameters - ({ - label: MODEL_PARAMETER_LABELS[modelParameterKey as ModelParameters], - children: datasetDetails.model_parameters[modelParameterKey as ModelParameters], - })) - : []}> - - {(datasetDetails.schema && datasetDetails.use_case === Usecases.TEXT2SQL) && ( -
- {'DB Schema'} - - - -
- )} */} - + />}
diff --git a/app/client/src/pages/DatasetDetails/TopicGenerationTable.tsx b/app/client/src/pages/DatasetDetails/TopicGenerationTable.tsx index 3626f55..a9c2fc7 100644 --- a/app/client/src/pages/DatasetDetails/TopicGenerationTable.tsx +++ b/app/client/src/pages/DatasetDetails/TopicGenerationTable.tsx @@ -1,7 +1,5 @@ import React, { SyntheticEvent, useEffect } from 'react'; - import { Col, Input, Row, Table } from 'antd'; -import { CustomResult } from '../DataGenerator/types'; import { DatasetGeneration } from '../Home/types'; import throttle from 'lodash/throttle'; import { SearchProps } from 'antd/es/input'; @@ -52,7 +50,7 @@ const TopicGenerationTable: React.FC = ({ results }) => { } ]; - const onSearch: SearchProps['onSearch'] = (value, _e, info) => { + const onSearch: SearchProps['onSearch'] = (value) => { throttle((value: string) => setSearchQuery(value), 500)(value); } diff --git a/app/client/src/pages/DatasetDetails/constants.tsx b/app/client/src/pages/DatasetDetails/constants.tsx index 4faa3b4..4446138 100644 --- a/app/client/src/pages/DatasetDetails/constants.tsx +++ b/app/client/src/pages/DatasetDetails/constants.tsx @@ -1,6 +1,4 @@ -import { HomeOutlined, PageviewOutlined } from '@mui/icons-material'; import AssessmentIcon from '@mui/icons-material/Assessment'; -import CheckCircleIcon from '@mui/icons-material/CheckCircle' import GradingIcon from '@mui/icons-material/Grading'; import ModelTrainingIcon from '@mui/icons-material/ModelTraining'; diff --git a/app/client/src/pages/EvaluationDetails/EvaluationConfigurationTab.tsx b/app/client/src/pages/EvaluationDetails/EvaluationConfigurationTab.tsx index c445348..f4dff95 100644 --- a/app/client/src/pages/EvaluationDetails/EvaluationConfigurationTab.tsx +++ b/app/client/src/pages/EvaluationDetails/EvaluationConfigurationTab.tsx @@ -1,8 +1,6 @@ import { Badge, Col, Flex, Modal, Row, Table, Typography } from "antd"; import { Evaluation } from "../Evaluator/types"; import styled from "styled-components"; -import { QuestionSolution } from "../DataGenerator/types"; -import isEmpty from "lodash/isEmpty"; import ExampleModal from "../DatasetDetails/ExampleModal"; import { getColorCode } from "../Evaluator/util"; diff --git a/app/client/src/pages/EvaluationDetails/EvaluationGenerationTab.tsx b/app/client/src/pages/EvaluationDetails/EvaluationGenerationTab.tsx index d9dfcbd..d2e5a7d 100644 --- a/app/client/src/pages/EvaluationDetails/EvaluationGenerationTab.tsx +++ b/app/client/src/pages/EvaluationDetails/EvaluationGenerationTab.tsx @@ -5,6 +5,7 @@ import { Dataset, Evaluation, EvaluationDetails } from '../Evaluator/types'; import styled from 'styled-components'; import { getTopicMap } from '../Evaluator/util'; import EvaluateTopicTable from '../Evaluator/EvaluateTopicTable'; +import FreeFormEvaluationTable from '../Evaluator/FreeFromEvaluationTable'; interface Props { @@ -18,12 +19,13 @@ const Container = styled.div` background-color: #ffffff; `; -const EvaluationGenerationTab: React.FC = ({ dataset, evaluation, evaluationDetails }) => { +const EvaluationGenerationTab: React.FC = ({ dataset, evaluationDetails }) => { const result = get(evaluationDetails, 'evaluation'); + const isFreeForm = get(dataset, 'technique' , false) === 'freeform'; - let topicTabs: any[] = []; + let topicTabs: unknown[] = []; const { topics, topicMap } = getTopicMap({ result }); - if (dataset.topics !== null && !isEmpty(dataset.topics)) { + if (dataset.topics !== null && !isEmpty(dataset.topics) && !isFreeForm) { topicTabs = topics.map((topicName: string, index: number) => ({ key: `${topicName}-${index}`, label: topicName, @@ -32,7 +34,7 @@ const EvaluationGenerationTab: React.FC = ({ dataset, evaluation, evaluat })); } - if (isEmpty(topicTabs)) { + if (isEmpty(topicTabs) && !isFreeForm) { const values = Object.values(topicMap); return ( @@ -43,6 +45,16 @@ const EvaluationGenerationTab: React.FC = ({ dataset, evaluation, evaluat ); } + if (isFreeForm) { + return ( + + + + + + ); + } + return ( diff --git a/app/client/src/pages/EvaluationDetails/hooks.ts b/app/client/src/pages/EvaluationDetails/hooks.ts index 7438e51..08a70a4 100644 --- a/app/client/src/pages/EvaluationDetails/hooks.ts +++ b/app/client/src/pages/EvaluationDetails/hooks.ts @@ -31,11 +31,7 @@ const fetchEvaluationDetails = async (evaluate_file_name: string) => { queryKey: ['data', fetchEvaluationDetails], queryFn: () => fetchEvaluationDetails(generate_file_name), placeholderData: (previousData) => previousData - }); - - // const dataset = get(data, 'dataset'); - console.log('data:', data); - console.log('error:', error); + }); if (error) { notification.error({ diff --git a/app/client/src/pages/Evaluator/EvaluateExampleTable.tsx b/app/client/src/pages/Evaluator/EvaluateExampleTable.tsx index 6a8761a..53cf280 100644 --- a/app/client/src/pages/Evaluator/EvaluateExampleTable.tsx +++ b/app/client/src/pages/Evaluator/EvaluateExampleTable.tsx @@ -5,7 +5,7 @@ import { Dataset, EvaluateExample, EvaluateExampleRecord } from "./types"; import React, { useEffect, useState } from 'react'; -import { DeleteOutlined, EditOutlined, Add } from "@mui/icons-material"; +import { DeleteOutlined, EditOutlined } from "@mui/icons-material"; import TooltipIcon from "../../components/TooltipIcon"; import StyledTitle from "./StyledTitle"; import styled from "styled-components"; @@ -79,7 +79,7 @@ const EvaluateExampleTable: React.FC = ({ examples, form }) => { } const onDelete = (index: number) => { - let _promptExamples = clone(evaluateExamples); + const _promptExamples = clone(evaluateExamples); pullAt(_promptExamples, index); setEvaluateExamples(_promptExamples); } @@ -142,7 +142,7 @@ const EvaluateExampleTable: React.FC = ({ examples, form }) => {
- diff --git a/app/client/src/pages/Evaluator/EvaluatorPage.tsx b/app/client/src/pages/Evaluator/EvaluatorPage.tsx index b1f9abb..9864420 100644 --- a/app/client/src/pages/Evaluator/EvaluatorPage.tsx +++ b/app/client/src/pages/Evaluator/EvaluatorPage.tsx @@ -1,8 +1,6 @@ import get from 'lodash/get'; -import set from 'lodash/set'; import isEmpty from 'lodash/isEmpty'; import React, { useEffect, useState } from 'react'; -import { useMutation } from '@tanstack/react-query'; import { useParams } from 'react-router-dom'; import { ModelParameters } from '../../types'; import { Button, Form, FormInstance, Result } from 'antd'; @@ -34,7 +32,7 @@ const EvaluatorPage: React.FC = () => { const values = form.getFieldsValue(); form.setFieldsValue({ ...values, - custom_prompt: '' || prompt, + custom_prompt: prompt || '', top_p: get(parameters, 'top_p'), top_k: get(parameters, 'top_k'), min_p: get(parameters, 'min_p'), @@ -48,16 +46,18 @@ const EvaluatorPage: React.FC = () => { } }, [dataset]); -const mutation = useMutation(async (formData) => { - const response = await fetch(`${BASE_API_URL}/synthesis/evaluate`, { - method: 'POST', - body: JSON.stringify(formData), - }); - return response.json(); - }); +// const mutation = useMutation(async (formData) => { +// const response = await fetch(`${BASE_API_URL}/synthesis/evaluate`, { +// method: 'POST', +// body: JSON.stringify(formData), +// }); +// return response.json(); +// }); -const evaluateDataset = async (formData: any) => { - const response = await fetch(`${BASE_API_URL}/synthesis/evaluate`, { +const evaluateDataset = async (formData: unknown) => { + const url = dataset.technique === 'freeforms' ? + `${BASE_API_URL}/synthesis/evaluate` : `${BASE_API_URL}/synthesis/evaluate_freeform`; + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', diff --git a/app/client/src/pages/Evaluator/EvaluatorSuccess.tsx b/app/client/src/pages/Evaluator/EvaluatorSuccess.tsx index 06ece55..106c43d 100644 --- a/app/client/src/pages/Evaluator/EvaluatorSuccess.tsx +++ b/app/client/src/pages/Evaluator/EvaluatorSuccess.tsx @@ -1,6 +1,6 @@ import get from 'lodash/get'; import isEmpty from 'lodash/isEmpty'; -import React, { useState } from 'react'; +import React from 'react'; import { Link } from 'react-router-dom'; import { Avatar, Button, Card, Flex, Layout, List, Tabs, Typography } from 'antd'; import CheckCircleIcon from '@mui/icons-material/CheckCircle'; @@ -13,13 +13,14 @@ import { getProjectJobsUrl } from './hooks'; import { Dataset } from './types'; import { WorkflowType } from '../DataGenerator/types'; import SeedEvaluateTable from './SeedEvaluateTable'; +import FreeFromEvaluationTable from './FreeFromEvaluationTable'; const { Content } = Layout; const { Title } = Typography; interface Props { - result: any; + result: unknown; demo: boolean; dataset: Dataset; } @@ -40,10 +41,13 @@ const StyleContent = styled(Content)` const EvaluatorSuccess: React.FC = ({ result, dataset, demo }) => { - const hasTopics = (result: any) => { - return !Array.isArray(result?.results) + const hasTopics = (result: unknown) => { + return !Array.isArray(result?.results); } + const isFreeForm = (dataset: Dataset) => + dataset?.technique === 'freeform'; + const hasCustomSeed = (_dataset: Dataset) => (_dataset?.technique === 'sft' && !isEmpty(_dataset?.doc_paths)) || (_dataset?.technique === WorkflowType.CUSTOM_DATA_GENERATION && !isEmpty(_dataset?.input_path)) @@ -88,9 +92,14 @@ const EvaluatorSuccess: React.FC = ({ result, dataset, demo }) => { {'Your dataset evaluation was successfully generated. You can review your evaluation in the table below.'} - {!isCustom && !isEmpty(topicTabs) && - - } + {!isCustom && !isEmpty(topicTabs) && !isFreeForm(dataset) && + + + } + {isFreeForm(dataset) && + + + } } {isCustom && <> diff --git a/app/client/src/pages/Evaluator/FreeFromEvaluationTable.tsx b/app/client/src/pages/Evaluator/FreeFromEvaluationTable.tsx new file mode 100644 index 0000000..6e905a8 --- /dev/null +++ b/app/client/src/pages/Evaluator/FreeFromEvaluationTable.tsx @@ -0,0 +1,140 @@ +import first from 'lodash/first'; +import isEmpty from 'lodash/isEmpty'; +import React, { useCallback, useEffect, useMemo, useState } from 'react'; +import { AgGridReact } from 'ag-grid-react'; +import Paragraph from 'antd/es/typography/Paragraph'; +// import { TextFilterModule } from 'ag-grid-community'; +// import { NumberFilterModule } from 'ag-grid-community'; +// import { DateFilterModule } from 'ag-grid-community'; +import { + // ModuleRegistry, + // ClientSideRowModelModule, + // ValidationModule, + type ColDef, + type GetRowIdFunc, + type GetRowIdParams + } from 'ag-grid-community'; + + import { themeMaterial } from "ag-grid-community"; +import get from 'lodash/get'; +import { getColorCode } from './util'; +import { Badge, Popover, Tooltip } from 'antd'; +import styled from 'styled-components'; + +interface Props { + data: unknown[]; +} + +const StyledParagraph = styled(Paragraph)` + font-size: 13px; + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + color: #5a656d; +`; + +const FreeFormEvaluationTable: React.FC = ({ data }) => { + const [colDefs, setColDefs] = useState([]); + const [rowData, setRowData] = useState([]); + + useEffect(() => { + if (!isEmpty(data)) { + const rows = data.map((item) => { + const row = get(item, 'row'); + return { + score: get(item, 'evaluation.score'), + justification: get(item, 'evaluation.justification'), + ...row + } + + }); + + const columnNames = Object.keys(first(rows)); + const columnDefs = columnNames.map((colName) => { + const columnDef = { + field: colName, + headerName: colName, + width: 250, + filter: true, + sortable: true, + resizable: true + } + if (colName === 'score') { + columnDef['width'] = 120 + columnDef['cellRenderer'] = (params: unknown) => { + return + } + } else if (colName === 'justification') { + columnDef['cellRenderer'] = (params: unknown) => ( + + + {params.value} + + + ); + } + + return columnDef; + }); + setColDefs(columnDefs); + setRowData(rows); + } + }, [data]); + + const defaultColDef: ColDef = useMemo( + () => ({ + flex: 1, + filter: true, + enableRowGroup: true, + enableValue: true, + editable: true, + minWidth: 170 + }), + [] + ); + + let index = 0; + const getRowId = useCallback( + ({ data: { ticker } }: GetRowIdParams) => { + index++; + return ticker || toString(index); + }, + [] + ); + + const statusBar = useMemo( + () => ({ + statusPanels: [ + { statusPanel: "agTotalAndFilteredRowCountComponent" }, + { statusPanel: "agTotalRowCountComponent" }, + { statusPanel: "agFilteredRowCountComponent" }, + { statusPanel: "agSelectedRowCountComponent" }, + { statusPanel: "agAggregationComponent" }, + ], + }), + [] + ); + + + return ( + <> +
+ +
+ + ); +} + +export default FreeFormEvaluationTable; \ No newline at end of file diff --git a/app/client/src/pages/Evaluator/GeneratedEvaluationModal.tsx b/app/client/src/pages/Evaluator/GeneratedEvaluationModal.tsx index 3df4f4e..057fcc0 100644 --- a/app/client/src/pages/Evaluator/GeneratedEvaluationModal.tsx +++ b/app/client/src/pages/Evaluator/GeneratedEvaluationModal.tsx @@ -2,14 +2,12 @@ import get from 'lodash/get'; import isString from 'lodash/isString'; import React from 'react'; import { EvaluatedPair } from "./types"; -import { Badge, Button, Flex, Layout, Modal, Tooltip } from 'antd'; +import { Badge, Button, Flex, Modal, Tooltip } from 'antd'; import { QuestionCircleOutlined } from '@ant-design/icons'; import styled from 'styled-components'; import Markdown from '../../components/Markdown'; import { getColorCode } from './util'; -const { Content } = Layout; - interface Props { evaluatedPair: EvaluatedPair; onClose: () => void; diff --git a/app/client/src/pages/Evaluator/ReevaluatorPage.tsx b/app/client/src/pages/Evaluator/ReevaluatorPage.tsx index 8d9e841..d8f3a97 100644 --- a/app/client/src/pages/Evaluator/ReevaluatorPage.tsx +++ b/app/client/src/pages/Evaluator/ReevaluatorPage.tsx @@ -21,8 +21,7 @@ const ReevaluatorPage: React.FC = () => { evaluate, dataset, prompt, - examples, - isLoading + examples } = useGetEvaluate(evaluate_file_name as string); const modelsReq = useModels(); @@ -33,9 +32,8 @@ const ReevaluatorPage: React.FC = () => { useEffect(() => { if (!isEmpty(evaluate)) { const parameters: ModelParameters = get(evaluate, 'model_parameters'); - console.log('parameters', parameters); const values = form.getFieldsValue(); - console.log('prompt', prompt); + form.setFieldsValue({ ...values, display_name: get(evaluate, 'display_name'), @@ -52,7 +50,7 @@ const ReevaluatorPage: React.FC = () => { } }, [evaluate]); - const evaluateDataset = async (formData: any) => { + const evaluateDataset = async (formData: unknown) => { const response = await fetch(`${BASE_API_URL}/synthesis/evaluate`, { method: 'POST', headers: { diff --git a/app/client/src/pages/Evaluator/SeedEvaluateTable.tsx b/app/client/src/pages/Evaluator/SeedEvaluateTable.tsx index 668667c..9b9f877 100644 --- a/app/client/src/pages/Evaluator/SeedEvaluateTable.tsx +++ b/app/client/src/pages/Evaluator/SeedEvaluateTable.tsx @@ -6,7 +6,7 @@ import { getColorCode } from './util'; import { Badge, Table } from 'antd'; interface Props { - results: any; + results: unknown; } const SeedEvaluateTable: React.FC = ({ results }) => { @@ -17,10 +17,10 @@ const SeedEvaluateTable: React.FC = ({ results }) => { } const seeds = Object.values(result); const data = []; - forEach(seeds, (seed: any) => { + forEach(seeds, (seed: unknown) => { const pairs = get(seed, `evaluated_pairs`); if (Array.isArray(pairs)) { - forEach(pairs, (pair: any) => { + forEach(pairs, (pair: unknown) => { data.push({ seed, question: get(pair, `question`), diff --git a/app/client/src/pages/Home/DatasetsTab.tsx b/app/client/src/pages/Home/DatasetsTab.tsx index 84b54ea..0149df5 100644 --- a/app/client/src/pages/Home/DatasetsTab.tsx +++ b/app/client/src/pages/Home/DatasetsTab.tsx @@ -86,7 +86,7 @@ const DatasetsTab: React.FC = () => { } }, [exportResult, notificationInstance]) - const onSearch: SearchProps['onSearch'] = (value, _e, info) => { + const onSearch: SearchProps['onSearch'] = (value: unknown) => { throttle((value: string) => setSearchQuery(value), 500)(value); } @@ -116,12 +116,14 @@ const DatasetsTab: React.FC = () => { title: 'Dataset Name', dataIndex: 'generate_file_name', sorter: sortItemsByKey('generate_file_name'), + width: 250, render: (generate_file_name) => {generate_file_name} }, { key: 'model_id', title: 'Model', dataIndex: 'model_id', sorter: sortItemsByKey('model_id'), + width: 250, render: (modelId) => {modelId} }, { key: 'num_questions', diff --git a/app/client/src/pages/Home/EvaluateButton.tsx b/app/client/src/pages/Home/EvaluateButton.tsx index 4cb5d32..f9ea18a 100644 --- a/app/client/src/pages/Home/EvaluateButton.tsx +++ b/app/client/src/pages/Home/EvaluateButton.tsx @@ -8,7 +8,6 @@ import { isEmpty } from "lodash"; import { Dataset } from "../Evaluator/types"; import { Pages } from "../../types"; -const { Option } = Select; const EvaluateButton: React.FC = () => { const [form] = Form.useForm(); @@ -40,7 +39,7 @@ const EvaluateButton: React.FC = () => { } } - const options = datasets.map((dataset: any) => ({ + const options = datasets.map((dataset: unknown) => ({ value: dataset.display_name, label: dataset.display_name, key: `${dataset?.display_name}-${dataset?.generate_file_name}` diff --git a/app/client/src/pages/Home/EvaluationsTab.tsx b/app/client/src/pages/Home/EvaluationsTab.tsx index 62e8510..36b786f 100644 --- a/app/client/src/pages/Home/EvaluationsTab.tsx +++ b/app/client/src/pages/Home/EvaluationsTab.tsx @@ -65,7 +65,7 @@ const EvaluationsTab: React.FC = () => { } }, [isError]); - const onSearch: SearchProps['onSearch'] = (value, _e, info) => { + const onSearch: SearchProps['onSearch'] = (value: unknown) => { throttle((value: string) => setSearchQuery(value), 500)(value); } diff --git a/app/client/src/pages/Home/hooks.ts b/app/client/src/pages/Home/hooks.ts index b588282..dee238e 100644 --- a/app/client/src/pages/Home/hooks.ts +++ b/app/client/src/pages/Home/hooks.ts @@ -38,7 +38,7 @@ export const useDatasets = () => { } ); if (searchQuery !== null && !isEmpty(searchQuery)) { - const filteredData = data?.datasets.filter((dataset: any) => { + const filteredData = data?.datasets.filter((dataset: unknown) => { return dataset.display_name.toLowerCase().includes(searchQuery.toLowerCase()); }); @@ -71,7 +71,7 @@ export const useEvaluations = () => { } ); if (searchQuery !== null && !isEmpty(searchQuery)) { - const filteredData = data?.evaluations.filter((evaluation: any) => { + const filteredData = data?.evaluations.filter((evaluation: unknown) => { return evaluation.display_name.toLowerCase().includes(searchQuery.toLowerCase()); }); diff --git a/app/client/src/utils/sortutils.ts b/app/client/src/utils/sortutils.ts index 396e14d..1754c7e 100644 --- a/app/client/src/utils/sortutils.ts +++ b/app/client/src/utils/sortutils.ts @@ -1,5 +1,5 @@ type Key = string | number; -type Item = { [x: string]: any }; +type Item = { [x: string]: unknown }; /** * Helper function to use as a comparer when sorting an array of items based on a key in the item. diff --git a/app/core/config.py b/app/core/config.py index efd3405..ea8d6f1 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -5,6 +5,10 @@ import requests import json from fastapi.responses import JSONResponse +import os +from pathlib import Path +from dotenv import load_dotenv +load_dotenv() class UseCase(str, Enum): CODE_GENERATION = "code_generation" @@ -281,18 +285,144 @@ def get_examples_for_topic(use_case: UseCase, topic: str) -> List[Dict[str, str] } } +JWT_PATH = Path("/tmp/jwt") + +def _get_caii_token() -> str: + if (tok := os.getenv("CDP_TOKEN")): + return tok + try: + payload = json.loads(open(JWT_PATH).read()) + except FileNotFoundError: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="No CDP_TOKEN env‑var and no /tmp/jwt file") + except json.JSONDecodeError: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Malformed /tmp/jwt") + + if not (tok := payload.get("access_token")): + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="access_token missing in /tmp/jwt") + return tok + +def caii_check(endpoint: str, timeout: int = 3) -> requests.Response: + """ + Return the GET /models response if everything is healthy. + Raise HTTPException on *any* problem. + """ + if not endpoint: + raise HTTPException(400, "CAII endpoint not provided") + + token = _get_caii_token() + url = endpoint.removesuffix("/chat/completions") + "/models" + + try: + r = requests.get(url, + headers={"Authorization": f"Bearer {token}"}, + timeout=timeout) + except requests.exceptions.RequestException as exc: + raise HTTPException(503, f"CAII endpoint unreachable: {exc}") + + if r.status_code in (401, 403): + raise HTTPException(403, "Token is valid but has no access to this environment") + if r.status_code == 404: + raise HTTPException(404, "CAII endpoint or resource not found") + if 500 <= r.status_code < 600: + raise HTTPException(503, "CAII endpoint is downscaled; retry in ~15 min") + if r.status_code != 200: + raise HTTPException(r.status_code, r.text) + + return r + +LENDING_DATA_PROMPT = """ + Create profile data for the LendingClub company which specialises in lending various types of loans to urban customers. + + Background: + LendingClub is a peer-to-peer lending platform connecting borrowers with investors. The dataset captures loan applications, + borrower profiles, and outcomes to assess credit risk, predict defaults, and determine interest rates. + + + Loan Record field: + + Each generated record must include the following fields in the exact order provided, with values generated as specified: + + - loan_amnt: The listed amount of the loan applied for by the borrower. If at some point in time, the credit department + reduces the loan amount, then it will be reflected in this value. + - term: The number of payments on the loan. Values are in months and can be either " 36 months" or " 60 months". + - int_rate: Interest Rate on the loan + - installment: The monthly payment owed by the borrower if the loan originates. + - grade: LC assigned loan grade (Possible values: A, B, C, D, E, F, G) + - sub_grade: LC assigned loan subgrade (Possible sub-values: 1-5 i.e. A5) + - emp_title: The job title supplied by the Borrower when applying for the loan. + - emp_length: Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 + means ten or more years. + - home_ownership: The home ownership status provided by the borrower during registration or obtained from the credit report. + Possible values are: RENT, OWN, MORTGAGE, ANY, OTHER + - annual_inc: The self-reported annual income provided by the borrower during registration. + - verification_status: Indicates if income was verified by LC, not verified, or if the income source was verified + - issue_d: The month which the loan was funded + - loan_status: Current status of the loan (Possible values: "Fully Paid", "Charged Off") + - purpose: A category provided by the borrower for the loan request. + - title: The loan title provided by the borrower + - dti: A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage + and the requested LC loan, divided by the borrower’s self-reported monthly income. + - earliest_cr_line: The month the borrower's earliest reported credit line was opened + - open_acc: The number of open credit lines in the borrower's credit file. + - pub_rec: Number of derogatory public records + - revol_bal: Total credit revolving balance + - revol_util: Revolving line utilization rate, or the amount of credit the borrower is using relative to all available + revolving credit. + - total_acc: The total number of credit lines currently in the borrower's credit file + - initial_list_status: The initial listing status of the loan. Possible values are: w, f + - application_type: Indicates whether the loan is an individual application or a joint application with two co-borrowers + - mort_acc: Number of mortgage accounts. + - pub_rec_bankruptcies: Number of public record bankruptcies + - address: The physical address of the person + + In addition to the definitions above, when generating samples, adhere to following guidelines: + + Privacy Compliance guidelines: + 1) Ensure PII from examples such as addresses are not used in the generated data to minimize any privacy concerns. + 2) Avoid real PII in addresses. Use generic street names and cities. + + Formatting guidelines: + 1) Use consistent decimal precision (e.g., "10000.00" for loan_amnt). + 2) Dates (e.g. issue_d, earliest_cr_line) should follow the "Jan-YYYY" format. + 3) term has a leading space before the number of months (i.e. " 36 months") + 4) The address field is a special case where the State zipcode needs to be exactly as specified in the seed instructions. + The persons address must follow the format as specified in the examples with the State zipcode coming last. + 5) Any other formatting guidelines that can be inferred from the examples or field definitions but are not listed above. + + Cross-row guidelines: + 1) Generated data should maintain consistency with all statistical parameters and distributions defined in the seed instruction + across records (e.g., 60% of `term` as " 36 months"). + + Cross-column guidelines: + 1) Ensure logical and realistic consistency and correlations between variables. Examples include but not limited to: + a) Grade/Sub-grade consistency: Sub-grade must match the grade (e.g., "B" grade → "B1" to "B5"). + b) Interest Rate vs Grade/Subgrade relationship: Higher subgrades (e.g., A5) could have higher `int_rate` than lower subgrades (e.g., A3). + c) Mortgage Consistency: `mort_acc` should be 1 or more if `home_ownership` is `MORTGAGE`. + d) Open Accounts: `open_acc` ≤ `total_acc`. + + Data distribution guidelines: + 1) Continuous Variables (e.g., `loan_amnt`, `annual_inc`): Adhere to the mean and standard deviation given in the seed + instructions for each variable. + 2) Categorical variables (e.g., `term`, `home_ownership`): Use probability distributions given in the seed instructions + (e.g. 60% for " 36 months", 40% for " 60 months"). + 3) Discrete Variables (e.g., `pub_rec`, `mort_acc`): Adhere to value ranges and statistical parameters + provided in the seed instructions. + 4) Any other logical data distribution guidelines that can be inferred from the seed instructions or field definitions + and are not specified above. + + Background knowledge and realism guidelines: + 1) Ensure fields such as interest rates reflect real-world interest rates at the time the loan is issued. + 2) Generate values that are plausible (e.g., `annual_inc` ≤ $500,000 for most `emp_length` ranges). + 3) Avoid unrealistic values (e.g., `revol_util` as "200%" is unrealistic). + 4) Ensure that the generated data is realistic and plausible, avoiding extreme or impossible values. + 5) Ensure that the generated data is diverse and not repetitive, avoiding identical or very similar records. + 6) Ensure that the generated data is coherent and consistent, avoiding contradictions or inconsistencies between fields. + 7) Ensure that the generated data is relevant to the LendingClub use case and adheres to the guidelines provided.""" -def caii_check(caii_endpoint): - API_KEY = json.load(open("/tmp/jwt"))["access_token"] - headers = { - "Authorization": f"Bearer {API_KEY}" - } - - - if caii_endpoint: - caii_endpoint = caii_endpoint.removesuffix('/chat/completions') - caii_endpoint = caii_endpoint + "/models" - response = requests.get(caii_endpoint, headers=headers, timeout=3) # Will raise RequestException if fails - - return response diff --git a/app/core/data_analyser.py b/app/core/data_analyser.py new file mode 100644 index 0000000..977de10 --- /dev/null +++ b/app/core/data_analyser.py @@ -0,0 +1,357 @@ +import pandas as pd +import numpy as np +import warnings +from typing import Dict, List, Any, Union, Optional, Tuple +import math + +class DataAnalyser: + """Utility class for analyzing datasets and providing statistical insights.""" + + @classmethod + def analyse(cls, df: pd.DataFrame, correlation_threshold: float = 0.7) -> Dict[str, Any]: + """ + Analyze a DataFrame and extract useful statistics and insights. + + Args: + df: Input DataFrame to analyze + correlation_threshold: Threshold for identifying strong correlations + + Returns: + Dictionary containing analysis results + """ + print("Analyzing data...") + + # Initialize results structure + results = {"columns": [], + "grp_columns": {}, + "statistical_analysis": {}, + "cross_row_relationship": {}, + "cross_column_relationship": {} + } + + # Categorize columns + results["grp_columns"] = cls.categorize_columns(df) + results["columns"]= df.columns.tolist() + + # Analyze each type of column + stats = {} + if results["grp_columns"]["numeric"]: + stats["numeric"] = cls.analyze_numeric_columns(df, results["grp_columns"]["numeric"]) + + if results["grp_columns"]["categorical"]: + stats["categorical"] = cls.analyze_categorical_columns(df, results["grp_columns"]["categorical"]) + + if results["grp_columns"]["datetime"]: + stats["datetime"] = cls.analyze_datetime_columns(df, results["grp_columns"]["datetime"]) + + results["statistical_analysis"] = stats + + # Analyze cross-row relationships + results["cross_row_relationship"] = cls.analyze_cross_row_relationships(df) + + # Analyze cross-column relationships + if results["grp_columns"]["numeric"] and len(results["grp_columns"]["numeric"]) > 1: + results["cross_column_relationship"] = cls.analyze_cross_column_relationships( + df, results["grp_columns"]["numeric"], correlation_threshold + ) + + return results + + @classmethod + def categorize_columns(cls, df: pd.DataFrame) -> Dict[str, List[str]]: + """ + Categorize DataFrame columns by their data types. + + Args: + df: Input DataFrame + + Returns: + Dictionary mapping column types to lists of column names + """ + result = { + "numeric": [], + "categorical": [], + "datetime": [], + "text": [], + "other": [] + } + + for col in df.columns: + column = df[col] + + # Check if already datetime type - most reliable method + if pd.api.types.is_datetime64_any_dtype(column): + result["datetime"].append(col) + + # Check numeric types + elif pd.api.types.is_numeric_dtype(column) and not pd.api.types.is_bool_dtype(column): + result["numeric"].append(col) + + # Check categorical and boolean + elif pd.api.types.is_categorical_dtype(column) or pd.api.types.is_bool_dtype(column): + result["categorical"].append(col) + + # Check for text columns + elif pd.api.types.is_string_dtype(column) or pd.api.types.is_object_dtype(column): + # Check if more than 50% of non-null values are likely categorical (few unique values) + non_null_count = column.count() + if non_null_count > 0: + unique_ratio = column.nunique() / non_null_count + if unique_ratio < 0.2: # If less than 20% of values are unique, consider categorical + result["categorical"].append(col) + else: + result["text"].append(col) + else: + result["text"].append(col) + + # Everything else + else: + result["other"].append(col) + + # Verify all columns are categorized + categorized = [] + for category, cols in result.items(): + categorized.extend(cols) + + missing = set(df.columns) - set(categorized) + if missing: + print(f"Found uncategorized columns: {missing}") + result["other"].extend(list(missing)) + + return result + + @classmethod + def analyze_numeric_columns(cls, df: pd.DataFrame, numeric_columns: List[str]) -> Dict[str, Dict[str, Any]]: + """ + Analyze numeric columns to extract statistical information. + + Args: + df: Input DataFrame + numeric_columns: List of numeric column names + + Returns: + Dictionary mapping column names to their statistics + """ + result = {} + + for col in numeric_columns: + # Skip columns with all NaN values + if df[col].isna().all(): + continue + + stats = {} + + # Basic statistics + stats["count"] = int(df[col].count()) + stats["mean"] = float(df[col].mean()) + stats["median"] = float(df[col].median()) + stats["std"] = float(df[col].std()) + stats["min"] = float(df[col].min()) + stats["max"] = float(df[col].max()) + + # Calculate percentiles + for p in [25, 75, 90, 95, 99]: + stats[f"p{p}"] = float(df[col].quantile(p/100)) + + # Null value statistics + null_count = int(df[col].isna().sum()) + stats["null_count"] = null_count + stats["null_percentage"] = float((null_count / len(df)) * 100) + + result[col] = stats + + return result + + @classmethod + def analyze_categorical_columns(cls, df: pd.DataFrame, categorical_columns: List[str]) -> Dict[str, Dict[str, Any]]: + """ + Analyze categorical columns to extract distribution information. + + Args: + df: Input DataFrame + categorical_columns: List of categorical column names + + Returns: + Dictionary mapping column names to their statistics + """ + result = {} + + for col in categorical_columns: + # Skip columns with all NaN values + if df[col].isna().all(): + continue + + stats = {} + + # Basic statistics + stats["count"] = int(df[col].count()) + stats["unique_count"] = int(df[col].nunique()) + + # Value distribution (top 10 most common values) + value_counts = df[col].value_counts().head(10).to_dict() + # Convert any non-string keys to strings for JSON compatibility + top_values = {} + for k, v in value_counts.items(): + key = str(k) if not isinstance(k, str) else k + top_values[key] = int(v) + + stats["top_values"] = top_values + + # Calculate entropy to measure randomness + counts = df[col].value_counts() + probs = counts / counts.sum() + entropy = -np.sum(probs * np.log2(probs)) + stats["entropy"] = float(entropy) + + # Null value statistics + null_count = int(df[col].isna().sum()) + stats["null_count"] = null_count + stats["null_percentage"] = float((null_count / len(df)) * 100) + + result[col] = stats + + return result + + @classmethod + def analyze_datetime_columns(cls, df: pd.DataFrame, datetime_columns: List[str]) -> Dict[str, Dict[str, Any]]: + """ + Analyze datetime columns to extract temporal patterns. + + Args: + df: Input DataFrame + datetime_columns: List of datetime column names + + Returns: + Dictionary mapping column names to their statistics + """ + result = {} + + for col in datetime_columns: + # Skip columns with all NaN values + if df[col].isna().all(): + continue + + stats = {} + + # Basic statistics + stats["count"] = int(df[col].count()) + stats["min"] = str(df[col].min()) + stats["max"] = str(df[col].max()) + + # Calculate temporal span + min_date = df[col].min() + max_date = df[col].max() + if pd.notna(min_date) and pd.notna(max_date): + span_days = (max_date - min_date).total_seconds() / (60 * 60 * 24) + stats["span_days"] = float(span_days) + + # Extract date parts distribution + date_parts = {} + + # Year distribution + if df[col].dt.year.nunique() > 1: + year_counts = df[col].dt.year.value_counts().to_dict() + date_parts["year"] = {str(k): int(v) for k, v in year_counts.items()} + + # Month distribution + month_counts = df[col].dt.month.value_counts().to_dict() + date_parts["month"] = {str(k): int(v) for k, v in month_counts.items()} + + # Day of week distribution + dow_counts = df[col].dt.dayofweek.value_counts().to_dict() + date_parts["day_of_week"] = {str(k): int(v) for k, v in dow_counts.items()} + + # Hour distribution (if time component exists) + if (df[col].dt.hour != 0).any(): + hour_counts = df[col].dt.hour.value_counts().to_dict() + date_parts["hour"] = {str(k): int(v) for k, v in hour_counts.items()} + + stats["date_parts"] = date_parts + + # Null value statistics + null_count = int(df[col].isna().sum()) + stats["null_count"] = null_count + stats["null_percentage"] = float((null_count / len(df)) * 100) + + result[col] = stats + + return result + + @classmethod + def analyze_cross_row_relationships(cls, df: pd.DataFrame) -> Dict[str, Any]: + """ + Analyze relationships across rows, such as duplicates and null patterns. + + Args: + df: Input DataFrame + + Returns: + Dictionary containing cross-row relationship information + """ + result = {} + + # Analyze duplicates + duplicates = df.duplicated() + duplicate_count = int(duplicates.sum()) + duplicate_percentage = float((duplicate_count / len(df)) * 100) + + result["duplicates"] = { + "count": duplicate_count, + "percentage": duplicate_percentage + } + + # Analyze rows with null values + rows_with_null = df.isna().any(axis=1) + null_rows_count = int(rows_with_null.sum()) + null_rows_percentage = float((null_rows_count / len(df)) * 100) + + result["null_rows"] = { + "count": null_rows_count, + "percentage": null_rows_percentage + } + + return result + + @classmethod + def analyze_cross_column_relationships( + cls, df: pd.DataFrame, numeric_columns: List[str], correlation_threshold: float + ) -> Dict[str, Any]: + """ + Analyze relationships between columns, such as correlations. + + Args: + df: Input DataFrame + numeric_columns: List of numeric column names + correlation_threshold: Threshold for identifying strong correlations + + Returns: + Dictionary containing cross-column relationship information + """ + result = {} + + # Calculate correlations between numeric columns + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + corr_matrix = df[numeric_columns].corr() + + # Extract strong correlations (ignore self-correlations) + strong_correlations = {} + for i in range(len(numeric_columns)): + for j in range(i+1, len(numeric_columns)): + col1 = numeric_columns[i] + col2 = numeric_columns[j] + corr_value = corr_matrix.iloc[i, j] + + # Skip NaN correlations + if pd.isna(corr_value): + continue + + # Store absolute correlation values above threshold + if abs(corr_value) >= correlation_threshold: + pair_name = f"{col1} - {col2}" + strong_correlations[pair_name] = float(corr_value) + + if strong_correlations: + result["correlations"] = strong_correlations + + return result \ No newline at end of file diff --git a/app/core/data_loader.py b/app/core/data_loader.py new file mode 100644 index 0000000..6a7fbc4 --- /dev/null +++ b/app/core/data_loader.py @@ -0,0 +1,160 @@ +import pandas as pd +import numpy as np +import json +import os +import warnings +from pathlib import Path +from typing import Optional, Union + +class DataLoader: + """Load arbitrary tabular data into a DataFrame with robust error handling.""" + + @staticmethod + def load(path: str, sample_rows: int = 100000) -> pd.DataFrame: + """ + Load data from various file formats into a pandas DataFrame. + + Args: + path: Path to the data file + sample_rows: Maximum number of rows to load for large files + + Returns: + pandas DataFrame with the loaded data + """ + # Validate the path exists + if not os.path.exists(path): + raise FileNotFoundError(f"File not found: {path}") + + # Get file extension + ext = Path(path).suffix.lower() + + try: + if ext == ".csv": + # Try different encoding and delimiter options + try: + df = pd.read_csv(path, encoding='utf-8') + except: + try: + df = pd.read_csv(path, encoding='latin1') + except: + try: + # Try with different delimiters + df = pd.read_csv(path, sep=None, engine='python') + except: + # Last resort - try reading with very permissive settings + df = pd.read_csv(path, sep=None, engine='python', + encoding='latin1', on_bad_lines='skip') + elif ext == ".tsv": + df = pd.read_csv(path, sep='\t') + elif ext == ".json": + # Try multiple JSON formats + try: + # Try JSONL format first + df = pd.read_json(path, lines=True) + except ValueError: + try: + # Then try normal JSON + df = pd.read_json(path) + except: + # Try loading as raw JSON and converting + with open(path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list): + df = pd.DataFrame(data) + elif isinstance(data, dict): + # If it's a dict, try to extract a list or convert the dict itself + for k, v in data.items(): + if isinstance(v, list) and len(v) > 0: + df = pd.DataFrame(v) + break + else: + df = pd.DataFrame([data]) + else: + raise ValueError(f"Unsupported JSON structure in {path}") + elif ext in (".xls", ".xlsx"): + try: + # Try with openpyxl first + df = pd.read_excel(path, engine="openpyxl") + except: + # Fall back to xlrd for older Excel files + df = pd.read_excel(path) + elif ext == ".xlsb": + df = pd.read_excel(path, engine="pyxlsb") + elif ext == ".parquet": + df = pd.read_parquet(path) + elif ext == ".feather": + df = pd.read_feather(path) + elif ext == ".pickle" or ext == ".pkl": + df = pd.read_pickle(path) + elif ext == ".sas7bdat": + df = pd.read_sas(path) + elif ext == ".dta": + df = pd.read_stata(path) + elif ext == ".h5" or ext == ".hdf5": + df = pd.read_hdf(path) + else: + raise ValueError(f"Unsupported file extension: {ext}") + + # Clean up the DataFrame + # Replace infinite values with NaN + df = df.replace([np.inf, -np.inf], np.nan) + + # Handle duplicate column names + if df.columns.duplicated().any(): + df.columns = [f"{col}_{i}" if i > 0 else col + for i, col in enumerate(df.columns)] + + # Keep memory/latency bounded + if len(df) > sample_rows: + df = df.sample(sample_rows, random_state=42) + + # Process column types + df = DataLoader.infer_dtypes(df) + + return df.reset_index(drop=True) + + except Exception as e: + print(f"Error loading data from {path}: {str(e)}") + # Return an empty DataFrame with a message column + return pd.DataFrame({"error_message": [f"Failed to load data: {str(e)}"]}) + + @staticmethod + def parse_datetime(series): + """ + Parse datetime with appropriate format while suppressing warnings. + """ + # Skip if already datetime + if pd.api.types.is_datetime64_any_dtype(series): + return series + + # Suppress warnings and use dateutil parser + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return pd.to_datetime(series, errors='coerce') + + @staticmethod + def infer_dtypes(df: pd.DataFrame) -> pd.DataFrame: + """Attempt to infer correct data types for all columns.""" + for col in df.columns: + # Skip columns that are already numeric or datetime + if pd.api.types.is_numeric_dtype(df[col]) or pd.api.types.is_datetime64_any_dtype(df[col]): + continue + + # Try to convert to numeric + numeric_series = pd.to_numeric(df[col], errors='coerce') + if numeric_series.notna().sum() > 0.8 * df[col].count(): # Over 80% valid numbers + df[col] = numeric_series + continue + + # Try to convert to datetime - with warnings suppressed + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + datetime_series = pd.to_datetime(df[col], errors='coerce') + if datetime_series.notna().sum() > 0.8 * df[col].count(): # Over 80% valid dates + df[col] = datetime_series + continue + except: + pass + + return df \ No newline at end of file diff --git a/app/core/database.py b/app/core/database.py index 3e9ee2b..64433a8 100644 --- a/app/core/database.py +++ b/app/core/database.py @@ -60,6 +60,7 @@ def init_db(self): display_name TEXT, local_export_path TEXT, hf_export_path TEXT, + s3_export_path TEXT, num_questions FLOAT, total_count FLOAT, topics TEXT, @@ -107,6 +108,7 @@ def init_db(self): display_name TEXT, local_export_path TEXT, hf_export_path TEXT, + s3_export_path TEXT, job_id TEXT, job_name TEXT UNIQUE, job_status TEXT, @@ -145,29 +147,24 @@ def save_generation_metadata(self, metadata: Dict) -> int: try: # Prepare data outside transaction if metadata.get('generate_file_name'): - output_paths = metadata.get('output_path', {}) else: - output_paths = {} - - # Use a single connection with enhanced settings with self.get_connection() as conn: conn.execute("BEGIN IMMEDIATE") - cursor = conn.cursor() query = """ INSERT INTO generation_metadata ( - timestamp, technique, model_id, inference_type, caii_endpoint, use_case, - custom_prompt, model_parameters, input_key, output_key, output_value, generate_file_name, - display_name, local_export_path, hf_export_path, - num_questions, total_count, topics, examples, - schema, doc_paths, input_path, job_id, job_name, job_status, job_creator_name - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + timestamp, technique, model_id, inference_type, caii_endpoint, use_case, + custom_prompt, model_parameters, input_key, output_key, output_value, generate_file_name, + display_name, local_export_path, hf_export_path, s3_export_path, + num_questions, total_count, topics, examples, + schema, doc_paths, input_path, job_id, job_name, job_status, job_creator_name + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """ values = ( @@ -186,6 +183,7 @@ def save_generation_metadata(self, metadata: Dict) -> int: metadata.get('display_name', None), output_paths.get('local', None), output_paths.get('huggingface', None), + output_paths.get('s3', None), metadata.get('num_questions', None), metadata.get('total_count', None), metadata.get('topics', None), @@ -198,20 +196,18 @@ def save_generation_metadata(self, metadata: Dict) -> int: metadata.get('job_status', None), metadata.get('job_creator_name', None) ) - #print(values) cursor.execute(query, values) conn.commit() return cursor.lastrowid except sqlite3.OperationalError as e: - if conn: + if 'conn' in locals(): conn.rollback() print(f"Database operation error in save_generation_metadata: {e}") - raise except Exception as e: - if conn: + if 'conn' in locals(): conn.rollback() print(f"Error saving metadata to database: {str(e)}") raise @@ -359,7 +355,6 @@ def save_evaluation_metadata(self, metadata: Dict) -> int: def save_export_metadata(self, metadata: Dict) -> int: """Save export metadata to database with prepared transaction""" try: - # Use a single connection with enhanced settings with self.get_connection() as conn: conn.execute("BEGIN IMMEDIATE") @@ -373,11 +368,12 @@ def save_export_metadata(self, metadata: Dict) -> int: display_name, local_export_path, hf_export_path, + s3_export_path, job_id, job_name, job_status, job_creator_name - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """ values = ( @@ -386,6 +382,7 @@ def save_export_metadata(self, metadata: Dict) -> int: metadata.get('display_name'), metadata.get('local_export_path', None), metadata.get('hf_export_path', None), + metadata.get('s3_export_path', None), # Add this line metadata.get('job_id', None), metadata.get('job_name', None), metadata.get('job_status', None), @@ -1131,4 +1128,21 @@ def backup_and_restore_db(self, force_restore: bool = False) -> bool: print(f"Force restore failed: {str(restore_error)}") return False + def update_s3_path(self, file_name: str, s3_path: str): + """Update s3_export_path for a generation""" + try: + with self.get_connection() as conn: + conn.execute("BEGIN IMMEDIATE") + cursor = conn.cursor() + + # Update the s3_path + cursor.execute( + "UPDATE generation_metadata SET s3_export_path = ? WHERE generate_file_name = ?", + (s3_path, file_name) + ) + conn.commit() + print(f"S3 path update successful for file: {file_name}") + except Exception as e: + print(f"Error updating S3 export path: {str(e)}") + raise (f"Error updating S3 export path: {str(e)}") diff --git a/app/core/model_handlers.py b/app/core/model_handlers.py index 74ce723..64ae81f 100644 --- a/app/core/model_handlers.py +++ b/app/core/model_handlers.py @@ -11,6 +11,7 @@ from openai import OpenAI from app.core.exceptions import APIError, InvalidModelError, ModelHandlerError, JSONParsingError from app.core.telemetry_integration import track_llm_operation +from app.core.config import _get_caii_token @@ -280,7 +281,8 @@ def _handle_bedrock_request(self, prompt: str, retry_with_reduced_tokens: bool): def _handle_caii_request(self, prompt: str): """Original CAII implementation""" try: - API_KEY = json.load(open("/tmp/jwt"))["access_token"] + #API_KEY = json.load(open("/tmp/jwt"))["access_token"] + API_KEY = _get_caii_token() MODEL_ID = self.model_id caii_endpoint = self.caii_endpoint diff --git a/app/core/prompt_templates.py b/app/core/prompt_templates.py index ad838ee..9785636 100644 --- a/app/core/prompt_templates.py +++ b/app/core/prompt_templates.py @@ -2,8 +2,13 @@ import json import csv import os +import pandas as pd +import numpy as np from app.models.request_models import Example, Example_eval -from app.core.config import UseCase, Technique, ModelFamily, get_model_family,USE_CASE_CONFIGS +from app.core.config import UseCase, Technique, ModelFamily, get_model_family,USE_CASE_CONFIGS, LENDING_DATA_PROMPT +from app.core.data_loader import DataLoader +from app.core.data_analyser import DataAnalyser +from app.core.summary_formatter import SummaryFormatter DEFAULT_SCHEMA = """CREATE TABLE employees ( id INT PRIMARY KEY, @@ -580,52 +585,303 @@ def get_freeform_eval_prompt(model_id: str, return final_prompt - @staticmethod - def create_custom_prompt(model_id: str, - custom_prompt:str - ) -> str: + # @staticmethod + # def create_custom_prompt(model_id: str, + # custom_prompt:str, + # example_path: Optional[str], + # ) -> str: - final_instruction = f"""You are a brilliant prompt engineer. Your job is to create a best prompt for provided task: {custom_prompt} which can get - best response from large language model - The prompt should Focus on: + # final_instruction = f"""You are a brilliant prompt engineer. Your job is to create a best prompt for provided task: {custom_prompt} which can get + # best response from large language model + # The prompt should Focus on: + + # - The core task objective + # - Key aspects to consider or maintain + # - Any special requirements specific to the task. + # For example the prompt for code generation is below + # {DEFAULT_CODE_GENERATION_PROMPT} + # Make sure you just give the prompt in your response which can be directly used by large language model. + # No need to give any explanation but just the prompt in same format as the example given above. + # """ + # model_family = get_model_family(model_id) + + # if model_family== ModelFamily.LLAMA: + + # final_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>" + '\n' + final_instruction + '\n' + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" - - The core task objective - - Key aspects to consider or maintain - - Any special requirements specific to the task. - for example the prompt for code generation is below - {DEFAULT_CODE_GENERATION_PROMPT} - Make sure you just give the prompt in your response which can be directly used by large language model. - No need to give any explanation but just the prompt in same format as the example given above. - """ - model_family = get_model_family(model_id) + # elif model_family== ModelFamily.MISTRAL: + + # final_prompt = '[INST]' + "\n" + final_instruction + '\n' + '[/INST]' + + # elif model_family == ModelFamily.CLAUDE: + # final_prompt = "\n" + final_instruction + + # elif model_family== ModelFamily.QWEN: + # system_prompt = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." + + # final_prompt = f'''<|im_start|>system + # {system_prompt}<|im_end|> + # <|im_start|>user + + # {final_instruction}<|im_end|> + # <|im_start|>assistant + # ''' + + + # else: + # final_prompt = "\n" + final_instruction + # return final_prompt + + def create_custom_prompt( + model_id: str, + custom_prompt: str, + example_path: str | None = None, +) -> str: + """ + Create a custom prompt for a language model, optionally including dataset analysis. - if model_family== ModelFamily.LLAMA: + Args: + model_id: The ID of the model to create the prompt for + custom_prompt: The base custom prompt text + example_path: Optional path to an example dataset - final_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>" + '\n' + final_instruction + '\n' + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + Returns: + A formatted prompt suitable for the specified model + """ + summary_block = "" + example_block = "" - elif model_family== ModelFamily.MISTRAL: + if example_path: + print(f"Loading example data from: {example_path}") + try: + df = DataLoader.load(example_path) + #print(f"Loaded DataFrame with shape: {df.shape}") + + # Apply type inference to improve analysis + df = DataLoader.infer_dtypes(df) + + if "error_message" in df.columns and len(df.columns) == 1: + # Data loading failed + print(f"Error loading data: {df['error_message'][0]}") + # Keep summary and example blocks as empty strings + elif not df.empty: + # ---------- build summary block ---------- + try: + print("Analyzing data...") + summary_dict = DataAnalyser.analyse(df) + + # Create a more structured summary with explanations + summary_block = ( + "\n" + "INSTRUCTIONS: The following analysis provides key insights about the dataset that should guide your synthetic data generation. Use these signals to match distributions and relationships when generating synthetic data.\n\n" + ) + + # Add section for columns classification + if "columns" in summary_dict: + summary_block += (""" + ## Column Types\n + "These are all columns identified in the dataset in given specific order:\n\n + Make sure to provide definitions of each column in the same order as they are in the dataset. + Don't change or skip any column name or order. + """) + + + + + summary_block += "\n".join(f"- {col}" for col in summary_dict["columns"]) + "\n\n" + + # Add section for statistical analysis + if "statistical_analysis" in summary_dict: + summary_block += ( + "## Statistical Analysis\n" + "These statistics describe the distributions of values in the dataset:\n\n" + ) + + if "numeric" in summary_dict["statistical_analysis"]: + summary_block += ( + "### Numeric Statistics\n" + "Key statistics for numeric columns (mean, median, min, max, etc.):\n" + f"{json.dumps(summary_dict['statistical_analysis']['numeric'], indent=2)}\n\n" + ) + + if "categorical" in summary_dict["statistical_analysis"]: + summary_block += ( + "### Categorical Statistics\n" + "Distribution of values in categorical columns:\n" + f"{json.dumps(summary_dict['statistical_analysis']['categorical'], indent=2)}\n\n" + ) + + if "datetime" in summary_dict["statistical_analysis"]: + summary_block += ( + "### DateTime Statistics\n" + "Temporal patterns and ranges in datetime columns:\n" + f"{json.dumps(summary_dict['statistical_analysis']['datetime'], indent=2)}\n\n" + ) + + # Add section for cross-row relationships + if "cross_row_relationship" in summary_dict: + summary_block += ( + "## Cross-Row Relationships\n" + "These insights describe patterns across rows in the dataset:\n\n" + f"{json.dumps(summary_dict['cross_row_relationship'], indent=2)}\n\n" + ) + + # Add section for cross-column relationships + if "cross_column_relationship" in summary_dict: + summary_block += ( + "## Cross-Column Relationships\n" + "These insights describe correlations and dependencies between columns:\n\n" + f"{json.dumps(summary_dict['cross_column_relationship'], indent=2)}\n\n" + ) + + # Close the data summary block + summary_block += "\n" + + print("Data analysis completed successfully.") + + except Exception as e: + # Analysis failed → keep summary_block as empty string + print(f"Error in data analysis: {str(e)}") + # Do NOT add any error messages to blocks + + # ---------- build example block ---------- + try: + print("Creating CSV snippet...") + csv_snippet = SummaryFormatter.first_rows_block(df) + example_block = ( + "\n" + "INSTRUCTIONS: The CSV snippet shows the first 10 rows of the " + "original dataset. Preserve this column order, header names, " + "and data types while creating new rows. " + "Use this to create a comprehensive list of all columns and their definitions. " + "Make sure the list covers all details and columns which will be required " + "to create data.\n" + f"{csv_snippet}" + "\n" + ) + print("CSV snippet created successfully.") + except Exception as e: + # Snippet failed → keep example_block as empty string + print(f"Error creating CSV snippet: {str(e)}") + # Do NOT add any error messages to blocks + except Exception as e: + print(f"Error processing example file: {str(e)}") + # Keep blocks as empty strings + # Do NOT add any error messages to blocks + + # Construct the final instruction with proper error handling for missing constants + try: - final_prompt = '[INST]' + "\n" + final_instruction + '\n' + '[/INST]' + + if example_path: + #Construct the final instruction + final_instruction = f"""You are a brilliant prompt engineer. + Your task: **{custom_prompt}** - elif model_family == ModelFamily.CLAUDE: - final_prompt = "\n" + final_instruction + {summary_block}{example_block}Return **only** the finished prompt that can be sent directly to a language model. + Now that you have complete information about the task, follow the below instructions to create prompt. - elif model_family== ModelFamily.QWEN: - system_prompt = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." + - Look at column list and include all columns in your prompt with their definitions. the list should be exhaustive and cover all columns. + - Make sure to have all statistical analysis , cross-row and cross-column relationships in your prompt. + - The prmpt should be absolutely clear in its final goal and there should not be any ambiguity or vagueness in the prompt. + - The prompt should be clear and exhaustive in its column details. + + + A few examples are given below for your reference + Code Generation: + + {DEFAULT_CODE_GENERATION_PROMPT} + + Lending Data Generation: + {LENDING_DATA_PROMPT} + + Make sure you just give the prompt in your response which can be directly used by large language model. + No need to give any explanation but just the prompt in same format as the example given above. + Never mention how many rows or dataset size needs to be generated in the final output. + + """ + else: + + final_instruction = f"""You are a brilliant prompt engineer. + Your task: **{custom_prompt}** + + {summary_block}{example_block} + + Return a well-crafted prompt that focuses on: + - The core task objective + - Clear and exhaustive column details + - Key aspects to consider or maintain + - Special requirements for the task + + A few examples are given below for your reference + Code Generation: + + {DEFAULT_CODE_GENERATION_PROMPT} + + Text to SQL: + {DEFAULT_TEXT2SQL_PROMPT} + + Make sure you just give the prompt in your response which can be directly used by large language model. + No need to give any explanation but just the prompt in same format as the example given above. + Never mention how many rows or dataset size needs to be generated in the final output. + """ + except Exception as e: + print(f"Error constructing instruction template: {str(e)}") + # Fallback to a simpler template that still includes any successful blocks + final_instruction = f"""You are a brilliant prompt engineer. + Your task: **{custom_prompt}** + + {summary_block}{example_block} + + Return a well-crafted prompt that focuses on: + - The core task objective + - Clear and exhaustive column details + - Key aspects to consider or maintain + - Special requirements for the task + + A few examples are given below for your reference + Code Generation: + + {DEFAULT_CODE_GENERATION_PROMPT} + + Text to SQL: + {DEFAULT_TEXT2SQL_PROMPT} + + Make sure you just give the prompt in your response which can be directly used by large language model. + No need to give any explanation but just the prompt in same format as the example given above. + Never mention how many rows or dataset size needs to be generated in the final output. + """ + + # Format according to model family + try: + family = get_model_family(model_id) - final_prompt = f'''<|im_start|>system - {system_prompt}<|im_end|> + if family == ModelFamily.LLAMA: + return "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n" \ + f"{final_instruction}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + elif family == ModelFamily.MISTRAL: + return f"[INST]\n{final_instruction}\n[/INST]" + elif family == ModelFamily.CLAUDE: + return "\n" + final_instruction + elif family == ModelFamily.QWEN: + system = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." + return f"""<|im_start|>system + {system}<|im_end|> <|im_start|>user - + {final_instruction}<|im_end|> <|im_start|>assistant - ''' - - - else: - final_prompt = "\n" + final_instruction - return final_prompt + """ + else: + # Default format if model family is unknown + return "\n" + final_instruction + except Exception as e: + print(f"Error formatting for model family: {str(e)}") + # Return the raw instruction if formatting fails + return final_instruction + + @staticmethod def generate_result_prompt(model_id: str, @@ -715,20 +971,32 @@ def get_freeform_prompt(model_id: str, ) -> str: if example_path: - file_extension = os.path.splitext(example_path)[1].lower() - - with open(example_path, 'r') as f: - if file_extension == '.json': - # Handle JSON files - example_upload = json.load(f) - examples_str = json.dumps(example_upload, indent=2) - elif file_extension == '.csv': - # Handle CSV files - csv_reader = csv.DictReader(f) - example_upload = list(csv_reader) - examples_str = json.dumps(example_upload, indent=2) # Convert CSV data to JSON string format - else: - raise ValueError(f"Unsupported file extension: {file_extension}. Only .json and .csv are supported.") + try: + # Use DataLoader to load the file, limiting to 10 rows + df = DataLoader.load(example_path, sample_rows=10) + + # Convert DataFrame to list of dictionaries + example_upload = df.head(10).to_dict(orient='records') + + # Handle non-serializable objects + def json_serializable(obj): + if isinstance(obj, (pd.Timestamp, np.datetime64)): + return obj.isoformat() + elif isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return str(obj) + + # Convert to JSON string with custom serialization + examples_str = json.dumps(example_upload, indent=2, default=json_serializable) + + except Exception as e: + print(f"Error processing example file: {str(e)}") + examples_str = "" else: if example_custom: @@ -876,10 +1144,11 @@ def build_generate_result_prompt(model_id: str, @staticmethod def build_custom_prompt(model_id: str, - custom_prompt = Optional[str] + custom_prompt = Optional[str], + example_path= Optional[str] ) -> str: - return ModelPrompts.create_custom_prompt(model_id, custom_prompt) + return ModelPrompts.create_custom_prompt(model_id, custom_prompt, example_path) @staticmethod def build_freeform_prompt(model_id: str, diff --git a/app/core/summary_formatter.py b/app/core/summary_formatter.py new file mode 100644 index 0000000..aea0406 --- /dev/null +++ b/app/core/summary_formatter.py @@ -0,0 +1,51 @@ +import json +import pandas as pd +import numpy as np +from typing import Dict, Any, Optional + +class SummaryFormatter: + """Build XML‑ish blocks for prompt ingestion with error handling.""" + + @staticmethod + def first_rows_block(df: pd.DataFrame, n: int = 10) -> str: + """Generate a CSV snippet of the first n rows.""" + try: + # Handle potential issues with object serialization + # Replace problematic values with their string representation + safe_df = df.head(n).copy() + + for col in safe_df.columns: + # Replace problematic values in object columns + if safe_df[col].dtype == 'object': + safe_df[col] = safe_df[col].apply(lambda x: + str(x) if x is not None else None) + + # Use CSV repr so models instantly see delimiters + return safe_df.to_csv(index=False) + except Exception as e: + return f"Error generating CSV snippet: {str(e)}\n" + + @staticmethod + def json_block(summary: Dict[str, Any]) -> str: + """Convert summary dict to a JSON string, handling problematic values.""" + try: + # Handle non-serializable objects + def clean_for_json(obj): + if isinstance(obj, dict): + return {k: clean_for_json(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [clean_for_json(item) for item in obj] + elif isinstance(obj, (int, float, str, bool, type(None))): + return obj + elif isinstance(obj, np.number): + return float(obj) + else: + return str(obj) + + # Clean the summary dict + clean_summary = clean_for_json(summary) + + # Convert to JSON + return json.dumps(clean_summary, separators=(",", ":"), ensure_ascii=False) + except Exception as e: + return f"Error generating JSON summary: {str(e)}" \ No newline at end of file diff --git a/app/main.py b/app/main.py index ec8b7e9..216eb8d 100644 --- a/app/main.py +++ b/app/main.py @@ -1,6 +1,7 @@ import os import boto3 from datetime import datetime, timezone +from typing import Any, Dict from botocore.config import Config from fastapi import FastAPI, HTTPException, Request, status from fastapi.responses import JSONResponse @@ -9,6 +10,8 @@ from fastapi.staticfiles import StaticFiles from fastapi.responses import FileResponse from pydantic import BaseModel +import pandas as pd +import numpy as np from typing import Dict, List, Optional import subprocess import asyncio @@ -22,6 +25,7 @@ import sys import json import uuid +from fastapi.encoders import jsonable_encoder print(os.getcwd()) # Setup absolute paths ROOT_DIR = Path(__file__).parent.parent # Goes up one level from app/main.py to reach project root @@ -34,7 +38,7 @@ sys.path.append(str(ROOT_DIR)) from app.services.evaluator_service import EvaluatorService -from app.models.request_models import SynthesisRequest, EvaluationRequest, Export_synth, ModelParameters, CustomPromptRequest, JsonDataSize, RelativePath +from app.models.request_models import SynthesisRequest, EvaluationRequest, Export_synth, ModelParameters, CustomPromptRequest, JsonDataSize, RelativePath, Technique from app.services.synthesis_service import SynthesisService from app.services.export_results import Export_Service @@ -180,7 +184,34 @@ def restart_application(): print(f"Error restarting application: {e}") raise - +def deep_sanitize_nans(obj: Any) -> Any: + """ + Recursively traverse all data structures and replace NaN with None. + This handles all nested structures. + """ + if isinstance(obj, dict): + return {k: deep_sanitize_nans(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [deep_sanitize_nans(item) for item in obj] + elif isinstance(obj, tuple): + return tuple(deep_sanitize_nans(item) for item in obj) + elif isinstance(obj, set): + return {deep_sanitize_nans(item) for item in obj} + elif isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)): + return None + elif isinstance(obj, (pd.Timestamp, np.datetime64)): + return obj.isoformat() + elif isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + if np.isnan(obj) or np.isinf(obj): + return None + return float(obj) + elif isinstance(obj, np.ndarray): + return deep_sanitize_nans(obj.tolist()) + elif pd.isna(obj): + return None + return obj # Add these models @@ -276,7 +307,9 @@ def get_timeout_for_request(request: Request) -> float: if path.endswith("/generate"): return 200.0 # 2 minutes for generation elif path.endswith("/freeform"): - return 200.0 # 2 minutes for generation + return 300.0 # 5 minutes for generation + elif path.endswith("/create_custom_prompt"): + return 300.0 # 5 minutes for generation elif path.endswith("/evaluate"): return 200.0 # 2 minutes for evaluation elif path.endswith("/export_results"): @@ -284,7 +317,7 @@ def get_timeout_for_request(request: Request) -> float: elif "health" in path: return 5.0 # Quick timeout for health checks elif path.endswith("/upgrade"): - return 1200 # timeout increase for upgrade + return 2000 # timeout increase for upgrade else: return 60.0 # Default timeout @@ -363,6 +396,41 @@ async def get_dataset_size(request:JsonDataSize): return {"dataset_size": len(inputs)} +@app.post("/json/get_content", include_in_schema=True, responses = responses, + description = "get json content") +async def get_dataset_size(request: RelativePath): + + if request.path: + path = path_manager.get_str_path(request.path) + + try: + with open(path) as f: + data = json.load(f) + + + except json.JSONDecodeError as e: + error_msg = f"Invalid JSON format in file {path}: {str(e)}" + print(error_msg) + return JSONResponse( + status_code=400, + content={"status": "failed", "error": error_msg} + ) + except (KeyError, ValueError) as e: + print(str(e)) + return JSONResponse( + status_code=400, + content={"status": "failed", "error": str(e)} + ) + except Exception as e: + error_msg = f"Error processing {path}: {str(e)}" + print(error_msg) + return JSONResponse( + status_code=400, + content={"status": "failed", "error": error_msg} + ) + + return {"data": data} + @app.post("/synthesis/generate", include_in_schema=True, responses=responses, @@ -373,15 +441,10 @@ async def generate_examples(request: SynthesisRequest): # Generate a request ID request_id = str(uuid.uuid4()) - if request.inference_type== "CAII": + if request.inference_type == "CAII": caii_endpoint = request.caii_endpoint - response = caii_check(caii_endpoint) - message = "The CAII endpoint you are tring to reach is downscaled, please try after >15 minutes while it autoscales, meanwhile please try another model" - if response.status_code != 200: - return JSONResponse( - status_code=503, # Service Unavailable - content={"status": "failed", "error": message} - ) + + caii_check(caii_endpoint) is_demo = request.is_demo @@ -429,13 +492,7 @@ async def generate_freeform_data(request: SynthesisRequest): if request.inference_type == "CAII": caii_endpoint = request.caii_endpoint - response = caii_check(caii_endpoint) - message = "The CAII endpoint you are trying to reach is downscaled, please try after >15 minutes while it autoscales, meanwhile please try another model" - if response.status_code != 200: - return JSONResponse( - status_code=503, # Service Unavailable - content={"status": "failed", "error": message} - ) + caii_check(caii_endpoint) is_demo = request.is_demo mem = 4 @@ -462,13 +519,21 @@ async def generate_freeform_data(request: SynthesisRequest): core = 2 if is_demo: - return await synthesis_service.generate_freeform(request, is_demo=is_demo, request_id=request_id ) + result = await synthesis_service.generate_freeform(request, is_demo=is_demo, request_id=request_id ) + # Apply our deep sanitization to handle all NaN values + sanitized_result = deep_sanitize_nans(result) + + # Then use jsonable_encoder for FastAPI-specific conversions + final_result = jsonable_encoder(sanitized_result) + + return final_result else: # Pass additional parameter to indicate this is a freeform request request_dict = request.model_dump() freeform = True # Convert back to SynthesisRequest object freeform_request = SynthesisRequest(**request_dict) + return synthesis_job.generate_job(freeform_request, core, mem, request_id=request_id, freeform = freeform) @app.post("/synthesis/evaluate", @@ -479,15 +544,9 @@ async def evaluate_examples(request: EvaluationRequest): """Evaluate generated QA pairs""" request_id = str(uuid.uuid4()) - if request.inference_type== "CAII": + if request.inference_type == "CAII": caii_endpoint = request.caii_endpoint - response = caii_check(caii_endpoint) - message = "The CAII endpoint you are tring to reach is downscaled, please try after >15 minutes while it autoscales, meanwhile please try another model" - if response.status_code != 200: - return JSONResponse( - status_code=503, # Service Unavailable - content={"status": "failed", "error": message} - ) + caii_check(caii_endpoint) is_demo = request.is_demo if is_demo: @@ -506,13 +565,8 @@ async def evaluate_freeform(request: EvaluationRequest): if request.inference_type == "CAII": caii_endpoint = request.caii_endpoint - response = caii_check(caii_endpoint) - message = "The CAII endpoint you are trying to reach is downscaled, please try after >15 minutes while it autoscales, meanwhile please try another model" - if response.status_code != 200: - return JSONResponse( - status_code=503, # Service Unavailable - content={"status": "failed", "error": message} - ) + caii_check(caii_endpoint) + is_demo = getattr(request, 'is_demo', True) if is_demo: @@ -592,8 +646,9 @@ async def create_custom_prompt(request: CustomPromptRequest, request_id = None): prompt = PromptBuilder.build_custom_prompt( model_id=request.model_id, custom_prompt=request.custom_prompt, + example_path= request.example_path ) - #print(prompt) + print(prompt) prompt_gen = model_handler.generate_response(prompt, request_id=request_id) return {"generated_prompt":prompt_gen} @@ -900,6 +955,149 @@ async def get_model_parameters() -> Dict: +@app.post("/complete_gen_prompt") +async def complete_prompt(request: SynthesisRequest): + """Allow users to see whole prompt which goes finally into LLM""" + try: + topic = request.topics[0] + batch_size = 5 if request.num_questions>=5 else request.num_questions + omit_questions = [] + + if request.technique == Technique.Freeform: + prompt = PromptBuilder.build_freeform_prompt( + model_id=request.model_id, + use_case=request.use_case, + topic=topic, + num_questions=batch_size, + omit_questions=omit_questions, + example_custom=request.example_custom or [], + example_path=request.example_path, + custom_prompt=request.custom_prompt, + schema=request.schema, + ) + elif request.technique == Technique.Custom_Workflow: + + inputs = [] + path = None # Initialize path + + try: + if not request.input_path: + raise ValueError("input_path must not be empty or None") + if not isinstance(request.input_path, (list, tuple)): + # Or handle a single string case if needed, e.g., path = request.input_path + raise TypeError("input_path must be a list or tuple of paths") + if not request.input_path[0]: + raise ValueError("First path in input_path is empty") + + path = request.input_path[0] + + except (ValueError, TypeError, IndexError) as e: # Catch specific errors for clarity + # Raise appropriate HTTP exception for bad request data + raise HTTPException(status_code=400, detail=f"Invalid input_path: {str(e)}") + except Exception as e: # Catch any other unexpected errors getting the path + raise HTTPException(status_code=500, detail=f"Unexpected error getting input path: {str(e)}") + + + # Proceed only if path was successfully retrieved + try: + with open(path) as f: + data = json.load(f) + + # Assuming data is a list of dicts + if not isinstance(data, list): + raise ValueError(f"Expected JSON data in {path} to be a list, but got {type(data).__name__}") + + inputs.extend(item.get(request.input_key, '') for item in data if isinstance(item, dict)) # Ensure item is a dict + + except FileNotFoundError: + raise HTTPException(status_code=404, detail=f"Input file not found: {path}") + except json.JSONDecodeError: + raise HTTPException(status_code=400, detail=f"Invalid JSON in file: {path}") + except ValueError as e: # For the list/dict structure check + raise HTTPException(status_code=400, detail=f"Invalid data structure in {path}: {str(e)}") + except Exception as e: # Catch any other unexpected file processing errors + raise HTTPException(status_code=500, detail=f"Error processing file {path}: {str(e)}") + + + # Check if inputs list is empty before accessing index 0 + if not inputs: + # Raise an error indicating no data was extracted based on the key + raise HTTPException(status_code=400, detail=f"No data extracted from {path} using key '{request.input_key}'. The file might be empty, the key might not exist, or the JSON structure is unexpected.") + + input_data = inputs[0] + + prompt = PromptBuilder.build_generate_result_prompt( + model_id=request.model_id, + use_case=request.use_case, + input=input_data, + examples=request.examples or [], + schema=request.schema, + custom_prompt=request.custom_prompt, + ) + elif request.technique == Technique.SFT: + prompt = PromptBuilder.build_prompt( + model_id=request.model_id, + use_case=request.use_case, + topic=topic, + num_questions=batch_size, + omit_questions=omit_questions, + examples=request.examples or [], + technique=request.technique, + schema=request.schema, + custom_prompt=request.custom_prompt, + ) + + return {"complete_prompt":prompt} + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/complete_eval_prompt") +async def complete_prompt(request: EvaluationRequest): + """Allow users to see whole prompt which goes finally into LLM""" + try: + + + if request.technique == Technique.Freeform: + with open(request.import_path, 'r') as file: + data = json.load(file) + + # Ensure data is a list of rows + rows = data if isinstance(data, list) else [data] + prompt = PromptBuilder.build_freeform_eval_prompt( + request.model_id, + request.use_case, + rows[0], + request.examples, + request.custom_prompt + ) + + elif request.technique == Technique.SFT or request.technique == Technique.Custom_Workflow: + + with open(request.import_path, 'r') as file: + data = json.load(file) + qa_pairs = [{ + request.output_key: item.get(request.output_key, ''), # Use get() with default value + request.output_value: item.get(request.output_value, '') # Use get() with default value + } for item in data] + qa_pair = qa_pairs[0] + prompt = PromptBuilder.build_eval_prompt( + request.model_id, + request.use_case, + qa_pair[request.output_key], + qa_pair[request.output_value], + request.examples, + request.custom_prompt + ) + + return {"complete_prompt":prompt} + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + + + @app.get("/{use_case}/gen_prompt") async def customise_prompt(use_case: UseCase): diff --git a/app/migrations/alembic_schema_models.py b/app/migrations/alembic_schema_models.py index 4035467..3967a11 100644 --- a/app/migrations/alembic_schema_models.py +++ b/app/migrations/alembic_schema_models.py @@ -23,6 +23,7 @@ class GenerationMetadataModel(Base): display_name = Column(Text) local_export_path = Column(Text) hf_export_path = Column(Text) + s3_export_path = Column(Text) num_questions = Column(Float) total_count = Column(Float) topics = Column(Text) @@ -66,6 +67,7 @@ class ExportMetadataModel(Base): display_name = Column(Text) local_export_path = Column(Text) hf_export_path = Column(Text) + s3_export_path = Column(Text) job_id = Column(Text) job_name = Column(Text, unique=True) job_status = Column(Text) diff --git a/app/models/request_models.py b/app/models/request_models.py index 08252a3..bf47e5d 100644 --- a/app/models/request_models.py +++ b/app/models/request_models.py @@ -13,6 +13,7 @@ class Technique(str, Enum): SFT = "sft" Custom_Workflow = "custom_workflow" Model_Alignment = "model_alignment" + Freeform = "freeform" class Example(BaseModel): @@ -44,10 +45,12 @@ class Example_eval(BaseModel): ) +# In app/models/request_models.py class S3Config(BaseModel): """S3 export configuration""" bucket: str - key: str + key: str = "" # Make key optional with default empty string + create_if_not_exists: bool = True # Flag to create bucket if it doesn't exist class HFConfig(BaseModel): """HF export configuration""" @@ -58,41 +61,40 @@ class HFConfig(BaseModel): hf_commit_message: Optional[str] = "Hugging face export" # Commit message class Export_synth(BaseModel): - # Export configuration - export_type: List[str] = Field(default_factory=lambda: ["huggingface"]) # Accept multiple export types (e.g., ["s3", "huggingface"]) - file_path:str - display_name:Optional[str]= None + # Existing fields... + export_type: List[str] = Field(default_factory=lambda: ["huggingface"]) + file_path: str + display_name: Optional[str] = None output_key: Optional[str] = 'Prompt' output_value: Optional[str] = 'Completion' - # Hugging Face-specific fields - hf_config:HFConfig + hf_config: Optional[HFConfig] = None # Make HF config optional # Optional s3 config s3_config: Optional[S3Config] = None - model_config = ConfigDict(protected_namespaces=(), + model_config = ConfigDict( + protected_namespaces=(), json_schema_extra={ "example": { - "export_type": [ - "huggingface" - ], - "file_path": "qa_pairs_claude_20241204_132411_test.json", - "hf_config": { - "hf_token": "your token", - "hf_username": "your_username", - "hf_repo_name": "file_name", - "hf_commit_message": "dataset trial" - } - - + "export_type": ["huggingface", "s3"], + "file_path": "qa_pairs_claude_20241204_132411_test.json", + "hf_config": { + "hf_token": "your token", + "hf_username": "your_username", + "hf_repo_name": "file_name", + "hf_commit_message": "dataset trial" + }, + "s3_config": { + "bucket": "my-dataset-bucket", + "create_if_not_exists": True + } } } ) - class ModelParameters(BaseModel): """Low-level model parameters""" temperature: float = Field(default=0.0, ge=0.0, le=2.0, description="Controls randomness (0.0 to 1.0)") @@ -193,6 +195,7 @@ class SynthesisResponse(BaseModel): class EvaluationRequest(BaseModel): """Request model for evaluating generated QA pairs""" use_case: UseCase + technique: Technique | None = Field(default=Technique.SFT) model_id: str import_path: Optional[str] = None import_type: str = "local" @@ -237,6 +240,7 @@ class CustomPromptRequest(BaseModel): inference_type :Optional[str] = "aws_bedrock" caii_endpoint: Optional[str] = None + example_path: Optional[str] = None custom_p:bool =True model_config = ConfigDict(protected_namespaces=(), diff --git a/app/run_export_job.py b/app/run_export_job.py index 067babe..99fb48d 100644 --- a/app/run_export_job.py +++ b/app/run_export_job.py @@ -8,24 +8,6 @@ os.chdir("/home/cdsw/synthetic-data-studio") -# def check_and_install_requirements(): -# """Check and install requirements from requirements.txt""" -# # Get the current working directory instead of using __file__ -# current_dir = os.getcwd() -# requirements_path = os.path.join(current_dir, 'requirements.txt') - -# if os.path.exists(requirements_path): -# try: -# print(f"Installing requirements from: {requirements_path}") -# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', requirements_path]) -# except subprocess.CalledProcessError as e: -# print(f"Error installing requirements: {e}") -# sys.exit(1) -# else: -# print("No requirements.txt found, continuing with existing packages") - -# # Run installation check at start -# check_and_install_requirements() # Get the current notebook's directory notebook_dir = os.getcwd() diff --git a/app/services/export_results.py b/app/services/export_results.py index e233e77..89205ba 100644 --- a/app/services/export_results.py +++ b/app/services/export_results.py @@ -15,6 +15,7 @@ from app.models.request_models import Export_synth from app.core.database import DatabaseManager +from app.services.s3_export import export_to_s3 import logging from logging.handlers import RotatingFileHandler @@ -101,30 +102,62 @@ def _create_dataset(self, records:List, output_key, output_value, file_path) -> return dataset - def export(self,request:Export_synth): + def export(self, request: Export_synth): try: export_paths = {} file_name = os.path.basename(request.file_path) - try: - with open(request.file_path, 'r') as f: - output_data = json.load(f) - except FileNotFoundError: - raise HTTPException(status_code=404, detail=f"File not found: {request.file_path}") - except json.JSONDecodeError as e: - raise HTTPException(status_code=400, detail=f"Invalid JSON file: {str(e)}") for export_type in request.export_type: - if export_type == "s3" and request.s3_config: - s3_client = boto3.client("s3") - s3_client.put_object( - Bucket=request.s3_config.bucket, - Key=request.s3_config.key, - Body=json.dumps(output_data, indent=2), - ) - export_paths['s3']= f"s3://{request.s3_config.bucket}/{request.s3_config.key}" - self.logger.info(f"Results saved to S3: {export_paths['s3']}") - + # S3 Export + if export_type == "s3": + if not request.s3_config: + raise HTTPException(status_code=400, detail="S3 configuration required for S3 export") + + try: + # Get bucket and key from request + bucket_name = request.s3_config.bucket + key = request.s3_config.key or file_name + + # Override with display_name if provided + if request.display_name and not request.s3_config.key: + key = f"{request.display_name}.json" + + + + + create_bucket = getattr(request.s3_config, 'create_if_not_exists', True) + + s3_result = export_to_s3( + file_path=request.file_path, + bucket_name=bucket_name, + key=key, + create_bucket=create_bucket + ) + + s3_path = s3_result['s3'] + self.logger.info(f"Results saved to S3: {s3_path}") + + # Update database with S3 path + self.db.update_s3_path(file_name, s3_path) + self.logger.info(f"Generation Metadata updated for s3_path: {s3_path}") + + export_paths['s3'] = s3_path + + except Exception as e: + self.logger.error(f"Error exporting to S3: {str(e)}", exc_info=True) + raise APIError(f"S3 export failed: {str(e)}") + + # HuggingFace Export (existing code) elif export_type == "huggingface" and request.hf_config: + # We still need to read the file for HuggingFace export + try: + with open(request.file_path, 'r') as f: + output_data = json.load(f) + except FileNotFoundError: + raise HTTPException(status_code=404, detail=f"File not found: {request.file_path}") + except json.JSONDecodeError as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON file: {str(e)}") + self.logger.info(f"Creating HuggingFace dataset: {request.hf_config.hf_repo_name}") # Set up HuggingFace authentication @@ -132,7 +165,6 @@ def export(self,request:Export_synth): # Convert JSON to dataset dataset = self._create_dataset(output_data, request.output_key, request.output_value, request.file_path) - print(dataset) # Push to HuggingFace Hub as a dataset repo_id = f"{request.hf_config.hf_username}/{request.hf_config.hf_repo_name}" @@ -146,8 +178,8 @@ def export(self,request:Export_synth): self.logger.info(f"Dataset published to HuggingFace: {export_paths['huggingface']}") self.db.update_hf_path(file_name, export_paths['huggingface']) self.logger.info(f"Generation Metadata updated for hf_path: {export_paths['huggingface']}") - - return export_paths + + return export_paths except Exception as e: self.logger.error(f"Error saving results: {str(e)}", exc_info=True) diff --git a/app/services/s3_export.py b/app/services/s3_export.py new file mode 100644 index 0000000..ff5e774 --- /dev/null +++ b/app/services/s3_export.py @@ -0,0 +1,98 @@ +# In app/services/s3_export.py +import os +import logging +import boto3 +from botocore.exceptions import ClientError +from typing import Dict, Any, Optional + +def export_to_s3(file_path: str, bucket_name: str, key: str = "", + create_bucket: bool = True, access_key: str = None, + secret_key: str = None, region: str = None) -> Dict[str, str]: + """ + Export a dataset to AWS S3 + + Args: + file_path: Path to the JSON file to export + bucket_name: Name of the S3 bucket + key: Optional key name for the file in S3 (path/to/file.json) + create_bucket: Whether to create the bucket if it doesn't exist + access_key: AWS access key (defaults to environment variable) + secret_key: AWS secret key (defaults to environment variable) + region: AWS region (defaults to environment variable) + + Returns: + Dictionary with the S3 path of the exported file + """ + try: + # Check if file exists + if not os.path.exists(file_path): + raise ValueError(f"File not found: {file_path}") + + # Use provided credentials or environment variables + access_key = access_key or os.environ.get('AWS_ACCESS_KEY_ID') + secret_key = secret_key or os.environ.get('AWS_SECRET_ACCESS_KEY') + region = region or os.environ.get('AWS_DEFAULT_REGION') + + if not access_key or not secret_key: + raise ValueError("AWS credentials not provided and not found in environment variables") + + # Set up S3 client + s3_args = { + 'aws_access_key_id': access_key, + 'aws_secret_access_key': secret_key + } + + if region: + s3_args['region_name'] = region + + s3_client = boto3.client('s3', **s3_args) + + # Create key name if not provided + if not key: + key = os.path.basename(file_path) + + # Check if bucket exists + try: + s3_client.head_bucket(Bucket=bucket_name) + except ClientError as e: + error_code = e.response.get('Error', {}).get('Code', '') + if error_code == '404' and create_bucket: + # If bucket doesn't exist and create_bucket is True, create it + try: + if region and region != 'us-east-1': + s3_client.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': region} + ) + else: + s3_client.create_bucket(Bucket=bucket_name) + print(f"Bucket {bucket_name} created successfully") + except ClientError as create_error: + raise ValueError(f"Failed to create bucket: {str(create_error)}") + else: + # If there's another error or create_bucket is False + if error_code == '404': + raise ValueError(f"Bucket {bucket_name} does not exist and create_bucket is False") + else: + raise ValueError(f"Error accessing bucket: {str(e)}") + + # Upload file to S3 using upload_file method + try: + s3_client.upload_file( + file_path, + bucket_name, + key, + ExtraArgs={'ContentType': 'application/json'} + ) + except ClientError as e: + raise ValueError(f"Error uploading file to S3: {str(e)}") + + s3_path = f"s3://{bucket_name}/{key}" + print(f"File successfully uploaded to {s3_path}") + + return {'s3': s3_path} + + except Exception as e: + error_msg = f"Error exporting to S3: {str(e)}" + print(error_msg) + raise Exception(error_msg) \ No newline at end of file diff --git a/app/services/synthesis_job.py b/app/services/synthesis_job.py index fc3f02f..323fa7f 100644 --- a/app/services/synthesis_job.py +++ b/app/services/synthesis_job.py @@ -206,28 +206,49 @@ def evaluate_job(self, request: Any, cpu: int = 2, memory: int = 4, request_id = return {"job_name": job_name, "job_id": job_run.job_id} #@track_job("export") + # In the file containing synthesis_job def export_job(self, request: Any, cpu: int = 2, memory: int = 4) -> Dict[str, str]: """Create and run an export job""" params = request.model_dump() + # Generate job name based on export type + if "s3" in request.export_type and request.s3_config: + job_name_prefix = f"s3_{request.s3_config.bucket}" + elif "huggingface" in request.export_type and request.hf_config: + job_name_prefix = f"hf_{request.hf_config.hf_repo_name}" + else: + job_name_prefix = "export" + job_name, job_run, file_name = self._create_and_run_job( "run_export_job.py", - f"hf_{request.hf_config.hf_repo_name}", + job_name_prefix, params, cpu=cpu, memory=memory - ) - repo_id = f"{request.hf_config.hf_username}/{request.hf_config.hf_repo_name}" - export_path = f"https://huggingface.co/datasets/{repo_id}" + # Initialize export paths + export_paths = {} + + # Add HF export path if applicable + if "huggingface" in request.export_type and request.hf_config: + repo_id = f"{request.hf_config.hf_username}/{request.hf_config.hf_repo_name}" + export_paths['huggingface'] = f"https://huggingface.co/datasets/{repo_id}" + + # Add S3 export path if applicable + if "s3" in request.export_type and request.s3_config: + key = request.s3_config.key or os.path.basename(request.file_path) + if request.display_name and not request.s3_config.key: + key = f"{request.display_name}.json" + export_paths['s3'] = f"s3://{request.s3_config.bucket}/{key}" metadata = { "timestamp": datetime.now(timezone.utc).isoformat(), - "display_export_name": request.hf_config.hf_repo_name, + "display_export_name": request.display_name or os.path.basename(request.file_path), "display_name": request.display_name, "local_export_path": request.file_path, - "hf_export_path": export_path, + "hf_export_path": export_paths.get('huggingface', ''), + "s3_export_path": export_paths.get('s3', ''), "job_id": job_run.job_id, "job_name": job_name, "job_status": self.get_job_status(job_run.job_id), @@ -235,13 +256,21 @@ def export_job(self, request: Any, cpu: int = 2, memory: int = 4) -> Dict[str, s "cpu": cpu, "memory": memory } - + self.db_manager.save_export_metadata(metadata) - return { + + result = { "job_name": job_name, "job_id": job_run.job_id, - "hf_link": export_path } + + # Add export paths to result + if 'huggingface' in export_paths: + result["hf_link"] = export_paths['huggingface'] + if 's3' in export_paths: + result["s3_link"] = export_paths['s3'] + + return result def _calculate_total_count(self, request: Any) -> int: diff --git a/app/services/synthesis_service.py b/app/services/synthesis_service.py index 30a630a..5edf9d7 100644 --- a/app/services/synthesis_service.py +++ b/app/services/synthesis_service.py @@ -14,6 +14,9 @@ import asyncio from fastapi import FastAPI, BackgroundTasks, HTTPException from app.core.exceptions import APIError, InvalidModelError, ModelHandlerError, JSONParsingError +from app.core.data_loader import DataLoader +import pandas as pd +import numpy as np from app.models.request_models import SynthesisRequest, Example, ModelParameters from app.core.model_handlers import create_handler @@ -1017,20 +1020,32 @@ async def generate_freeform(self, request: SynthesisRequest, job_name=None, is_d # For examples if request.example_path: - file_extension = os.path.splitext(request.example_path)[1].lower() - - with open(request.example_path, 'r') as f: - if file_extension == '.json': - # Handle JSON files - example_upload = json.load(f) - examples_str = json.dumps(example_upload, indent=2) - elif file_extension == '.csv': - # Handle CSV files - csv_reader = csv.DictReader(f) - example_upload = list(csv_reader) - examples_str = json.dumps(example_upload, indent=2) # Convert CSV data to JSON string format - else: - raise ValueError(f"Unsupported file extension: {file_extension}. Only .json and .csv are supported.") + try: + # Use DataLoader to load the file, limiting to 10 rows + df = DataLoader.load(request.example_path, sample_rows=10) + + # Convert DataFrame to list of dictionaries + example_upload = df.head(10).to_dict(orient='records') + + # Handle non-serializable objects + def json_serializable(obj): + if isinstance(obj, (pd.Timestamp, np.datetime64)): + return obj.isoformat() + elif isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return str(obj) + + # Convert to JSON string with custom serialization + examples_str = json.dumps(example_upload, indent=2, default=json_serializable) + + except Exception as e: + print(f"Error processing example file: {str(e)}") + examples_str = "" else: examples_value = request.example_custom if hasattr(request, 'example_custom') else None examples_str = self.safe_json_dumps(examples_value) diff --git a/build/shell_scripts/build_client.sh b/build/shell_scripts/build_client.sh index d70a30c..2a0e6bb 100644 --- a/build/shell_scripts/build_client.sh +++ b/build/shell_scripts/build_client.sh @@ -16,6 +16,7 @@ fi # Activate virtual environment - using relative path source .venv/bin/activate +export NODE_OPTIONS=--max-old-space-size=16384 # Build frontend cd "$CLIENT_DIR" rm -rf node_modules/ diff --git a/pyproject.toml b/pyproject.toml index 965dbe1..864e889 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ version = "0.1.0" description = "Synthetic Data Studio Project" requires-python = ">=3.10" dependencies = [ + # ── existing ───────────────────────────────────────────── "fastapi==0.109.2", "uvicorn==0.27.1", "pydantic==2.10.4", @@ -31,8 +32,15 @@ dependencies = [ "pytest-asyncio==0.25.3", "pytest-cov==6.0.0", "psutil==5.9.8", + "pandas>=2.2.3", + # ── new packages for data‑analysis layer ───────────────── + "numpy>=1.24.0", # explicit, for dcor/scipy (already a transitive dep of pandas) + "scipy>=1.12.0", # chi‑square, ANOVA, etc. + "dcor>=0.6", # distance‑correlation metric + "openpyxl>=3.1.2", # read .xlsx files + "pyxlsb>=1.0.9", # read .xlsb files ] [tool.hatch.build.targets.wheel] -packages = ["app"] \ No newline at end of file +packages = ["app"] diff --git a/uv.lock b/uv.lock index a50b04f..dded261 100644 --- a/uv.lock +++ b/uv.lock @@ -134,12 +134,17 @@ dependencies = [ { name = "boto3" }, { name = "botocore" }, { name = "datasets" }, + { name = "dcor" }, { name = "fastapi" }, { name = "httpx" }, { name = "huggingface-hub" }, { name = "loguru" }, { name = "nest-asyncio" }, + { name = "numpy" }, { name = "openai" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "psutil" }, { name = "pydantic" }, { name = "pyflakes" }, { name = "pymupdf" }, @@ -148,6 +153,8 @@ dependencies = [ { name = "pytest-cov" }, { name = "python-docx" }, { name = "python-dotenv" }, + { name = "pyxlsb" }, + { name = "scipy" }, { name = "sqlalchemy" }, { name = "typing-extensions" }, { name = "uvicorn" }, @@ -160,12 +167,17 @@ requires-dist = [ { name = "boto3", specifier = "==1.35.48" }, { name = "botocore", specifier = "==1.35.48" }, { name = "datasets", specifier = "==2.20.0" }, + { name = "dcor", specifier = ">=0.6" }, { name = "fastapi", specifier = "==0.109.2" }, { name = "httpx", specifier = "==0.27.2" }, { name = "huggingface-hub", specifier = "==0.23.5" }, { name = "loguru", specifier = "==0.7.2" }, { name = "nest-asyncio", specifier = "==1.5.8" }, + { name = "numpy", specifier = ">=1.24.0" }, { name = "openai", specifier = "==1.57.2" }, + { name = "openpyxl", specifier = ">=3.1.2" }, + { name = "pandas", specifier = ">=2.2.3" }, + { name = "psutil", specifier = "==5.9.8" }, { name = "pydantic", specifier = "==2.10.4" }, { name = "pyflakes", specifier = "==3.2.0" }, { name = "pymupdf", specifier = "==1.25.1" }, @@ -174,6 +186,8 @@ requires-dist = [ { name = "pytest-cov", specifier = "==6.0.0" }, { name = "python-docx", specifier = "==1.1.2" }, { name = "python-dotenv", specifier = "==1.0.0" }, + { name = "pyxlsb", specifier = ">=1.0.9" }, + { name = "scipy", specifier = ">=1.12.0" }, { name = "sqlalchemy", specifier = "==2.0.38" }, { name = "typing-extensions", specifier = "==4.12.2" }, { name = "uvicorn", specifier = "==0.27.1" }, @@ -448,6 +462,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/2d/963b266bb8f88492d5ab4232d74292af8beb5b6fdae97902df9e284d4c32/datasets-2.20.0-py3-none-any.whl", hash = "sha256:76ac02e3bdfff824492e20678f0b6b1b6d080515957fe834b00c2ba8d6b18e5e", size = 547777 }, ] +[[package]] +name = "dcor" +version = "0.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numba" }, + { name = "numpy" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/a7/1d06e98f1b123be60ba5de004edba510025da689c8cfb501299a8f2ba1d1/dcor-0.6.tar.gz", hash = "sha256:f5d39776101db4787348e6be6cd9369341efeb40b070509a30d5c57185558431", size = 45509 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/f3/49770c523067d2179a600f236ea6d55f0a02909a424d055dbc50e04c4860/dcor-0.6-py3-none-any.whl", hash = "sha256:de306fc666668188749730fc803fc1d4d804d9886c92b622ba57b434fed395a2", size = 55545 }, +] + [[package]] name = "dill" version = "0.3.8" @@ -475,6 +504,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl", hash = "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af", size = 548181 }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 }, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -783,6 +821,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256 }, ] +[[package]] +name = "joblib" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/08/8bd4a0250247861420a040b33ccf42f43c426ac91d99405374ef117e5872/joblib-1.5.0.tar.gz", hash = "sha256:d8757f955389a3dd7a23152e43bc297c2e0c2d3060056dad0feefc88a06939b5", size = 330234 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/d3/13ee227a148af1c693654932b8b0b02ed64af5e1f7406d56b088b57574cd/joblib-1.5.0-py3-none-any.whl", hash = "sha256:206144b320246485b712fc8cc51f017de58225fa8b414a1fe1764a7231aca491", size = 307682 }, +] + +[[package]] +name = "llvmlite" +version = "0.44.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/89/6a/95a3d3610d5c75293d5dbbb2a76480d5d4eeba641557b69fe90af6c5b84e/llvmlite-0.44.0.tar.gz", hash = "sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4", size = 171880 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/75/d4863ddfd8ab5f6e70f4504cf8cc37f4e986ec6910f4ef8502bb7d3c1c71/llvmlite-0.44.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:9fbadbfba8422123bab5535b293da1cf72f9f478a65645ecd73e781f962ca614", size = 28132306 }, + { url = "https://files.pythonhosted.org/packages/37/d9/6e8943e1515d2f1003e8278819ec03e4e653e2eeb71e4d00de6cfe59424e/llvmlite-0.44.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cccf8eb28f24840f2689fb1a45f9c0f7e582dd24e088dcf96e424834af11f791", size = 26201096 }, + { url = "https://files.pythonhosted.org/packages/aa/46/8ffbc114def88cc698906bf5acab54ca9fdf9214fe04aed0e71731fb3688/llvmlite-0.44.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7202b678cdf904823c764ee0fe2dfe38a76981f4c1e51715b4cb5abb6cf1d9e8", size = 42361859 }, + { url = "https://files.pythonhosted.org/packages/30/1c/9366b29ab050a726af13ebaae8d0dff00c3c58562261c79c635ad4f5eb71/llvmlite-0.44.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:40526fb5e313d7b96bda4cbb2c85cd5374e04d80732dd36a282d72a560bb6408", size = 41184199 }, + { url = "https://files.pythonhosted.org/packages/69/07/35e7c594b021ecb1938540f5bce543ddd8713cff97f71d81f021221edc1b/llvmlite-0.44.0-cp310-cp310-win_amd64.whl", hash = "sha256:41e3839150db4330e1b2716c0be3b5c4672525b4c9005e17c7597f835f351ce2", size = 30332381 }, + { url = "https://files.pythonhosted.org/packages/b5/e2/86b245397052386595ad726f9742e5223d7aea999b18c518a50e96c3aca4/llvmlite-0.44.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3", size = 28132305 }, + { url = "https://files.pythonhosted.org/packages/ff/ec/506902dc6870249fbe2466d9cf66d531265d0f3a1157213c8f986250c033/llvmlite-0.44.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ace564d9fa44bb91eb6e6d8e7754977783c68e90a471ea7ce913bff30bd62427", size = 26201090 }, + { url = "https://files.pythonhosted.org/packages/99/fe/d030f1849ebb1f394bb3f7adad5e729b634fb100515594aca25c354ffc62/llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5d22c3bfc842668168a786af4205ec8e3ad29fb1bc03fd11fd48460d0df64c1", size = 42361858 }, + { url = "https://files.pythonhosted.org/packages/d7/7a/ce6174664b9077fc673d172e4c888cb0b128e707e306bc33fff8c2035f0d/llvmlite-0.44.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f01a394e9c9b7b1d4e63c327b096d10f6f0ed149ef53d38a09b3749dcf8c9610", size = 41184200 }, + { url = "https://files.pythonhosted.org/packages/5f/c6/258801143975a6d09a373f2641237992496e15567b907a4d401839d671b8/llvmlite-0.44.0-cp311-cp311-win_amd64.whl", hash = "sha256:d8489634d43c20cd0ad71330dde1d5bc7b9966937a263ff1ec1cebb90dc50955", size = 30331193 }, + { url = "https://files.pythonhosted.org/packages/15/86/e3c3195b92e6e492458f16d233e58a1a812aa2bfbef9bdd0fbafcec85c60/llvmlite-0.44.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:1d671a56acf725bf1b531d5ef76b86660a5ab8ef19bb6a46064a705c6ca80aad", size = 28132297 }, + { url = "https://files.pythonhosted.org/packages/d6/53/373b6b8be67b9221d12b24125fd0ec56b1078b660eeae266ec388a6ac9a0/llvmlite-0.44.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5f79a728e0435493611c9f405168682bb75ffd1fbe6fc360733b850c80a026db", size = 26201105 }, + { url = "https://files.pythonhosted.org/packages/cb/da/8341fd3056419441286c8e26bf436923021005ece0bff5f41906476ae514/llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0143a5ef336da14deaa8ec26c5449ad5b6a2b564df82fcef4be040b9cacfea9", size = 42361901 }, + { url = "https://files.pythonhosted.org/packages/53/ad/d79349dc07b8a395a99153d7ce8b01d6fcdc9f8231355a5df55ded649b61/llvmlite-0.44.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d752f89e31b66db6f8da06df8b39f9b91e78c5feea1bf9e8c1fba1d1c24c065d", size = 41184247 }, + { url = "https://files.pythonhosted.org/packages/e2/3b/a9a17366af80127bd09decbe2a54d8974b6d8b274b39bf47fbaedeec6307/llvmlite-0.44.0-cp312-cp312-win_amd64.whl", hash = "sha256:eae7e2d4ca8f88f89d315b48c6b741dcb925d6a1042da694aa16ab3dd4cbd3a1", size = 30332380 }, + { url = "https://files.pythonhosted.org/packages/89/24/4c0ca705a717514c2092b18476e7a12c74d34d875e05e4d742618ebbf449/llvmlite-0.44.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:319bddd44e5f71ae2689859b7203080716448a3cd1128fb144fe5c055219d516", size = 28132306 }, + { url = "https://files.pythonhosted.org/packages/01/cf/1dd5a60ba6aee7122ab9243fd614abcf22f36b0437cbbe1ccf1e3391461c/llvmlite-0.44.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c58867118bad04a0bb22a2e0068c693719658105e40009ffe95c7000fcde88e", size = 26201090 }, + { url = "https://files.pythonhosted.org/packages/d2/1b/656f5a357de7135a3777bd735cc7c9b8f23b4d37465505bd0eaf4be9befe/llvmlite-0.44.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46224058b13c96af1365290bdfebe9a6264ae62fb79b2b55693deed11657a8bf", size = 42361904 }, + { url = "https://files.pythonhosted.org/packages/d8/e1/12c5f20cb9168fb3464a34310411d5ad86e4163c8ff2d14a2b57e5cc6bac/llvmlite-0.44.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa0097052c32bf721a4efc03bd109d335dfa57d9bffb3d4c24cc680711b8b4fc", size = 41184245 }, + { url = "https://files.pythonhosted.org/packages/d0/81/e66fc86539293282fd9cb7c9417438e897f369e79ffb62e1ae5e5154d4dd/llvmlite-0.44.0-cp313-cp313-win_amd64.whl", hash = "sha256:2fb7c4f2fb86cbae6dca3db9ab203eeea0e22d73b99bc2341cdf9de93612e930", size = 30331193 }, +] + [[package]] name = "loguru" version = "0.7.2" @@ -1047,6 +1122,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/d3/48c01d1944e0ee49fdc005bf518a68b0582d3bd201e5401664890b62a647/nest_asyncio-1.5.8-py3-none-any.whl", hash = "sha256:accda7a339a70599cb08f9dd09a67e0c2ef8d8d6f4c07f96ab203f2ae254e48d", size = 5268 }, ] +[[package]] +name = "numba" +version = "0.61.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "llvmlite" }, + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1c/a0/e21f57604304aa03ebb8e098429222722ad99176a4f979d34af1d1ee80da/numba-0.61.2.tar.gz", hash = "sha256:8750ee147940a6637b80ecf7f95062185ad8726c8c28a2295b8ec1160a196f7d", size = 2820615 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/ca/f470be59552ccbf9531d2d383b67ae0b9b524d435fb4a0d229fef135116e/numba-0.61.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a", size = 2775663 }, + { url = "https://files.pythonhosted.org/packages/f5/13/3bdf52609c80d460a3b4acfb9fdb3817e392875c0d6270cf3fd9546f138b/numba-0.61.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd", size = 2778344 }, + { url = "https://files.pythonhosted.org/packages/e2/7d/bfb2805bcfbd479f04f835241ecf28519f6e3609912e3a985aed45e21370/numba-0.61.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae8c7a522c26215d5f62ebec436e3d341f7f590079245a2f1008dfd498cc1642", size = 3824054 }, + { url = "https://files.pythonhosted.org/packages/e3/27/797b2004745c92955470c73c82f0e300cf033c791f45bdecb4b33b12bdea/numba-0.61.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd1e74609855aa43661edffca37346e4e8462f6903889917e9f41db40907daa2", size = 3518531 }, + { url = "https://files.pythonhosted.org/packages/b1/c6/c2fb11e50482cb310afae87a997707f6c7d8a48967b9696271347441f650/numba-0.61.2-cp310-cp310-win_amd64.whl", hash = "sha256:ae45830b129c6137294093b269ef0a22998ccc27bf7cf096ab8dcf7bca8946f9", size = 2831612 }, + { url = "https://files.pythonhosted.org/packages/3f/97/c99d1056aed767503c228f7099dc11c402906b42a4757fec2819329abb98/numba-0.61.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2", size = 2775825 }, + { url = "https://files.pythonhosted.org/packages/95/9e/63c549f37136e892f006260c3e2613d09d5120672378191f2dc387ba65a2/numba-0.61.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:49c980e4171948ffebf6b9a2520ea81feed113c1f4890747ba7f59e74be84b1b", size = 2778695 }, + { url = "https://files.pythonhosted.org/packages/97/c8/8740616c8436c86c1b9a62e72cb891177d2c34c2d24ddcde4c390371bf4c/numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3945615cd73c2c7eba2a85ccc9c1730c21cd3958bfcf5a44302abae0fb07bb60", size = 3829227 }, + { url = "https://files.pythonhosted.org/packages/fc/06/66e99ae06507c31d15ff3ecd1f108f2f59e18b6e08662cd5f8a5853fbd18/numba-0.61.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbfdf4eca202cebade0b7d43896978e146f39398909a42941c9303f82f403a18", size = 3523422 }, + { url = "https://files.pythonhosted.org/packages/0f/a4/2b309a6a9f6d4d8cfba583401c7c2f9ff887adb5d54d8e2e130274c0973f/numba-0.61.2-cp311-cp311-win_amd64.whl", hash = "sha256:76bcec9f46259cedf888041b9886e257ae101c6268261b19fda8cfbc52bec9d1", size = 2831505 }, + { url = "https://files.pythonhosted.org/packages/b4/a0/c6b7b9c615cfa3b98c4c63f4316e3f6b3bbe2387740277006551784218cd/numba-0.61.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2", size = 2776626 }, + { url = "https://files.pythonhosted.org/packages/92/4a/fe4e3c2ecad72d88f5f8cd04e7f7cff49e718398a2fac02d2947480a00ca/numba-0.61.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ddce10009bc097b080fc96876d14c051cc0c7679e99de3e0af59014dab7dfe8", size = 2779287 }, + { url = "https://files.pythonhosted.org/packages/9a/2d/e518df036feab381c23a624dac47f8445ac55686ec7f11083655eb707da3/numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b1bb509d01f23d70325d3a5a0e237cbc9544dd50e50588bc581ba860c213546", size = 3885928 }, + { url = "https://files.pythonhosted.org/packages/10/0f/23cced68ead67b75d77cfcca3df4991d1855c897ee0ff3fe25a56ed82108/numba-0.61.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:48a53a3de8f8793526cbe330f2a39fe9a6638efcbf11bd63f3d2f9757ae345cd", size = 3577115 }, + { url = "https://files.pythonhosted.org/packages/68/1d/ddb3e704c5a8fb90142bf9dc195c27db02a08a99f037395503bfbc1d14b3/numba-0.61.2-cp312-cp312-win_amd64.whl", hash = "sha256:97cf4f12c728cf77c9c1d7c23707e4d8fb4632b46275f8f3397de33e5877af18", size = 2831929 }, + { url = "https://files.pythonhosted.org/packages/0b/f3/0fe4c1b1f2569e8a18ad90c159298d862f96c3964392a20d74fc628aee44/numba-0.61.2-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:3a10a8fc9afac40b1eac55717cece1b8b1ac0b946f5065c89e00bde646b5b154", size = 2771785 }, + { url = "https://files.pythonhosted.org/packages/e9/71/91b277d712e46bd5059f8a5866862ed1116091a7cb03bd2704ba8ebe015f/numba-0.61.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7d3bcada3c9afba3bed413fba45845f2fb9cd0d2b27dd58a1be90257e293d140", size = 2773289 }, + { url = "https://files.pythonhosted.org/packages/0d/e0/5ea04e7ad2c39288c0f0f9e8d47638ad70f28e275d092733b5817cf243c9/numba-0.61.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bdbca73ad81fa196bd53dc12e3aaf1564ae036e0c125f237c7644fe64a4928ab", size = 3893918 }, + { url = "https://files.pythonhosted.org/packages/17/58/064f4dcb7d7e9412f16ecf80ed753f92297e39f399c905389688cf950b81/numba-0.61.2-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:5f154aaea625fb32cfbe3b80c5456d514d416fcdf79733dd69c0df3a11348e9e", size = 3584056 }, + { url = "https://files.pythonhosted.org/packages/af/a4/6d3a0f2d3989e62a18749e1e9913d5fa4910bbb3e3311a035baea6caf26d/numba-0.61.2-cp313-cp313-win_amd64.whl", hash = "sha256:59321215e2e0ac5fa928a8020ab00b8e57cda8a97384963ac0dfa4d4e6aa54e7", size = 2831846 }, +] + [[package]] name = "numpy" version = "2.2.3" @@ -1128,6 +1235,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/37/e7/95437fb676381e927d4cb3f9f8dd90ed24cfd264f572db4d395037428594/openai-1.57.2-py3-none-any.whl", hash = "sha256:f7326283c156fdee875746e7e54d36959fb198eadc683952ee05e3302fbd638d", size = 389873 }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 }, +] + [[package]] name = "packaging" version = "24.2" @@ -1283,6 +1402,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/35/6c4c6fc8774a9e3629cd750dc24a7a4fb090a25ccd5c3246d127b70f9e22/propcache-0.3.0-py3-none-any.whl", hash = "sha256:67dda3c7325691c2081510e92c561f465ba61b975f481735aefdfc845d2cd043", size = 12101 }, ] +[[package]] +name = "psutil" +version = "5.9.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/90/c7/6dc0a455d111f68ee43f27793971cf03fe29b6ef972042549db29eec39a2/psutil-5.9.8.tar.gz", hash = "sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c", size = 503247 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/e3/07ae864a636d70a8a6f58da27cb1179192f1140d5d1da10886ade9405797/psutil-5.9.8-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81", size = 248702 }, + { url = "https://files.pythonhosted.org/packages/b3/bd/28c5f553667116b2598b9cc55908ec435cb7f77a34f2bff3e3ca765b0f78/psutil-5.9.8-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421", size = 285242 }, + { url = "https://files.pythonhosted.org/packages/c5/4f/0e22aaa246f96d6ac87fe5ebb9c5a693fbe8877f537a1022527c47ca43c5/psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4", size = 288191 }, + { url = "https://files.pythonhosted.org/packages/6e/f5/2aa3a4acdc1e5940b59d421742356f133185667dd190b166dbcfcf5d7b43/psutil-5.9.8-cp37-abi3-win32.whl", hash = "sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0", size = 251252 }, + { url = "https://files.pythonhosted.org/packages/93/52/3e39d26feae7df0aa0fd510b14012c3678b36ed068f7d78b8d8784d61f0e/psutil-5.9.8-cp37-abi3-win_amd64.whl", hash = "sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf", size = 255090 }, + { url = "https://files.pythonhosted.org/packages/05/33/2d74d588408caedd065c2497bdb5ef83ce6082db01289a1e1147f6639802/psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8", size = 249898 }, +] + [[package]] name = "pyarrow" version = "19.0.1" @@ -1541,6 +1674,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57", size = 507930 }, ] +[[package]] +name = "pyxlsb" +version = "1.0.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/13/eebaeb7a40b062d1c6f7f91d09e73d30a69e33e4baa7cbe4b7658548b1cd/pyxlsb-1.0.10.tar.gz", hash = "sha256:8062d1ea8626d3f1980e8b1cfe91a4483747449242ecb61013bc2df85435f685", size = 22424 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/92/345823838ae367c59b63e03aef9c331f485370f9df6d049256a61a28f06d/pyxlsb-1.0.10-py2.py3-none-any.whl", hash = "sha256:87c122a9a622e35ca5e741d2e541201d28af00fb46bec492cfa9586890b120b4", size = 23849 }, +] + [[package]] name = "pyyaml" version = "6.0.2" @@ -1624,6 +1766,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/66/05/7957af15543b8c9799209506df4660cba7afc4cf94bfb60513827e96bed6/s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e", size = 83175 }, ] +[[package]] +name = "scipy" +version = "1.15.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/b9/31ba9cd990e626574baf93fbc1ac61cf9ed54faafd04c479117517661637/scipy-1.15.2.tar.gz", hash = "sha256:cd58a314d92838f7e6f755c8a2167ead4f27e1fd5c1251fd54289569ef3495ec", size = 59417316 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/df/ef233fff6838fe6f7840d69b5ef9f20d2b5c912a8727b21ebf876cb15d54/scipy-1.15.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9", size = 38692502 }, + { url = "https://files.pythonhosted.org/packages/5c/20/acdd4efb8a68b842968f7bc5611b1aeb819794508771ad104de418701422/scipy-1.15.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5", size = 30085508 }, + { url = "https://files.pythonhosted.org/packages/42/55/39cf96ca7126f1e78ee72a6344ebdc6702fc47d037319ad93221063e6cf4/scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ecf797d2d798cf7c838c6d98321061eb3e72a74710e6c40540f0e8087e3b499e", size = 22359166 }, + { url = "https://files.pythonhosted.org/packages/51/48/708d26a4ab8a1441536bf2dfcad1df0ca14a69f010fba3ccbdfc02df7185/scipy-1.15.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:9b18aa747da280664642997e65aab1dd19d0c3d17068a04b3fe34e2559196cb9", size = 25112047 }, + { url = "https://files.pythonhosted.org/packages/dd/65/f9c5755b995ad892020381b8ae11f16d18616208e388621dfacc11df6de6/scipy-1.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87994da02e73549dfecaed9e09a4f9d58a045a053865679aeb8d6d43747d4df3", size = 35536214 }, + { url = "https://files.pythonhosted.org/packages/de/3c/c96d904b9892beec978562f64d8cc43f9cca0842e65bd3cd1b7f7389b0ba/scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69ea6e56d00977f355c0f84eba69877b6df084516c602d93a33812aa04d90a3d", size = 37646981 }, + { url = "https://files.pythonhosted.org/packages/3d/74/c2d8a24d18acdeae69ed02e132b9bc1bb67b7bee90feee1afe05a68f9d67/scipy-1.15.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:888307125ea0c4466287191e5606a2c910963405ce9671448ff9c81c53f85f58", size = 37230048 }, + { url = "https://files.pythonhosted.org/packages/42/19/0aa4ce80eca82d487987eff0bc754f014dec10d20de2f66754fa4ea70204/scipy-1.15.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9412f5e408b397ff5641080ed1e798623dbe1ec0d78e72c9eca8992976fa65aa", size = 40010322 }, + { url = "https://files.pythonhosted.org/packages/d0/d2/f0683b7e992be44d1475cc144d1f1eeae63c73a14f862974b4db64af635e/scipy-1.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:b5e025e903b4f166ea03b109bb241355b9c42c279ea694d8864d033727205e65", size = 41233385 }, + { url = "https://files.pythonhosted.org/packages/40/1f/bf0a5f338bda7c35c08b4ed0df797e7bafe8a78a97275e9f439aceb46193/scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:92233b2df6938147be6fa8824b8136f29a18f016ecde986666be5f4d686a91a4", size = 38703651 }, + { url = "https://files.pythonhosted.org/packages/de/54/db126aad3874601048c2c20ae3d8a433dbfd7ba8381551e6f62606d9bd8e/scipy-1.15.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:62ca1ff3eb513e09ed17a5736929429189adf16d2d740f44e53270cc800ecff1", size = 30102038 }, + { url = "https://files.pythonhosted.org/packages/61/d8/84da3fffefb6c7d5a16968fe5b9f24c98606b165bb801bb0b8bc3985200f/scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c6676490ad76d1c2894d77f976144b41bd1a4052107902238047fb6a473e971", size = 22375518 }, + { url = "https://files.pythonhosted.org/packages/44/78/25535a6e63d3b9c4c90147371aedb5d04c72f3aee3a34451f2dc27c0c07f/scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:a8bf5cb4a25046ac61d38f8d3c3426ec11ebc350246a4642f2f315fe95bda655", size = 25142523 }, + { url = "https://files.pythonhosted.org/packages/e0/22/4b4a26fe1cd9ed0bc2b2cb87b17d57e32ab72c346949eaf9288001f8aa8e/scipy-1.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a8e34cf4c188b6dd004654f88586d78f95639e48a25dfae9c5e34a6dc34547e", size = 35491547 }, + { url = "https://files.pythonhosted.org/packages/32/ea/564bacc26b676c06a00266a3f25fdfe91a9d9a2532ccea7ce6dd394541bc/scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28a0d2c2075946346e4408b211240764759e0fabaeb08d871639b5f3b1aca8a0", size = 37634077 }, + { url = "https://files.pythonhosted.org/packages/43/c2/bfd4e60668897a303b0ffb7191e965a5da4056f0d98acfb6ba529678f0fb/scipy-1.15.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:42dabaaa798e987c425ed76062794e93a243be8f0f20fff6e7a89f4d61cb3d40", size = 37231657 }, + { url = "https://files.pythonhosted.org/packages/4a/75/5f13050bf4f84c931bcab4f4e83c212a36876c3c2244475db34e4b5fe1a6/scipy-1.15.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f5e296ec63c5da6ba6fa0343ea73fd51b8b3e1a300b0a8cae3ed4b1122c7462", size = 40035857 }, + { url = "https://files.pythonhosted.org/packages/b9/8b/7ec1832b09dbc88f3db411f8cdd47db04505c4b72c99b11c920a8f0479c3/scipy-1.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:597a0c7008b21c035831c39927406c6181bcf8f60a73f36219b69d010aa04737", size = 41217654 }, + { url = "https://files.pythonhosted.org/packages/4b/5d/3c78815cbab499610f26b5bae6aed33e227225a9fa5290008a733a64f6fc/scipy-1.15.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c4697a10da8f8765bb7c83e24a470da5797e37041edfd77fd95ba3811a47c4fd", size = 38756184 }, + { url = "https://files.pythonhosted.org/packages/37/20/3d04eb066b471b6e171827548b9ddb3c21c6bbea72a4d84fc5989933910b/scipy-1.15.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:869269b767d5ee7ea6991ed7e22b3ca1f22de73ab9a49c44bad338b725603301", size = 30163558 }, + { url = "https://files.pythonhosted.org/packages/a4/98/e5c964526c929ef1f795d4c343b2ff98634ad2051bd2bbadfef9e772e413/scipy-1.15.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:bad78d580270a4d32470563ea86c6590b465cb98f83d760ff5b0990cb5518a93", size = 22437211 }, + { url = "https://files.pythonhosted.org/packages/1d/cd/1dc7371e29195ecbf5222f9afeedb210e0a75057d8afbd942aa6cf8c8eca/scipy-1.15.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b09ae80010f52efddb15551025f9016c910296cf70adbf03ce2a8704f3a5ad20", size = 25232260 }, + { url = "https://files.pythonhosted.org/packages/f0/24/1a181a9e5050090e0b5138c5f496fee33293c342b788d02586bc410c6477/scipy-1.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6fd6eac1ce74a9f77a7fc724080d507c5812d61e72bd5e4c489b042455865e", size = 35198095 }, + { url = "https://files.pythonhosted.org/packages/c0/53/eaada1a414c026673eb983f8b4a55fe5eb172725d33d62c1b21f63ff6ca4/scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b871df1fe1a3ba85d90e22742b93584f8d2b8e6124f8372ab15c71b73e428b8", size = 37297371 }, + { url = "https://files.pythonhosted.org/packages/e9/06/0449b744892ed22b7e7b9a1994a866e64895363572677a316a9042af1fe5/scipy-1.15.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:03205d57a28e18dfd39f0377d5002725bf1f19a46f444108c29bdb246b6c8a11", size = 36872390 }, + { url = "https://files.pythonhosted.org/packages/6a/6f/a8ac3cfd9505ec695c1bc35edc034d13afbd2fc1882a7c6b473e280397bb/scipy-1.15.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:601881dfb761311045b03114c5fe718a12634e5608c3b403737ae463c9885d53", size = 39700276 }, + { url = "https://files.pythonhosted.org/packages/f5/6f/e6e5aff77ea2a48dd96808bb51d7450875af154ee7cbe72188afb0b37929/scipy-1.15.2-cp312-cp312-win_amd64.whl", hash = "sha256:e7c68b6a43259ba0aab737237876e5c2c549a031ddb7abc28c7b47f22e202ded", size = 40942317 }, + { url = "https://files.pythonhosted.org/packages/53/40/09319f6e0f276ea2754196185f95cd191cb852288440ce035d5c3a931ea2/scipy-1.15.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01edfac9f0798ad6b46d9c4c9ca0e0ad23dbf0b1eb70e96adb9fa7f525eff0bf", size = 38717587 }, + { url = "https://files.pythonhosted.org/packages/fe/c3/2854f40ecd19585d65afaef601e5e1f8dbf6758b2f95b5ea93d38655a2c6/scipy-1.15.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:08b57a9336b8e79b305a143c3655cc5bdbe6d5ece3378578888d2afbb51c4e37", size = 30100266 }, + { url = "https://files.pythonhosted.org/packages/dd/b1/f9fe6e3c828cb5930b5fe74cb479de5f3d66d682fa8adb77249acaf545b8/scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:54c462098484e7466362a9f1672d20888f724911a74c22ae35b61f9c5919183d", size = 22373768 }, + { url = "https://files.pythonhosted.org/packages/15/9d/a60db8c795700414c3f681908a2b911e031e024d93214f2d23c6dae174ab/scipy-1.15.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf72ff559a53a6a6d77bd8eefd12a17995ffa44ad86c77a5df96f533d4e6c6bb", size = 25154719 }, + { url = "https://files.pythonhosted.org/packages/37/3b/9bda92a85cd93f19f9ed90ade84aa1e51657e29988317fabdd44544f1dd4/scipy-1.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de9d1416b3d9e7df9923ab23cd2fe714244af10b763975bea9e4f2e81cebd27", size = 35163195 }, + { url = "https://files.pythonhosted.org/packages/03/5a/fc34bf1aa14dc7c0e701691fa8685f3faec80e57d816615e3625f28feb43/scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb530e4794fc8ea76a4a21ccb67dea33e5e0e60f07fc38a49e821e1eae3b71a0", size = 37255404 }, + { url = "https://files.pythonhosted.org/packages/4a/71/472eac45440cee134c8a180dbe4c01b3ec247e0338b7c759e6cd71f199a7/scipy-1.15.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5ea7ed46d437fc52350b028b1d44e002646e28f3e8ddc714011aaf87330f2f32", size = 36860011 }, + { url = "https://files.pythonhosted.org/packages/01/b3/21f890f4f42daf20e4d3aaa18182dddb9192771cd47445aaae2e318f6738/scipy-1.15.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11e7ad32cf184b74380f43d3c0a706f49358b904fa7d5345f16ddf993609184d", size = 39657406 }, + { url = "https://files.pythonhosted.org/packages/0d/76/77cf2ac1f2a9cc00c073d49e1e16244e389dd88e2490c91d84e1e3e4d126/scipy-1.15.2-cp313-cp313-win_amd64.whl", hash = "sha256:a5080a79dfb9b78b768cebf3c9dcbc7b665c5875793569f48bf0e2b1d7f68f6f", size = 40961243 }, + { url = "https://files.pythonhosted.org/packages/4c/4b/a57f8ddcf48e129e6054fa9899a2a86d1fc6b07a0e15c7eebff7ca94533f/scipy-1.15.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:447ce30cee6a9d5d1379087c9e474628dab3db4a67484be1b7dc3196bfb2fac9", size = 38870286 }, + { url = "https://files.pythonhosted.org/packages/0c/43/c304d69a56c91ad5f188c0714f6a97b9c1fed93128c691148621274a3a68/scipy-1.15.2-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c90ebe8aaa4397eaefa8455a8182b164a6cc1d59ad53f79943f266d99f68687f", size = 30141634 }, + { url = "https://files.pythonhosted.org/packages/44/1a/6c21b45d2548eb73be9b9bff421aaaa7e85e22c1f9b3bc44b23485dfce0a/scipy-1.15.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:def751dd08243934c884a3221156d63e15234a3155cf25978b0a668409d45eb6", size = 22415179 }, + { url = "https://files.pythonhosted.org/packages/74/4b/aefac4bba80ef815b64f55da06f62f92be5d03b467f2ce3668071799429a/scipy-1.15.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:302093e7dfb120e55515936cb55618ee0b895f8bcaf18ff81eca086c17bd80af", size = 25126412 }, + { url = "https://files.pythonhosted.org/packages/b1/53/1cbb148e6e8f1660aacd9f0a9dfa2b05e9ff1cb54b4386fe868477972ac2/scipy-1.15.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd5b77413e1855351cdde594eca99c1f4a588c2d63711388b6a1f1c01f62274", size = 34952867 }, + { url = "https://files.pythonhosted.org/packages/2c/23/e0eb7f31a9c13cf2dca083828b97992dd22f8184c6ce4fec5deec0c81fcf/scipy-1.15.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d0194c37037707b2afa7a2f2a924cf7bac3dc292d51b6a925e5fcb89bc5c776", size = 36890009 }, + { url = "https://files.pythonhosted.org/packages/03/f3/e699e19cabe96bbac5189c04aaa970718f0105cff03d458dc5e2b6bd1e8c/scipy-1.15.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:bae43364d600fdc3ac327db99659dcb79e6e7ecd279a75fe1266669d9a652828", size = 36545159 }, + { url = "https://files.pythonhosted.org/packages/af/f5/ab3838e56fe5cc22383d6fcf2336e48c8fe33e944b9037fbf6cbdf5a11f8/scipy-1.15.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f031846580d9acccd0044efd1a90e6f4df3a6e12b4b6bd694a7bc03a89892b28", size = 39136566 }, + { url = "https://files.pythonhosted.org/packages/0a/c8/b3f566db71461cabd4b2d5b39bcc24a7e1c119535c8361f81426be39bb47/scipy-1.15.2-cp313-cp313t-win_amd64.whl", hash = "sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db", size = 40477705 }, +] + [[package]] name = "six" version = "1.17.0"